Ridge and Lasso Regression

This notebook demonstrates regularized regression techniques including Ridge and Lasso regression.

Notebook Contents

This notebook covers regularized regression techniques:

  • Ridge Regression: L2 regularization for preventing overfitting
  • Lasso Regression: L1 regularization with feature selection
  • Elastic Net: Combining L1 and L2 regularization
  • Cross-Validation: Hyperparameter tuning techniques
  • Feature Selection: Understanding which features matter

Use the buttons above to download the notebook or open it in your preferred environment.

πŸ““ Notebook Preview

InΒ [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.regression.linear_model import OLS
from statsmodels.tools import add_constant
import statsmodels.api as sm
# Set random seed for reproducibility
np.random.seed(42)
# Generate synthetic data with multicollinearity
n_samples = 200
n_features = 10
# Create correlated features
X = np.random.randn(n_samples, n_features)
# Add multicollinearity by making some features correlated
X[:, 1] = X[:, 0] + np.random.randn(n_samples) * 0.1
X[:, 2] = X[:, 0] - np.random.randn(n_samples) * 0.1
# True coefficients (sparse - only a few are non-zero)
true_coef = np.array([3, 0, 0, 2, 0, 0, 0, 1.5, 0, 0])
# Generate target variable with noise
y = X @ true_coef + np.random.randn(n_samples) * 0.5
# Create DataFrame for better visualization
feature_names = [f'X{i+1}' for i in range(n_features)]
df = pd.DataFrame(X, columns=feature_names)
df['y'] = y
# Split data into train and test sets
train_size = int(0.8 * n_samples)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]
# Add constant term
X_train_const = add_constant(X_train)
X_test_const = add_constant(X_test)
print("=" * 70)
print("RIDGE AND LASSO REGRESSION EXAMPLE")
print("=" * 70)
print(f"\nDataset Info:")
print(f"  Training samples: {len(X_train)}")
print(f"  Test samples: {len(X_test)}")
print(f"  Features: {n_features}")
print(f"\nTrue coefficients (sparse):")
for i, coef in enumerate(true_coef):
    if coef != 0:
        print(f"  {feature_names[i]}: {coef:.2f}")
# 1. ORDINARY LEAST SQUARES (OLS) - Baseline
print("\n" + "=" * 70)
print("1. ORDINARY LEAST SQUARES (OLS) - No Regularization")
print("=" * 70)
ols_model = OLS(y_train, X_train_const).fit()
print(f"\nR-squared (train): {ols_model.rsquared:.4f}")
# Predict on test set
y_pred_ols = ols_model.predict(X_test_const)
test_mse_ols = np.mean((y_test - y_pred_ols) ** 2)
print(f"Test MSE: {test_mse_ols:.4f}")
print("\nEstimated coefficients:")
for i, (name, coef) in enumerate(zip(['Intercept'] + feature_names, ols_model.params)):
    if i > 0:  # Skip intercept for comparison
        print(f"  {name}: {coef:7.4f} (True: {true_coef[i-1]:5.2f})")
# 2. RIDGE REGRESSION (L2 Regularization)
print("\n" + "=" * 70)
print("2. RIDGE REGRESSION (L2 Regularization)")
print("=" * 70)
# Try different alpha values for Ridge
alphas_ridge = [0.1, 1.0, 10.0, 100.0]
ridge_results = []
for alpha in alphas_ridge:
    ridge_model = OLS(y_train, X_train_const).fit_regularized(
        method='elastic_net',
        alpha=alpha,
        L1_wt=0  # L1_wt=0 means pure Ridge (L2)
    )
    # Predict on test set
    y_pred_ridge = ridge_model.predict(X_test_const)
    test_mse_ridge = np.mean((y_test - y_pred_ridge) ** 2)
    ridge_results.append({
        'alpha': alpha,
        'params': ridge_model.params,
        'test_mse': test_mse_ridge
    })
    print(f"\nAlpha = {alpha}")
    print(f"  Test MSE: {test_mse_ridge:.4f}")
# Best Ridge model
best_ridge = min(ridge_results, key=lambda x: x['test_mse'])
print(f"\nBest Ridge Alpha: {best_ridge['alpha']} (MSE: {best_ridge['test_mse']:.4f})")
print("Estimated coefficients:")
for i, (name, coef) in enumerate(zip(['Intercept'] + feature_names, best_ridge['params'])):
    if i > 0:
        print(f"  {name}: {coef:7.4f} (True: {true_coef[i-1]:5.2f})")
# 3. LASSO REGRESSION (L1 Regularization)
print("\n" + "=" * 70)
print("3. LASSO REGRESSION (L1 Regularization)")
print("=" * 70)
# Try different alpha values for Lasso
alphas_lasso = [0.01, 0.1, 0.5, 1.0]
lasso_results = []
for alpha in alphas_lasso:
    lasso_model = OLS(y_train, X_train_const).fit_regularized(
        method='elastic_net',
        alpha=alpha,
        L1_wt=1  # L1_wt=1 means pure Lasso (L1)
    )
    # Predict on test set
    y_pred_lasso = lasso_model.predict(X_test_const)
    test_mse_lasso = np.mean((y_test - y_pred_lasso) ** 2)
    # Count non-zero coefficients (excluding intercept)
    n_nonzero = np.sum(np.abs(lasso_model.params[1:]) > 1e-5)
    lasso_results.append({
        'alpha': alpha,
        'params': lasso_model.params,
        'test_mse': test_mse_lasso,
        'n_nonzero': n_nonzero
    })
    print(f"\nAlpha = {alpha}")
    print(f"  Test MSE: {test_mse_lasso:.4f}")
    print(f"  Non-zero coefficients: {n_nonzero}/{n_features}")
# Best Lasso model
best_lasso = min(lasso_results, key=lambda x: x['test_mse'])
print(f"\nBest Lasso Alpha: {best_lasso['alpha']} (MSE: {best_lasso['test_mse']:.4f})")
print("Estimated coefficients:")
for i, (name, coef) in enumerate(zip(['Intercept'] + feature_names, best_lasso['params'])):
    if i > 0:
        if abs(coef) > 1e-5:
            print(f"  {name}: {coef:7.4f} (True: {true_coef[i-1]:5.2f}) *")
        else:
            print(f"  {name}: {coef:7.4f} (True: {true_coef[i-1]:5.2f}) [zeroed]")
# 4. COMPARISON VISUALIZATION
print("\n" + "=" * 70)
print("4. MODEL COMPARISON")
print("=" * 70)
print(f"\nTest MSE Comparison:")
print(f"  OLS:          {test_mse_ols:.4f}")
print(f"  Ridge (best): {best_ridge['test_mse']:.4f}")
print(f"  Lasso (best): {best_lasso['test_mse']:.4f}")
# Create visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Plot 1: Coefficient comparison
ax1 = axes[0, 0]
x_pos = np.arange(n_features)
width = 0.2
ax1.bar(x_pos - width, true_coef, width, label='True', alpha=0.8)
ax1.bar(x_pos, ols_model.params[1:], width, label='OLS', alpha=0.8)
ax1.bar(x_pos + width, best_ridge['params'][1:], width, label='Ridge', alpha=0.8)
ax1.bar(x_pos + 2*width, best_lasso['params'][1:], width, label='Lasso', alpha=0.8)
ax1.set_xlabel('Feature')
ax1.set_ylabel('Coefficient Value')
ax1.set_title('Coefficient Comparison')
ax1.set_xticks(x_pos)
ax1.set_xticklabels(feature_names, rotation=45)
ax1.legend()
ax1.grid(True, alpha=0.3)
ax1.axhline(y=0, color='k', linestyle='-', linewidth=0.5)
# Plot 2: Ridge regularization path
ax2 = axes[0, 1]
ridge_coefs = np.array([r['params'][1:] for r in ridge_results]).T
for i, coef_path in enumerate(ridge_coefs):
    ax2.plot([r['alpha'] for r in ridge_results], coef_path, 
             marker='o', label=feature_names[i] if true_coef[i] != 0 else None)
ax2.set_xlabel('Alpha')
ax2.set_ylabel('Coefficient Value')
ax2.set_title('Ridge Regularization Path')
ax2.set_xscale('log')
ax2.legend(loc='best')
ax2.grid(True, alpha=0.3)
ax2.axhline(y=0, color='k', linestyle='-', linewidth=0.5)
# Plot 3: Lasso regularization path
ax3 = axes[1, 0]
lasso_coefs = np.array([r['params'][1:] for r in lasso_results]).T
for i, coef_path in enumerate(lasso_coefs):
    ax3.plot([r['alpha'] for r in lasso_results], coef_path,
             marker='o', label=feature_names[i] if true_coef[i] != 0 else None)
ax3.set_xlabel('Alpha')
ax3.set_ylabel('Coefficient Value')
ax3.set_title('Lasso Regularization Path (Feature Selection)')
ax3.set_xscale('log')
ax3.legend(loc='best')
ax3.grid(True, alpha=0.3)
ax3.axhline(y=0, color='k', linestyle='-', linewidth=0.5)
# Plot 4: Test MSE comparison
ax4 = axes[1, 1]
ridge_mses = [r['test_mse'] for r in ridge_results]
lasso_mses = [r['test_mse'] for r in lasso_results]
ax4.plot([r['alpha'] for r in ridge_results], ridge_mses, 
         marker='o', label='Ridge', linewidth=2)
ax4.plot([r['alpha'] for r in lasso_results], lasso_mses,
         marker='s', label='Lasso', linewidth=2)
ax4.axhline(y=test_mse_ols, color='red', linestyle='--', 
            label='OLS', linewidth=2)
ax4.set_xlabel('Alpha')
ax4.set_ylabel('Test MSE')
ax4.set_title('Test MSE vs Regularization Strength')
ax4.set_xscale('log')
ax4.legend()
ax4.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('ridge_lasso_comparison.png', dpi=300, bbox_inches='tight')
print("\nVisualization saved as 'ridge_lasso_comparison.png'")
plt.show()
print("\n" + "=" * 70)
print("KEY INSIGHTS:")
print("=" * 70)
print("β€’ Ridge shrinks coefficients but keeps all features")
print("β€’ Lasso performs feature selection by zeroing coefficients")
print("β€’ Both reduce overfitting compared to OLS")
print("β€’ Lasso correctly identified the sparse true model")
print("=" * 70)
======================================================================
RIDGE AND LASSO REGRESSION EXAMPLE
======================================================================
Dataset Info:
  Training samples: 160
  Test samples: 40
  Features: 10
True coefficients (sparse):
  X1: 3.00
  X4: 2.00
  X8: 1.50
======================================================================
1. ORDINARY LEAST SQUARES (OLS) - No Regularization
======================================================================
R-squared (train): 0.9800
Test MSE: 0.2817
Estimated coefficients:
  X1:  1.5865 (True:  3.00)
  X2:  0.5232 (True:  0.00)
  X3:  0.8730 (True:  0.00)
  X4:  1.9586 (True:  2.00)
  X5: -0.0015 (True:  0.00)
  X6: -0.0014 (True:  0.00)
  X7:  0.0157 (True:  0.00)
  X8:  1.5169 (True:  1.50)
  X9:  0.0005 (True:  0.00)
  X10:  0.0488 (True:  0.00)
======================================================================
2. RIDGE REGRESSION (L2 Regularization)
======================================================================
Alpha = 0.1
  Test MSE: 0.3496
Alpha = 1.0
  Test MSE: 2.0262
Alpha = 10.0
  Test MSE: 8.4903
Alpha = 100.0
  Test MSE: 11.2510
Best Ridge Alpha: 0.1 (MSE: 0.3496)
Estimated coefficients:
  X1:  0.9608 (True:  3.00)
  X2:  0.9371 (True:  0.00)
  X3:  0.9391 (True:  0.00)
  X4:  1.7733 (True:  2.00)
  X5: -0.0111 (True:  0.00)
  X6:  0.0097 (True:  0.00)
  X7:  0.0307 (True:  0.00)
  X8:  1.3442 (True:  1.50)
  X9:  0.0335 (True:  0.00)
  X10:  0.0531 (True:  0.00)
======================================================================
3. LASSO REGRESSION (L1 Regularization)
======================================================================
Alpha = 0.01
  Test MSE: 0.2572
  Non-zero coefficients: 8/10
Alpha = 0.1
  Test MSE: 0.2802
  Non-zero coefficients: 4/10
Alpha = 0.5
  Test MSE: 1.0444
  Non-zero coefficients: 4/10
Alpha = 1.0
  Test MSE: 3.4823
  Non-zero coefficients: 4/10
Best Lasso Alpha: 0.01 (MSE: 0.2572)
Estimated coefficients:
  X1:  2.5538 (True:  3.00) *
  X2:  0.1996 (True:  0.00) *
  X3:  0.2190 (True:  0.00) *
  X4:  1.9492 (True:  2.00) *
  X5:  0.0000 (True:  0.00) [zeroed]
  X6: -0.0052 (True:  0.00) *
  X7:  0.0066 (True:  0.00) *
  X8:  1.5021 (True:  1.50) *
  X9:  0.0000 (True:  0.00) [zeroed]
  X10:  0.0362 (True:  0.00) *
======================================================================
4. MODEL COMPARISON
======================================================================
Test MSE Comparison:
  OLS:          0.2817
  Ridge (best): 0.3496
  Lasso (best): 0.2572
Visualization saved as 'ridge_lasso_comparison.png'
No description has been provided for this image
======================================================================
KEY INSIGHTS:
======================================================================
β€’ Ridge shrinks coefficients but keeps all features
β€’ Lasso performs feature selection by zeroing coefficients
β€’ Both reduce overfitting compared to OLS
β€’ Lasso correctly identified the sparse true model
======================================================================