import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import math


### Read superconductivity data
sc = pd.read_csv("https://remiller1450.github.io/data/super.csv")

## Train-test split
from sklearn.model_selection import train_test_split
train_sc, test_sc = train_test_split(sc, test_size=0.2, random_state=5)

## Split predictors and outcome
train_y_sc = train_sc['critical_temp']
train_X_sc = train_sc.drop(['critical_temp'], axis=1)
train_X_sc.shape

(17010, 81)


## Read sobar data
sobar = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00537/sobar-72.csv')
train_sobar, test_sobar = train_test_split(sobar, test_size=0.2, random_state=5)

## Define outcome variable
train_sobar_y = train_sobar['ca_cervix']

## Define predictor matrix
train_sobar_X = train_sobar.drop('ca_cervix',axis=1)


from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

## Standardize
train_Xs_sc = StandardScaler().fit_transform(train_X_sc)

## Set up sequence of alphas
n_alphas = 200
alphas = np.logspace(-1, 5, n_alphas)

## Find coefficients at each alpha (Ridge regression)
coefs = []
for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(train_Xs_sc, train_y_sc)
    coefs.append(ridge.coef_)

## Create Plot
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale("log")
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel("alpha")
plt.ylabel("weights")
plt.axis("tight")
plt.show()


## Import
from sklearn.linear_model import RidgeCV

## Set up sequence of alphas
n_alphas = 200
alphas = np.logspace(-1, 5, n_alphas)

## Evaluate using 5-fold CV and print the best alpha
ridge_cv_res = RidgeCV(cv=5, alphas = alphas, scoring='neg_mean_squared_error').fit(train_Xs_sc, train_y_sc)
best_alpha = ridge_cv_res.alpha_
print(best_alpha)

0.11489510001873092


## Fit final ridge model
print(ridge_cv_res.best_score_)

## Fit linear regression and evaluate via 5-fold CV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
lm_score = cross_val_score(LinearRegression(), train_Xs_sc, train_y_sc, scoring='neg_mean_squared_error', cv = 5)
print(np.average(lm_score))

-309.56563223752676
-309.58888720629477


from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

## Standardize
train_Xs_sc = StandardScaler().fit_transform(train_X_sc)

## Set up sequence of alphas
n_alphas = 50
alphas = np.logspace(-1, 2, n_alphas)

## Find coefficients at each alpha (Lasso regression)
coefs = []
for a in alphas:
    lasso = Lasso(alpha=a)
    lasso.fit(train_Xs_sc, train_y_sc)
    coefs.append(lasso.coef_)

## Create Plot
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale("log")
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel("alpha")
plt.ylabel("weights")
plt.axis("tight")
plt.show()


## Import
from sklearn.linear_model import LogisticRegression

## Standardize
train_sobar_Xs = StandardScaler().fit_transform(train_sobar_X)

## Set up sequence of "C" values
n_c = 30
c_vals = np.logspace(-2, 5, n_c)

## Initialize model
lr_mod = LogisticRegression(penalty="l1", solver="liblinear", warm_start=True, tol=1e-6)

coefs = []
for c in c_vals:
    lr_mod.set_params(C=c)
    lr_mod.fit(train_sobar_Xs, train_sobar_y)
    coefs.append(lr_mod.coef_.ravel())

    
## Create Plot
ax = plt.gca()
ax.plot(c_vals, coefs)
ax.set_xscale("log")
plt.xlabel("C")
plt.ylabel("weights")
plt.axis("tight")
plt.show()


## Set up sequence of "C" values
n_c = 30
c_vals = np.logspace(-2, 5, n_c)

## Initialize model
lr_mod = LogisticRegression(penalty="l2", solver="liblinear", warm_start=True, tol=1e-6)

coefs = []
for c in c_vals:
    lr_mod.set_params(C=c)
    lr_mod.fit(train_sobar_Xs, train_sobar_y)
    coefs.append(lr_mod.coef_.ravel())

    
## Create Plot
ax = plt.gca()
ax.plot(c_vals, coefs)
ax.set_xscale("log")
plt.xlabel("C")
plt.ylabel("weights")
plt.axis("tight")
plt.show()


## Import
from sklearn.linear_model import LogisticRegressionCV

## Set up seq of c-values
n_c = 30
c_vals = np.logspace(-2, 5, n_c)

## Initialize and fit
cv_lr_mod = LogisticRegressionCV(cv=10, Cs = c_vals, penalty="l1", solver="liblinear", tol=1e-6, scoring = "f1").fit(train_sobar_Xs, train_sobar_y)

## Print the best value of C
print(cv_lr_mod.C_)

## Print the cross-validated score
print(cv_lr_mod.score(train_sobar_Xs, train_sobar_y))

[4.52035366]
1.0


## Import
from sklearn.linear_model import ElasticNetCV

## Set up sequence of alphas
n_alphas = 50
alphas = np.logspace(-1, 5, n_alphas)

## Set up a few differ l1 ratios
rats = [.1, .5, .7, .9, .95, .99, 1]

## Evaluate using 5-fold CV and print the best alpha
enet_cv_res = ElasticNetCV(cv=5, alphas = alphas, l1_ratio = rats).fit(train_Xs_sc, train_y_sc)
print([enet_cv_res.alpha_, enet_cv_res.l1_ratio_])

[0.1, 1.0]


## Set up seq of c-values
n_c = 30
c_vals = np.logspace(-2, 5, n_c)

## Initialize
cv_elastic_lr = LogisticRegressionCV(cv=10, Cs = c_vals, l1_ratios = [0.1,0.5], penalty="elasticnet", solver="saga", tol=1e-4, max_iter = 1000, scoring = "f1")

## Fit
cv_elastic_lr.fit(train_sobar_Xs, train_sobar_y)

## Print the best value of C
print(cv_elastic_lr.C_)

## Print the best l1_ratio
print(cv_elastic_lr.l1_ratio_)

## Print the cross-validated score
print(cv_lr_mod.score(train_sobar_Xs, train_sobar_y))

[72.78953844]
[0.1]
1.0


## Read the data
ames = pd.read_csv("https://remiller1450.github.io/data/AmesHousing.csv")

## Train-test split
from sklearn.model_selection import train_test_split
train_ames, test_ames = train_test_split(ames, test_size=0.2, random_state=9)

## Separate y and X
train_ames_y = train_ames['SalePrice']
train_ames_X = train_ames.drop('SalePrice', axis=1)

## Note the presence of missing data
print(train_ames_X.isna().sum().sort_values(ascending=False))

## We'll remove any variables with more than 1000 missing values since they have too much missing data to reliably impute.
train_ames_X = train_ames_X[train_ames_X.columns[train_ames_X.isna().sum() < 1000]]
# train_ames_X.columns  # In case you need to know some of the column names

Pool.QC           2333
Misc.Feature      2263
Alley             2193
Fence             1903
Fireplace.Qu      1136
                  ... 
Heating.QC           0
Central.Air          0
X1st.Flr.SF          0
X2nd.Flr.SF          0
Sale.Condition       0
Length: 81, dtype: int64

Lab #5 (part 1) - Regularization¶

Part 1 - Ridge and Lasso penalties for linear regression¶

Question #1¶

Part 2 - Ridge and Lasso penalties for logistic/softmax regression¶

Question #2¶

Part 3 - Elastic Net¶

Question #3¶

Part 4 - Application and comparison of methods¶

Question #4¶