import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import math


### Read superconductivity data
sc = pd.read_csv("https://remiller1450.github.io/data/super.csv")

## Train-test split
from sklearn.model_selection import train_test_split
train_sc, test_sc = train_test_split(sc, test_size=0.2, random_state=5)

## Split predictors and outcome
train_y_sc = train_sc['critical_temp']
train_X_sc = train_sc.drop(['critical_temp'], axis=1)
train_X_sc.shape

(17010, 81)


from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

## Standardize
train_Xs_sc = StandardScaler().fit_transform(train_X_sc)

## Set up sequence of alphas
n_alphas = 200
alphas = np.logspace(-1, 5, n_alphas)

## Find coefficients at each alpha (Ridge regression)
coefs = []
for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(train_Xs_sc, train_y_sc)
    coefs.append(ridge.coef_)

## Create Plot
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale("log")
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel("alpha")
plt.ylabel("weights")
plt.axis("tight")
plt.show()


from sklearn.linear_model import RidgeCV

## Set up sequence of alphas
n_alphas = 200
alphas = np.logspace(-1, 5, n_alphas)

## Evaluate using 5-fold CV and print the best alpha
ridge_cv_res = RidgeCV(cv=5, alphas = alphas, scoring='neg_mean_squared_error').fit(train_Xs_sc, train_y_sc)
best_alpha = ridge_cv_res.alpha_
print(best_alpha)

0.11489510001873092


## Compare w/ what we'd get from LinearRegression() using 5-fold CV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
lm_score = cross_val_score(LinearRegression(), train_Xs_sc, train_y_sc, scoring='neg_mean_squared_error', cv = 5)
ridge_score = cross_val_score(Ridge(alpha=best_alpha), train_Xs_sc, train_y_sc, scoring='neg_mean_squared_error', cv = 5)
print("OLS:", np.average(lm_score), "Ridge:", np.average(ridge_score))

OLS: -309.5888872062948 Ridge: -309.5656322375192


### Read flattened MNIST data
mnist = pd.read_csv("https://remiller1450.github.io/data/mnist_small.csv")

## Train-test split
from sklearn.model_selection import train_test_split
train_mnist, test_mnist = train_test_split(mnist, test_size=0.2, random_state=5)

### Separate the label column (outcome)
train_mnist_y = train_mnist['label']
train_mnist_X = train_mnist.drop(['label'], axis="columns")

### Create a binary label for 4's
train_mnist_y_binary = (train_mnist_y == 4).astype(int)


from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline 

pipe = Pipeline([
('model', LogisticRegression(penalty=None, solver='saga'))
])

log_mod = pipe.fit(train_mnist_X, train_mnist_y_binary)

C:\Users\millerry\Anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(


## Modify and re-fit the logistic regression pipeline
from sklearn.preprocessing import MinMaxScaler 
new_pipe = Pipeline([
('scaler', MinMaxScaler()),
('model', LogisticRegression(penalty=None, solver='lbfgs', max_iter=150))
])

new_log_mod = new_pipe.fit(train_mnist_X, train_mnist_y_binary)


## Extract the weights and bias from the fitted model
w = log_mod.named_steps['model'].coef_
b = log_mod.named_steps['model'].intercept_

## Calculate the linear predictor using the dot product of X and w + the bias term
lin_predictor = np.dot(MinMaxScaler().fit_transform(train_mnist_X), np.transpose(w)) + b

## Apply the sigmoid transformation
pi = 1/(1 + np.exp(-lin_predictor))

## Plot the original model (which failed to converge)
plt.scatter(lin_predictor, pi, c=train_mnist_y_binary.astype('category'))
plt.show()


## Plot the new model (which converged)
w = new_log_mod.named_steps['model'].coef_
b = new_log_mod.named_steps['model'].intercept_
lin_predictor = np.dot(MinMaxScaler().fit_transform(train_mnist_X), np.transpose(w)) + b
pi = 1/(1 + np.exp(-lin_predictor))
plt.scatter(lin_predictor, pi, c=train_mnist_y_binary.astype('category'))
plt.show()

C:\Users\millerry\AppData\Local\Temp\ipykernel_27380\4165323997.py:5: RuntimeWarning: overflow encountered in exp
  pi = 1/(1 + np.exp(-lin_predictor))


## Set up sequence of "C" values
n_c = 20
c_vals = np.logspace(-5, 0, n_c)

ridge_pipe = Pipeline([
('scaler', MinMaxScaler()),
('model', LogisticRegression(penalty="l2", solver='lbfgs', warm_start=True, max_iter=150))
])

## Fit the pipeline to each value of C
coefs = []
for c in c_vals:
    ridge_pipe.set_params(model__C=c)
    ridge_pipe.fit(train_mnist_X, train_mnist_y_binary)
    coefs.append(ridge_pipe.named_steps['model'].coef_.ravel())
    
## Create Plot
ax = plt.gca()
ax.plot(c_vals, coefs)
ax.set_xscale("log")
plt.xlabel("C")
plt.ylabel("weights")
plt.axis("tight")
plt.show()


## Perform cross-validated search for C
c_vals = np.logspace(-4, -1, 6)
from sklearn.linear_model import LogisticRegressionCV
cv_lr_mod = LogisticRegressionCV(cv=3, Cs = c_vals, penalty="l2", solver="lbfgs", max_iter=150).fit(train_mnist_X, train_mnist_y_binary)

## Print the best value of C
print(cv_lr_mod.C_)

## Print the cross-validated score
print(cv_lr_mod.score(train_mnist_X, train_mnist_y_binary))

C:\Users\millerry\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\millerry\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\millerry\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

[0.0001]
0.9991666666666666

C:\Users\millerry\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Read the data
ames = pd.read_csv("https://remiller1450.github.io/data/AmesHousing.csv")

## Train-test split
from sklearn.model_selection import train_test_split
train_ames, test_ames = train_test_split(ames, test_size=0.2, random_state=9)

## Separate y and X
train_ames_y = train_ames['SalePrice']
train_ames_X = train_ames.drop('SalePrice', axis=1)

## Note the presence of missing data
print(train_ames_X.isna().sum().sort_values(ascending=False))

## We'll also remove any variables with more than 1000 missing values
## they have too much missing data to reliably impute.
train_ames_X = train_ames_X[train_ames_X.columns[train_ames_X.isna().sum() < 1000]]

Pool.QC           2333
Misc.Feature      2263
Alley             2193
Fence             1903
Fireplace.Qu      1136
                  ... 
Heating.QC           0
Central.Air          0
X1st.Flr.SF          0
X2nd.Flr.SF          0
Sale.Condition       0
Length: 81, dtype: int64

Lab 6 - Regularized Regression¶

Part 1 - Lasso and Ridge Regression in `sklearn`¶

Part 2 - Regularized Logistic Regression¶

Part 3 - Application¶

Lab 6 - Regularized Regression¶

Part 1 - Lasso and Ridge Regression in sklearn¶

Part 2 - Regularized Logistic Regression¶

Part 3 - Application¶

Part 1 - Lasso and Ridge Regression in `sklearn`¶