## Standard Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

# KNN will warn you about future updates, but we'll turn these off
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


## Read IC homes data
ic = pd.read_csv("https://remiller1450.github.io/data/IowaCityHomeSales.csv")

## Train-test split
from sklearn.model_selection import train_test_split
train, test = train_test_split(ic, test_size=0.2, random_state=7)

## Separate the target from the predictors (only using numeric predictors)
train_y = train['sale.amount']
test_y = test['sale.amount']
train_X = train.select_dtypes("number").drop('sale.amount', axis = 1)
test_X = test.select_dtypes("number").drop('sale.amount', axis = 1)


from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

pipe = Pipeline([
('scaler', StandardScaler()),
('model', KNeighborsRegressor())
])


from sklearn.preprocessing import RobustScaler, MaxAbsScaler
parms = {'scaler': [StandardScaler(), RobustScaler(), MaxAbsScaler()],
         'model__n_neighbors': [10,20,30],
         'model__weights': ['uniform','distance'],
         'model__p': [1,2]
        }


from sklearn.model_selection import GridSearchCV
grid_res = GridSearchCV(pipe, parms, cv=5).fit(train_X, train_y)
print(grid_res.best_estimator_)

Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 KNeighborsRegressor(n_neighbors=10, p=1, weights='distance'))])


full_grid_results = pd.DataFrame(grid_res.cv_results_)
parms_to_show = ['param_model__n_neighbors', 'param_model__p', 'param_model__weights', 'param_scaler', 'mean_test_score']
full_grid_results.sort_values('mean_test_score', ascending=False)[parms_to_show].head(5)


from sklearn.decomposition import PCA

## Pipeline
pipe = Pipeline([
('scaler', StandardScaler()),
('reducer', PCA()),
('model', KNeighborsRegressor())
])

## Parameters to try
parms = {'scaler': [StandardScaler(), RobustScaler(), MaxAbsScaler()],
         'reducer': ['passthrough', PCA(n_components=4), PCA(n_components=8)],
         'model__n_neighbors': [10,20,30]
        }

## Results
GridSearchCV(pipe, parms, cv=5).fit(train_X, train_y).best_estimator_

Pipeline(steps=[('scaler', StandardScaler()), ('reducer', 'passthrough'),
                ('model', KNeighborsRegressor(n_neighbors=10))])

Pipeline(steps=[('scaler', StandardScaler()), ('reducer', 'passthrough'),
                ('model', KNeighborsRegressor(n_neighbors=10))])

StandardScaler()

passthrough

KNeighborsRegressor(n_neighbors=10)


## Create the binary outcome variable indicating over-assessment
train_y_binary = (train['assessed'] > train['sale.amount']).astype(int)


from scipy.stats import poisson
from sklearn.model_selection import RandomizedSearchCV

## Parameter dictionary, notice the use of 'poisson(20)'
parms = {'scaler': [StandardScaler(), RobustScaler(), MaxAbsScaler()],
         'model__n_neighbors': poisson(20),
         'model__weights': ['uniform','distance']
        }

## Results
RandomizedSearchCV(pipe, parms, cv =5, n_iter = 30, random_state=0).fit(train_X, train_y).best_estimator_

Pipeline(steps=[('scaler', StandardScaler()), ('reducer', PCA()),
                ('model',
                 KNeighborsRegressor(n_neighbors=13, weights='distance'))])

Pipeline(steps=[('scaler', StandardScaler()), ('reducer', PCA()),
                ('model',
                 KNeighborsRegressor(n_neighbors=13, weights='distance'))])

StandardScaler()

PCA()

KNeighborsRegressor(n_neighbors=13, weights='distance')


## Screenshot of the Dark Triad survey items 
from IPython.display import Image
Image("C:\\Users\\millerry\\OneDrive - Grinnell College\\Documents\\STA-395_Intro_ML\\Spring24\\Labs\\rs.PNG")


## Functions to perform k-fold CV, LOOC, and cross-validated predictions
from sklearn.model_selection import KFold, LeaveOneOut, cross_val_predict

## Initialize CV functions
k_folds = KFold(n_splits = 5)
loo = LeaveOneOut()

## Initialize a knn model
knn_mod = KNeighborsRegressor(n_neighbors=13)

## Get cross-validated predictions
kfold_pred = cross_val_predict(knn_mod, train_X, train_y, cv = k_folds)
loo_pred = cross_val_predict(knn_mod, train_X, train_y, cv = loo)

## Example visualization, predicted prices vs. cross-validated residual errors
plt.scatter(kfold_pred, train_y-kfold_pred)
plt.show()


## Pipeline
pipe = Pipeline([
('scaler', StandardScaler()),
('model', KNeighborsRegressor())
])

## Parameters to try
parms = {'scaler': [StandardScaler(), RobustScaler(), MaxAbsScaler()],
         'model__n_neighbors': [10,20,30]
        }

## Predictions
test_preds = GridSearchCV(pipe, parms, cv=5).fit(train_X, train_y).predict(test_X)

## Test set RMSE using the best model/steps (calculated "by hand"):
np.sqrt(np.average((test_y - test_preds)**2))

47298.88262157997


### Read flattened, processed data
mnist = pd.read_csv("https://remiller1450.github.io/data/mnist_small.csv")

### Separate the label column (outcome)
label = mnist['label']
mnist = mnist.drop(['label'], axis=1)

### Convert to numpy array and reshape to 28 by 28
mnist_unflattened = mnist.to_numpy()
mnist_unflattened = mnist_unflattened.reshape(6000,28,28)

## Plot some examples
import matplotlib.cm as cm
fig, axs = plt.subplots(ncols=7)
for i in range(7):
    axs[i].imshow(mnist_unflattened[i], cmap=cm.Greys)
    axs[i].title.set_text(f'label={label[i]}')
plt.show()

Lab 5 - Cross-validation and Hyperparameter tuning¶

Part 1 - Cross-validation and Grid Search¶

Part 2 - Randomized Search¶

Part 3 - Other cross-validation functions¶

Part 4 - Using the test set¶

Part 5 - Application - KNN for classifying handwritten digits¶

	param_model__n_neighbors	param_model__p	param_model__weights	param_scaler	mean_test_score
3	10	1	distance	StandardScaler()	0.731008
9	10	2	distance	StandardScaler()	0.722000
5	10	1	distance	MaxAbsScaler()	0.718766
0	10	1	uniform	StandardScaler()	0.704226
6	10	2	uniform	StandardScaler()	0.698957