import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

# # KNN functions may produce FutureWarnings; the command below suppresses them.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


from sklearn.model_selection import train_test_split
ic_homes = pd.read_csv("https://remiller1450.github.io/data/IowaCityHomeSales.csv")
ic_train, ic_test = train_test_split(ic_homes, test_size=0.2, random_state=7)


## Imports
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer

## Set up the steps of the pipeline
my_pipeline = Pipeline([
('transformer',  PowerTransformer(method = 'yeo-johnson')),
('scaler', MinMaxScaler())
])


## Fit the pipeline
my_variables = ['area.lot','area.bsmt']
fitted_pipe = my_pipeline.fit(ic_train[my_variables])

## Apply the fitted pipeline to visualize a transformation followed by min-max scaling
pd.DataFrame(fitted_pipe.transform(ic_train[my_variables]), columns = my_variables).hist()
plt.show()


## New pipeline with a modeling step at the end
from sklearn.neighbors import KNeighborsRegressor
my_pipeline = Pipeline([
    ('transformer',  PowerTransformer(method = 'yeo-johnson')),
    ('scaler', MinMaxScaler()),
    ('model', KNeighborsRegressor(n_neighbors = 12))
])

## Fit to training data (using only area.lot and area.bsmt as predictive features)
ic_train_X = ic_train[my_variables]
ic_train_y = ic_train['sale.amount']
fitted_pipe = my_pipeline.fit(X = ic_train_X, y = ic_train_y)

## Predict on test data
from sklearn.metrics import mean_squared_error
ic_test_X = ic_test[my_variables]
ic_test_y = ic_test['sale.amount']
np.sqrt(mean_squared_error(ic_test_y, fitted_pipe.predict(ic_test_X)))

68356.27227228682


## Dictionary of hyperparameters to try
from sklearn.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler
my_params = {'scaler': [StandardScaler(), RobustScaler(), MaxAbsScaler()],
         'model__n_neighbors': [10,20,30],
         'model__weights': ['uniform','distance'],
         'model__p': [1,2]}


## Perform grid search using 5-fold cross-validation to assess performance
from sklearn.model_selection import GridSearchCV
grid_results = GridSearchCV(my_pipeline, my_params, cv=5, scoring='neg_root_mean_squared_error').fit(ic_train_X, ic_train_y)


## Example showing the top 5 combinations of hyperparameters/models
params_to_print = ['rank_test_score','param_scaler','param_model__n_neighbors', 'param_model__weights', 'param_model__p', 'mean_test_score']
pd.DataFrame(grid_results.cv_results_).sort_values(by='rank_test_score')[params_to_print].head(5)


## Example using passthrough
my_params = {'scaler': ['passthrough',StandardScaler(), RobustScaler()],
         'model__n_neighbors': [10,20,30],
         'model__weights': ['uniform','distance'],
         'model__p': [1,2]}
grid_results = GridSearchCV(my_pipeline, my_params, cv=5, scoring='neg_root_mean_squared_error').fit(ic_train_X, ic_train_y)
pd.DataFrame(grid_results.cv_results_).sort_values(by='rank_test_score')[params_to_print].head(5)


## Diagram of best estimator - click on each step for hyperparameter values
grid_results.best_estimator_

Pipeline(steps=[('transformer', PowerTransformer()),
                ('scaler', StandardScaler()),
                ('model',
                 KNeighborsRegressor(n_neighbors=20, p=1, weights='distance'))])

Pipeline(steps=[('transformer', PowerTransformer()),
                ('scaler', StandardScaler()),
                ('model',
                 KNeighborsRegressor(n_neighbors=20, p=1, weights='distance'))])

PowerTransformer()

StandardScaler()

KNeighborsRegressor(n_neighbors=20, p=1, weights='distance')


## Performance of best estimator on the test set
np.sqrt(mean_squared_error(ic_test_y, grid_results.best_estimator_.predict(ic_test_X)))

66086.58814048783


from scipy.stats import poisson
from sklearn.model_selection import RandomizedSearchCV

## Parameter dictionary, notice the use of 'poisson(20)'
my_params = {'scaler': [StandardScaler(), RobustScaler(), MaxAbsScaler()],
         'model__n_neighbors': poisson(20),
         'model__weights': ['uniform','distance']
        }

## Results
RandomizedSearchCV(my_pipeline, my_params, cv =5, n_iter = 30, random_state=0).fit(ic_train_X, ic_train_y).best_estimator_

Pipeline(steps=[('transformer', PowerTransformer()), ('scaler', MaxAbsScaler()),
                ('model',
                 KNeighborsRegressor(n_neighbors=18, weights='distance'))])

Pipeline(steps=[('transformer', PowerTransformer()), ('scaler', MaxAbsScaler()),
                ('model',
                 KNeighborsRegressor(n_neighbors=18, weights='distance'))])

PowerTransformer()

MaxAbsScaler()

KNeighborsRegressor(n_neighbors=18, weights='distance')


## In the parameter list provide two distinct model dictionaries
from sklearn.tree import DecisionTreeRegressor
my_params = [
    {"model": [DecisionTreeRegressor()],
        "model__max_depth": [2, 4, 6]},
    {"scaler": [MinMaxScaler(), RobustScaler()],
        "model": [KNeighborsRegressor()],
        "model__n_neighbors": [15, 30]}
]

## Simplified pipeline for this example
my_pipeline = Pipeline([
    ('scaler', 'passthrough'),
    ('model', KNeighborsRegressor())
])

## Perform grid search and print some of the results
grid_results = GridSearchCV(my_pipeline, my_params, cv=5, scoring='neg_root_mean_squared_error').fit(ic_train_X, ic_train_y)
pd.DataFrame(grid_results.cv_results_).sort_values(by='rank_test_score').head(5)


## Imports
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder

## Set up data w/ mixed feature types 
my_variables = ['area.lot', 'area.living', 'style']
ic_train_X = ic_train[my_variables]
ic_train_y = ic_train['sale.amount']

## Get names of numeric and categorical columns
num_cols = ic_train_X.select_dtypes(exclude=['object']).columns.tolist()
cat_cols = ic_train_X.select_dtypes(include=['object']).columns.tolist()

## Make a separate pipeline for numeric vs. cat vars
num_transformer = Pipeline([('transformer',  PowerTransformer(method = 'yeo-johnson')),
                            ("scaler", StandardScaler())])
cat_transformer = Pipeline([("encoder", OneHotEncoder(sparse=False, handle_unknown='ignore'))])

## Put these pipelines together using ColumnTransformer()
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

## Create the final pipeline
final_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", KNeighborsRegressor())
])

## Fit the entire pipeline
final_fitted = final_pipe.fit(ic_train_X, ic_train_y)

## Calculate test set performance
ic_test_X = ic_test[my_variables]
ic_test_y = ic_test['sale.amount']
np.sqrt(mean_squared_error(ic_test_y, final_fitted.predict(ic_test_X)))

49036.37811937593


### Read flattened, processed data
mnist = pd.read_csv("https://remiller1450.github.io/data/mnist_small.csv")

### Separate the target variable (label column)
label = mnist['label']
mnist = mnist.drop(['label'], axis=1)

### Convert to numpy array and reshape to 28 by 28
mnist_unflattened = mnist.to_numpy()
mnist_unflattened = mnist_unflattened.reshape(6000,28,28)

## Plot some examples
import matplotlib.cm as cm
fig, axs = plt.subplots(ncols=7)
for i in range(7):
    axs[i].imshow(mnist_unflattened[i], cmap=cm.Greys)
    axs[i].title.set_text(f'label={label[i]}')
plt.show()

	mean_fit_time	std_fit_time	mean_score_time	std_score_time	param_model	param_model__max_depth	param_model__n_neighbors	param_scaler	params	split0_test_score	split1_test_score	split2_test_score	split3_test_score	split4_test_score	mean_test_score	std_test_score	rank_test_score
4	0.001403	0.000492	0.001391	0.000788	KNeighborsRegressor()	NaN	15	RobustScaler()	{'model': KNeighborsRegressor(), 'model__n_nei...	-59109.156275	-70532.619049	-84820.363697	-48117.551024	-79083.658197	-68332.669648	13311.587487	1
6	0.002015	0.000020	0.000987	0.000021	KNeighborsRegressor()	NaN	30	RobustScaler()	{'model': KNeighborsRegressor(), 'model__n_nei...	-56897.004249	-72368.464393	-84381.254113	-49428.363823	-82010.447426	-69017.106801	13760.314373	2
3	0.001613	0.000501	0.001179	0.000416	KNeighborsRegressor()	NaN	15	MinMaxScaler()	{'model': KNeighborsRegressor(), 'model__n_nei...	-60327.869493	-72951.890271	-81030.432834	-50383.239622	-82301.617629	-69399.009970	12314.831320	3
1	0.001220	0.000392	0.000599	0.000489	DecisionTreeRegressor()	4	NaN	NaN	{'model': DecisionTreeRegressor(), 'model__max...	-60868.253606	-73236.644144	-79723.617926	-57738.032191	-78215.168794	-69956.343332	9013.864060	4
5	0.001210	0.000413	0.000982	0.000018	KNeighborsRegressor()	NaN	30	MinMaxScaler()	{'model': KNeighborsRegressor(), 'model__n_nei...	-60031.415423	-76715.241488	-84762.218723	-49547.016705	-85723.090527	-71355.796573	14274.028359	5

Lab 3 - Pipelines and Cross-Validation¶

Part 1 - Pipelines¶

Part 2 - Cross-Validation and Pipelines¶

Part 3 - Randomized Search¶

Part 4 - Comparing Models While Tuning Hyperparameters¶

Part 5 - More Complex Preprocessing¶

Part 6 - Application¶

	rank_test_score	param_scaler	param_model__n_neighbors	param_model__weights	param_model__p	mean_test_score
23	1	MaxAbsScaler()	20	distance	2	-65998.837551
17	2	MaxAbsScaler()	20	distance	1	-66044.547147
11	3	MaxAbsScaler()	10	distance	2	-66227.991350
15	4	StandardScaler()	20	distance	1	-66298.240297
29	5	MaxAbsScaler()	30	distance	1	-66357.280200