## Standard Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

# turn off KNN future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


## Read the data
wbc = pd.read_csv("https://remiller1450.github.io/data/wisc_bc.csv")

## Train-test split
from sklearn.model_selection import train_test_split
train, test = train_test_split(wbc, test_size=0.2, random_state=7)

## Separate the target from the predictors and re-label the target
train_y = train['Label'].map({'M': 1, 'B': 0})
test_y = test['Label'].map({'M': 1, 'B': 0})
train_X = train.drop(['ID','Label'], axis = 1)
test_X = test.drop(['ID','Label'], axis = 1)


## Import each model
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler

## Defining the individual models
model1 = Pipeline([('scaler', StandardScaler()),
                  ('model', SVC(kernel = 'poly', probability=True))])
model2 = DecisionTreeClassifier(max_depth=5)
model3 = Pipeline([('scaler', StandardScaler()),
                  ('model', KNeighborsClassifier())])
                  
## Create the ensemble
my_ensemble = VotingClassifier(estimators=[('svm', model1),('tree', model2),('knn', model3)], voting='soft')


## Imports
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import GridSearchCV

## Pipeline to compare models 
model_pipe = Pipeline([('model', SVC())])
candidate_models = {'model': [my_ensemble, model1, model2, model3]}

## Cross-validated F1 scores
grid = GridSearchCV(model_pipe, candidate_models, cv=5, scoring = 'f1').fit(train_X, train_y)
pd.DataFrame(grid.cv_results_).sort_values('mean_test_score', ascending=False)[['param_model', 'mean_test_score']]


## Some tuning parameters to search over
params = {'svm__model__kernel': ['poly','linear'], 
          'tree__max_depth': [4,5,6],
         'knn__model__n_neighbors': [5,10,15],
         'knn__model__weights': ['distance','uniform'],
         'voting': ['soft','hard']}

## Perform the grid search
grid = GridSearchCV(my_ensemble, param_grid=params, cv=5, scoring = 'f1').fit(train_X, train_y)
print(grid.best_estimator_)

VotingClassifier(estimators=[('svm',
                              Pipeline(steps=[('scaler', StandardScaler()),
                                              ('model',
                                               SVC(kernel='linear',
                                                   probability=True))])),
                             ('tree', DecisionTreeClassifier(max_depth=6)),
                             ('knn',
                              Pipeline(steps=[('scaler', StandardScaler()),
                                              ('model',
                                               KNeighborsClassifier(n_neighbors=15))]))])


## Example that gives less weight to SVM and more to KNN
weighted_ensemble = VotingClassifier(estimators=[('svm', model1),
                                                 ('tree', model2), 
                                                 ('knn', model3)], 
                                     voting='soft', weights=[0.8,1,1.2])

## Example comparing some different weighting schemes using cross-validation
candidate_models = {'weights': [[0.8,1,1.2], [1.2,1,0.8], [1,0.8,1.2]]}
grid = GridSearchCV(weighted_ensemble, candidate_models, cv=5, scoring = 'f1').fit(train_X, train_y)
grid.best_estimator_

VotingClassifier(estimators=[('svm',
                              Pipeline(steps=[('scaler', StandardScaler()),
                                              ('model',
                                               SVC(kernel='poly',
                                                   probability=True))])),
                             ('tree', DecisionTreeClassifier(max_depth=5)),
                             ('knn',
                              Pipeline(steps=[('scaler', StandardScaler()),
                                              ('model',
                                               KNeighborsClassifier())]))],
                 voting='soft', weights=[1, 0.8, 1.2])

VotingClassifier(estimators=[('svm',
                              Pipeline(steps=[('scaler', StandardScaler()),
                                              ('model',
                                               SVC(kernel='poly',
                                                   probability=True))])),
                             ('tree', DecisionTreeClassifier(max_depth=5)),
                             ('knn',
                              Pipeline(steps=[('scaler', StandardScaler()),
                                              ('model',
                                               KNeighborsClassifier())]))],
                 voting='soft', weights=[1, 0.8, 1.2])

StandardScaler()

SVC(kernel='poly', probability=True)

DecisionTreeClassifier(max_depth=5)

StandardScaler()

KNeighborsClassifier()


## Set up the base estimators (model1 - model3 defined previously)
my_base_models = [('svm', model1), ('tree', model2), ('knn', model3)]

## Set up the final estimator
my_final_model = DecisionTreeClassifier(max_depth=2)

## Create the stack
from sklearn.ensemble import StackingClassifier
my_stack = StackingClassifier(estimators = my_base_models, 
                              final_estimator = my_final_model, 
                              stack_method ='predict_proba', cv=5)

## Fit and Evaluate (note that CV is done internally in StackingClassifier)
fitted_stack = my_stack.fit(train_X, train_y)
cv_stacked_preds = fitted_stack.predict(train_X)

from sklearn.metrics import f1_score
print(f1_score(cv_stacked_preds, train_y))

0.9320987654320988


from sklearn.tree import plot_tree
plt.figure(figsize=(8,4.5))
plot_tree(fitted_stack.final_estimator_, class_names=True)
plt.show()


## Read the IC home sales data
ic = pd.read_csv("https://remiller1450.github.io/data/IowaCityHomeSales.csv")

## Split to training and testing sets
from sklearn.model_selection import train_test_split
train_ic, test_ic = train_test_split(ic, test_size=0.2, random_state=7)

## Split the outcome from the predictors
train_y_ic = train_ic['sale.amount']
train_X_ic= train_ic.select_dtypes("number").drop('sale.amount',axis=1)


from sklearn.ensemble import RandomForestClassifier
my_forest = RandomForestClassifier(max_depth=3, min_samples_split=10, 
                                   max_features=2, n_estimators=200, 
                                   random_state=0, oob_score=True)

fitted_forest = my_forest.fit(train_X, train_y)
print(fitted_forest.oob_score_) ## out-of-sample classification accuracy

0.9208791208791208

	param_model	mean_test_score
0	VotingClassifier(estimators=[('svm',\n ...	0.903979
3	(StandardScaler(), KNeighborsClassifier())	0.898491
2	DecisionTreeClassifier(max_depth=5)	0.862679
1	(StandardScaler(), SVC(kernel='poly', probabil...	0.848937

Lab 9 - Random Forests and Ensembles¶

Part 1 - Ensemble Classifiers¶

Part 2 - Stacked Generalizations¶

Part 3 - Ensembles for Regression Tasks¶

Part 4 - Random Forests¶