import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import math

# Unfortunately, knn functions prompt "future warnings", so the commands below turn these off
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


## Note that textfile containing these data uses a tab delimiter to separate the label and message
sms = pd.read_csv("https://remiller1450.github.io/data/sms_spam.txt", sep='\t', names=['Label','Message'])

## Train-test split
from sklearn.model_selection import train_test_split
train, test = train_test_split(sms, test_size=0.2, random_state=8)

## Split outcome from predictors
train_y = (train['Label'] == 'spam').astype(int)
train_msg = train['Message']

## Feature engineering functions
def get_num(text):
    return sum(map(str.isdigit, text))/len(text)
def cap_percent(text):
    return sum(map(str.isupper, text))/len(text)
def alpha_percent(text):
    return sum(map(str.isalnum, text))/len(text)

## Define "first_word" function
def first_word(text):
    return text.split(sep=' ')[0].lower().replace('!','')

## Create data frame with these features
d = {'prop_num': train_msg.apply(get_num),
     'prop_cap': train_msg.apply(cap_percent),
     'prop_alp': train_msg.apply(alpha_percent),
    'first': train_msg.apply(first_word)}
train_X = pd.DataFrame(d)

## Assemble final training X data
train_X_ohe = pd.get_dummies(train_X, columns=['first'])
train_X = train_X_ohe[['prop_num','prop_cap', 'prop_alp', 'first_urgent','first_free']]


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler

## Defining the models
model1 = LogisticRegression(penalty='none')
model2 = DecisionTreeClassifier(max_depth=5)
model3 = Pipeline([('scaler', StandardScaler()),
                  ('model', KNeighborsClassifier())])
                  
## Creating the ensemble
my_ensemble = VotingClassifier(estimators=[('logr', model1),
                                           ('tree', model2), 
                                           ('knn', model3)],
                               voting='soft')


from sklearn.model_selection import cross_val_score
print(np.average(cross_val_score(my_ensemble, train_X, train_y, scoring='f1', cv=5)))

## Individual models
print([np.average(cross_val_score(model1, train_X, train_y, scoring='f1', cv=5)),
       np.average(cross_val_score(model2, train_X, train_y, scoring='f1', cv=5)),
       np.average(cross_val_score(model3, train_X, train_y, scoring='f1', cv=5))])

0.9128026222243044
[0.8453849485158784, 0.9053659207079692, 0.9084395064728428]


params = {'logr__penalty': ['none','l2'], 
          'tree__max_depth': [4,5,6,7],
         'knn__model__n_neighbors': [3,6,10,15],
         'knn__model__weights': ['distance','uniform'],
         'voting': ['soft','hard']}

from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(my_ensemble, param_grid=params, cv=5, scoring = 'f1').fit(train_X, train_y)
print(grid.best_estimator_)

VotingClassifier(estimators=[('logr', LogisticRegression(penalty='none')),
                             ('tree', DecisionTreeClassifier(max_depth=5)),
                             ('knn',
                              Pipeline(steps=[('scaler', StandardScaler()),
                                              ('model',
                                               KNeighborsClassifier(n_neighbors=3,
                                                                    weights='distance'))]))],
                 voting='soft')


weighted_ensemble = VotingClassifier(estimators=[('logr', model1),
                                           ('tree', model2), 
                                           ('knn', model3)],
                               voting='soft',
                               weights=[0.8,1,1.2])


params = {'logr__penalty': ['none','l2'], 
          'tree__max_depth': [4,5,6,7],
         'knn__model__n_neighbors': [3,6,10,15],
         'knn__model__weights': ['distance','uniform']}

from sklearn.model_selection import GridSearchCV
grid_weighted = GridSearchCV(weighted_ensemble, param_grid=params, cv=5, scoring = 'f1').fit(train_X, train_y)
print(grid.best_score_, grid_weighted.best_score_)

0.9167655692653746 0.916737416166477


## Create list of models in the ensemble
base_models = [('logreg', LogisticRegression(penalty='none')),
              ('tree', DecisionTreeClassifier(max_depth=5)),
              ('knn', KNeighborsClassifier(n_neighbors=20))]

## Final model used to aggregate predicted probs
my_final_estimator = DecisionTreeClassifier(max_depth=3)

## Create stack
from sklearn.ensemble import StackingClassifier
my_stack = StackingClassifier(estimators = base_models, final_estimator = my_final_estimator, 
                              stack_method ='predict_proba', cv=5)

## Fit and Evaluate (cv is done internally in StackingClassifier)
fitted_stack = my_stack.fit(train_X, train_y)
cv_stacked_preds = fitted_stack.predict(train_X)

from sklearn.metrics import f1_score
print(f1_score(cv_stacked_preds, train_y))

0.9242957746478873


from sklearn.tree import plot_tree
plt.figure(figsize=(8,5.5))
plot_tree(fitted_stack.final_estimator_, class_names=True)
plt.show()


fitted_stack.final_estimator_.feature_importances_

array([0.01446582, 0.95041708, 0.03511711])


## Read IC home sales data
ic = pd.read_csv("https://remiller1450.github.io/data/IowaCityHomeSales.csv")

## Split to training and testing sets
from sklearn.model_selection import train_test_split
train_ic, test_ic = train_test_split(ic, test_size=0.2, random_state=7)

## Create outcome var
train_y_ic = train_ic['sale.amount']

## Create predictor matrix (numeric predictors only for simplicity, but we could use OHE if we wanted to)
train_X_ic= train_ic.select_dtypes("number").drop('sale.amount',axis=1)

## Create list of models in the ensemble
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
base_models = [('linreg', LinearRegression()),
              ('tree', DecisionTreeRegressor(max_depth=5)),
              ('knn', KNeighborsRegressor(n_neighbors=20))]

## Final model to aggregate base models in the ensemble
my_final_estimator = DecisionTreeRegressor(max_depth=3)

## Stacked regressor
from sklearn.ensemble import StackingRegressor
reg_stack =  StackingRegressor(base_models, final_estimator = my_final_estimator, cv=5)

## Fit and Evaluate (cv is done internally)
fitted_stack = reg_stack.fit(train_X_ic, train_y_ic)
cv_stacked_preds = fitted_stack.predict(train_X_ic)

from sklearn.metrics import mean_squared_error
print(np.sqrt(mean_squared_error(cv_stacked_preds, train_y_ic)))

23905.17554946077


from sklearn.ensemble import RandomForestClassifier
my_forest = RandomForestClassifier(max_depth=3, min_samples_split=10, 
                                   max_features=2, n_estimators=200, 
                                   random_state=0, oob_score=True)

fitted_forest = my_forest.fit(train_X, train_y)
print(fitted_forest.oob_score_) ## out-of-sample classification accuracy

0.9692618353152345

Lab #6 (part 2) - Ensembles and Random Forests¶

Part 1 - Introduction to Classifier Ensembles¶

Question #1¶

Part 2 - Weighting the base models¶

Part 3 - Stacked Generalization¶

Question #2¶

Part 4 - Regressor Ensembles¶

Question #3¶

Part 5 - Random Forests¶

Question #4¶