import xgboost as xgb

C:\Users\millerry\Anaconda3\lib\site-packages\xgboost\compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import MultiIndex, Int64Index


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import math

## We'll ignore the warnings xgboost gives us
import warnings
warnings.simplefilter(action='ignore')


### Read data
wells = pd.read_csv("https://remiller1450.github.io/data/Wells.csv")

## Train-test split
from sklearn.model_selection import train_test_split
train, test = train_test_split(wells, test_size=0.1, random_state=9)

### Separate the outcome and predictors
train_y = (train['switch'] == 'yes').astype(int)
train_X = train.drop(['switch', 'association'], axis="columns")


from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
ada_model = AdaBoostClassifier(n_estimators=100)
cv_res1 = cross_val_score(ada_model, train_X, train_y, cv=5)
np.average(cv_res1)

0.6269188061965117


## Setup two different AdaBoost models:
ada_model1 = AdaBoostClassifier(n_estimators=100, learning_rate=1.2, algorithm = 'SAMME')
ada_model2 = AdaBoostClassifier(n_estimators=100, learning_rate=0.3, algorithm = 'SAMME')

## Plot weights of each tree in the ensemble:
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(np.linspace(0, 100, 100),ada_model1.fit(train_X, train_y,).estimator_weights_)
ax1.set_title('learning_rate = 1.2')
ax2.plot(np.linspace(0, 100, 100),ada_model2.fit(train_X, train_y,).estimator_weights_)
ax2.set_title('learning_rate = 0.3')
plt.show()


cv_res1 = cross_val_score(ada_model1, train_X, train_y, cv=5)
print(np.average(cv_res1))

cv_res2 = cross_val_score(ada_model2, train_X, train_y, cv=5)
print(np.average(cv_res2))

0.6214156104430723
0.6291321362799263


from sklearn.linear_model import LogisticRegression
ada_model_lr = AdaBoostClassifier(LogisticRegression(), n_estimators=100)
cv_res2 = cross_val_score(ada_model_lr, train_X, train_y, cv=5)
np.average(cv_res2)

0.6114810150579568


from sklearn.tree import DecisionTreeClassifier
ada_model_depth3 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=100)
cv_res3 = cross_val_score(ada_model_depth3, train_X, train_y, cv=5)
np.average(cv_res3)

0.6008002924926876


from sklearn.ensemble import GradientBoostingClassifier
gb_mod = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
gb_mod_res = cross_val_score(gb_mod, train_X, train_y, cv=5)
np.average(gb_mod_res)

0.6394370869894919


### Read data
wells = pd.read_csv("https://remiller1450.github.io/data/Wells.csv")

## Train-test split
from sklearn.model_selection import train_test_split
train, test = train_test_split(wells, test_size=0.1, random_state=9)

### Separate the outcome and predictors
train_y = (train['switch'] == 'yes').astype(int)
train_X = train.drop('switch', axis=1)

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score 
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer, make_column_selector

## Seperate pipelines for numeric vs. cat vars
num_transformer = Pipeline([("scaler", MinMaxScaler())])
cat_transformer = Pipeline([("encoder", OneHotEncoder(sparse=False, handle_unknown='ignore'))])

## Column ids
cat_cols = ['association']
num_cols = ['arsenic', 'distance', 'education']

## Pre-processor
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

## Final pipeline
pipe = Pipeline([
('preprocessor', preprocessor),
('model', xgb.XGBClassifier(eval_metric='error'))
])

## Accuracy with the default parms
np.average(cross_val_score(pipe, train_X, train_y, cv = 5))

0.6004387390315242


from sklearn.model_selection import GridSearchCV

## Tree depths to try:
pars = {'model__max_depth': [1,2,3,5,8,10]}

## Grid search
grid_results = GridSearchCV(pipe, pars, cv = 5).fit(train_X, train_y)

## Plot results
mean_accs = grid_results.cv_results_['mean_test_score']
plt.plot(list(pars.values())[0], mean_accs)
plt.show()


## New parm grid
pars = {'model__reg_alpha': [0,1.5, 5],
        'model__max_depth': [1,2,3,5,8]}

## Grid search
grid_results = GridSearchCV(pipe, pars, cv = 5).fit(train_X, train_y)


## Plot results
mean_accs = grid_results.cv_results_['mean_test_score']

## 
plt.figure(figsize=(20,3))
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
depths = [1,2,3,5,8] 
ax1.plot(depths, mean_accs[range(0, len(mean_accs),3)])
ax1.set_title('Alpha = 0')
ax2.plot(depths, mean_accs[range(1, len(mean_accs),3)])
ax2.set_title('Alpha = 1.5')
ax3.plot(depths, mean_accs[range(2, len(mean_accs),3)])
ax3.set_title('Alpha = 5')
plt.show()

<Figure size 2000x300 with 0 Axes>


## Read the data
ames = pd.read_csv("https://remiller1450.github.io/data/AmesHousing.csv")

## Train-test split
from sklearn.model_selection import train_test_split
train_ames, test_ames = train_test_split(ames, test_size=0.2, random_state=9)

## Separate y and X
train_ames_y = train_ames['SalePrice']
train_ames_X = train_ames.drop('SalePrice', axis=1)

## We'll remove any variables with more than 1000 missing values since they have too much missing data to reliably impute.
train_ames_X = train_ames_X[train_ames_X.columns[train_ames_X.isna().sum() < 1000]]

## We'll also remove variables that shouldn't be used for prediction
train_ames_X = train_ames_X.drop(['Order','PID','MS.SubClass', 'Yr.Sold','Mo.Sold'],axis=1)

Lab 7 - Boosting and `xgboost`¶

Part 1 - AdaBoost¶

Question #1¶

Part 2 - Gradient Boosting¶

Question #2¶

Part 3 - Introduction to `xgboost`¶

Question #3 (the `xgboost` challenge)¶

Lab 7 - Boosting and xgboost¶

Part 1 - AdaBoost¶

Question #1¶

Part 2 - Gradient Boosting¶

Question #2¶

Part 3 - Introduction to xgboost¶

Question #3 (the xgboost challenge)¶

Lab 7 - Boosting and `xgboost`¶

Part 3 - Introduction to `xgboost`¶

Question #3 (the `xgboost` challenge)¶