import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import math


## Load data sets
clickbait = pd.read_csv("https://remiller1450.github.io/data/clickbait_headlines.csv", encoding='latin-1')
act_ts = pd.read_csv("https://remiller1450.github.io/data/activity.csv")


## Train-Test split
from sklearn.model_selection import train_test_split
train_clickbait, test_clickbait = train_test_split(clickbait, test_size=0.2, random_state=7)
train_clickbait_y = train_clickbait['Label']
train_clickbait_X = train_clickbait['Headline']
test_clickbait_y = test_clickbait['Label']
test_clickbait_X = test_clickbait['Headline']


## Examples of clickbait
train_clickbait_X[train_clickbait_y == 1].sample(15, random_state=9)

3327    What's Your Craziest Bachelorette Party Experi...
3623                                 Spike Surprises Fans
3436    15 Brilliant And Creative Ways To Use Spaghett...
4026    19 Things You'll Just Get If You're Obsessed W...
5605    How Well Do You Know These Random, 16th Centur...
3595    Kanye West Auditioned For "American Idol" This...
4936    This Korean Pop Star Can't Stop Falling On Sta...
5563           Who Said It: Gene Belcher Or Chris Griffin
3639      Are You More Bailee Madison Or Hailee Steinfeld
4095     For Anyone Who Had A Crush On Sheik From "Zelda"
4518          Send This To The Person Who Gives Bad Gifts
5871    This Canadian Woman Opened Up A Can Of Thanksg...
4513    21 Gifts To Give Yourself When Adulting Is Too...
5534    17 Pairs Of Actually Cute Sweatpants That'll K...
4654        12 Charts Only Sephora Lovers Will Understand
Name: Headline, dtype: object


## Examples of non-clickbait
train_clickbait_X[train_clickbait_y == 0].sample(15, random_state=9)

873     Both sides of Kenya's constitution dispute are...
2456        Italy Seizes Millions in Assets of Four Banks
2603    Truck carrying explosives crashes, explodes in...
1348    Caloundra win Sunshine Coast, Australia cricke...
2159         Southern Gaza hit by new Israeli air strikes
832               Posted deadlines for Christmas delivery
340     Rumsfeld memo recognizes need for 'major adjus...
220                              Horse flu damage spreads
269                     Officials Identify Alabama Gunman
2619         MS-13 gang threatens the Arizona "minutemen"
2710    18 illegal Afghan and Burmese immigrants kille...
1855    Financial Safety Net of Nonprofit Organization...
1317    United Nations requests US$700 million in aid ...
77      English "Lady in the Lake" killer found dead i...
2059               Tomb discovered in Valley of the Kings
Name: Headline, dtype: object


def first_word_q(input_text):
    first_word = input_text.str.split(' ').str[0]
    return (first_word == "What") | (first_word == "Which") | (first_word == "Would") | (first_word == "How") | (first_word == "Can") | (first_word == "Are")


## Proportion of clickbait starting w/ a question word
np.average(first_word_q(train_clickbait_X)[train_clickbait_y == 1])

0.15651085141903173


## Proportion of non-clickbait starting w/ a question word
np.average(first_word_q(train_clickbait_X)[train_clickbait_y == 0])

0.0024958402662229617


## Function to count appearances of "US" or "UK" or "EU" in a given string
def count_US(text):
    return text.count('US') + text.count('U.S.') + text.count('UK') + text.count('U.K.') +  text.count('EU') + text.count('E.U.') 

## Print how many of each in the training data
train_clickbait_X.apply(count_US).value_counts()

0    4562
1     235
2       3
Name: Headline, dtype: int64


## Training Data
train_act_X = act_ts[act_ts['Subject'] == 1].drop(['Label'], axis = 1)
train_act_y = act_ts[act_ts['Subject'] == 1]['Label']

## Validation Data
test_act_X = act_ts[act_ts['Subject'] == 7].drop(['Label'], axis = 1)
test_act_y = act_ts[act_ts['Subject'] == 7]['Label']


## Create plots of select variables over time and color by activity (label)
my_vars = ['X_accel', 'Y_accel', 'Z_accel']
for cur_var in my_vars:
    plt.figure().set_figheight(2)
    plt.scatter(x = range(len(train_act_X[cur_var])), y=train_act_X[cur_var], c = train_act_y.map({'Work': 1, 'Talk': 0}))
    plt.title(cur_var)
    plt.show()


## Get the mean and std dev of each selected variable for each sample
train_X_FE = pd.concat([train_act_X.groupby('Sample_Num')[my_vars].mean(),
                        train_act_X.groupby('Sample_Num')[my_vars].std()], axis=1)
train_X_FE.columns = ['Mean_X', 'Mean_Y', 'Mean_Z', 'SD_X', 'SD_Y', 'SD_Z']
train_X_FE.head(5)


## Plot two of the engineered features and color by the target variable (label)
train_y_FE = act_ts[act_ts['Subject'] == 1].groupby('Sample_Num')['Label'].first()
plt.scatter(train_X_FE['Mean_X'], train_X_FE['SD_X'], c = train_y_FE.map({'Work': 1, 'Talk': 0}))
plt.show()


## Some example data
wbc = pd.read_csv("https://remiller1450.github.io/data/wisc_bc.csv")
train, test = train_test_split(wbc, test_size=0.2, random_state=7)
train_wbc_y = train['Label'].map({'M': 1, 'B': 0})
train_wbc_y = test['Label'].map({'M': 1, 'B': 0})
train_wbc_X = train.drop(['ID','Label'], axis = 1)
train_wbc_X = test.drop(['ID','Label'], axis = 1)

from sklearn.ensemble import RandomForestClassifier
my_forest = RandomForestClassifier(max_depth=3, min_samples_split=10, 
                                   max_features=2, n_estimators=200, 
                                   random_state=0, oob_score=True)

fitted_forest = my_forest.fit(train_wbc_X, train_wbc_y)
print(fitted_forest.oob_score_) ## out-of-sample classification accuracy

0.9385964912280702


## Import each base model plus a few extras
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler

## Defining the individual models
model1 = Pipeline([('scaler', StandardScaler()),
                  ('model', SVC(kernel = 'poly', probability=True))])
model2 = DecisionTreeClassifier(max_depth=5)
model3 = Pipeline([('scaler', StandardScaler()),
                  ('model', KNeighborsClassifier())])
                  
## Create the ensemble
my_ensemble = VotingClassifier(estimators=[('svm', model1),('tree', model2),('knn', model3)], voting='soft')


## A few more imports
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import GridSearchCV

## Pipeline to compare models 
model_pipe = Pipeline([('model', SVC())])
candidate_models = {'model': [my_ensemble, model1, model2, model3]}

## Cross-validated F1 scores
grid = GridSearchCV(model_pipe, candidate_models, cv=5, scoring = 'f1').fit(train_wbc_X, train_wbc_y)
pd.DataFrame(grid.cv_results_).sort_values('mean_test_score', ascending=False)[['param_model', 'mean_test_score']]


## Some tuning parameters to search over
params = {'svm__model__kernel': ['poly','linear'], 
          'tree__max_depth': [4,5,6],
         'knn__model__n_neighbors': [5,10,15],
         'knn__model__weights': ['distance','uniform'],
         'voting': ['soft','hard']}

## Perform the grid search
grid = GridSearchCV(my_ensemble, param_grid=params, cv=5, scoring = 'f1').fit(train_wbc_X, train_wbc_y)
grid.best_estimator_

VotingClassifier(estimators=[('svm',
                              Pipeline(steps=[('scaler', StandardScaler()),
                                              ('model',
                                               SVC(kernel='linear',
                                                   probability=True))])),
                             ('tree', DecisionTreeClassifier(max_depth=4)),
                             ('knn',
                              Pipeline(steps=[('scaler', StandardScaler()),
                                              ('model',
                                               KNeighborsClassifier(n_neighbors=15,
                                                                    weights='distance'))]))],
                 voting='soft')

VotingClassifier(estimators=[('svm',
                              Pipeline(steps=[('scaler', StandardScaler()),
                                              ('model',
                                               SVC(kernel='linear',
                                                   probability=True))])),
                             ('tree', DecisionTreeClassifier(max_depth=4)),
                             ('knn',
                              Pipeline(steps=[('scaler', StandardScaler()),
                                              ('model',
                                               KNeighborsClassifier(n_neighbors=15,
                                                                    weights='distance'))]))],
                 voting='soft')

StandardScaler()

SVC(kernel='linear', probability=True)

DecisionTreeClassifier(max_depth=4)

StandardScaler()

KNeighborsClassifier(n_neighbors=15, weights='distance')

	Mean_X	Mean_Y	Mean_Z	SD_X	SD_Y	SD_Z
Sample_Num
1	1897.030769	2292.657692	2064.600000	170.098461	143.059786	125.358252
2	1957.184615	2379.226923	2108.601923	11.067517	7.863383	16.358731
3	1962.921154	2378.396154	2112.584615	5.970345	3.332676	5.728319
4	1969.034615	2375.501923	2120.165385	14.418689	7.394212	15.416865
5	1976.882692	2373.157692	2130.376923	7.419165	6.416962	7.635497

	param_model	mean_test_score
3	(StandardScaler(), KNeighborsClassifier())	0.923235
0	VotingClassifier(estimators=[('svm',\n ...	0.891429
2	DecisionTreeClassifier(max_depth=5)	0.855798
1	(StandardScaler(), SVC(kernel='poly', probabil...	0.837216

Lab 8 - Ensemble Models and Feature Engineering¶

Part 1 - Feature Engineering Textual Data¶

Part 2 - Feature Engineering Time-Series Data¶

Part 3 - Random Forest¶

Part 4 - Custom Ensembles¶