import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import math

# Unfortunately, knn functions prompt "future warnings", the commands below turn them off
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


## Note that textfile containing these data uses a tab delimiter to separate the label and message
sms = pd.read_csv("https://remiller1450.github.io/data/sms_spam.txt", sep='\t', names=['Label','Message'])
sms.head(5)


from sklearn.model_selection import train_test_split
train, test = train_test_split(sms, test_size=0.2, random_state=8)

## Split outcome from predictors
train_y = (train['Label'] == 'spam').astype(int)
train_msg = train['Message']


## Show 10 randomly selected spam messages
train_msg[train_y ==1].sample(10)

4798    Santa calling! Would your little ones like a c...
5292    Urgent! Please call 09061213237 from landline....
3862    Free Msg: Ringtone!From: http://tms. widelive....
1022    Guess what! Somebody you know secretly fancies...
4162    Had your mobile 11 months or more? U R entitle...
3954    Refused a loan? Secured or Unsecured? Can't ge...
4676    Hi babe its Chloe, how r u? I was smashed on s...
3968    YOU HAVE WON! As a valued Vodafone customer ou...
4297    Please CALL 08712402578 immediately as there i...
3598    Congratulations YOU'VE Won. You're a Winner in...
Name: Message, dtype: object


## Function to measure numbers
def get_num(text):
    return sum(map(str.isdigit, text))/len(text)

## Create a dictonary and convert to a Pandas dataframe
d = {'prop_num': train_msg.apply(get_num)}
train_X = pd.DataFrame(d)
train_X.head(6)


## Load the Iowa City home sales data
ic = pd.read_csv("https://remiller1450.github.io/data/IowaCityHomeSales.csv")

## one hot encoding for style, bsmt and ac, dropping other categorical vars
from pandas import get_dummies
encode_list = ['style','bsmt','ac']
ic_ohe = get_dummies(ic, columns=encode_list)

## drop non-numeric vars
ic_new = ic_ohe.select_dtypes("number")
print(ic_new.dtypes) # Check out the new columns

sale.amount                  int64
built                        int64
bedrooms                     int64
area.base                    int64
area.add                     int64
area.bsmt                    int64
area.garage1                 int64
area.garage2                 int64
area.living                  int64
area.lot                     int64
lon                        float64
lat                        float64
assessed                     int64
style_1 1/2 Story Frame      uint8
style_1 Story Brick          uint8
style_1 Story Condo          uint8
style_1 Story Frame          uint8
style_2 Story Brick          uint8
style_2 Story Condo          uint8
style_2 Story Frame          uint8
style_Split Foyer Frame      uint8
style_Split Level Frame      uint8
bsmt_1/2                     uint8
bsmt_3/4                     uint8
bsmt_Crawl                   uint8
bsmt_Full                    uint8
bsmt_None                    uint8
ac_No                        uint8
ac_Yes                       uint8
dtype: object


## Reference coding
ic_ohe = get_dummies(ic, columns=encode_list, drop_first=True)
ic_new = ic_ohe.select_dtypes("number")
print(ic_new.dtypes) # Check out the new columns

sale.amount                  int64
built                        int64
bedrooms                     int64
area.base                    int64
area.add                     int64
area.bsmt                    int64
area.garage1                 int64
area.garage2                 int64
area.living                  int64
area.lot                     int64
lon                        float64
lat                        float64
assessed                     int64
style_1 Story Brick          uint8
style_1 Story Condo          uint8
style_1 Story Frame          uint8
style_2 Story Brick          uint8
style_2 Story Condo          uint8
style_2 Story Frame          uint8
style_Split Foyer Frame      uint8
style_Split Level Frame      uint8
bsmt_3/4                     uint8
bsmt_Crawl                   uint8
bsmt_Full                    uint8
bsmt_None                    uint8
ac_Yes                       uint8
dtype: object


## Define "first_word" function
def first_word(text):
    return text.split(sep=' ')[0].lower().replace('!','')

## Recreate train_X using the new feature
d = {'prop_num': train_msg.apply(get_num),
    'first_word': train_msg.apply(first_word)}
train_X = pd.DataFrame(d)
train_X.head(6)


## Apply OHE to the 'first' column, 
train_X_ohe = get_dummies(train_X, columns=['first_word'])
train_X_ohe.shape

(4457, 1174)


## Keep indicators for the words 'urgent' and 'free'
train_X2 = train_X_ohe[['prop_num','first_word_urgent','first_word_free']]

## Print the frequence of "1" for each of our new variables:
print({sum(train_X2['first_word_urgent']), sum(train_X2['first_word_free'])})

{43, 28}


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier

## Seperate pipelines for numeric vs. cat vars
num_transformer = Pipeline([("scaler", StandardScaler())])
cat_transformer = Pipeline([("encoder", OneHotEncoder(sparse=False, handle_unknown='ignore'))])

## Get names of numeric and categorical columns
num_cols = train_X.select_dtypes(exclude=['object']).columns.tolist()
cat_cols = train_X.select_dtypes(include=['object']).columns.tolist()

## Preprocessing transformer allowing different actions for numeric and categorical vars
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

## Combine the preprocessor and model into a final pipeline
final_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", KNeighborsClassifier())
])

fitted = final_pipe.fit(train_X, train_y)
print(fitted.score(train_X, train_y))

0.971729863136639


## Import data set w/ missing values
hp = pd.read_csv("https://remiller1450.github.io/data/HappyPlanet.csv")

## Tally number of missing values
hp.isna().sum()

Country           0
Region            0
Happiness         0
LifeExpectancy    0
Footprint         0
HLY               0
HPI               0
HPIRank           0
GDPperCapita      2
HDI               2
Population        0
dtype: int64


## create outcome
hp_y = hp['Happiness']

## create predictors
hp_X = hp[['Region','LifeExpectancy', 'HDI']]


## Import imputation functions
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.neighbors import KNeighborsRegressor

simp_imp_pipe = Pipeline([
    ("imputer", SimpleImputer(missing_values=np.nan, strategy='mean')),
    ("model", KNeighborsRegressor())
])

simp_fit = simp_imp_pipe.fit(hp_X, hp_y)
print(simp_fit.score(hp_X, hp_y))

knn_imp_pipe = Pipeline([
    ("imputer", KNNImputer(missing_values=np.nan, n_neighbors=4, weights='distance')),
    ("model", KNeighborsRegressor())
])

kp_fit = knn_imp_pipe.fit(hp_X, hp_y)
print(kp_fit.score(hp_X, hp_y))

0.8130099967617596
0.8128904952418757


## Seperate pipelines for numeric vs. cat vars
num_transformer = Pipeline([("scaler", StandardScaler()),
                           ("imputer", KNNImputer(missing_values=np.nan, n_neighbors=4, weights='distance'))
                           ])
cat_transformer = Pipeline([("encoder", OneHotEncoder(sparse=False, handle_unknown='ignore'))])

## Get names of numeric and categorical columns
num_cols = ['LifeExpectancy', 'HDI']
cat_cols = ['Region']

## Preprocessing transformer allowing different actions for numeric and categorical vars
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

## Combine the preprocessor and model into a final pipeline
knn_final_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", KNeighborsRegressor())
])

## Fit new pipeline and print the score
kpn_fit = knn_final_pipe.fit(hp_X, hp_y)
print(kpn_fit.score(hp_X, hp_y))

0.8525635060743124


## Import data set w/ missing values
c19 = pd.read_csv("https://remiller1450.github.io/data/Colleges2019.csv")

## Remove rows with missing outcome
c19 = c19[c19.Salary10yr_median.notnull()]

## Seperate predictors and outcome
c19_y = c19['Salary10yr_median']
c19_X = c19[['Cost','Avg_Fac_Salary','Region','ACT_median']]

## Note that some data are missing
c19_X.isna().sum()

Cost               39
Avg_Fac_Salary      6
Region              0
ACT_median        432
dtype: int64

	Label	Message
0	ham	Go until jurong point, crazy.. Available only ...
1	ham	Ok lar... Joking wif u oni...
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...
3	ham	U dun say so early hor... U c already then say...
4	ham	Nah I don't think he goes to usf, he lives aro...

	prop_num
3922	0.017857
2559	0.000000
2672	0.000000
4282	0.006579
987	0.000000
3323	0.000000

	prop_num	first_word
3922	0.017857	do
2559	0.000000	some
2672	0.000000	that's
4282	0.006579	wn
987	0.000000	i'm
3323	0.000000	ok

Lab #3 (part 2) - Feature Engineering and Missing Data¶

Part 1 - Feature Engineering¶

Question #1¶

Part 2 - One Hot Encoding¶

Question #2¶

Part 3 - Pipelines and One-Hot-Encoding¶

Part 4 - Missing Data Imputation¶

Question #3 (pipelines and imputation)¶

Part 5 - Application/Challenge¶

Question #4 (challenge)¶