## Standard Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

# turn off future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


## Load the Iowa City home sales data
ic = pd.read_csv("https://remiller1450.github.io/data/IowaCityHomeSales.csv")

## one hot encoding for style and ac, dropping other categorical vars
from pandas import get_dummies
encode_list = ['style','ac']
ic_ohe = get_dummies(ic[['sale.amount','assessed','style','ac']], columns=encode_list)
ic_ohe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   sale.amount              777 non-null    int64
 1   assessed                 777 non-null    int64
 2   style_1 1/2 Story Frame  777 non-null    uint8
 3   style_1 Story Brick      777 non-null    uint8
 4   style_1 Story Condo      777 non-null    uint8
 5   style_1 Story Frame      777 non-null    uint8
 6   style_2 Story Brick      777 non-null    uint8
 7   style_2 Story Condo      777 non-null    uint8
 8   style_2 Story Frame      777 non-null    uint8
 9   style_Split Foyer Frame  777 non-null    uint8
 10  style_Split Level Frame  777 non-null    uint8
 11  ac_No                    777 non-null    uint8
 12  ac_Yes                   777 non-null    uint8
dtypes: int64(2), uint8(11)
memory usage: 20.6 KB


## Reference coding
ic_ohe = get_dummies(ic[['sale.amount','assessed','style','ac']], columns=encode_list, drop_first=True)
ic_ohe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   sale.amount              777 non-null    int64
 1   assessed                 777 non-null    int64
 2   style_1 Story Brick      777 non-null    uint8
 3   style_1 Story Condo      777 non-null    uint8
 4   style_1 Story Frame      777 non-null    uint8
 5   style_2 Story Brick      777 non-null    uint8
 6   style_2 Story Condo      777 non-null    uint8
 7   style_2 Story Frame      777 non-null    uint8
 8   style_Split Foyer Frame  777 non-null    uint8
 9   style_Split Level Frame  777 non-null    uint8
 10  ac_Yes                   777 non-null    uint8
dtypes: int64(2), uint8(9)
memory usage: 19.1 KB


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

## Set up training data for this example
train_ic, test_ic = train_test_split(ic, test_size=0.2, random_state=7)
train_ic_y = train_ic['sale.amount']
train_ic_X = train_ic.drop('sale.amount', axis = 1)

## Seperate pipelines for numeric vs. cat vars
num_transformer = Pipeline([("scaler", StandardScaler())])
cat_transformer = Pipeline([("encoder", OneHotEncoder(sparse=False, handle_unknown='ignore'))])

## Get names of numeric and categorical columns
num_col_names = train_ic_X.select_dtypes(exclude=['object']).columns.tolist()
cat_col_names = train_ic_X.select_dtypes(include=['object']).columns.tolist()

## Preprocessing transformer allowing different actions for numeric and categorical vars
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_col_names),
    ('cat', cat_transformer, cat_col_names)
])

## Combine the preprocessor and model into a final pipeline
final_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", KNeighborsClassifier())
])

## We can fit this pipeline
fitted_pipe = final_pipe.fit(train_ic_X, train_ic_y)


## Note that textfile containing these data uses a tab delimiter to separate the label and message
sms = pd.read_csv("https://remiller1450.github.io/data/sms_spam.txt", sep='\t', names=['Label','Message'])
sms.head(5)


## Train/test split
train_sms, test_sms = train_test_split(sms, test_size=0.2, random_state=8)

## Separate outcome from message
train_sms_y = (train_sms['Label'] == 'spam').astype(int)
train_sms_X = train_sms['Message']

## Print some examples of 'spam'
train_sms_X[train_sms_y ==1].sample(10)

3620    8007 25p 4 Alfie Moon's Children in Need song ...
4258    important information 4 orange user . today is...
1307    Enjoy the jamster videosound gold club with yo...
635     Dear Voucher Holder, 2 claim this weeks offer,...
5370    dating:i have had two of these. Only started a...
576     You have won ?1,000 cash or a ?2,000 prize! To...
709     To review and KEEP the fantastic Nokia N-Gage ...
1196    You have 1 new voicemail. Please call 08719181503
1518    Our brand new mobile music service is now live...
4967    URGENT! We are trying to contact U. Todays dra...
Name: Message, dtype: object


## Function to measure percentage of numbers
def get_num(text):
    return sum(map(str.isdigit, text))/len(text)

## Function to count exclamation marks
def count_exclamation_points(text):
    return text.count('!')

## Create a dictonary and convert to a Pandas dataframe
d = {'prop_num': train_sms_X.apply(get_num), 'num_exclam': train_sms_X.apply(count_exclamation_points)}
train_X = pd.DataFrame(d)
train_X.head(6)


## Scatter plot colored by label
plt.figure().set_figheight(3)
plt.scatter(x = train_X['prop_num'], y = train_X['num_exclam'], c = train_sms_y)
plt.show()


## Define "first_word" function
def first_word(text):
    return text.split(sep=' ')[0].lower().replace('!','')


## Dataset
act_ts = pd.read_csv("https://remiller1450.github.io/data/activity.csv")

## Training Data
train_X = act_ts[act_ts['Subject'] == 1].drop(['Label'], axis = 1)
train_y = act_ts[act_ts['Subject'] == 1]['Label']

## Validation Data
test_X = act_ts[act_ts['Subject'] == 7].drop(['Label'], axis = 1)
test_y = act_ts[act_ts['Subject'] == 7]['Label']


my_vars = ['X_accel', 'Y_accel', 'Z_accel']
for cur_var in my_vars:
    plt.figure().set_figheight(2)
    plt.scatter(x = range(len(train_X[cur_var])), y=train_X[cur_var], c = train_y.map({'Work': 1, 'Talk': 0}))
    plt.title(cur_var)
    plt.show()


train_X_FE = pd.concat([train_X.groupby('Sample_Num')[my_vars].mean(),
                        train_X.groupby('Sample_Num')[my_vars].std()], axis=1)
train_X_FE.columns = ['Mean_X', 'Mean_Y', 'Mean_Z', 'SD_X', 'SD_Y', 'SD_Z']
train_X_FE.head(5)


train_y_FE = act_ts[act_ts['Subject'] == 1].groupby('Sample_Num')['Label'].first()
plt.scatter(train_X_FE['Mean_X'], train_X_FE['SD_X'], c = train_y_FE.map({'Work': 1, 'Talk': 0}))
plt.show()


## Question 3 Dataset
act_ts_v2 = pd.read_csv("https://remiller1450.github.io/data/activity_v2.csv")


## Import data set w/ missing values
hp = pd.read_csv("https://remiller1450.github.io/data/HappyPlanet.csv")

## Tally number of missing values (by variable)
hp.isna().sum()

Country           0
Region            0
Happiness         0
LifeExpectancy    0
Footprint         0
HLY               0
HPI               0
HPIRank           0
GDPperCapita      2
HDI               2
Population        0
dtype: int64


## Outcome
hp_y = hp['Happiness']

## Predictors
hp_X = hp[['Region','LifeExpectancy', 'HDI']]


## Import imputation functions
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.neighbors import KNeighborsRegressor

simp_imp_pipe = Pipeline([
    ("imputer", SimpleImputer(missing_values=np.nan, strategy='mean')),
    ("model", KNeighborsRegressor())
])

simp_fit = simp_imp_pipe.fit(hp_X, hp_y)
print(simp_fit.score(hp_X, hp_y))

knn_imp_pipe = Pipeline([
    ("imputer", KNNImputer(missing_values=np.nan, n_neighbors=4, weights='distance')),
    ("model", KNeighborsRegressor())
])

kp_fit = knn_imp_pipe.fit(hp_X, hp_y)

print(kp_fit.score(hp_X, hp_y))

0.8130099967617596
0.8128904952418757

	Label	Message
0	ham	Go until jurong point, crazy.. Available only ...
1	ham	Ok lar... Joking wif u oni...
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...
3	ham	U dun say so early hor... U c already then say...
4	ham	Nah I don't think he goes to usf, he lives aro...

	prop_num	num_exclam
3922	0.017857	1
2559	0.000000	0
2672	0.000000	0
4282	0.006579	0
987	0.000000	0
3323	0.000000	1

	Mean_X	Mean_Y	Mean_Z	SD_X	SD_Y	SD_Z
Sample_Num
1	1897.030769	2292.657692	2064.600000	170.098461	143.059786	125.358252
2	1957.184615	2379.226923	2108.601923	11.067517	7.863383	16.358731
3	1962.921154	2378.396154	2112.584615	5.970345	3.332676	5.728319
4	1969.034615	2375.501923	2120.165385	14.418689	7.394212	15.416865
5	1976.882692	2373.157692	2130.376923	7.419165	6.416962	7.635497

Lab 11 - Feature Engineering¶

Part 1 - One-hot Encoding¶

Part 2 - Feature Engineering for Text Data¶

Part 3 - Feature Engineering for Time Series Classification¶

Part 4 - Handling Missing Data¶