## Standard Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

# KNN will warn you of future updates, so we'll use the commands below to turn these off
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


ic = pd.read_csv("https://remiller1450.github.io/data/IowaCityHomeSales.csv")
ic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sale.amount   777 non-null    int64  
 1   sale.date     777 non-null    object 
 2   occupancy     777 non-null    object 
 3   style         777 non-null    object 
 4   built         777 non-null    int64  
 5   bedrooms      777 non-null    int64  
 6   bsmt          777 non-null    object 
 7   ac            777 non-null    object 
 8   attic         777 non-null    object 
 9   area.base     777 non-null    int64  
 10  area.add      777 non-null    int64  
 11  area.bsmt     777 non-null    int64  
 12  area.garage1  777 non-null    int64  
 13  area.garage2  777 non-null    int64  
 14  area.living   777 non-null    int64  
 15  area.lot      777 non-null    int64  
 16  lon           777 non-null    float64
 17  lat           777 non-null    float64
 18  assessed      777 non-null    int64  
dtypes: float64(2), int64(11), object(6)
memory usage: 115.5+ KB


from sklearn.model_selection import train_test_split
train, test = train_test_split(ic, test_size=0.2, random_state=7)
print(train.shape)
print(test.shape)

(621, 19)
(156, 19)


## Descriptive stats
print(train.describe())

## Distributions
train.hist()
plt.show()

         sale.amount        built    bedrooms    area.base     area.add  \
count     621.000000   621.000000  621.000000   621.000000   621.000000   
mean   180461.838969  1971.291465    3.033816   997.391304    76.402576   
std     90454.257944    32.012907    0.993763   359.994278   163.199173   
min     38250.000000  1873.000000    1.000000   240.000000     0.000000   
25%    130000.000000  1957.000000    2.000000   729.000000     0.000000   
50%    157000.000000  1979.000000    3.000000   947.000000     0.000000   
75%    207500.000000  1998.000000    4.000000  1200.000000    32.000000   
max    815000.000000  2007.000000    7.000000  3440.000000  1192.000000   

         area.bsmt  area.garage1  area.garage2  area.living       area.lot  \
count   621.000000    621.000000    621.000000   621.000000     621.000000   
mean    348.731079    210.342995     79.138486  1349.797101    8937.497585   
std     392.638897    251.577814    162.963058   508.534575    8966.536367   
min       0.000000      0.000000      0.000000   312.000000     137.000000   
25%       0.000000      0.000000      0.000000  1008.000000    5398.000000   
50%     250.000000      0.000000      0.000000  1232.000000    7500.000000   
75%     600.000000    440.000000      0.000000  1526.000000    9902.000000   
max    2500.000000   1065.000000    780.000000  4988.000000  158123.000000   

              lon         lat       assessed  
count  621.000000  621.000000     621.000000  
mean   -91.522192   41.652605  174009.645733  
std      0.033560    0.011406   84160.526541  
min    -91.604721   41.628040   38590.000000  
25%    -91.547302   41.645714  125970.000000  
50%    -91.515492   41.652497  154710.000000  
75%    -91.495742   41.658984  198450.000000  
max    -91.463069   41.690921  778000.000000


## Import PowerTransformer
from sklearn.preprocessing import PowerTransformer

## Box-Cox transformation
bc_trans = PowerTransformer(method = 'box-cox').fit(train[['area.lot']])
bc_trans_arealot = bc_trans.transform(train[['area.lot']])

## Histogram of area.lot after transformation
pd.DataFrame(bc_trans_arealot).hist()
plt.show()


## A few predictors to show
example_pred_vars = ['area.lot', 'area.living', 'area.garage1']

## Standardization via StandardScaler
from sklearn.preprocessing import StandardScaler
ss_trans = StandardScaler().fit(train[example_pred_vars])
ss_trans_out = ss_trans.transform(train[example_pred_vars])
pd.DataFrame(ss_trans_out, columns = example_pred_vars).hist(layout = (1,3))
plt.show()

## Min-Max Scaling
from sklearn.preprocessing import MinMaxScaler
rs_trans = MinMaxScaler().fit(train[example_pred_vars])
rs_trans_out = rs_trans.transform(train[example_pred_vars])
pd.DataFrame(rs_trans_out, columns = example_pred_vars).hist(layout = (1,3))
plt.show()

## Two more to know about that aren't shown - robust scaling and max absolute scaling
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler


from sklearn.pipeline import Pipeline 

## Set up the steps of the pipeline
my_pipeline = Pipeline([
('transformer',  PowerTransformer(method = 'yeo-johnson')),
('scaler', MinMaxScaler())
])

## Fit the pipeline
fitted_pipe = my_pipeline.fit(train[example_pred_vars])

## Apply the fitted pipeline to visualize a transformation followed by min-max scaling
pd.DataFrame(fitted_pipe.transform(train[example_pred_vars]), columns = example_pred_vars).hist(layout = (1,3))
plt.show()


## Setup knn reg
from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor(n_neighbors=10, weights='uniform', p=2)


## Fit the model
knn_reg.fit(X = train[example_pred_vars], y = train[['sale.amount']])

## Make predictions for the test data
knn_preds = knn_reg.predict(X = test[example_pred_vars])


from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_true = test[['sale.amount']], y_pred = knn_preds))

54017.72706078141


## Baseline error
training_avg = np.average(train[['sale.amount']])  ## Average sale amount in training data

## RMSE using the average sale amount
np.sqrt(mean_squared_error(y_true = test[['sale.amount']], y_pred = np.repeat(training_avg, test.shape[0])))

91453.58553396251


## Set up a classifier
from sklearn.neighbors import KNeighborsClassifier
knn_class = KNeighborsClassifier(n_neighbors=10, weights='uniform', p=2)

## Create a binary outcome - was the assessed value higher than the sale price?
train_y_binary = (train['assessed'] > train['sale.amount']).astype(int)
knn_class.fit(X = train[example_pred_vars], y = train_y_binary)

## Make predictions
knn_class_preds = knn_class.predict(X = test[example_pred_vars])

## Accuracy score
from sklearn.metrics import accuracy_score
test_y_binary = (test['assessed'] > test['sale.amount']).astype(int)
accuracy_score(knn_class_preds, test_y_binary)

0.6410256410256411


## Baseline accuracy
most_common_y = np.argmax(np.bincount(train_y_binary))
accuracy_score(knn_class_preds, np.repeat(most_common_y, test.shape[0]))

0.8461538461538461


## Simple pipeline
my_pipeline = Pipeline([
('transformer',  PowerTransformer(method = 'yeo-johnson')),
('scaler', MinMaxScaler())
])

## Change the method argument of the 'transformer' step to the value 'box-cox'
my_pipeline.set_params(transformer__method = 'box-cox')

Pipeline(steps=[('transformer', PowerTransformer(method='box-cox')),
                ('scaler', MinMaxScaler())])

Pipeline(steps=[('transformer', PowerTransformer(method='box-cox')),
                ('scaler', MinMaxScaler())])

PowerTransformer(method='box-cox')

MinMaxScaler()


## Read data
wbc = pd.read_csv("https://remiller1450.github.io/data/wisc_bc.csv")

Lab 4 - Machine Learning Workflow and $k$-nearest neighbors¶

Training vs. Testing¶

Data Preparation¶

Transformations¶

Scaling¶

Pipelines¶

$k$-nearest Neighbors¶

Application¶