import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import math


## Read data
ic = pd.read_csv("https://remiller1450.github.io/data/IowaCityHomeSales.csv")

## Split to training and testing sets
from sklearn.model_selection import train_test_split
train, test = train_test_split(ic, test_size=0.2, random_state=7)

## Create target
train_y = train['sale.amount']

## Create predictor matrix (numeric predictors only for simplicity, but we could use OHE if we wanted to)
train_X = train.select_dtypes("number").drop('sale.amount',axis=1)


from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline 

## Simple pipeline
pipe = Pipeline([
('scaler', StandardScaler()),
('model', LinearRegression())
])


## Fit the pipeline to the training data
lin_mod = pipe.fit(train_X, train_y)

## print the model's coefficients
lin_mod.named_steps['model'].coef_

array([-1.25805804e+03,  1.92884548e+03,  6.72865255e+02, -1.92085650e+03,
       -7.95482914e+01, -3.12181937e+03,  1.97731725e+03,  8.24129242e+02,
        1.82409979e+03,  3.74282983e+02,  2.29862180e+00,  8.78235016e+04])


## print the estimated intercept (bias)
lin_mod.named_steps['model'].intercept_

180461.83896940417


## Set up a simple grid to compare two scaling methods
from sklearn.preprocessing import RobustScaler
parms = {'scaler': [StandardScaler(), RobustScaler()]}

## Perform grid search
from sklearn.model_selection import GridSearchCV
lin_mod2 = GridSearchCV(pipe, parms, cv=5, scoring='neg_root_mean_squared_error').fit(train_X, train_y)

## Print the coefficients of the top performing model
lin_mod2.best_estimator_.named_steps['model'].coef_

array([-1.25805804e+03,  1.92884548e+03,  6.72865255e+02, -1.92085650e+03,
       -7.95482914e+01, -3.12181937e+03,  1.97731725e+03,  8.24129242e+02,
        1.82409979e+03,  3.74282983e+02,  2.29862180e+00,  8.78235016e+04])


## Using the score method of GridSearchCV
lin_mod2.cv_results_['mean_test_score'][0]

-19833.449405441286


## Split to training and testing sets
from sklearn.model_selection import cross_validate
cv_res = cross_validate(lin_mod2, train_X, train_y, cv = 5, scoring='neg_root_mean_squared_error')
np.average(cv_res['test_score'])

-19833.44940544128


## Set up simple pipeline that does scaling and modeling
from sklearn.neighbors import KNeighborsRegressor
pipe = Pipeline([
('scaler', StandardScaler()),
('model', KNeighborsRegressor())
])

## Set up parameter grid - notice we are considering two different models (and two different scalers)
parms = {'scaler': [StandardScaler(), RobustScaler()],
         'model': [KNeighborsRegressor(n_neighbors=15,weights='distance'), LinearRegression()],
        }

## Use cross-validated grid search and print the best model
mod_comp = GridSearchCV(pipe, parms, cv=5, scoring='neg_mean_squared_error').fit(train_X, train_y)
print(mod_comp.best_estimator_)

Pipeline(steps=[('scaler', StandardScaler()), ('model', LinearRegression())])


## Default scorer:
mod_comp = GridSearchCV(pipe, parms, cv=5).fit(train_X, train_y)
mod_comp.best_score_

0.9440104734478318


## Define cost function
def cost_function(X, y, w):
    n = len(y)
    pred_y = np.dot(X,w)
    err = y - pred_y
    cost = (1./n)*np.dot(np.transpose(err),err)                 
    return cost


def grad_descent(X, y, w, alpha, n_iter):
    costs = np.zeros(n_iter)
    n = len(y) 
    
    for i in range(n_iter):
        pred_y = np.dot(X,w)
        err = y - pred_y
        grad = (-2./n)*np.dot(np.transpose(X), err)
        w = w - alpha*grad
        costs[i] = cost_function(X, y, w)
        
    return w, costs


## Run gradient descent
train_Xs = StandardScaler().fit_transform(train_X)
gdres = grad_descent(X=train_Xs,y=train_y,w=np.zeros(12),alpha=0.1, n_iter=100)

## Plot costs by iteration
plt.plot(np.linspace(0, 100, 100),gdres[1])
plt.xlabel("Epoch")
plt.ylabel("Cost")
plt.show()


lmf = LinearRegression(fit_intercept=False).fit(train_Xs, train_y)
print(lmf.coef_)
print(gdres[0])

[-1.25805804e+03  1.92884548e+03  6.72865255e+02 -1.92085650e+03
 -7.95482914e+01 -3.12181937e+03  1.97731725e+03  8.24129242e+02
  1.82409979e+03  3.74282983e+02  2.29862181e+00  8.78235016e+04]
[-1853.74094527   423.88972963   648.24511176 -2379.89586956
  2067.56723688 -3190.55684732  2191.25895646  5493.89295523
  1897.97999934   489.9667822    140.69348536 83440.4859917 ]


## Run gradient descent
train_Xs = StandardScaler().fit_transform(train_X)
gdres = grad_descent(X=train_Xs,y=train_y,w=np.zeros(12),alpha=0.1, n_iter=2000)
print(lmf.coef_)
print(gdres[0])

[-1.25805804e+03  1.92884548e+03  6.72865255e+02 -1.92085650e+03
 -7.95482914e+01 -3.12181937e+03  1.97731725e+03  8.24129242e+02
  1.82409979e+03  3.74282983e+02  2.29862181e+00  8.78235016e+04]
[-1.25805804e+03  1.92884548e+03  6.72865255e+02 -1.92085650e+03
 -7.95482914e+01 -3.12181937e+03  1.97731725e+03  8.24129242e+02
  1.82409979e+03  3.74282983e+02  2.29862181e+00  8.78235016e+04]


## Run gradient descent
gdres = grad_descent(X=train_Xs,y=train_y,w=np.zeros(12),alpha=0.2, n_iter=100)

## Plot costs by iteration
plt.plot(np.linspace(0, 100, 100),gdres[1])
plt.xlabel("Epoch")
plt.ylabel("Cost")
plt.show()


## Run gradient descent
gdres = grad_descent(X=train_Xs,y=train_y,w=np.zeros(12),alpha=0.3, n_iter=100)

## Plot costs by iteration
plt.plot(np.linspace(0, 100, 100),gdres[1])
plt.xlabel("Epoch")
plt.ylabel("Cost")

Text(0, 0.5, 'Cost')


## Standardized version of the only predictor "assessed"
X_assess = (train_X['assessed'] - np.average(train_X['assessed']))/np.std(train_X['assessed'])

Lab #4 (part 1) - Linear Regression and Optimization¶

Part 1 - Linear Regression¶

Question #1¶

Part 2 - Comparison with $k$-Nearest Neighbors¶

Question #2¶

Part 3 - Optimization and Gradient Descent¶

Question #3 (gradient descent)¶