import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import math


## Define the cost function
def cost_function(X, y, w):
    n = len(y)
    pred_y = np.dot(X,w)
    err = y - pred_y
    cost = (1./n)*np.dot(np.transpose(err),err)                 
    return cost


## Simulated Data for Question 1
np.random.seed(0)
x = np.linspace(-10, 10, 100)
y = 3 * x + np.random.normal(0, 3, 100)


## Define gradient descent function for linear regression
def grad_descent(X, y, w, alpha, n_iter):
    costs = np.zeros(n_iter)
    n = len(y) 
    
    for i in range(n_iter):
        pred_y = np.dot(X,w)
        grad = (-2./n)*np.dot(np.transpose(X), (y - pred_y))
        w = w - alpha*grad
        costs[i] = cost_function(X, y, w)
        
    return w, costs


## Run gradient descent (for the original IC homes data)
from sklearn.preprocessing import StandardScaler
ic = pd.read_csv("https://remiller1450.github.io/data/IowaCityHomeSales.csv")
ic_y = ic['sale.amount']
ic_X = ic.select_dtypes("number").drop('sale.amount',axis=1)
ic_Xs = StandardScaler().fit_transform(ic_X)
gdres = grad_descent(X=ic_Xs,y=ic_y,w=np.zeros(12),alpha=0.1, n_iter=100)

## Plot costs by iteration
plt.plot(np.linspace(0, 100, 100),gdres[1])
plt.xlabel("Epoch")
plt.ylabel("Cost")
plt.show()


## sklearn's linear regression
from sklearn.linear_model import LinearRegression
lmf = LinearRegression(fit_intercept=False).fit(ic_Xs, ic_y)
print(lmf.coef_)
print(gdres[0])

[-1571.29533112  2584.32172762  -225.92361186 -5778.1626521
  -608.85626531 -3022.00349719  1522.1430872    159.08016378
  1813.22175235   181.64294314  -520.80749986 90251.73234668]
[-2083.11880499  1076.37333675  -235.38323399 -6152.0121008
  1687.30812052 -3088.59226859  1724.58677251  5188.78489252
  1997.70355932   310.97837215  -466.9349181  85369.24932668]


## Run gradient descent (again)
gdres = grad_descent(X=ic_Xs,y=ic_y,w=np.zeros(12),alpha=0.1, n_iter=2000)
print(lmf.coef_)
print(gdres[0])

[-1571.29533112  2584.32172762  -225.92361186 -5778.1626521
  -608.85626531 -3022.00349719  1522.1430872    159.08016378
  1813.22175235   181.64294314  -520.80749986 90251.73234668]
[-1571.29533112  2584.32172762  -225.92361186 -5778.1626521
  -608.85626531 -3022.00349719  1522.1430872    159.08016378
  1813.22175235   181.64294314  -520.80749986 90251.73234668]


## What happens with a larger learning rate?
gdres = grad_descent(X=ic_Xs,y=ic_y,w=np.zeros(12),alpha=0.3, n_iter=100)
plt.plot(np.linspace(0, 100, 100),gdres[1])
plt.xlabel("Epoch")
plt.ylabel("Cost")
plt.show()


## Standardizing just one predictor
X_assess = (ic_X['assessed'] - np.average(ic_X['assessed']))/np.std(ic_X['assessed'])


## Slightly modified cost function
def cost_function(X, y, w):
    n = y.size
    pred_y = np.dot(X,w)
    err = y - pred_y
    cost = (1./n)*np.dot(np.transpose(err),err)                 
    return cost    

## Mini-batch gradient descent function
import random
def batch_grad_descent(X, y, w, alpha, n_iter, n_batches):
    costs = np.zeros(n_iter)
    n = len(y) 
    
    for i in range(n_iter):
        batch_idx = random.choices(range(n_batches), k=n_batches)
        temp_cost = 0.0
        
        for bi in range(n_batches):
            batch_y = y.iloc[batch_idx[bi]]
            batch_X = X[batch_idx[bi],:]
            pred_y = np.dot(batch_X,w)
            grad = (-2./batch_y.size)*np.dot(np.transpose(batch_X), (batch_y - pred_y))
            w = w - alpha*grad
            temp_cost += cost_function(batch_X, batch_y, w)
        
        costs[i] = temp_cost
        
    return w, costs

## Run and plot the costs by epoch
bgdres = batch_grad_descent(X=ic_Xs,y=ic_y,w=np.zeros(12),n_batches = 5, alpha=0.001, n_iter=200)
plt.plot(np.linspace(0, 200, 200),bgdres[1])
plt.xlabel("Epoch")
plt.ylabel("Cost")
plt.show()


## Cross-entropy cost for 1 data-point
def cost_function(x, y, w):
    z = np.dot(x,w)
    pred_y = 1/(1 + np.exp(-z))
    cost = -y*np.log(pred_y) - (1-y)*np.log(1-pred_y)               
    return cost               

## Stochastic Gradient Descent
def sgd(X, y, w, alpha, n_iter):
    n = len(y)
    costs = np.zeros(n_iter)
    
    for i in range(n_iter):
        idx = random.sample(range(n), n)
        temp_cost = 0.0
        
        for si in range(n):
            random_y = y.iloc[idx[si]]
            random_x = np.append(1,X[idx[si],:])
            z = np.dot(random_x,w)
            pred_y = 1/(1 + np.exp(-z))
            grad = random_x*(pred_y - random_y)
            w = w - alpha*grad
            temp_cost += cost_function(random_x, random_y, w)
        
        costs[i] = temp_cost
        
    return w, costs


## Set up binary outcome and scaled predictors
ic_y_bin = (ic['assessed'] > ic['sale.amount']).astype(int)
ic_Xs_bin = StandardScaler().fit_transform(ic.select_dtypes("number").drop(['sale.amount', 'assessed'],axis=1))

## Some prelims to input into SGD
p = np.shape(ic_Xs_bin)[1]+1
n = len(ic_y_bin)
nit = 100

## Run 100 epochs of SGD and display the results
gdres = sgd(X=ic_Xs_bin,y=ic_y_bin,w=np.zeros(p),alpha=0.0025, n_iter = nit)
plt.plot(np.linspace(0, nit, nit),gdres[1])
plt.show()

Lab 11 - Gradient Descent Algorithms for Machine Learning¶

Part 1 - Gradient Descent¶

Gradient Descent¶

Part 2 - Mini-batch Gradient Descent¶

Part 3 - Stochastic Gradient Descent and Logistic Regression¶

Implementing Stochastic Gradient Descent¶