import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import math
import scipy


### Read flattened, processed MNIST data
mnist = pd.read_csv("https://remiller1450.github.io/data/mnist_small.csv")

## Train-test split
from sklearn.model_selection import train_test_split
train, test = train_test_split(mnist, test_size=0.2, random_state=5)

### Separate the label column (outcome)
train_y = train['label']
train_X = train.drop(['label'], axis="columns")

### Create a binary label for 8's
train_y_binary = (train_y == 8).astype(int)


from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline 

pipe = Pipeline([
('model', LogisticRegression(penalty='none'))
])

log_mod = pipe.fit(train_X, train_y_binary)

C:\Users\millerry\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


from sklearn.preprocessing import MinMaxScaler 

## Try a different solver w/ scaling
pipe = Pipeline([
('scaler', MinMaxScaler()),
('model', LogisticRegression(penalty='none', solver='newton-cg'))
])

log_mod = pipe.fit(train_X, train_y_binary)
log_mod.score(train_X, train_y_binary)

1.0


from sklearn.model_selection import cross_validate
cv_res = cross_validate(log_mod, train_X, train_y_binary, cv = 5, scoring = 'accuracy')
np.average(cv_res['test_score'])

0.9227083333333332


## Modify pipeline to fit softmax regression
pipe = Pipeline([
('scaler', MinMaxScaler()),
('model', LogisticRegression(penalty='none', multi_class='multinomial'))
])

sm_mod = pipe.fit(train_X, train_y)
sm_mod.score(train_X, train_y)

1.0


cv_res = cross_validate(log_mod, train_X, train_y, cv = 5, scoring = 'accuracy')
np.average(cv_res['test_score'])

0.8758333333333332


## Refit logistic regression
pipe = Pipeline([
('scaler', MinMaxScaler()),
('model', LogisticRegression(penalty='none', solver='newton-cg'))
])
log_mod = pipe.fit(train_X, train_y_binary)

## Store the weights and bias
w = log_mod.named_steps['model'].coef_
b = log_mod.named_steps['model'].intercept_

## Calculate eta manually using the dot product of X and w + the bias term
train_Xs =  MinMaxScaler().fit_transform(train_X)
eta = np.dot(train_Xs, np.transpose(w)) + b

## Use eta to calculate probabilities of the positive class
pi = 1/(1 + np.exp(-eta))

## Display relationship between eta and pi
plt.scatter(eta, pi, c=train_y_binary.astype('category'))
plt.show()

C:\Users\millerry\AppData\Local\Temp\ipykernel_15620\1742544608.py:17: RuntimeWarning: overflow encountered in exp
  pi = 1/(1 + np.exp(-eta))


## Now let's extract sklearn's predicted probabilities using `predict_proba`:
pi_m = log_mod.named_steps['model'].predict_proba(train_Xs)
plt.scatter(eta, pi_m[:,1], c=train_y_binary.astype('category'))
plt.show()


## Load data
ic = pd.read_csv("https://remiller1450.github.io/data/IowaCityHomeSales.csv")
train_ic, test_ic = train_test_split(ic, test_size=0.2, random_state=7)

## Create X and y
train_num = train_ic.select_dtypes("number")
train_ic_y = (train_num['assessed'] > train_num['sale.amount']).astype(int)
train_ic_X = train_num.drop(['assessed','sale.amount'],axis=1)

## Fit logistic regression
pipe = Pipeline([
('scaler', MinMaxScaler()),
('model', LogisticRegression(penalty='none', solver='newton-cg'))
])
ic_log_mod = pipe.fit(train_ic_X, train_ic_y)

## Store the weights and bias
w = ic_log_mod.named_steps['model'].coef_
b = ic_log_mod.named_steps['model'].intercept_

## Calculate eta manually using the dot product of X and w + the bias term
train_Xs =  MinMaxScaler().fit_transform(train_ic_X)
eta = np.dot(train_Xs, np.transpose(w)) + b

## Use eta to calculate probabilities of the positive class
pi = 1/(1 + np.exp(-eta))

## Display
plt.scatter(eta, pi, c=train_ic_y.astype('category'))
plt.xlabel("Eta")
plt.ylabel("Pi")
plt.show()


## First, reload IC data
ic = pd.read_csv("https://remiller1450.github.io/data/IowaCityHomeSales.csv")

## Train-test split
train_ic, test_ic = train_test_split(ic, test_size=0.2, random_state=7)

## Recreate X and Y
train_num = train_ic.select_dtypes("number")
train_ic_y = (train_num['assessed'] > train_num['sale.amount']).astype(int)
train_ic_X = train_num.drop(['assessed','sale.amount'],axis=1)
train_Xs =  MinMaxScaler().fit_transform(train_ic_X)


## Define Functions
import random
def cost_function(x, y, w):
    eta = np.dot(x,w)
    pred_y = 1/(1 + np.exp(-eta))
    cost = -y*np.log(pred_y) - (1-y)*np.log(1-pred_y)               
    return cost               

def grad_descent(X, y, w, alpha, n_iter):
    n = len(y)
    costs = np.zeros(n_iter)
    
    for i in range(n_iter):
        idx = random.sample(range(n), n)
        temp_cost = 0.0
        
        for si in range(n):
            random_y = y.iloc[idx[si]]
            random_x = np.append(1,X[idx[si],:])
            eta = np.dot(random_x,w)
            pred_y = 1/(1 + np.exp(-eta))
            err =  (pred_y - random_y).item()
            grad = random_x*err
            w = w - alpha*grad
            temp_cost += cost_function(random_x, random_y, w)
        
        costs[i] = temp_cost
    return w, costs


## Plot of results
p = np.shape(train_Xs)[1]+1
n = len(train_ic_y)
nit = 500
gdres = grad_descent(X=train_Xs,y=train_ic_y,w=np.zeros(p),alpha=0.005, n_iter = nit)
plt.plot(np.linspace(0, nit, nit),gdres[1])
plt.show()

Lab #4 (part 2) - Logistic and Softmax Regression¶

Part 1 - Logistic Regression¶

Question #1¶

Part 2 - Softmax Regression¶

Question #2¶

Part 3 - Weights and predictions¶

Question #3¶

Part 4 - Interpretting Weights¶

Question #4¶

Part 5 - Optimization¶

Question #5¶