import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import math

# Unfortunately, knn functions prompt "future warnings", the commands below turn them off
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


sobar = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00537/sobar-72.csv')


from sklearn.model_selection import train_test_split
train, test = train_test_split(sobar, test_size=0.2, random_state=4)


## Define outcome variable
train_y = train['ca_cervix']

## Define predictor matrix
train_X = train.drop('ca_cervix',axis=1)

## Imports used in the pipeline
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.neighbors import KNeighborsClassifier

## Pipeline: normalize -> scale -> fit classifier
pipe = Pipeline([
('transformer', PowerTransformer(method = 'yeo-johnson')),
('scaler', StandardScaler()),
('classifier', KNeighborsClassifier(n_neighbors = 8))
])

## Model fit
pf = pipe.fit(train_X, train_y)

## Predictions (using 5-fold CV)
from sklearn.model_selection import cross_val_predict
train_y_pred = cross_val_predict(pf, train_X, train_y, cv=5)


from sklearn.metrics import confusion_matrix
confusion_matrix(y_true = train_y, y_pred = train_y_pred)

array([[41,  0],
       [12,  4]], dtype=int64)


## Returns predicted probabilities from the model fitted via our pipeline (and 5-fold CV)
train_y_probs = cross_val_predict(pf, train_X, train_y, cv=5, method='predict_proba')

## Select only the 2nd column (predicted probs of label=1)
train_y_score = train_y_probs[:,1]


## ROC curve plotting function:
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_predictions(train_y, train_y_score, pos_label=1)

<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x1cf04b81be0>


from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(train_y, train_y_score, pos_label=1)
print(pd.DataFrame({'t': thresholds, 'TPR': tpr,'FPR': fpr}))

       t     TPR       FPR
0  1.875  0.0000  0.000000
1  0.875  0.1875  0.000000
2  0.750  0.2500  0.000000
3  0.500  0.5000  0.000000
4  0.375  0.6875  0.048780
5  0.250  0.8125  0.170732
6  0.125  0.8750  0.365854
7  0.000  1.0000  1.000000


# Functions for PR-analysis
from sklearn.metrics import precision_recall_curve, PrecisionRecallDisplay
PrecisionRecallDisplay.from_predictions(train_y, train_y_score, pos_label=1)
pre, rec, thresholds = precision_recall_curve(train_y, train_y_score, pos_label=1)


## Simple pipeline
pipe = Pipeline([
('scaler', StandardScaler()),
('classifier', KNeighborsClassifier())
])

## Parameters to try via grid search
from sklearn.preprocessing import RobustScaler, MaxAbsScaler
parms = {'scaler': [StandardScaler(), RobustScaler(), MaxAbsScaler()],
         'classifier__n_neighbors': [4,6,8],
         'classifier__weights': ['uniform','distance'],
         'classifier__p': [1,2]
        }

## Conduct grid search and print best estimator/score
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, parms, cv=5, scoring='f1').fit(train_X, train_y)
print(grid.best_estimator_)
print(grid.best_score_)

Pipeline(steps=[('scaler', MaxAbsScaler()),
                ('classifier',
                 KNeighborsClassifier(n_neighbors=4, weights='distance'))])
0.6933333333333334


### Read flattened, processed MNIST data
mnist = pd.read_csv("https://remiller1450.github.io/data/mnist_small.csv")

### Separate the label column (outcome)
label = mnist['label']
mnist = mnist.drop(['label'], axis="columns")

### Convert to numpy array and reshape to 28 by 28
mnist_unflattened = mnist.to_numpy()
mnist_unflattened = mnist_unflattened.reshape(6000,28,28)

### Import grayscale color map
import matplotlib.cm as cm

## Plot the first five samples (to refresh your memory)
fig, axs = plt.subplots(ncols=5)
for i in range(5):
    axs[i].imshow(mnist_unflattened[i], cmap=cm.Greys)
    axs[i].title.set_text(f'label={label[i]}')
plt.show()


## Set up knn model
knn_mnist = KNeighborsClassifier(n_neighbors=20)

## Use 5-fold CV to get predicted labels
mnist_label_pred = cross_val_predict(knn_mnist, mnist, label, cv=5)

## Display confusion matrix
confusion_matrix(label, mnist_label_pred)

array([[604,   1,   2,   1,   0,   2,   7,   0,   1,   0],
       [  0, 693,   1,   0,   0,   0,   1,   1,   0,   0],
       [ 10,  25, 510,   6,   4,   2,   3,  17,   8,   3],
       [  3,  15,   3, 548,   1,   7,   2,   4,   8,   7],
       [  0,  17,   0,   0, 538,   0,   4,   5,   0,  28],
       [  5,  20,   0,  23,   3, 437,   7,   1,   3,  12],
       [ 10,  10,   0,   0,   0,   5, 558,   0,   0,   0],
       [  0,  26,   0,   0,   3,   1,   0, 596,   0,  17],
       [  8,  32,   0,  25,   7,  18,   4,   5, 459,  22],
       [  4,   7,   0,   9,   6,   0,   1,  17,   1, 546]], dtype=int64)


from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(label, mnist_label_pred)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1cf7f2fadf0>


from sklearn.metrics import f1_score, roc_auc_score

## Macro-averaging F1
f1_score(label, mnist_label_pred, average='macro')

## Micro-averaging F1
f1_score(label, mnist_label_pred, average='micro')

## Macro-averaging ROC-AUC w/ one-vs-one
mnist_label_prob = cross_val_predict(knn_mnist, mnist, label, cv=5, method='predict_proba')
roc_auc_score(label, mnist_label_prob, average='macro', multi_class='ovo')

## Macro-averaging ROC-AUC w/ one-vs-rest
roc_auc_score(label, mnist_label_prob, average='macro', multi_class='ovr')

0.992861360171459


## Read data
wifi = pd.read_csv("https://remiller1450.github.io/data/wifi.csv")
wifi.head(5)

	S1	S2	S3	S4	S5	S6	S7	Room
0	-64	-56	-61	-66	-71	-82	-81	1
1	-68	-57	-61	-65	-71	-85	-85	1
2	-63	-60	-60	-67	-76	-85	-84	1
3	-61	-60	-68	-62	-77	-90	-80	1
4	-63	-65	-60	-63	-77	-81	-87	1

Lab #3 (part 1) - Other Performance Metrics¶

Part 1 - Confusion Matrices¶

Question #1 (confusion matrices)¶

Part 2 - ROC Analysis¶

Question #2 (ROC analysis)¶

Part 3 - Precision-Recall Analysis¶

Question #3 (PR analysis)¶

Part 4 - Performance Metrics and Pipelines/Grid Search¶

Part 5 - Multiple Classes¶

Part 6 - Application¶

Question #4 (application)¶

	S1	S2	S3	S4	S5	S6	S7	Room
0	-64	-56	-61	-66	-71	-82	-81	1
1	-68	-57	-61	-65	-71	-85	-85	1
2	-63	-60	-60	-67	-76	-85	-84	1
3	-61	-60	-68	-62	-77	-90	-80	1
4	-63	-65	-60	-63	-77	-81	-87	1

	S1	S2	S3	S4	S5	S6	S7	Room
0	-64	-56	-61	-66	-71	-82	-81	1
1	-68	-57	-61	-65	-71	-85	-85	1
2	-63	-60	-60	-67	-76	-85	-84	1
3	-61	-60	-68	-62	-77	-90	-80	1
4	-63	-65	-60	-63	-77	-81	-87	1

	S1	S2	S3	S4	S5	S6	S7	Room
0	-64	-56	-61	-66	-71	-82	-81	1
1	-68	-57	-61	-65	-71	-85	-85	1
2	-63	-60	-60	-67	-76	-85	-84	1
3	-61	-60	-68	-62	-77	-90	-80	1
4	-63	-65	-60	-63	-77	-81	-87	1