## Standard Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

# KNN will warn you about future updates, but we'll turn these off
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


wbc = pd.read_csv("https://remiller1450.github.io/data/wisc_bc.csv")


## Train-test split
from sklearn.model_selection import train_test_split
train, test = train_test_split(wbc, test_size=0.2, random_state=7)

## Separate the target from the predictors
train_y = train['Label']
test_y = test['Label']
train_X = train.drop(['ID','Label'], axis = 1)
test_X = test.drop(['ID','Label'], axis = 1)


## Function imports
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict

## Pipeline: normalize -> scale -> fit classifier
pipe = Pipeline([
 ('transformer', PowerTransformer(method = 'yeo-johnson')),
 ('scaler', StandardScaler()),
 ('classifier', KNeighborsClassifier(n_neighbors = 8))
])

## Get out-of-sample predictions using 5-fold CV
train_y_pred = cross_val_predict(estimator = pipe, X = train_X, y = train_y, cv = 5)


from sklearn.metrics import confusion_matrix
confusion_matrix(y_true = train_y, y_pred = train_y_pred, labels = ['M','B'])

array([[154,  18],
       [ 10, 273]], dtype=int64)


from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(train_y, train_y_pred, labels = ['M','B'])
plt.show()


train_y_pred_prob = cross_val_predict(estimator = pipe, X = train_X, y = train_y, cv = 5, method = 'predict_proba')


# ROC curve plotting:
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_predictions(train_y, train_y_pred_prob[:,1], pos_label='M')

<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x1fde6c8ec10>


from sklearn.metrics import roc_curve, roc_auc_score

## Get ROC curve components and AUC for model 1
fpr, tpr, thresh = roc_curve(train_y, train_y_pred_prob[:,1], pos_label = 'M')
auc = roc_auc_score(train_y, train_y_pred_prob[:,1], labels = ['M','B'])

## Same for model 2, but let's assume our 2nd model is just random guessing (for illustration)
fpr2, tpr2, thresh2 = roc_curve(train_y, np.ones(len(train_y)), pos_label = 'M')
auc2 = roc_auc_score(train_y, np.ones(len(train_y)), labels = ['M','B'])

## Create the plot
plt.plot(fpr,tpr,label="Model 1, auc="+str(auc))
plt.plot(fpr2,tpr2,label="Guessing, auc="+str(auc2))
plt.legend(loc=0)
plt.show()


fpr, tpr, thresh = roc_curve(train_y, train_y_pred_prob[:,1], pos_label = 'M')
print(pd.DataFrame({'t': thresh, 'TPR': tpr,'FPR': fpr}))

       t       TPR       FPR
0    inf  0.000000  0.000000
1  1.000  0.715116  0.000000
2  0.875  0.802326  0.010601
3  0.750  0.854651  0.017668
4  0.625  0.895349  0.035336
5  0.500  0.930233  0.084806
6  0.375  0.947674  0.102473
7  0.250  0.976744  0.169611
8  0.125  0.982558  0.314488
9  0.000  1.000000  1.000000


from sklearn.metrics import precision_recall_curve, PrecisionRecallDisplay
PrecisionRecallDisplay.from_predictions(train_y, train_y_pred_prob[:,1], pos_label = 'M')
pre, rec, thresholds = precision_recall_curve(train_y, train_y_pred_prob[:,1], pos_label = 'M')


## Simple pipeline
pipe = Pipeline([
('scaler', StandardScaler()),
('classifier', KNeighborsClassifier())
])

## Parameters to try via grid search
from sklearn.preprocessing import RobustScaler
parms = {'scaler': [StandardScaler(), RobustScaler()],
         'classifier__n_neighbors': [6,10],
         'classifier__weights': ['uniform','distance']
        }

## Conduct grid search and print best estimator/score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
my_score = make_scorer(f1_score, pos_label = 'M')
grid = GridSearchCV(pipe, parms, cv=5, scoring=my_score).fit(train_X, train_y)

## Print results
print(grid.best_estimator_)
print(grid.best_score_)

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 KNeighborsClassifier(n_neighbors=10, weights='distance'))])
0.9114487539813068


## Macro-averaging F1
a = f1_score(train_y, train_y_pred, labels = ['M','B'], average='macro')

## Micro-averaging F1
b = f1_score(train_y, train_y_pred, labels = ['M','B'], average='micro')

## Macro-averaging ROC-AUC w/ one-vs-one
c = roc_auc_score(train_y, train_y_pred_prob[:,1], average='macro', multi_class='ovo')

## Micro-averaging ROC-AUC w/ one-vs-rest
d = roc_auc_score(train_y, train_y_pred_prob[:,1], average='macro', multi_class='ovr')

## Results
print(a, b, c, d)

0.9339430894308942 0.9384615384615385 0.9771653381543266 0.9771653381543266


## The Wi-Fi positioning data
wifi = pd.read_csv("https://remiller1450.github.io/data/wifi.csv")
wifi.head(5)

	S1	S2	S3	S4	S5	S6	S7	Room
0	-64	-56	-61	-66	-71	-82	-81	1
1	-68	-57	-61	-65	-71	-85	-85	1
2	-63	-60	-60	-67	-76	-85	-84	1
3	-61	-60	-68	-62	-77	-90	-80	1
4	-63	-65	-60	-63	-77	-81	-87	1

Lab 6 - Evaluating Classifier Performance¶

Part 1 - Confusion Matrices and Performance Scores¶

Part 2 - Receiver Operating Characteristic (ROC) curve analysis¶

Part 3 - Precision-Recall (PR) Analysis¶

Part 4 - Performance Metrics and Tuning¶

Part 5 - Multi-class Outcomes¶

Part 6 - Application - Predicting locations using Wi-Fi signals¶

	S1	S2	S3	S4	S5	S6	S7	Room
0	-64	-56	-61	-66	-71	-82	-81	1
1	-68	-57	-61	-65	-71	-85	-85	1
2	-63	-60	-60	-67	-76	-85	-84	1
3	-61	-60	-68	-62	-77	-90	-80	1
4	-63	-65	-60	-63	-77	-81	-87	1

	S1	S2	S3	S4	S5	S6	S7	Room
0	-64	-56	-61	-66	-71	-82	-81	1
1	-68	-57	-61	-65	-71	-85	-85	1
2	-63	-60	-60	-67	-76	-85	-84	1
3	-61	-60	-68	-62	-77	-90	-80	1
4	-63	-65	-60	-63	-77	-81	-87	1

	S1	S2	S3	S4	S5	S6	S7	Room
0	-64	-56	-61	-66	-71	-82	-81	1
1	-68	-57	-61	-65	-71	-85	-85	1
2	-63	-60	-60	-67	-76	-85	-84	1
3	-61	-60	-68	-62	-77	-90	-80	1
4	-63	-65	-60	-63	-77	-81	-87	1