## Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


wisc_bc = pd.read_csv("https://remiller1450.github.io/data/wisc_bc.csv")


## Train-test split
from sklearn.model_selection import train_test_split
train, test = train_test_split(wisc_bc, test_size=0.2, random_state=7)

## Separate the target from the predictors
train_y = train['Label']
test_y = test['Label']
train_X = train.drop(['ID','Label'], axis = 1)
test_X = test.drop(['ID','Label'], axis = 1)


## Function imports
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict

## Pipeline
pipe = Pipeline([
 ('scaler', StandardScaler()),
 ('classifier', KNeighborsClassifier(n_neighbors = 8))
])

## Get out-of-sample predictions using 5-fold CV
train_y_pred = cross_val_predict(estimator = pipe, X = train_X, y = train_y, cv = 5, method = 'predict')

## Confusion matrix:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true = train_y, y_pred = train_y_pred, labels = ['M','B'])

array([[150,  22],
       [ 11, 272]], dtype=int64)


from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(train_y, train_y_pred, labels = ['M','B'])
plt.show()


from sklearn.metrics import f1_score, make_scorer
f1_scorer = make_scorer(f1_score, average='binary', pos_label='M')


train_y_pred_prob = cross_val_predict(estimator = pipe, X = train_X, y = train_y, cv = 5, method = 'predict_proba')


# ROC curve plot
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_predictions(train_y, train_y_pred_prob[:,1], pos_label='M')
plt.show()


## Functions to get TPR, FPR, and Thresholds for curves
from sklearn.metrics import roc_curve, roc_auc_score

## Get ROC curve components and AUC for model 1
fpr, tpr, thresh = roc_curve(train_y, train_y_pred_prob[:,1], pos_label = 'M')
auc = roc_auc_score(train_y, train_y_pred_prob[:,1], labels = ['M','B'])

## Same for model 2, but let's assume our 2nd model always guesses the positive class (for illustration)
fpr2, tpr2, thresh2 = roc_curve(train_y, np.ones(len(train_y)), pos_label = 'M')
auc2 = roc_auc_score(train_y, np.ones(len(train_y)), labels = ['M','B'])

## Create the plot
plt.plot(fpr,tpr,label="Model 1, auc="+str(round(auc,3)))
plt.plot(fpr2,tpr2,label="Guessing, auc="+str(auc2))
plt.legend(loc=0)
plt.show()


fpr, tpr, thresh = roc_curve(train_y, train_y_pred_prob[:,1], pos_label = 'M')
print(pd.DataFrame({'t': thresh, 'TPR': tpr,'FPR': fpr}))

       t       TPR       FPR
0    inf  0.000000  0.000000
1  1.000  0.691860  0.003534
2  0.875  0.784884  0.014134
3  0.750  0.848837  0.021201
4  0.625  0.872093  0.038869
5  0.500  0.918605  0.088339
6  0.375  0.947674  0.113074
7  0.250  0.965116  0.173145
8  0.125  0.988372  0.325088
9  0.000  1.000000  1.000000


## Display a PR curve from predictions
from sklearn.metrics import precision_recall_curve, PrecisionRecallDisplay
PrecisionRecallDisplay.from_predictions(train_y, train_y_pred_prob[:,1], pos_label = 'M')

<sklearn.metrics._plot.precision_recall_curve.PrecisionRecallDisplay at 0x219bd457ee0>


## Get precision/recall information to make our own PR curve or investigate different thresholds
pre, rec, thresholds = precision_recall_curve(train_y, train_y_pred_prob[:,1], pos_label = 'M')


## Macro-averaging F1
a = f1_score(train_y, train_y_pred, labels = ['M','B'], average='macro')

## Micro-averaging F1
b = f1_score(train_y, train_y_pred, labels = ['M','B'], average='micro')

## Macro-averaging ROC-AUC w/ one-vs-one
c = roc_auc_score(train_y, train_y_pred_prob[:,1], average='macro', multi_class='ovo')

## Micro-averaging ROC-AUC w/ one-vs-rest
d = roc_auc_score(train_y, train_y_pred_prob[:,1], average='macro', multi_class='ovr')

## Neatly printed results
print(f"Macro F1: {a} | Micro F1: {b} | ROC-AUC (macro, ovo): {c} | ROC-AUC (macro, ovr): {d}")

Macro F1: 0.9218542632754072 | Micro F1: 0.9274725274725275 | ROC-AUC (macro, ovo): 0.974248089407511 | ROC-AUC (macro, ovr): 0.974248089407511


## The Wi-Fi positioning data
wifi = pd.read_csv("https://remiller1450.github.io/data/wifi.csv")
wifi.head(5)

	S1	S2	S3	S4	S5	S6	S7	Room
0	-64	-56	-61	-66	-71	-82	-81	1
1	-68	-57	-61	-65	-71	-85	-85	1
2	-63	-60	-60	-67	-76	-85	-84	1
3	-61	-60	-68	-62	-77	-90	-80	1
4	-63	-65	-60	-63	-77	-81	-87	1

Lab 4 - Classifier Performance¶

Part 1 - Scorers and Confusion Matrices¶

Part 2 - Receiver Operating Characteristic (ROC) Analysis¶

Part 3 - Precision-Recall (PR) Analysis¶

Part 4 - Multiclass Classification¶

Part 5 - Application¶

	S1	S2	S3	S4	S5	S6	S7	Room
0	-64	-56	-61	-66	-71	-82	-81	1
1	-68	-57	-61	-65	-71	-85	-85	1
2	-63	-60	-60	-67	-76	-85	-84	1
3	-61	-60	-68	-62	-77	-90	-80	1
4	-63	-65	-60	-63	-77	-81	-87	1

	S1	S2	S3	S4	S5	S6	S7	Room
0	-64	-56	-61	-66	-71	-82	-81	1
1	-68	-57	-61	-65	-71	-85	-85	1
2	-63	-60	-60	-67	-76	-85	-84	1
3	-61	-60	-68	-62	-77	-90	-80	1
4	-63	-65	-60	-63	-77	-81	-87	1

	S1	S2	S3	S4	S5	S6	S7	Room
0	-64	-56	-61	-66	-71	-82	-81	1
1	-68	-57	-61	-65	-71	-85	-85	1
2	-63	-60	-60	-67	-76	-85	-84	1
3	-61	-60	-68	-62	-77	-90	-80	1
4	-63	-65	-60	-63	-77	-81	-87	1