## Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn


jobs = pd.read_csv("https://remiller1450.github.io/data/majors.csv")
jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Major                    34 non-null     object 
 1   Category                 34 non-null     object 
 2   Workforce                34 non-null     float64
 3   Per_Male                 34 non-null     float64
 4   Per_Female               34 non-null     float64
 5   Age_25_34                34 non-null     float64
 6   Age_35_44                34 non-null     float64
 7   Age_45_54                34 non-null     float64
 8   Age_55_64                34 non-null     float64
 9   Per_White                34 non-null     float64
 10  Per_Black                34 non-null     float64
 11  Per_Asian                34 non-null     float64
 12  Per_Hispanic             34 non-null     float64
 13  Per_Other_Race           34 non-null     float64
 14  Per_Bachelors            34 non-null     float64
 15  Per_Masters              34 non-null     float64
 16  Per_Professional_Degree  34 non-null     float64
 17  Per_PhD                  34 non-null     float64
 18  Per_Employed             34 non-null     float64
 19  Per_Unemployed           34 non-null     float64
 20  Per_Non_Labor            34 non-null     float64
 21  Bach_Med_Income          34 non-null     float64
 22  Grad_Med_Income          34 non-null     float64
dtypes: float64(21), object(2)
memory usage: 6.2+ KB


## Datasets module
from sklearn import datasets

## Toy Dataset #1 - "moons"
moons = datasets.make_moons(n_samples=500, noise=0.11, random_state=8)
moons_X = moons[0]         # features to use in clustering
moons_labels = moons[1]    # true cluster identities

## Toy Dataset #2 - "blobs"
blobs = datasets.make_blobs(n_samples=500, cluster_std=2, random_state=8)
blobs_X = blobs[0]         # features to use in clustering
blobs_labels = blobs[1]    # true cluster identities


from sklearn.cluster import AgglomerativeClustering
agg_cl = AgglomerativeClustering(linkage = 'single', distance_threshold=0, n_clusters=None).fit(moons_X)


from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


## Part of the dendrogram for "moons"
plot_dendrogram(model = agg_cl, truncate_mode="level", p=10) # Plot is truncated at "level 10"


agg_cl = AgglomerativeClustering(linkage = 'single', n_clusters=3).fit_predict(moons_X)
plt.scatter(moons_X[:,0], moons_X[:,1], c = agg_cl) 
plt.show()


agg_cl = AgglomerativeClustering(linkage = 'complete', n_clusters=3).fit_predict(moons_X)
plt.scatter(moons_X[:,0], moons_X[:,1], c = agg_cl) 
plt.show()


from sklearn.preprocessing import StandardScaler
moons_XS = StandardScaler().fit_transform(moons_X) 

agg_cl = AgglomerativeClustering(linkage = 'complete', n_clusters=3).fit_predict(moons_XS)
plt.scatter(moons_XS[:,0], moons_XS[:,1], c = agg_cl) 
plt.show()


from sklearn.cluster import DBSCAN
results_dbscan = DBSCAN().fit(moons_XS)
plt.scatter(moons_XS[:,0], moons_XS[:,1], c = results_dbscan.labels_)

<matplotlib.collections.PathCollection at 0x1643349ce50>


results_dbscan = DBSCAN(eps=0.1, min_samples=2).fit(moons_XS)
plt.scatter(moons_XS[:,0], moons_XS[:,1], c = results_dbscan.labels_)

<matplotlib.collections.PathCollection at 0x164323ca850>


results_dbscan = DBSCAN(eps=0.28, min_samples=10).fit(moons_XS)
plt.scatter(moons_XS[:,0], moons_XS[:,1], c = results_dbscan.labels_)

<matplotlib.collections.PathCollection at 0x16432442160>


## Find predicted assignments
outliers_dbscan = DBSCAN(eps=0.28, min_samples=10).fit_predict(moons_XS)

## Outliers are denoted by the label '-1', so we can see that 0.6% of the data were deemed outliers
np.sum(outliers_dbscan == -1)/outliers_dbscan.shape[0]

0.006

Lab 2 - Hierarchical Clustering and DBSCAN¶

Part 1 - Agglomerative clustering¶

Model Fitting and Dendrograms¶

Cluster labels¶

Part 2 - DBSCAN¶

Part 3 - Application¶