import pandas as pd
import numpy as np


import matplotlib.pyplot as plt


x = '4'  # Define x as '4'
type(x)  # Notice the type is str

str


float(x)  # Coerce to float

4.0


## Three different boolean conversions
print(bool(1.0), bool(-2), bool(0.0))

True True False


y = '1.0'


z = 5
float(z)
type(z)

int


## A simple list
my_list1 = [1,5,3,9]

## Two lists within a list
my_list2 = [[1,3,5], [2,4,6]]


## The first list in `my_list2`
my_list2[0]

[1, 3, 5]


## Elements in positions 0 through 2 (first 3 positions) in my_list1
my_list1[:3]

[1, 5, 3]


## All elements in my_list1
my_list1[:len(my_list1)]

[1, 5, 3, 9]


## Element in position 1 in my_list1
my_list2[0][1]

3


my_dict = {
  "brand": "Ford",
  "year": 1964,
  "colors": ["red", "white", "blue"]
}


my_dict['colors']

['red', 'white', 'blue']


my_list2 = [[1,3,5], [2,4,6]]   # Same list as before
my_array2 = np.array(my_list2)  # Convert to a numpy array, notice our use of the alias np
my_array2[0,1]                  # Now this works!

3


## For lists, + will concatenate
my_list1 = [1,5,3,9]
print(my_list1 + my_list1)

## For arrays, + works as intended (vectorized addition)
my_array1 = np.array(my_list1)
print(my_array1 + my_array1)

[1, 5, 3, 9, 1, 5, 3, 9]
[ 2 10  6 18]


## Create the 2-d array
my_array = np.array([[1,3,5], [2,4,6]]) 

my_array.shape   # arrays have a "shape" attribute

(2, 3)


my_array.flatten()  # arrays have a "flatten" method

array([1, 3, 5, 2, 4, 6])


ic = pd.read_csv("https://remiller1450.github.io/data/IowaCityHomeSales.csv")


## Built-in descriptive summaries of the DataFrame's columns
ic.describe()


# Select just the 'sale.amount' column and print the first 5 values
ic['sale.amount'].head(n = 5)

0    172500
1     90000
2    168500
3    205000
4    121000
Name: sale.amount, dtype: int64


## Select only columns with numeric data types and print the resulting shape
ic.select_dtypes(include=['number']).shape

(777, 13)


## Subset rows using a boolean condition and print the resulting shape.
ic[ic['sale.amount'] > 500000].shape

(11, 19)


## Group the data by the 'style' column, then find the mean of 'sale.amount' for each group
ic.groupby(by='style')['sale.amount'].mean()

style
1 1/2 Story Frame    186644.000000
1 Story Brick        220225.416667
1 Story Condo        121246.600000
1 Story Frame        166179.740634
2 Story Brick        334985.000000
2 Story Condo        149766.666667
2 Story Frame        215038.451087
Split Foyer Frame    160058.333333
Split Level Frame    208351.612903
Name: sale.amount, dtype: float64


## Create an example dataframe for illustration
more_data = pd.DataFrame({'sale.date': ['1/3/2005','1/12/2005'],
        'new_variable': ['a','b']})

## Left join 'ic' onto this example dataframe according to the 'sale.date' variable
merged_data = more_data.merge(ic, on='sale.date', how='left')

## Print some of the merged data, notice the new columns from 'ic'
merged_data[['sale.date', 'new_variable', 'sale.amount', 'built']]


## Add a new column containing the log-transformed sale amount as the 1st column
ic.insert(loc=0, column='new_col_name', value=np.log(ic['sale.amount']))
ic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   new_col_name  777 non-null    float64
 1   sale.amount   777 non-null    int64  
 2   sale.date     777 non-null    object 
 3   occupancy     777 non-null    object 
 4   style         777 non-null    object 
 5   built         777 non-null    int64  
 6   bedrooms      777 non-null    int64  
 7   bsmt          777 non-null    object 
 8   ac            777 non-null    object 
 9   attic         777 non-null    object 
 10  area.base     777 non-null    int64  
 11  area.add      777 non-null    int64  
 12  area.bsmt     777 non-null    int64  
 13  area.garage1  777 non-null    int64  
 14  area.garage2  777 non-null    int64  
 15  area.living   777 non-null    int64  
 16  area.lot      777 non-null    int64  
 17  lon           777 non-null    float64
 18  lat           777 non-null    float64
 19  assessed      777 non-null    int64  
dtypes: float64(3), int64(11), object(6)
memory usage: 121.5+ KB


## Load a data set that is in "wide" format
bluechip_stocks = pd.read_csv("https://remiller1450.github.io/data/bluechips.csv")
bluechip_stocks


## Pivot everything but 'Year' longer (print only the first 6 obs)
bluechip_stocks_long = bluechip_stocks.melt(id_vars=['Year'])
bluechip_stocks_long.head(6)


## Pivot back to "wide" format
bluechip_stocks_wide = bluechip_stocks_long.pivot(index="Year", columns="variable", values="value")
bluechip_stocks_wide.reset_index(inplace=True)  # Extra step to make 'Year' a column rather than row names
print(bluechip_stocks_wide)

variable  Year        AAPL         AXP         JNJ         KO
0         2010    7.643214   40.919998   64.680000  28.520000
1         2011   11.770357   43.400002   62.820000  32.610001
2         2012   14.686786   48.389999   65.879997  35.070000
3         2013   19.608213   58.750000   70.839996  37.599998
4         2014   19.754642   89.449997   91.029999  40.660000
5         2015   27.332500   93.019997  104.519997  42.139999
6         2016   26.337500   67.589996  100.480003  42.400002
7         2017   29.037500   75.349998  115.839996  41.799999
8         2018   43.064999   98.940002  139.229996  45.540001
9         2019   35.547501   93.430000  125.720001  46.639999
10        2020   74.357498  124.599998  144.279999  54.689999
11        2021  129.410004  118.040001  156.500000  52.759998
12        2022  182.009995  168.210007  171.539993  59.299999


## Read polls data set
polls = pd.read_csv("https://remiller1450.github.io/data/polls2016.csv")

## Info on the dataset
polls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Poll         7 non-null      object 
 1   Date         7 non-null      object 
 2   Sample       7 non-null      object 
 3   MoE          6 non-null      float64
 4   Clinton..D.  7 non-null      int64  
 5   Trump..R.    7 non-null      int64  
 6   Johnson..L.  7 non-null      int64  
 7   Stein..G.    7 non-null      int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 576.0+ bytes


## Select only numeric vars
ic_num = ic.select_dtypes(include=['number'])

## Scatterplot matrix of all pairing of numeric vars
from pandas.plotting import scatter_matrix
scatter_matrix(ic_num)
plt.show()


ic_num['built'].plot(kind = 'hist')

<AxesSubplot:ylabel='Frequency'>


## Create some toy inputs
x = np.linspace(0, 4, num=20) # 20 equally spaced values b/w 0 and 4

## Display 3 different f(x)
plt.plot(x, x)
plt.plot(x, np.power(x,2))
plt.plot(x, np.exp(x))

## Add a legend
plt.legend(['y = x', 'y = x^2', 'y = e^x'], loc='upper left')

## Optional plt.show() 
plt.show()


## Define function
def squared_diff(y, yh):
    diff = y - yh
    ss_diff = np.dot(diff, diff)
    return ss_diff

## Example use
squared_diff(y = np.array([1,1,1]), yh = np.array([1,0,2]))

2


## Example while loop
x = 0
while x <= 3:
    print(x)
    x = x +1


## More realistic use of a while loop
diff = float('inf')
tol = 0.1
yh = np.array([1,1.5,-2])
target = np.array([0,0,0])
while diff > tol:
    diff = squared_diff(yh, target)
    yh = yh/1.5    # shrink values by 50%
    print(yh)

[ 0.66666667  1.         -1.33333333]
[ 0.44444444  0.66666667 -0.88888889]
[ 0.2962963   0.44444444 -0.59259259]
[ 0.19753086  0.2962963  -0.39506173]
[ 0.13168724  0.19753086 -0.26337449]
[ 0.0877915   0.13168724 -0.17558299]
[ 0.05852766  0.0877915  -0.11705533]


for i in range(3):
    print(i)

0
1
2


## Libraries
import os
import matplotlib.image as mpimg

## Root directory containing the folder (on my PC)
path = 'OneDrive - Grinnell College/Documents/cats/'
file_list = os.listdir(path)[:3]                   # We'll show only the first 3 images

## Loop through file list, notice we can concatenate the file path and name into a single string using '+'
for file in file_list:
    plt.imshow(mpimg.imread(path + file))
    plt.show()


import sklearn
from sklearn.cluster import KMeans # Import k-means function
import warnings
warnings.filterwarnings("ignore") ## We will turn off warnings since k-means will warn about future changes


## Datasets module
from sklearn import datasets

## Dataset #1 - "moons"
moons = datasets.make_moons(n_samples=500, noise=0.11, random_state=8)
moons_X = moons[0]         # features to use in clustering
moons_labels = moons[1]    # true cluster identities

## Visualization
plt.scatter(moons_X[:,0], moons_X[:,1], c =moons_labels)
plt.show()


## Dataset #2 - "blobs"
blobs = datasets.make_blobs(n_samples=500, cluster_std=2, random_state=8)
blobs_X = blobs[0]         # features to use in clustering
blobs_labels = blobs[1]    # true cluster identities

plt.scatter(blobs_X[:,0], blobs_X[:,1], c = blobs_labels)
plt.show()


from sklearn.preprocessing import StandardScaler
moons_XS = StandardScaler().fit_transform(moons_X) # use the fit_transform method of a newly instantiated StandardScaler object

## Plot the standardized features, notice the subtle change in the x/y axes
plt.scatter(moons_XS[:,0], moons_XS[:,1], c =moons_labels)

<matplotlib.collections.PathCollection at 0x1bd0204f610>


## Loop through choices of k ranging from 2 to 10
scores = []
for k in range(2, 12):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(moons_XS)
    scores.append(kmeans.inertia_)

## Visualize
plt.plot(range(2, 12), scores)
plt.xlabel("k")
plt.ylabel("Objective")
plt.show()


from sklearn.metrics import silhouette_score
my_clusters = KMeans(n_clusters=4)   # init the k-means object

## Calculate silhouette scores
my_sils = silhouette_score(moons_XS, my_clusters.fit_predict(moons_XS))

## Mean sil score
np.average(my_sils)

0.42124198131633384


plt.scatter(moons_XS[:,0], moons_XS[:,1], c =my_clusters.predict(moons_XS)) 
plt.show()


from skimage import io
my_img = io.imread("https://remiller1450.github.io/data/beach_img.jpg")
io.imshow(my_img)
io.show()


my_img.shape

(525, 700, 3)


my_img_2d = my_img.reshape(my_img.shape[0]*my_img.shape[1], my_img.shape[2])


kmeans_fit = KMeans(n_clusters=2, random_state=10, n_init=10).fit(my_img_2d)
plt.imshow(kmeans_fit.labels_.reshape(525,700).astype(int), cmap='gray')

<matplotlib.image.AxesImage at 0x1bd07e77af0>


from sklearn.cluster import KMeans
kmeans_fit = KMeans(n_clusters=3, random_state=10, n_init=10).fit(my_img_2d)
plt.imshow(kmeans_fit.labels_.reshape(525,700).astype(int), cmap='gray')

<matplotlib.image.AxesImage at 0x1bd06f07100>

	sale.amount	built	bedrooms	area.base	area.add	area.bsmt	area.garage1	area.garage2	area.living	area.lot	lon	lat	assessed
count	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000
mean	180098.329472	1972.566281	3.025740	986.942085	81.061776	345.971686	211.738739	76.873874	1365.144144	8886.574003	-91.522543	41.652816	174324.916345
std	90655.308636	31.252192	0.990604	353.324287	182.587922	386.380310	252.103302	162.755911	527.894470	8474.390873	0.033708	0.011389	85370.750928
min	38250.000000	1873.000000	1.000000	240.000000	0.000000	0.000000	0.000000	0.000000	312.000000	137.000000	-91.605747	41.628040	38590.000000
25%	130000.000000	1958.000000	2.000000	726.000000	0.000000	0.000000	0.000000	0.000000	1017.000000	5398.000000	-91.549255	41.645962	126030.000000
50%	157900.000000	1981.000000	3.000000	924.000000	0.000000	250.000000	0.000000	0.000000	1240.000000	7500.000000	-91.515802	41.652618	153930.000000
75%	205000.000000	1998.000000	4.000000	1184.000000	36.000000	600.000000	440.000000	0.000000	1560.000000	9960.000000	-91.496104	41.659200	198200.000000
max	815000.000000	2007.000000	7.000000	3440.000000	2261.000000	2500.000000	1065.000000	856.000000	4988.000000	158123.000000	-91.463069	41.690921	778000.000000

Lab 1 - Introduction to Python and $k$-means clustering¶

Part 1 - Libraries¶

Part 2 - Data Basics¶

Data Types¶

Data objects¶

Lists¶

Dictionaries¶

Arrays (`numpy`)¶

Methods and attributes¶

DataFrames (`pandas`)¶

Part 3 - Data manipulation and summarization using `pandas`¶

Part 4 - Data visualization using `matplotlib`¶

Part 5 - Functions and iteration¶

Part 6 - $k$-means clustering¶

Preprocessing¶

Choosing $k$¶

Evaluating cluster fit¶

Part 7 - Application of $k$-means - image segmentation¶

	Year	AAPL	KO	JNJ	AXP
0	2010	7.643214	28.520000	64.680000	40.919998
1	2011	11.770357	32.610001	62.820000	43.400002
2	2012	14.686786	35.070000	65.879997	48.389999
3	2013	19.608213	37.599998	70.839996	58.750000
4	2014	19.754642	40.660000	91.029999	89.449997
5	2015	27.332500	42.139999	104.519997	93.019997
6	2016	26.337500	42.400002	100.480003	67.589996
7	2017	29.037500	41.799999	115.839996	75.349998
8	2018	43.064999	45.540001	139.229996	98.940002
9	2019	35.547501	46.639999	125.720001	93.430000
10	2020	74.357498	54.689999	144.279999	124.599998
11	2021	129.410004	52.759998	156.500000	118.040001
12	2022	182.009995	59.299999	171.539993	168.210007

	Year	variable	value
0	2010	AAPL	7.643214
1	2011	AAPL	11.770357
2	2012	AAPL	14.686786
3	2013	AAPL	19.608213
4	2014	AAPL	19.754642
5	2015	AAPL	27.332500

	sale.date	new_variable	sale.amount	built
0	1/3/2005	a	172500	1993
1	1/12/2005	b	168500	1976

Lab 1 - Introduction to Python and $k$-means clustering¶

Part 1 - Libraries¶

Part 2 - Data Basics¶

Data Types¶

Data objects¶

Lists¶

Dictionaries¶

Arrays (numpy)¶

Methods and attributes¶

DataFrames (pandas)¶

Part 3 - Data manipulation and summarization using pandas¶

Part 4 - Data visualization using matplotlib¶

Part 5 - Functions and iteration¶

Part 6 - $k$-means clustering¶

Preprocessing¶

Choosing $k$¶

Evaluating cluster fit¶

Part 7 - Application of $k$-means - image segmentation¶

Arrays (`numpy`)¶

DataFrames (`pandas`)¶

Part 3 - Data manipulation and summarization using `pandas`¶

Part 4 - Data visualization using `matplotlib`¶