import pandas as pd
import numpy as np


import matplotlib.pyplot as plt


## x contains the string '4'
x = '4'
type(x)

str


## Convert x from str to float
new_x = float(x)
print(new_x)
type(new_x)

4.0

float


## Three different boolean conversions
print(bool(1.0), bool(-2), bool(0.0))

True True False


## Simple list
my_list1 = [1,5,3,9]

## Lists within a list
my_list2 = [[1,3,5], [2,4,6]]


## The first list within `my_list2`
my_list2[0]

[1, 3, 5]


my_list1[:2]

[1, 5]


my_list2[0][1]

3


my_list2[0,1]

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_15364\3441573747.py in <module>
----> 1 my_list2[0,1]

TypeError: list indices must be integers or slices, not tuple


my_array2 = np.array(my_list2)
my_array2[0,1]

3


# For lists, + will concantenate
print(my_list1 + my_list1)

## For arrays, + works as intended (vectorized addition)
my_array1 = np.array(my_list1)
print(my_array1 + my_array1)

[1, 5, 3, 9, 1, 5, 3, 9]
[ 2 10  6 18]


## The shape attribute describes the array's dimensions
my_array2.shape

(2, 3)


## The flatten method reshapes an n-dimensional array into a 1-dimensional array
my_array2.flatten()

array([1, 3, 5, 2, 4, 6])


ic = pd.read_csv("https://remiller1450.github.io/data/IowaCityHomeSales.csv")
ic.shape

(777, 19)


## Print column labels attribute
print(ic.columns)

Index(['sale.amount', 'sale.date', 'occupancy', 'style', 'built', 'bedrooms',
       'bsmt', 'ac', 'attic', 'area.base', 'area.add', 'area.bsmt',
       'area.garage1', 'area.garage2', 'area.living', 'area.lot', 'lon', 'lat',
       'assessed'],
      dtype='object')


## Store price as a separate object, using the label `sale.amount` to select that column
ic_price = ic['sale.amount']
print(ic_price.shape)

(777,)


## Drop sale.price column (axis 1) from the original DataFrame
ic_no_price = ic.drop('sale.amount', axis=1)
print(ic_no_price.shape)

(777, 18)


## Select only columns with numeric data types
ic_num = ic.select_dtypes(include=['number'])
print(ic_num.shape)

(777, 13)


## Subset rows using a boolean condition
ic_expensive = ic[ic['sale.amount'] > 500000]
print(ic_expensive.shape)

(11, 19)


## Print the first N rows of a DataFrame
ic.head(3)

## Briefly summarize the DataFrame's column types
ic.info()

## Built-in descriptive summaries of a DataFrame's columns
ic.describe()

## Drop rows with at least one missing value
ic.dropna()

## Number of unique values for each column
ic.nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sale.amount   777 non-null    int64  
 1   sale.date     777 non-null    object 
 2   occupancy     777 non-null    object 
 3   style         777 non-null    object 
 4   built         777 non-null    int64  
 5   bedrooms      777 non-null    int64  
 6   bsmt          777 non-null    object 
 7   ac            777 non-null    object 
 8   attic         777 non-null    object 
 9   area.base     777 non-null    int64  
 10  area.add      777 non-null    int64  
 11  area.bsmt     777 non-null    int64  
 12  area.garage1  777 non-null    int64  
 13  area.garage2  777 non-null    int64  
 14  area.living   777 non-null    int64  
 15  area.lot      777 non-null    int64  
 16  lon           777 non-null    float64
 17  lat           777 non-null    float64
 18  assessed      777 non-null    int64  
dtypes: float64(2), int64(11), object(6)
memory usage: 115.5+ KB

sale.amount     391
sale.date       605
occupancy         3
style             9
built            95
bedrooms          7
bsmt              5
ac                2
attic             6
area.base       381
area.add        150
area.bsmt       111
area.garage1    127
area.garage2     47
area.living     442
area.lot        448
lon             701
lat             703
assessed        705
dtype: int64


## Group by a column then summarize (means here)
ic.groupby(by='style').mean()


## Merge two dataframes
more_data = pd.DataFrame({'sale.date': ['1/3/2005','1/12/2005'],
        'new_variable': ['a','b']})
merged_data = more_data.merge(ic, on='sale.date', how='left')
print(merged_data.shape)

(2, 20)


## Find homes with sale amounts in a given range
ic_midprice = ic.query('200000 < `sale.amount` < 400000')
ic_midprice.shape

(183, 19)


## Add a new column to a DataFrame (random values from a standard normal dist in this example)
ic = pd.read_csv("https://remiller1450.github.io/data/IowaCityHomeSales.csv")
new_var = np.random.randn(ic.shape[0])
ic.insert(ic.shape[1],'my_new_column',new_var)
ic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   sale.amount    777 non-null    int64  
 1   sale.date      777 non-null    object 
 2   occupancy      777 non-null    object 
 3   style          777 non-null    object 
 4   built          777 non-null    int64  
 5   bedrooms       777 non-null    int64  
 6   bsmt           777 non-null    object 
 7   ac             777 non-null    object 
 8   attic          777 non-null    object 
 9   area.base      777 non-null    int64  
 10  area.add       777 non-null    int64  
 11  area.bsmt      777 non-null    int64  
 12  area.garage1   777 non-null    int64  
 13  area.garage2   777 non-null    int64  
 14  area.living    777 non-null    int64  
 15  area.lot       777 non-null    int64  
 16  lon            777 non-null    float64
 17  lat            777 non-null    float64
 18  assessed       777 non-null    int64  
 19  my_new_column  777 non-null    float64
dtypes: float64(3), int64(11), object(6)
memory usage: 121.5+ KB


## Read data in "long" format
collegeAdm = pd.read_csv("https://remiller1450.github.io/data/college_adm.csv")
collegeAdm


## Pivot to "wide" format
ca_wide = collegeAdm.pivot(index="College", columns="Year", values="Adm_Rate")
ca_wide


## Pivot back to "long" format
ca_wide.reset_index(inplace=True)
ca_wide.melt(id_vars=['College'])


## Read polls
polls = pd.read_csv("https://remiller1450.github.io/data/polls2016.csv")

## Split 'Sample' into 'N' and 'P' using the space
polls[['N', 'Pop']] = polls['Sample'].str.split(' ', n=1, expand=True)

## Info on the resulting dataset
polls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Poll         7 non-null      object 
 1   Date         7 non-null      object 
 2   Sample       7 non-null      object 
 3   MoE          6 non-null      float64
 4   Clinton..D.  7 non-null      int64  
 5   Trump..R.    7 non-null      int64  
 6   Johnson..L.  7 non-null      int64  
 7   Stein..G.    7 non-null      int64  
 8   N            7 non-null      object 
 9   Pop          7 non-null      object 
dtypes: float64(1), int64(4), object(5)
memory usage: 688.0+ bytes


## Histogram
plt.hist(ic['sale.amount'])
plt.title('Home Sales in Iowa City, IA (2005-2007)')
plt.xlabel('Sale Price')
plt.show()


## Scatterplot
plt.scatter(ic['sale.amount'], ic['assessed'])
plt.show()

## Boxplots (these are easier to create using another graphics library, 'seaborn')
import seaborn as sns
sns.boxplot(x=ic['style'], y=ic['sale.amount'])
plt.xticks(rotation=90)
plt.show()

## Line graph (the default of `plot`)
plt.plot(collegeAdm[collegeAdm['College'] == 'Grinnell']['Year'], collegeAdm[collegeAdm['College'] == 'Grinnell']['Adm_Rate'])
plt.plot(collegeAdm[collegeAdm['College'] == 'Carlton']['Year'], collegeAdm[collegeAdm['College'] == 'Carlton']['Adm_Rate'])
plt.show()


## 2x2 grid of plots
fig, axs = plt.subplots(2,2)
fig.suptitle('2x2 plot grid')
axs[0,0].hist(ic['sale.amount'])
axs[0,0].set(xlabel='Sale Price')
axs[1,1].hist(ic['assessed'])
plt.show()


## Histograms of all numeric variables
ic_num = ic.select_dtypes(include=['number'])
ic_num.hist(figsize=(10, 12), layout=(3,5))
plt.show()


## Scatterplot matrix of all pairing of numeric vars
from pandas.plotting import scatter_matrix
scatter_matrix(ic_num)
plt.show()


## Boxplots (with groupings)
ic[['sale.amount','style']].boxplot(by='style')
plt.xticks(rotation=90)
plt.show()


## Bar chart for a categorical var (note that the data must be converted to counts first)
ic_style_counts = ic['style'].value_counts()
ic_style_counts.plot(kind='bar')
plt.show()


from skimage import io
my_img = io.imread("https://www.iowacollegefoundation.org/_assets/image/schools/grinnell-college/grinnell-college.png")
io.imshow(my_img)
io.show()
my_img.shape

(238, 400, 4)


## Read data
profs = pd.read_csv("https://remiller1450.github.io/data/Salaries.csv")

	sale.amount	built	bedrooms	area.base	area.add	area.bsmt	area.garage1	area.garage2	area.living	area.lot	lon	lat	assessed
style
1 1/2 Story Frame	186644.000000	1927.320000	3.080000	744.480000	177.040000	123.800000	79.240000	273.280000	1499.160000	8348.440000	-91.525070	41.654289	177443.600000
1 Story Brick	220225.416667	1992.000000	2.333333	1304.000000	17.500000	244.041667	125.583333	33.333333	1321.500000	7325.708333	-91.513483	41.654208	224732.916667
1 Story Condo	121246.600000	2001.800000	2.111111	1041.777778	0.000000	106.000000	108.555556	193.111111	1041.777778	4232.644444	-91.517162	41.653922	118141.777778
1 Story Frame	166179.740634	1969.317003	3.002882	1115.057637	33.997118	412.340058	244.138329	80.020173	1188.129683	9057.158501	-91.522871	41.652600	159532.074928
2 Story Brick	334985.000000	1937.200000	4.100000	923.500000	142.600000	135.000000	20.900000	128.800000	2247.600000	14049.800000	-91.519299	41.651382	305561.000000
2 Story Condo	149766.666667	2003.555556	2.407407	721.740741	21.222222	113.592593	206.296296	0.000000	1537.925926	6579.888889	-91.527477	41.650626	151150.740741
2 Story Frame	215038.451087	1966.141304	3.271739	736.065217	189.891304	284.619565	248.597826	73.641304	1764.902174	10013.983696	-91.523175	41.653444	210999.782609
Split Foyer Frame	160058.333333	1982.023810	3.226190	1085.595238	13.464286	516.738095	78.785714	9.571429	1114.071429	8852.440476	-91.523587	41.652011	151671.547619
Split Level Frame	208351.612903	1984.870968	3.290323	896.548387	266.838710	381.387097	380.193548	0.000000	1614.096774	9119.483871	-91.521810	41.652192	201474.838710

	Adm_Rate	Year	College
0	28.9	2018	Grinnell
1	24.4	2019	Grinnell
2	23.1	2020	Grinnell
3	21.2	2018	Carlton
4	19.8	2019	Carlton
5	19.1	2020	Carlton
6	33.7	2018	Oberlin
7	36.2	2019	Oberlin
8	36.4	2020	Oberlin

Year	2018	2019	2020
College
Carlton	21.2	19.8	19.1
Grinnell	28.9	24.4	23.1
Oberlin	33.7	36.2	36.4

	College	Year	value
0	Carlton	2018	21.2
1	Grinnell	2018	28.9
2	Oberlin	2018	33.7
3	Carlton	2019	19.8
4	Grinnell	2019	24.4
5	Oberlin	2019	36.2
6	Carlton	2020	19.1
7	Grinnell	2020	23.1
8	Oberlin	2020	36.4

Lab #1 - Python Essentials¶

Part 1 - Libraries¶

Part 2 - Basic Data Types¶

Storing data¶

Question #1¶

Part 3 - Numpy Arrays¶

Question #2¶

Part 4 - Pandas DataFrames¶

Question #3¶

Part 5 - Data Manipulation¶

Question #4¶

Part 6 - Basic Graphics¶

Displaying Multiple Graphics¶

Question #5¶