import pandas as pd
import numpy as np


ic_homes = pd.read_csv("https://remiller1450.github.io/data/IowaCityHomeSales.csv")


# Import the scikit-learn library
import sklearn 

# Load all functions from the pre-processing module
from sklearn.preprocessing import *

# Load just the 'LinearRegression' function from the linear_model module
from sklearn.linear_model import LinearRegression


# Example 1 - Define x as a str type and coerce to float
x = '4'
type(x)
float(x)

# Example 2 - Check the boolean coercions of a few different numerical values
print(bool(1.0), bool(-2), bool(0.0))

True True False


# Create a simple list of integer values
my_first_list = [3,1,4,5]

# Create a list of lists
my_second_list = [[1,2,3], ['a','b','c']]

# Indexing starts at zero in Python
my_second_list[1]

['a', 'b', 'c']


# Create the dictionary
my_dict = {'brand': 'Ford',
           'year': 1964,
           'colors': ['red', 'white', 'blue']}

# Access the 'colors' that are stored in this dictionary
my_dict['colors']

['red', 'white', 'blue']


# Two lists used in the example
my_list1 = [1,5,3,9]
my_list2 = [2,5,0,-4]

# The '+' operator will concatenate lists
my_list1 + my_list2

[1, 5, 3, 9, 2, 5, 0, -4]


# The '+' operator will perform vectorized addition on numpy arrays
my_array1 = np.array(my_list1)
my_array2 = np.array(my_list2)
my_array1 + my_array2

array([ 3, 10,  3,  5])


## Create a 2-d array
my_2d_array = np.array([[1,3,5], [2,4,6]]) 

## Use the "shape" attribute to see the dimensions of this array
my_2d_array.shape

(2, 3)


## Use the "flatten" method to make this 2-d array into a 1-d array
my_2d_array.flatten()

array([1, 3, 5, 2, 4, 6])


## Example 1 - Printing the variable names and types in ic_homes
ic_homes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sale.amount   777 non-null    int64  
 1   sale.date     777 non-null    object 
 2   occupancy     777 non-null    object 
 3   style         777 non-null    object 
 4   built         777 non-null    int64  
 5   bedrooms      777 non-null    int64  
 6   bsmt          777 non-null    object 
 7   ac            777 non-null    object 
 8   attic         777 non-null    object 
 9   area.base     777 non-null    int64  
 10  area.add      777 non-null    int64  
 11  area.bsmt     777 non-null    int64  
 12  area.garage1  777 non-null    int64  
 13  area.garage2  777 non-null    int64  
 14  area.living   777 non-null    int64  
 15  area.lot      777 non-null    int64  
 16  lon           777 non-null    float64
 17  lat           777 non-null    float64
 18  assessed      777 non-null    int64  
dtypes: float64(2), int64(11), object(6)
memory usage: 115.5+ KB


## Example 2 - selecting a single variable and describing it
ic_homes['sale.amount'].describe()

count       777.000000
mean     180098.329472
std       90655.308636
min       38250.000000
25%      130000.000000
50%      157900.000000
75%      205000.000000
max      815000.000000
Name: sale.amount, dtype: float64


## Example 3 - selecting three variables and printing the first 3 observations
ic_homes[['sale.date','sale.amount', 'assessed']].head(3)


## Example 4 - filtering by a logical condition, selecting three variables, and reporting the dimensions of the resulting DataFrame
ic_homes.loc[ic_homes['sale.amount'] > 500000, ['sale.date','sale.amount', 'assessed']].shape

(11, 3)


ic_homes.groupby(by='style')['sale.amount'].mean()

style
1 1/2 Story Frame    186644.000000
1 Story Brick        220225.416667
1 Story Condo        121246.600000
1 Story Frame        166179.740634
2 Story Brick        334985.000000
2 Story Condo        149766.666667
2 Story Frame        215038.451087
Split Foyer Frame    160058.333333
Split Level Frame    208351.612903
Name: sale.amount, dtype: float64


## Create an example dataframe for illustration
more_data = pd.DataFrame({'sale.date': ['1/3/2005','1/12/2005'],
        'new_variable': ['a','b']})

## Left join 'ic' onto this example dataframe according to the 'sale.date' variable
merged_data = more_data.merge(ic_homes, on='sale.date', how='left')

## Print some of the merged data, notice the new columns from 'ic'
merged_data[['sale.date', 'new_variable', 'sale.amount', 'built']]


ic_homes[['sale.amount', 'assessed']].plot(kind = 'hist', subplots = True)

array([<AxesSubplot:ylabel='Frequency'>, <AxesSubplot:ylabel='Frequency'>],
      dtype=object)


## A few imports
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

## Create the scatterplot matrix after selecting only numeric columns
plot = scatter_matrix(ic_homes.select_dtypes(include=['number']))
plt.show()


## Setup
## Create some inputs
x = np.linspace(0, 4, num=20) # 20 equally spaced values b/w 0 and 4

## Display 3 different functions of x
plt.plot(x, x)
plt.plot(x, np.power(x,2))
plt.plot(x, np.exp(x))

## Add a legend
plt.legend(['y = x', 'y = x^2', 'y = e^x'], loc='upper left')

## Optional plt.show() - this will prevent superfluous printed output
plt.show()


## Define the function
def squared_diff(y, yh):
    diff = y - yh
    ss_diff = np.dot(diff, diff)
    return ss_diff

## Example showing how it is used
squared_diff(y = np.array([1,1,1]), yh = np.array([1,0,2]))

2


## Example while loop
x = 0
while x <= 3:
    print(x)
    x = x +1


## More realistic use of a while loop
diff = float('inf')
tol = 0.1
yh = np.array([1,1.5,-2])
target = np.array([0,0,0])
while diff > tol:
    diff = squared_diff(yh, target)
    yh = yh/1.5    # shrink values by 50%
    print(yh)

[ 0.66666667  1.         -1.33333333]
[ 0.44444444  0.66666667 -0.88888889]
[ 0.2962963   0.44444444 -0.59259259]
[ 0.19753086  0.2962963  -0.39506173]
[ 0.13168724  0.19753086 -0.26337449]
[ 0.0877915   0.13168724 -0.17558299]
[ 0.05852766  0.0877915  -0.11705533]


## Libraries
import os
import matplotlib.image as mpimg

## Root directory containing the folder (on my PC)
path = 'OneDrive - Grinnell College/Documents/cats/'

## Display the first file
file_list = os.listdir(path) ## This is a list of all files in the directory     
plt.imshow(mpimg.imread(path + file_list[0]))  ## Notice how '+' combines strings

<matplotlib.image.AxesImage at 0x1e6b3d1d1c0>


first_9_cats = file_list[0:8]
for file in first_9_cats:
    plt.imshow(mpimg.imread(path + file))
    plt.show()

	sale.date	sale.amount	assessed
0	1/3/2005	172500	173040
1	1/5/2005	90000	89470
2	1/12/2005	168500	164230

Lab 1 - Introduction to Python¶

Part 1 - Libraries¶

Part 2 - Basic Data Structures¶

Lists¶

Dictionaries¶

Arrays¶

DataFrames¶

Part 3 - Data Manipulation using `pandas`¶

Aggregation/Grouped Summaries¶

Merging and Joining¶

Part 4 - Data Visualizations¶

Part 5 - Functions, Iteration/Looping, and Files¶

Lab 1 - Introduction to Python¶

Part 1 - Libraries¶

Part 2 - Basic Data Structures¶

Lists¶

Dictionaries¶

Arrays¶

DataFrames¶

Part 3 - Data Manipulation using pandas¶

Aggregation/Grouped Summaries¶

Merging and Joining¶

Part 4 - Data Visualizations¶

Part 5 - Functions, Iteration/Looping, and Files¶

Part 3 - Data Manipulation using `pandas`¶