-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathttest.py
134 lines (112 loc) · 6.83 KB
/
ttest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import pandas as pd
from scipy.stats import ttest_ind
states = {'OH': 'Ohio', 'KY': 'Kentucky', 'AS': 'American Samoa', 'NV': 'Nevada', 'WY': 'Wyoming', 'NA': 'National',
'AL': 'Alabama', 'MD': 'Maryland', 'AK': 'Alaska', 'UT': 'Utah', 'OR': 'Oregon', 'MT': 'Montana',
'IL': 'Illinois', 'TN': 'Tennessee', 'DC': 'District of Columbia', 'VT': 'Vermont', 'ID': 'Idaho',
'AR': 'Arkansas', 'ME': 'Maine', 'WA': 'Washington', 'HI': 'Hawaii', 'WI': 'Wisconsin', 'MI': 'Michigan',
'IN': 'Indiana', 'NJ': 'New Jersey', 'AZ': 'Arizona', 'GU': 'Guam', 'MS': 'Mississippi', 'PR': 'Puerto Rico',
'NC': 'North Carolina', 'TX': 'Texas', 'SD': 'South Dakota', 'MP': 'Northern Mariana Islands', 'IA': 'Iowa',
'MO': 'Missouri', 'CT': 'Connecticut', 'WV': 'West Virginia', 'SC': 'South Carolina', 'LA': 'Louisiana',
'KS': 'Kansas', 'NY': 'New York', 'NE': 'Nebraska', 'OK': 'Oklahoma', 'FL': 'Florida', 'CA': 'California',
'CO': 'Colorado', 'PA': 'Pennsylvania', 'DE': 'Delaware', 'NM': 'New Mexico', 'RI': 'Rhode Island',
'MN': 'Minnesota', 'VI': 'Virgin Islands', 'NH': 'New Hampshire', 'MA': 'Massachusetts', 'GA': 'Georgia',
'ND': 'North Dakota', 'VA': 'Virginia'}
def get_list_of_university_towns():
"""Returns a DataFrame of towns and the states they are in from the university_towns.txt list."""
with open('university_towns.txt') as file:
data = []
for line in file:
data.append(line[:-1])
state_town = []
for line in data:
if line[-6:] == '[edit]':
state = line[:-6]
elif '(' in line:
town = line[:line.index('(') - 1]
state_town.append([state, town])
else:
town = line.rstrip()
state_town.append([state, town])
return pd.DataFrame(state_town, columns=["State", "RegionName"])
def get_recession_start():
"""Returns the year and quarter of the recession start time as a string value in a format such as 2005q3"""
gdp = pd.read_excel('gdplev.xls', skiprows=7)
gdp = gdp[['Unnamed: 4', 'Unnamed: 5']]
gdp = gdp.iloc[212:]
gdp.columns = ['Quarter', 'GDP']
recession_start = []
for year in range(len(gdp) - 2):
if (gdp.iloc[year][1] > gdp.iloc[year + 1][1]) & (gdp.iloc[year + 1][1] > gdp.iloc[year + 2][1]):
recession_start.append(gdp.iloc[year][0])
return recession_start[0]
def get_recession_end():
"""Returns the year and quarter of the recession end time as a string value in a format such as 2005q3"""
gdp = pd.read_excel('gdplev.xls', skiprows=7)
gdp = gdp[['Unnamed: 4', 'Unnamed: 5']]
gdp = gdp.iloc[212:]
gdp.columns = ['Quarter', 'GDP']
recession_end = []
for year in range(len(gdp) - 4):
if (gdp.iloc[year][1] > gdp.iloc[year + 1][1]) & (gdp.iloc[year + 1][1] > gdp.iloc[year + 2][1]) & (
gdp.iloc[year + 2][1] < gdp.iloc[year + 3][1]) & (gdp.iloc[year + 3][1] < gdp.iloc[year + 4][1]):
recession_end.append(gdp.iloc[year + 4][0])
return recession_end[0]
def get_recession_bottom():
"""Returns the year and quarter of the recession bottom time as a string value in a format such as 2005q3"""
start = get_recession_start()
end = get_recession_end()
gdp = pd.read_excel('gdplev.xls', skiprows=7)
gdp = gdp[['Unnamed: 4', 'Unnamed: 5']]
gdp = gdp.iloc[212:]
gdp.columns = ['Quarter', 'GDP']
gdp = gdp.set_index('Quarter')
gdp = gdp.loc[start:end]
return gdp['GDP'].idxmin()
def convert_housing_data_to_quarters():
"""Converts the housing data to quarters and returns it as mean values in a dataframe. This dataframe has columns
from 2000q1 through 2016q3, and a multi-index in the shape of ["State","RegionName"]."""
housing = pd.read_csv('City_Zhvi_AllHomes.csv')
housing = housing.drop(housing.columns[[0] + list(range(3, 51))], axis=1)
qhouse = pd.DataFrame(housing[['State', 'RegionName']])
for year in range(2000, 2017):
qhouse[str(year) + 'q1'] = housing[[str(year) + '-01', str(year) + '-02', str(year) + '-03']].mean(axis=1)
qhouse[str(year) + 'q2'] = housing[[str(year) + '-04', str(year) + '-05', str(year) + '-06']].mean(axis=1)
if year == 2016:
qhouse[str(year) + 'q3'] = housing[[str(year) + '-07', str(year) + '-08']].mean(axis=1)
else:
qhouse[str(year) + 'q3'] = housing[[str(year) + '-07', str(year) + '-08', str(year) + '-09']].mean(axis=1)
qhouse[str(year) + 'q4'] = housing[[str(year) + '-10', str(year) + '-11', str(year) + '-12']].mean(axis=1)
qhouse['State'] = [states[state] for state in qhouse['State']]
qhouse = qhouse.set_index(['State', 'RegionName'])
return qhouse
def run_ttest():
"""First creates new data showing the decline or growth of housing prices between the recession start and the
recession bottom. Then runs a ttest comparing the university town values to the non-university towns values, return
whether the alternative hypothesis (that the two groups are the same) is true or not as well as the p-value of the
confidence.
Returns the tuple (different, p, better) where different=True if the t-test is True at a p<0.01 (we reject the null
hypothesis), or different=False if otherwise (we cannot reject the null hypothesis). The value for better is either
"university town" or "non-university town" depending on which has a lower mean price ratio (which is equivalent to a
reduced market loss)."""
unitowns = get_list_of_university_towns()
bottom = get_recession_bottom()
start = get_recession_start()
house = convert_housing_data_to_quarters()
before_start = house.columns[house.columns.get_loc(start) - 1]
house['ratio'] = house[before_start] / house[bottom]
house = house[[bottom, before_start, 'ratio']]
house = house.reset_index()
house_unitown = pd.merge(house, unitowns, how='inner', on=['State', 'RegionName'])
house_unitown['University Town'] = True
house_unitown2 = pd.merge(house, house_unitown, how='outer',
on=['State', 'RegionName', before_start, bottom, 'ratio'])
house_unitown2['University Town'] = house_unitown2['University Town'].fillna(False)
university_towns = house_unitown2[house_unitown2['University Town'] == True]
non_university_towns = house_unitown2[house_unitown2['University Town'] == False]
t, p = ttest_ind(university_towns['ratio'].dropna(), non_university_towns['ratio'].dropna())
different = True if p < 0.01 else False
better = "university town" if university_towns['ratio'].mean() < non_university_towns[
'ratio'].mean() else "non-university town"
return different, p, better
if __name__ == '__main__':
print(run_ttest())