-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean.py
143 lines (117 loc) · 5.97 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
'''
Different processes that have been followed to clean the data present in the aggregated data file
'''
def nomalize_percent(x):
return float(x/100)
file_path = '../Aggregated_Data_v3.csv'
df = pd.read_csv(file_path)
inc_data = ['vehicles_#', 'poverty_all_#', 'emp_Pop_Ratio_Wh', 'emp_Pop_Ration_Afam', 'emp_Pop_Ratio_Alaska', 'emp_Pop_Ratio_Asian', 'emp_Pop_Ratio_Hawaii', 'emp_Pop_Ratio_Other', 'emp_Pop_Ratio_Two', 'emp_Pop_Ratio_Hisp']
#columns to be dropped because of duplication
df = df.drop(inc_data, axis = 1)
#convert from current format to numpy arraw
household_no = np.array(list(df['Total # of households']))
df = df.drop('Total # of households', axis = 1)
columns = list(df)[:65]
sub_set = df[columns]
pop = np.array(list(df['Total population']))
#towns are the indices of this dataframe
munip = list(df['Unnamed: 0'])
per_k = pop/1000
emp_rate = np.array(list(df['emp_rate']))/100
sparse = []
clean = []
rate = []
norm = []
perc = []
#Find out which columns are sparsed, have to be cleaned, converted to a rate, normalized or percentage(/100)
for c in columns[1:]:
# to find the number of sparse columns
if (sub_set[c].isna().sum() > 175):
sub_set = sub_set.drop([c], axis = 1)
sparse.append(c)
else:
name = sub_set[c].name
dtype = sub_set[c].dtype
#check if numbers are in float format, if not, remove commas and convert to numeric from string/object type
if(dtype != 'float64'):
temp = pd.to_numeric(sub_set[c], errors='ignore')
clean.append(c)
for i,j in temp.iteritems():
try:
float(j)
except ValueError:
if(j == '-'):
temp[i] = np.nan
elif name == 'median_sale_homes_USD' or name == 'rent_income_USD':
temp[i] = float(j[:-1].replace(',',''))
else:
print(name)
print(j)
print(type(j))
sub_set.loc[: ,c] = pd.to_numeric(temp)
#convert to per 1000 of the population
if c == 'homeless_Shelters':
rate.append(c)
sub_set.loc[:, c] = np.array(list(sub_set[c])/per_k)
sub_set.rename(index=str, columns={c:c+'_per_1000'}, inplace = True)
#have to be normalized
if c == 'labor_force_rate' or c == 'emp_rate' or c == 'age_lt18' or c== 'house_lt1939' or c == 'house_1940to1999' or c == 'house_2000to2014' or c == 'poverty_child_%' or c == 'rent_income_USD' or c == 'comm_car_%' or c == 'comm_carpool_%' or c == 'comm_bus_%' or c == 'comm_walk_%' or c == 'comm_cycle_%' or c == 'comm_taxi_%' or c == 'comm_wfh_%' or c == '% no vehicle is available' or c == '% below poverty level':
norm.append(c)
sub_set.loc[:, c] = sub_set[c].apply(nomalize_percent)
if c == 'age_lt18' or c== 'house_lt1939' or c == 'house_1940to1999' or c == 'house_2000to2014':
sub_set.rename(index=str, columns={c:c+'_%'}, inplace = True)
if c == '% no vehicle is available':
sub_set.rename(index=str, columns={c:'no_vehicle_avail_%'}, inplace = True)
if c == '% below poverty level':
sub_set.rename(index=str, columns={c:'below_poverty_level_%'}, inplace = True)
if c == 'rent_income_USD':
sub_set.rename(index=str, columns={c:'gross_rent_%'}, inplace = True)
#string formatting and cleaning of the data
if c == 'race_Wh' or c == 'race_Afam' or c == 'race_Alaska' or c == 'race_Asian' or c == 'race_Hawaii' or c == 'race_Other' or c == 'race_Two' or c == 'race_Hisp' or c == 'eng' or c == 'owner_homes_#' or c == 'moved_lastyear_#':
perc.append(c)
sub_set.loc[:, c] = np.array(list(sub_set[c])/pop)
replace = ''
if c == 'owner_homes_#' or c == 'moved_lastyear_#':
replace = c.replace('#', '%')
else:
replace = c + '_%'
sub_set.rename(index=str, columns={c:replace}, inplace = True)
if c == 'emp_business' or c == 'emp_computer' or c == 'emp_legal' or c == 'emp_healthcare' or c == 'emp_healthsupport' or c == 'emp_protective' or c == 'emp_foodprepare' or c == 'emp_clean' or c == 'emp_personalcare' or c == 'emp_sales' or c == 'emp_admin' or c == 'emp_farm' or c == 'emp_construct' or c == 'emp_repair' or c == 'emp_production' or c == 'emp_transport' or c == 'emp_material':
perc.append(c)
sub_set.loc[:, c] = np.array(sub_set[c])/(emp_rate * pop)
sub_set.rename(index=str, columns={c:c+'_%'}, inplace = True)
if c == 'occ_lt0.5' or c == 'occ_0.5to1.0' or c == 'occ_1.01to1.5' or c == 'occ_1.51to2.0' or c == 'occ_gt2.01' or c == 'house_noplumbing' or c == 'house_nokitchen' or c == 'house_income_gt30_USD':
perc.append(c)
sub_set.loc[:,c] = np.array(sub_set[c])/household_no
sub_set.rename(index=str, columns={c:c+'_%'}, inplace = True)
sub_set = pd.read_csv('data/health_determinant.csv', index_col=0)
columns = list(sub_set)
#converting to either standardized format or mean centered
for c in columns:
sub_set.loc[:,c] = (sub_set[c]-sub_set[c].mean())#/sub_set[c].std()
sub_set.to_csv('data/determinant_data_mn.csv')
#raw data that is either standardized or mean-centered
sub_set.set_index('Unnamed: 0', inplace = True)
sub_set.to_csv('../cleaned_sub_1_raw.csv')
#Print all columns for which necessary changes took place
print("COLUMNS DROPPED DUE TO INCORRECT DATA")
print(inc_data)
print('\n')
print("COLUMNS DROPPED DUE TO SPARSE DATA")
print(sparse)
print('\n')
print("COLUMNS THAT HAD TO BE CLEANED")
print(clean)
print('\n')
print("COLUMNS THAT HAD TO BE CONVERTED TO A RATE")
print(rate)
print('\n')
print("COLUMNS THAT HAD TO BE NORMALIZED")
print(norm)
print('\n')
print("COLUMNS THAT HAD TO BE CONVERTED TO PERCENT")
print(perc)
print('\n')