forked from jgoldstein2/zillow_project
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathproperties_final_new.R
182 lines (124 loc) · 8.28 KB
/
properties_final_new.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
library(DT)
library(data.table)
library(dplyr)
library(lubridate)
library(Hmisc)
library(mice)
library(lattice)
###############################################################
# Reading files
###############################################################
properties_df = as.data.frame(fread("properties_2016.csv"))
###############################################################
# Dropping columns
###############################################################
name_list <- names(properties_df)
cols_drop <- c("assessmentyear", "architecturalstyletypeid", "basementsqft",
"buildingclasstypeid", "calculatedbathnbr", "threequarterbathnbr",
"finishedfloor1squarefeet", "finishedsquarefeet12", "finishedsquarefeet13",
"finishedsquarefeet15", "finishedsquarefeet50", "finishedsquarefeet6",
"fips", "fireplaceflag", "fullbathcnt", "numberofstories", "poolsizesum",
"pooltypeid10", "pooltypeid2", "pooltypeid7", "propertycountylandusecode",
"propertyzoningdesc", "rawcensustractandblock", "regionidcity", "censustractandblock",
"regionidneighborhood", "roomcnt", "storytypeid", "typeconstructiontypeid",
"yardbuildingsqft17", "yardbuildingsqft26", "taxdelinquencyyear", "garagetotalsqft")
cleanProperties <- properties_df[ , !(names(properties_df) %in% cols_drop)]
###############################################################
# Mutating/Adding Features
###############################################################
cleanProperties = cleanProperties %>% mutate(latitude = latitude/1e6, longitude = longitude/1e6)
cleanProperties <- cleanProperties %>% mutate(age_of_home = 2017 - cleanProperties$yearbuilt)
###############################################################
# Imputation of Binary Variables
###############################################################
cleanProperties$poolcnt[is.na(cleanProperties$poolcnt)] = 0
cleanProperties$unitcnt[is.na(cleanProperties$unitcnt)] = 1
cleanProperties$decktypeid = ifelse(is.na(cleanProperties$decktypeid), 0, 1)
cleanProperties$fireplacecnt = ifelse(is.na(cleanProperties$fireplacecnt), 0, 1)
cleanProperties$taxdelinquencyflag = ifelse(cleanProperties$taxdelinquencyflag == '', 0, 1)
cleanProperties$hashottuborspa = ifelse(cleanProperties$hashottuborspa == '', 0, 1)
cleanProperties$airconditioningtypeid = ifelse(is.na(cleanProperties$airconditioningtypeid), 1,
ifelse(cleanProperties$airconditioningtypeid == 5, 0, 1))
cleanProperties$heatingorsystemtypeid = ifelse(is.na(cleanProperties$heatingorsystemtypeid), 1,
ifelse(cleanProperties$heatingorsystemtypeid == 13, 0, 1))
###############################################################
# Imputation by Mean/Mode
###############################################################
cleanProperties$taxvaluedollarcnt = as.numeric(impute(cleanProperties$taxvaluedollarcnt, mean))
cleanProperties$structuretaxvaluedollarcnt = as.numeric(impute(cleanProperties$structuretaxvaluedollarcnt, mean))
cleanProperties$landtaxvaluedollarcnt = as.numeric(impute(cleanProperties$landtaxvaluedollarcnt, mean))
cleanProperties$taxamount = as.numeric(impute(cleanProperties$taxamount, mean))
mode_ <- function(vec) {
names(which.max(table(vec)))
}
# cleanProperties$bathroomcnt <- as.numeric(ifelse(cleanProperties$bathroomcnt == 0,
# mode_(cleanProperties$bathroomcnt),
# cleanProperties$bathroomcnt))
# cleanProperties$bathroomcnt <- as.numeric(impute(cleanProperties$bathroomcnt, mode_(cleanProperties$bathroomcnt)))
cleanProperties$bathroomcnt <- ifelse(is.na(cleanProperties$bathroomcnt), 0, cleanProperties$bathroomcnt)
cleanProperties$bedroomcnt <- as.numeric(impute(cleanProperties$bedroomcnt, mode_(cleanProperties$bedroomcnt)))
cleanProperties$regionidcounty <- as.numeric(impute(cleanProperties$regionidcounty, mode_(cleanProperties$regionidcounty)))
cleanProperties$longitude <- as.numeric(impute(cleanProperties$longitude, mode_(cleanProperties$longitude)))
cleanProperties$latitude <- as.numeric(impute(cleanProperties$latitude, mode_(cleanProperties$latitude)))
cleanProperties$regionidzip <- as.numeric(impute(cleanProperties$regionidzip, mode_(cleanProperties$regionidzip)))
cleanProperties$age_of_home = round(as.numeric(impute(cleanProperties$age_of_home, mean)), 0)
cleanProperties$yearbuilt <- NULL
cleanProperties$lotsizesquarefeet <- as.numeric(impute(cleanProperties$lotsizesquarefeet, mean))
cleanProperties$calculatedfinishedsquarefeet <- as.numeric(impute(cleanProperties$calculatedfinishedsquarefeet, mean))
cleanProperties$garagecarcnt[is.na(cleanProperties$garagecarcnt)] = as.numeric(impute(cleanProperties$garagecarcnt, mode_(cleanProperties$garagecarcnt)))
cleanProperties$buildingqualitytypeid[is.na(cleanProperties$buildingqualitytypeid)] = as.numeric(impute(cleanProperties$buildingqualitytypeid, mode_(cleanProperties$buildingqualitytypeid)))
###############################################################
# Remove blank zip code rows
###############################################################
# cleanProperties <- cleanProperties %>% filter(!(is.na(cleanProperties$regionidzip)))
###############################################################
# Changing variables types
###############################################################
cols_reduced <- names(cleanProperties)
cols_factors <- c('airconditioningtypeid', 'buildingqualitytypeid', 'decktypeid',
'fireplacecnt', 'hashottuborspa', 'heatingorsystemtypeid',
'poolcnt', 'propertylandusetypeid', 'regionidcounty',
'regionidzip', 'taxdelinquencyflag')
cleanProperties[cols_factors] <- lapply(cleanProperties[cols_factors], factor)
# cleanProperties$garagecarcnt = as.numeric(cleanProperties$garagecarcnt)
cleanProperties$unitcnt = as.numeric(cleanProperties$unitcnt)
###############################################################
# Imputation by Random Sampling
###############################################################
# impDistr <- function(vec) {
# i=1; s=c()
# samples <- sort(unique(vec))
# numNAs <- sum(is.na(vec))
# x <- table(vec)/length(vec)
# while (i <= length(x)) {
# s[i] = x[[i]][1]
# i = i + 1
# }
# return(sample(samples, numNAs, prob = s, replace=T))
# }
# cleanProperties$garagecarcnt[is.na(cleanProperties$garagecarcnt)] = impDistr(cleanProperties$garagecarcnt)
#
# cleanProperties$airconditioningtypeid[is.na(cleanProperties$airconditioningtypeid)] = impDistr(cleanProperties$airconditioningtypeid)
#
# cleanProperties$buildingqualitytypeid[is.na(cleanProperties$buildingqualitytypeid)] = impDistr(cleanProperties$buildingqualitytypeid)
###############################################################
# Rename Binary Variables to Flags
###############################################################
cleanProperties = dplyr::rename(cleanProperties, acflag = airconditioningtypeid,
deckflag = decktypeid,
fireplaceflag = fireplacecnt,
hottubflag = hashottuborspa,
heatflag = heatingorsystemtypeid,
poolflag = poolcnt)
###############################################################
# Rename Binary Variables to Flags
###############################################################
# cleanProperties$property_group = as.factor(ifelse(cleanProperties$propertylandusetypeid %in% c(31,46,47), "Commercial",
# ifelse(cleanProperties$propertylandusetypeid %in% c(266,267,246,247,248), "Apartment",
# ifelse(cleanProperties$propertylandusetypeid %in% c(269,290,291,274,270), "Land", "House"))))
cleanProperties$property_group <- NULL
cleanProperties$building_quality = as.factor(ifelse(cleanProperties$buildingqualitytypeid %in% c(1,2,3,4), "Good",
ifelse(cleanProperties$buildingqualitytypeid %in% c(5,6,7,8), "Average", "Bad")))
cleanProperties$propertylandusetypeid <- NULL
cleanProperties$buildingqualitytypeid <- NULL
save(cleanProperties, file='cleanProperties_final_new.Rda')