-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathDBNC.R
119 lines (104 loc) · 4.48 KB
/
DBNC.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
### load dependencies
library(readr)
library(dplyr)
library(tidyr)
library(Hmisc)
library(zoo)
library(VIM)
### load raw data
df = read_csv("rawData/train.csv")
test = read_csv("rawData/test.csv")
test$SalePrice = NA
df = rbind(df,test)
# check summary statistics
names(df)
summary(df)
##Check Missing Data
aggr(df, plot = F)
# convert types
df = df %>%
mutate(
Alley = ifelse(is.na(Alley), "No Alley", Alley),
MasVnrArea = ifelse(is.na(MasVnrArea), 0, MasVnrArea),
MasVnrType = ifelse(MasVnrArea == 0, "No Vnr", MasVnrType),
BsmtQual = ifelse(is.na(BsmtQual), "No Bsmt", BsmtQual),
BsmtCond = ifelse(is.na(BsmtCond), "No Bsmt", BsmtCond),
BsmtExposure = ifelse(is.na(BsmtExposure), "No Bsmt", BsmtExposure),
BsmtFinType1 = ifelse(is.na(BsmtFinType1), "No Bsmt", BsmtFinType1),
BsmtFinType2 = ifelse(is.na(BsmtFinType2), "No Bsmt", BsmtFinType2),
FireplaceQu = ifelse(is.na(FireplaceQu), "No FrPl", FireplaceQu),
GarageType = ifelse(is.na(GarageType), "No Grge", GarageType),
GarageFinish = ifelse(is.na(GarageFinish), "No Grge", GarageFinish),
GarageQual = ifelse(is.na(GarageQual), "No Grge", GarageQual),
GarageCond = ifelse(is.na(GarageCond), "No Grge", GarageCond),
PoolQC = ifelse(is.na(PoolQC), "No Pool", PoolQC),
Fence = ifelse(is.na(Fence), "No Fnce", Fence),
MiscFeature = ifelse(is.na(MiscFeature), "No Feature", MiscFeature),
GarageYrBlt = ifelse(is.na(GarageYrBlt), YearBuilt, GarageYrBlt),
Age = YrSold - YearRemodAdd,
Remodeled = ifelse(YearBuilt == YearRemodAdd, "No", "Yes"),
Bsmt = ifelse(BsmtQual == "No Bsmt", "No", "Yes"),
Garage = ifelse(GarageType == "No Grge", "No", "Yes"),
Pool = ifelse(PoolQC == "No Pool", "No", "Yes"),
TotalBath = BsmtFullBath + 0.5*BsmtHalfBath + FullBath + 0.5*HalfBath,
Condition.Norm = ifelse(Condition1 == "Norm" | Condition1 == "Norm", "Yes", "No"),
Condition.Artery = ifelse(Condition1 == "Artery" | Condition1 == "Artery", "Yes", "No"),
Condition.Feedr = ifelse(Condition1 == "Feedr" | Condition1 == "Feedr", "Yes", "No"),
Condition.PosA = ifelse(Condition1 == "PosA" | Condition1 == "PosA", "Yes", "No"),
Condition.PosN = ifelse(Condition1 == "PosN" | Condition1 == "PosN", "Yes", "No"),
Condition.RRAe = ifelse(Condition1 == "RRAe" | Condition1 == "RRAe", "Yes", "No"),
Condition.RRAn = ifelse(Condition1 == "RRAn" | Condition1 == "RRAn", "Yes", "No"),
Condition.RRNe = ifelse(Condition1 == "RRNe" | Condition1 == "RRNe", "Yes", "No"),
Condition.RRNn = ifelse(Condition1 == "RRNn" | Condition1 == "RRNn", "Yes", "No"),
TotSF = GrLivArea + TotalBsmtSF,
DateSold = as.yearmon(paste(df$YrSold, df$MoSold), "%Y %m"),
PorchTotSF = WoodDeckSF + OpenPorchSF + EnclosedPorch + `3SsnPorch` + ScreenPorch,
WoodDeck = ifelse(WoodDeckSF == 0, "No", "Yes"),
OpenPorch = ifelse(OpenPorchSF == 0, "No", "Yes"),
EnclosePorch = ifelse(EnclosedPorch == 0, "No", "Yes"),
ThreePorch = ifelse(`3SsnPorch` == 0, "No", "Yes"),
SPorch = ifelse(ScreenPorch == 0, "No", "Yes"),
Electrical = replace_na(Electrical, "SBrkr")) %>%
select(-Id, -Condition1, -Condition2)
test = df %>%
filter(.,is.na(SalePrice))
df = df %>%
filter(.,!is.na(SalePrice))
# imputation of LotFrontage
df.neighborhoodFrontage = df %>%
group_by(Neighborhood) %>%
summarise(MeanLotf = mean(LotFrontage, na.rm = TRUE))
df = df %>% left_join(df.neighborhoodFrontage, by = "Neighborhood") %>%
mutate(LotFrontage = ifelse(is.na(LotFrontage), MeanLotf, LotFrontage)) %>%
select(-MeanLotf)
# make MSSubClass character
df$MSSubClass = as.factor(df$MSSubClass)
df$OverallQual = as.factor(df$OverallQual)
df$OverallCond = as.factor(df$OverallCond)
# Mess with test set
aggr(test, plot = F)
#Impute Lot Frontage Again
test.neighborhoodFrontage = test %>%
group_by(Neighborhood) %>%
summarise(MeanLotf = mean(LotFrontage, na.rm = TRUE))
test = test %>% left_join(test.neighborhoodFrontage, by = "Neighborhood") %>%
mutate(LotFrontage = ifelse(is.na(LotFrontage), MeanLotf, LotFrontage)) %>%
select(-MeanLotf)
#Check Ext Vars
Ext = test %>%
select(.,contains("ext"),Neighborhood)
Ext[!complete.cases(Ext),]
##Belongs to Edwards
Ext = test %>%
select(.,contains("ext"),Neighborhood) %>%
filter(.,Neighborhood == "Edwards")
table(Ext$Exterior1st)
table(Ext$Exterior2nd)
#Impute with Mode
test %>%
mutate(.,Exterior1st = replace_na("Wd Sdng"),
Exterior2nd = replace_na("Wd Sdng"))
aggr(test, plot = F)
test[,]
#RL for mitchel, RM for IDOTRR
save(df, file = "cleanDF.rdata")