-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdubbizle.R
210 lines (169 loc) · 7.39 KB
/
dubbizle.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
library('gdata')
library('moments')
library('Amelia')
library('extRemes')
library('dummies')
library('ggplot2')
library('randomForest')
library('miscTools')
library('car')
library('stats')
library('corpcor')
library('corrplot')
library('MASS')
library("usdm")
####### extracting data
data <- read.xls('~/Downloads/Data Scientist Initial Screening Test.xlsx', sheet = 3, verbose = TRUE)
View(data)
data$X<-NULL
summary(data)
attach(data)
######Check if any missing value############
missmap(data, main = "Missing values vs observed")
########working with categorical variables#########
sapply(data, class)
################# Univariate Analysis ##############
#Distribution of price variable
boxplot(price,horizontal = T,col = "blue",main = "Car Prices Distribution",xlab = "Car Prices")
# Transofrmation to log seems to
boxplot(log(price),horizontal = T,col = "blue",main = "Car Prices Distribution",xlab = "Car Prices")
hist(price,col = "blue",main = "Car Prices Distribution",xlab = "Car Prices",ylab = "Probability", prob= T, breaks = 15)
hist(log(price),col = "blue",main = "Car Prices Distribution",xlab = "Car Prices",ylab = "Probability", prob= T)
#curve plotting
curve(dnorm(x,mean(log(price)),sd(log(price))),add = T, col = "red", lwd = 3)
qqPlot(log(price),main = "QQ Price")
log_price <- log(price)
mean(log_price)
sd(log_price)
data$log_price<-log(data$price)
#Distribution of Windowed Make model count ##########
boxplot(Windowed_make_model_count,horizontal = T,col = "blue",main = "Windowed_make_model_count Distribution",xlab = "Windowed_make_model_count")
hist(Windowed_make_model_count,col = "blue",main = "Windowed_make_model_count Distribution",xlab = "Windowed_make_model_count",ylab = "Probability", prob= T)
hist(sqrt(Windowed_make_model_count),col = "blue",main = "Windowed_make_model_count Distribution",xlab = "Windowed_make_model_count",ylab = "Probability", prob= T)
#Distribution of count ##########
boxplot(count,horizontal = T,col = "blue",main = "count Distribution",xlab = "count")
hist(count,col = "blue",main = "count Distribution",xlab = "count",ylab = "Probability", prob= T)
#Distribution of car_age ##########
boxplot(car_age,horizontal = T,col = "blue",main = "car_age Distribution",xlab = "car_age")
hist(car_age,col = "blue",main = "car_age Distribution",xlab = "car_age",ylab = "Probability", prob= T)
mean(data$car_age<0)
#99.5 %
data$car_age[data$car_age<0] <- 0
#check again the percentage of negative values
mean(data$car_age<0)
hist(sqrt(data$car_age),col = "blue",main = "car_age Distribution",xlab = "car_age",ylab = "Probability", prob= T)
curve(dnorm(x,mean(sqrt(data$car_age)),sd(sqrt(data$car_age))),add = T, col = "red", lwd = 3)
qqPlot(sqrt(data$car_age),main = "QQ Age")
data$sqrt_car_age<-sqrt(data$car_age)
#Distribution of kms_bucket ##########
boxplot(kms_bucket,horizontal = T,col = "blue",main = "kms_bucket Distribution",xlab = "kms_bucket")
hist(kms_bucket,col = "blue",main = "kms_bucket Distribution",xlab = "kms_bucket",ylab = "Probability", prob= T)
hist(sqrt(data$kms_bucket),col = "blue",main = "car_age Distribution",xlab = "car_age",ylab = "Probability", prob= T)
curve(dnorm(x,mean(sqrt(data$kms_bucket)),sd(sqrt(data$kms_bucket))),add = T, col = "red", lwd = 3)
qqPlot(sqrt(data$kms_bucket),main = "QQ Age")
data$sqrt_kms_bucket<- sqrt(data$kms_bucket)
###########Bivariate Analysis##############
##### checking correlation between continuos variables
cor(data[,c(4,5,6,7,8)])
cor2pcor(cor(data[,c(4,5,6,7,8)]))
M <- cor(data[,c(4,5,6,7,8)])
corrplot(M, method="circle")
corrplot(M, method="number")
#for more https://cran.r-project.org/web/packages/corrplot/vignettes/corrplot-intro.html
#dividing the data into samples
sub <- sample(nrow(data), floor(nrow(data) * 1))
training <- data[sub, ]
testing <- data[-sub, ]
# create first linear regression model
lm_model1=lm(price~count+car_age+kms_bucket+Windowed_make_model_count,data=training)
summary(lm_model1)
"
Residual standard error: 46560 on 8471 degrees of freedom
Multiple R-squared: 0.1468, Adjusted R-squared: 0.1463
F-statistic: 364.2 on 4 and 8471 DF, p-value: < 2.2e-16
"
lm_model2=lm(log_price~count+sqrt_car_age+kms_bucket+Windowed_make_model_count,data=training)
summary(lm_model2)
"
Residual standard error: 0.5664 on 8471 degrees of freedom
Multiple R-squared: 0.3507, Adjusted R-squared: 0.3504
F-statistic: 1144 on 4 and 8471 DF, p-value: < 2.2e-16
"
#########creating dummy variable ########
training$price<-NULL
training$car_age<-NULL
training$kms_bucket<-NULL
testing$price<-NULL
testing$car_age<-NULL
testing$kms_bucket<-NULL
lm_model3=lm(log_price~.,data=training)
summary(lm_model3)
qqPlot(lm_model3, main="QQ Plot") #qq plot for studentized resid
# Assessing Outliers
exclude <- outlierTest(lm_model3,cutoff =0.05, n.max = 50, order = TRUE) # Bonferonni p-value for most extreme obs
#outliers where age and price seems to be not in sync after visual inspection of outliers
outliers<-data[c(1742,8234,9498,243,1953,589,247,20,6144,10002,6152,1851,2012,595,2027,236,2157,
2271,602,2405,145,2333,2033,6150,2046,144,627,2385,259,2290,2817,9335,9431,264,
9572,277,2550,265,2880,601,6425,6273,2284,1129,2255,2815,2005,2102,3063,6444),]
View(outliers)
training <- training[-c(1742,8234,9498,243,1953,589,247,20,6144,10002,6152,1851,2012,595,2027,236,2157,
2271,602,2405,145,2333,2033,6150,2046,144,627,2385,259,2290,2817,9335,9431,264,
9572,277,2550,265,2880,601,6425,6273,2284,1129,2255,2815,2005,2102,3063,6444),]
plot(data$price~data$car_age)
plot(data$log_price~data$sqrt_car_age)
##########Now building the model again after removing top 50 outliers
lm_model4=lm(log_price~.,data=training)
summary(lm_model4)
qqPlot(lm_model4, main="QQ Plot") #qq plot for studentized resid
leveragePlots(lm_model4) # leverage plots
# Influential Observations
# added variable plots
avPlots(lm_model4)
# Cook's D plot
# identify D values > 4/(n-k-1)
cutoff <- 4/((nrow(training)-length(lm_model4$coefficients)-2))
plot(lm_model4, which=4, cook.levels=0.02)
# Influence Plot
influencePlot(lm_model4, id.method="identify", main="Influence Plot",
sub="Circle size is proportial to Cook's Distance" )
# Normality of Residuals
# qq plot for studentized resid
qqPlot(lm_model4, main="QQ Plot")
# distribution of studentized residuals
sresid <- studres(lm_model4)
hist(sresid, freq=FALSE,main="Distribution of Studentized Residuals")
xfit<-seq(min(sresid),max(sresid),length=50)
yfit<-dnorm(xfit)
lines(xfit, yfit)
# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(lm_model4)
# plot studentized residuals vs. fitted values
spreadLevelPlot(lm_model4)
# Evaluate Collinearity
vif(training) # variance inflation factors
sqrt(vif(training)) # problem?
# Evaluate Nonlinearity
# component + residual plot
crPlots(lm_model4)
# Ceres plots
ceresPlots(lm_model4)
# Test for Autocorrelated Errors
durbinWatsonTest(lm_model4)
# Global test of model assumptions
library('gvlma')
gvmodel <- gvlma(lm_model4)
summary(gvmodel)
##########Now building the model again after removing top 50 outliers
training$Windowed_make_model_count<-NULL
lm_model5=lm(log_price~.,data=training)
summary(lm_model5)
qqPlot(lm_model5, main="QQ Plot") #qq plot for studentized residual
# Global test of model assumptions
library('gvlma')
gvmodel <- gvlma(lm_model5)
summary(gvmodel)
plot(lm_model5)
lm_model6 <- lm(log_price~.,data=training)
step <- stepAIC(lm_model6, direction="both")
step$anova # display results