-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.R
186 lines (151 loc) · 7.05 KB
/
main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
library(dplyr)
library(ggplot2)
library(car)
library(MASS)
library(glmnet)
library(pROC)
if (!require(DAAG)) {
install.packages('DAAG')
library(DAAG)} else {
library(DAAG)
}
if (!require(boot)) {
install.packages('boot')
library(boot)} else {
library(boot)
}
options(scipen=999) # remove scientific notation in printing
########## Working directory and source function
pwd = getwd()
source(paste(pwd,'/src/function.R',sep=''))
########## Read Data
booktrain_path = paste(getwd(),'/data/booktrain.csv',sep='')
booktrain = read.csv(booktrain_path)
orders_path = paste(getwd(),'/data/orders.csv',sep='')
orders = read.csv(orders_path)
##### Only keep the rows from orders that can be matched to booktrain, i.e., the rows with logtarg.
all= merge(orders, booktrain, by='id',all.x= TRUE)
##### Create Train and Test Dataset
train_original = all %>% filter(!is.na(logtarg)) # note that train data contains only 8224 customer id instead of 8311, since there are 87 ids in booktrain that can not be matched to the records in orders.
train_original = data_manipulate(train_original)
test_original = all %>% filter(is.na(logtarg))
test_original = data_manipulate(test_original)
########## Exploratory Data Analysis
### check NA
colSums(is.na(train_original)) # do not have any NA
### check the distribution for categorical columns
ggplot(data=train_original, aes(x=category)) + geom_histogram(binwidth=1)
### check outlier for numerical columns
# > qty
check_outlier(train_original, 'qty', 10) # I think all the outliers are correct but extreme value. Decide to keep it for now.
check_outlier(train_original, 'price', 10) # I think all the outliers are correct but extreme value. Decide to keep it for now.
# > price
# there are 3.25% of rows with the value 0 in column 'price', and most of them are from category 99 (61%).
View(train_original[train_original$price==0,])
table(train_original[train_original$price==0,]$category)/dim(train_original[train_original$price==0,])[1]
########## Feature Engineering
train = feature_engineer(train_original)
test = feature_engineer(test_original, isTrain=FALSE)
system("say Just finished!")
########## Model Building Version 1
##### Description: Fit a simple regression model for benchmark
fit_multiple = lm(logtarg~.-id-total_duration, data = train)
summary(fit_multiple)
vif(fit_multiple)
cv.lm(data=train, m=10, form.lm=fit_multiple)#, printit = F) # ms 0.375
##### > Linear Regression with custom chosen variables
#chosen_feature = c('days_first_purchase', 'order_count', 'avg_qty',
# 'slope' ,'coe_va', 'logtarg')#'avg_qty', 'days_recent_purchase',
chosen_feature = c('days_first_purchase', 'order_count', 'aug_orders',
'slope' ,'coe_va', 'logtarg') #,'cat_count','total_money','avg_qty'
fit_custom = lm(logtarg~., data = train[chosen_feature])
summary(fit_custom)
cv.lm(data=train[chosen_feature], m=10, form.lm=fit_custom) # ms 0.375
#### Linear regression with interactions
fit_interation = lm(logtarg~.*., data = train[,!(colnames(train) == "id")]) #adj-R^2 0.0327
summary(fit_interation)
vif(fit_int_stepback)
cv.lm(data=train, m=10, form.lm=fit_interation) # ms 0.375
#### Linear regression with interactions and stepback
fit_int_stepback <- stepAIC(fit_interation,direction = c("backward"),trace=0)
summary(fit_int_stepback) # Adjusted R-squared: 0.0311 #MSE = 0.606
cv.lm(data=train[chosen_feature], m=10, form.lm=fit_custom) # ms 0.375
#### Improved linear with interactions and stepback
fit_chosen = lm(logtarg~days_recent_purchase+days_first_purchase+avg_qty+avg_ord_value+slope+days_recent_purchase:days_first_purchase+days_recent_purchase:order_count++days_first_purchase:avg_qty+days_first_purchase:coe_va+order_count:avg_ord_value+order_count:coe_va+slope:coe_va+cat_count:total_money
, data = train)
summary(fit_chosen) #adj-R2 = 0.017 #MSE = 0.61 #actual = 0.61925
cv.lm(data=train, m=10, form.lm=fit_chosen) # ms 0.374
##### > Stepwise Linear Regression
# **Backward**
fit_stepback <- stepAIC(fit_multiple,direction = c("backward"))
summary(fit_stepback) # Adjusted R-squared: 0.01663
cv.lm(data=train, m=10, form.lm=fit_stepback) # ms ms 0.375
# **Forward**
fit_zero <- lm(logtarg ~ 1, data = train)
fit_stepforw <- stepAIC(fit_zero,direction = c("forward"), scope=list(upper=fit_multiple,lower=fit_zero))
summary(fit_stepforw) # Adjusted R-squared: 0.007776
##### > Lasso Linear Regression
y=train$logtarg
x=model.matrix(logtarg~.-id-total_duration,train)
param_lasso = my_cv_glmnet(y,x,1)$small.lambda.betas
param_lasso = param_lasso[param_lasso!=0]
#lasso_feature = c('logtarg','days_recent_purchase', 'days_first_purchase', 'order_count', 'avg_qty',
# '6','8','12','20','30','38','50','coe_va')
#fit_lasso = lm(logtarg~., data = train[lasso_feature])
fit_lasso = lm(logtarg~., data = train)
summary(fit_lasso) #Adjusted R-squared: 0.01626
cv.lm(data=train[lasso_feature], m=10, form.lm=fit_lasso)
##### Model selection
# choose the model according to Adjusted R-squared, Feature Significance, cv mean square error.
# Multiple linear, custom and stepback give us similar result in Adjusted R^2 and cv ms.
# I'll choose fit_multiple as our final model now, since it somehow give us the best test error in my first try.
##### Make prediction
predict_reg = predict(fit_chosen, newdata=test)
##### Generating submitting file
test$logtarg = predict_reg
file_version1 = test %>% dplyr::select(id, logtarg)
file_version1$logtarg = ifelse(file_version1$logtarg<0, 0, file_version1$logtarg)
colnames(file_version1) = c('id','yhat')
write.csv(file_version1, file=paste('./submission/model_',format(Sys.time(), "%b%d_%H%M%S"),'.csv',sep=''),row.names = FALSE)
# ########## Model Building Version 2
# ##### Description: Fit a classication first and then a regression model. Predict final result combing both model.
#
# ##### Classification
# train$logtarg_bol = ifelse(train$logtarg!=0, 1, 0)
#
# cla_feature = c('logtarg_bol',
# 'days_recent_purchase','days_first_purchase','order_count','coe_va',
# '12','38')
#
# fit_classification = glm(logtarg_bol~., family=binomial, data=train[cla_feature])
# summary(fit_classification)
#
# predict_fit = predict(fit_classification, newdata=train[cla_feature], type="response")
#
# real_response = train$logtarg_bol
# optimal_p = get_optimal_p(real_response, predict_fit, seq(0.05,0.38,0.01))
# calculate_metrics(train,real_response, predict_fit, optimal_p)
#
#
# ##### Regression
# # use 'fit_stepback' in the previous version
#
# ##### Make prediction
# predict_cla = predict(fit_classification, newdata=test, type="response")
#
#
# ##### Generating submitting file
# test$logtarg = predict_version1
# test$prob = predict_cla
# file_version2 = test %>% mutate(yhat = logtarg*prob) %>% dplyr::select(id, yhat)
# # write.csv(file_version2, file=paste('./submission/model_',format(Sys.time(), "%b%d_%H%M%S"),'.csv',sep=''),row.names = FALSE)
#
###cross-validations
# not working
set.seed(17)
cv.error.10= rep(0, 10)
for (i in 1:10) {
fit_multiple = lm(logtarg~.-id-total_duration, data = train)
cv.error.10[i]=cv.lm(train,m=10,form.lm=fit_multiple)$delta[1]
}
cv.error.10