-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdflux_xgb_prep.R
114 lines (91 loc) · 2.83 KB
/
dflux_xgb_prep.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Run the recipe, workflow and hypertuning results
library(tidymodels)
library(doParallel)
set.seed(123)
load("samplesAllTidy_ACHR2.Rdata")
transportRxns <- readRDS(file = "./tmp/transportRxns.RDS")
not_all_na <- function(x) any(!is.na(x))
dat <-
dat %>%
## mutate(factor = factor(paste0(tissue, group))) %>%
## select(-c(tissue, group, source, `...1`)) %>%
select(-c( source, `...1`)) %>%
select(-any_of(transportRxns)) %>%
select(where(not_all_na)) # %>%
## .[,1500:ncol(.)] %>% # FIXME to be removed
## with_groups(c(tissue, group), sample_n, size = 100) # FIXME to be removed
# mutate_if(is.numeric, ~ replace(., is.na(.), 0)) # Rstats needs missing to be NA, but fluxe of zero is also a possibility
# 1. Data Manipulation
## 1.1 Data splitting
vb_split <- initial_split(dat, prop = 0.8, strata = tissue)
vb_split
# extract training and testing sets
vb_train <- training(vb_split)
vb_test <- testing(vb_split)
rm(dat); gc()
## 1.2 Setting Recipe
vb_recipe <-
recipe(group~., data = vb_train) %>%
#step_impute_knn(all_numeric()) %>%
#step_impute_median(all_numeric()) %>%
step_nzv(all_numeric()) %>%
step_normalize(all_numeric()) %>%
###
## step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors())
vb_recipe
## vb_prep <- prep(vb_recipe)
## summary(vb_prep)
## train_trans <- juice(vb_prep)
## test_trans <- bake(vb_prep, new_data = vb_test)
xgb_spec <- parsnip::boost_tree(
trees = 200,
tree_depth = tune(),
min_n = tune(),
sample_size = tune(),
mtry = tune(), ## randomness
loss_reduction = tune(), ## first three: model complexity
learn_rate = tune(), ## step size
) %>% set_engine("xgboost", num_class = 4,
objective = "multi:softprob",
tree_method = 'gpu_hist',
verbose=1) %>%
set_mode("classification")
## 2.2 set Grid Search
xgb_grid <- grid_latin_hypercube(
min_n(), #2nd
tree_depth(), #3d
loss_reduction(),
sample_size = sample_prop(),
finalize(mtry(), vb_train),
learn_rate(),
size = 10
)
xgb_grid
## 2.3 Model Workflow
xgb_wf <- workflow() %>%
## add_formula(group~.) %>%
add_recipe(vb_recipe) %>%
add_model(xgb_spec)
xgb_wf
## 2.4 Cross validation
vb_folds <- vfold_cv(vb_train,
v = 5,
strata = tissue)
vb_folds
#################################
#################################
# Hypertuning
doParallel::registerDoParallel(2)
# TODO bake first better, no need for recipe in wf then!
xgb_res <- tune_grid(
xgb_wf,
resamples = vb_folds,
grid = xgb_grid,
control = control_grid(save_pred = TRUE, # To obtain ROC curve
verbose = TRUE)
)
#xgb_res
saveRDS(xgb_res, file = "tmp/xgb_res.RDS")
saveRDS(xgb_wf, file = "tmp/xgb_wf.RDS")
saveRDS(vb_split, file = "tmp/vb_split.RDS")