-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimport_data.R
68 lines (62 loc) · 2.45 KB
/
import_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
get_cols_of_type = function (table_name, schema, type, data_or_col_type){
if(data_or_col_type == "data_type"){
strQuery <- paste0("SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS
WHERE table_name = '",table_name,"'
AND table_schema = '",schema,"'
AND Data_type = '",type,"'")
}else{
strQuery <- paste0("SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS
WHERE table_name = '",table_name,"'
AND table_schema = '",schema,"'
AND Column_type = '",type,"'")
}
df.cols_wanted = dbGetQuery(con, strQuery)
return(df.cols_wanted)
}
converted_import = function(table_name){
# table_name = "dim_icds"
#get full table
strQuery = paste0("SELECT * FROM dbo.",table_name)
df.table = dbGetQuery(con, strQuery)
#DateTime
cols_datetime = get_cols_of_type(table_name, "dbo", "datetime", "data_type")
if(nrow(cols_datetime) == 1){
df.table[,cols_datetime[,1]] = as.POSIXct(df.table[,cols_datetime[,1]])
}else{
df.table[,cols_datetime[,1]] = lapply(df.table[,cols_datetime[,1]], as.POSIXct)
}
#Integer : BUG!!
#cols_int = get_cols_of_type(table_name, "dbo", "int", "data_type")
#df.table[,cols_int[,1]] = lapply(df.table[,cols_int[,1]], as.integer)
#Boolean
cols_bool = get_cols_of_type(table_name, "dbo", "varchar(5)", "column_type")
if(nrow(cols_bool) <2){
df.table[,cols_bool[,1]] = as.logical(df.table[,cols_bool[,1]])
}else{
df.table[,cols_bool[,1]] = lapply(df.table[,cols_bool[,1]], as.logical)
}
#Remaining => Text (no difference between free text and factor variables)
df.text = df.table[lapply(df.table, class) == "character"]
#decide if it is a factor
levels = as.data.frame(unlist(sapply(names(df.text), get_levels, "dbo", table_name)))
levels$names = names(df.text)
colnames(levels) = c("num_lvl","names")
factors = levels[levels$num_lvl<50,]
factor_names = factors$names
#convert column format
if(length(factor_names) != 0 ){
if( length(factor_names) == 1 ){
df.table[,factor_names] = factor(df.table[,factor_names])
}else{
df.table[,factor_names] = lapply(df.table[,factor_names], factor)
}
}
str(df.table)
return(df.table)
}
get_levels = function(col_name, schema_name, table_name ){
strQuery = paste0("SELECT COUNT(DISTINCT ",col_name,") + COUNT(DISTINCT case when ",col_name ," is null then 1 end) as levels
FROM ",schema_name,".",table_name)
levels = dbGetQuery(con, strQuery)
return(levels)
}