diff --git a/Franzi/rmonize/data_dictionary/DD_DEGS1_FRANZI.xlsx b/Franzi/rmonize/data_dictionary/DD_DEGS1_FRANZI.xlsx new file mode 100644 index 0000000..4086d19 Binary files /dev/null and b/Franzi/rmonize/data_dictionary/DD_DEGS1_FRANZI.xlsx differ diff --git a/Franzi/DD_KARMEN_FRANZI.xlsx b/Franzi/rmonize/data_dictionary/DD_KARMEN_FRANZI.xlsx similarity index 100% rename from Franzi/DD_KARMEN_FRANZI.xlsx rename to Franzi/rmonize/data_dictionary/DD_KARMEN_FRANZI.xlsx diff --git a/Franzi/rmonize/data_proc_elem/DPE_DEGS1_FRANZI.xlsx b/Franzi/rmonize/data_proc_elem/DPE_DEGS1_FRANZI.xlsx new file mode 100644 index 0000000..c329f59 Binary files /dev/null and b/Franzi/rmonize/data_proc_elem/DPE_DEGS1_FRANZI.xlsx differ diff --git a/Franzi/DPE_KARMEN_FRANZI.xlsx b/Franzi/rmonize/data_proc_elem/DPE_KARMEN_FRANZI.xlsx similarity index 100% rename from Franzi/DPE_KARMEN_FRANZI.xlsx rename to Franzi/rmonize/data_proc_elem/DPE_KARMEN_FRANZI.xlsx diff --git a/Franzi/Dataschema_FRANZI.xlsx b/Franzi/rmonize/data_schema/Dataschema_FRANZI.xlsx similarity index 100% rename from Franzi/Dataschema_FRANZI.xlsx rename to Franzi/rmonize/data_schema/Dataschema_FRANZI.xlsx diff --git a/Franzi/scripts/DEGS1_FRANZI.R b/Franzi/scripts/DEGS1_FRANZI.R new file mode 100644 index 0000000..5c845ad --- /dev/null +++ b/Franzi/scripts/DEGS1_FRANZI.R @@ -0,0 +1,70 @@ +#### Script for harmonizing data for NFDI4Health + +####Installation of Rmonize and its dependent packages (necessary R Version > 3.4) +# install.packages("Rmonize") +# install.packages("readxl") +# install.packages("tidyverse") +# install.packages("here") +# install.packages("car") +# install.packages("writexl") + +#### Load the package in order to conduct +library(Rmonize) +library(readxl) +library(tidyverse) +library(here) +library(car) +library(writexl) + + +#### Step 0: Name of the study and creation of mock data +dataset_name <- 'DEGS1_FRANZI' +folder_name <- 'Franzi' + +source(here::here("create_mock_data", "mock_data_function.R")) +create_mock_data(studyname = dataset_name, folder_name = folder_name) + +#### Step 1: Import overall DataSchema +dataschema_1 <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_schema/", paste0("Dataschema_", toupper(folder_name), ".xlsx"))), sheet = 1)) +dataschema_2 <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_schema/", paste0("Dataschema_", toupper(folder_name), ".xlsx"))), sheet = 2)) + +dataschema <- list(Variables = dataschema_1, + Categories = dataschema_2) + +#### Step 2: Import Datasets +#### Import check provided in case the csv file is in German style (delim = ";", dec.point = ",") + +input_dataset <- readr::read_csv(here::here(paste0(folder_name, "/data/", paste0("DATA_", dataset_name, ".csv")))) + +if(dim(input_dataset)[2] == 1){ + input_dataset <- read.csv(here::here(paste0(folder_name, "/data", paste0("DATA_", dataset_name, ".csv"))), sep = ";", dec = ",") +} + +#### Step 3: Import Data Dictionaries of the study +dd_var <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_dictionary/"), paste0("DD_",dataset_name, ".xlsx")), sheet = 1)) +dd_cat <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name,"/rmonize/data_dictionary/"), paste0("DD_",dataset_name, ".xlsx")), sheet = 2)) + +dd <- list(Variables = dd_var, + Categories = dd_cat) + +#### Step 4: Import prepared Data Processing Elements (DPE) +data_proc_elem <- readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_proc_elem"), paste0("DPE_",dataset_name, ".xlsx")), sheet = 1) + +#### Step 5: Combine input datasets and data dictionaries into a dossier +dataset <- madshapR::data_dict_apply( + dataset = input_dataset, + data_dict = dd) + +dossier <- madshapR::dossier_create(dataset_list = list( + dataset)) + +#### Step 6: Create the harmonized dossier using the dossier, overall dataschema and DPE's +harmonized_dossier <- Rmonize::harmo_process( + dossier, + dataschema, + data_proc_elem) + +#### Step 9: Extract and save harmonized data into a pre-set folder +harmonized_dataset <- Rmonize::pooled_harmonized_dataset_create(harmonized_dossier) + + diff --git a/Franzi/scripts/KARMEN_FRANZI.R b/Franzi/scripts/KARMEN_FRANZI.R new file mode 100644 index 0000000..2751493 --- /dev/null +++ b/Franzi/scripts/KARMEN_FRANZI.R @@ -0,0 +1,70 @@ +#### Script for harmonizing data for NFDI4Health + +####Installation of Rmonize and its dependent packages (necessary R Version > 3.4) +# install.packages("Rmonize") +# install.packages("readxl") +# install.packages("tidyverse") +# install.packages("here") +# install.packages("car") +# install.packages("writexl") + +#### Load the package in order to conduct +library(Rmonize) +library(readxl) +library(tidyverse) +library(here) +library(car) +library(writexl) + + +#### Step 0: Name of the study and creation of mock data +dataset_name <- 'KARMEN_FRANZI' +folder_name <- 'Franzi' + +source(here::here("create_mock_data", "mock_data_function.R")) +create_mock_data(studyname = dataset_name, folder_name = folder_name) + +#### Step 1: Import overall DataSchema +dataschema_1 <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_schema/", paste0("Dataschema_", toupper(folder_name), ".xlsx"))), sheet = 1)) +dataschema_2 <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_schema/", paste0("Dataschema_", toupper(folder_name), ".xlsx"))), sheet = 2)) + +dataschema <- list(Variables = dataschema_1, + Categories = dataschema_2) + +#### Step 2: Import Datasets +#### Import check provided in case the csv file is in German style (delim = ";", dec.point = ",") + +input_dataset <- readr::read_csv(here::here(paste0(folder_name, "/data/", paste0("DATA_", dataset_name, ".csv")))) + +if(dim(input_dataset)[2] == 1){ + input_dataset <- read.csv(here::here(paste0(folder_name, "/data", paste0("DATA_", dataset_name, ".csv"))), sep = ";", dec = ",") +} + +#### Step 3: Import Data Dictionaries of the study +dd_var <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_dictionary/"), paste0("DD_",dataset_name, ".xlsx")), sheet = 1)) +dd_cat <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name,"/rmonize/data_dictionary/"), paste0("DD_",dataset_name, ".xlsx")), sheet = 2)) + +dd <- list(Variables = dd_var, + Categories = dd_cat) + +#### Step 4: Import prepared Data Processing Elements (DPE) +data_proc_elem <- readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_proc_elem"), paste0("DPE_",dataset_name, ".xlsx")), sheet = 1) + +#### Step 5: Combine input datasets and data dictionaries into a dossier +dataset <- madshapR::data_dict_apply( + dataset = input_dataset, + data_dict = dd) + +dossier <- madshapR::dossier_create(dataset_list = list( + dataset)) + +#### Step 6: Create the harmonized dossier using the dossier, overall dataschema and DPE's +harmonized_dossier <- Rmonize::harmo_process( + dossier, + dataschema, + data_proc_elem) + +#### Step 9: Extract and save harmonized data into a pre-set folder +harmonized_dataset <- Rmonize::pooled_harmonized_dataset_create(harmonized_dossier) + + diff --git a/Ines/rmonize/data_dictionary/DD_DEGS1_INES.xlsx b/Ines/rmonize/data_dictionary/DD_DEGS1_INES.xlsx new file mode 100644 index 0000000..807e36d Binary files /dev/null and b/Ines/rmonize/data_dictionary/DD_DEGS1_INES.xlsx differ diff --git a/Ines/DD_KARMEN_INES.xlsx b/Ines/rmonize/data_dictionary/DD_KARMEN_INES.xlsx similarity index 100% rename from Ines/DD_KARMEN_INES.xlsx rename to Ines/rmonize/data_dictionary/DD_KARMEN_INES.xlsx diff --git a/Ines/rmonize/data_proc_elem/DPE_DEGS1_INES.xlsx b/Ines/rmonize/data_proc_elem/DPE_DEGS1_INES.xlsx new file mode 100644 index 0000000..49e66c0 Binary files /dev/null and b/Ines/rmonize/data_proc_elem/DPE_DEGS1_INES.xlsx differ diff --git a/Ines/DPE_KARMEN_INES.xlsx b/Ines/rmonize/data_proc_elem/DPE_KARMEN_INES.xlsx similarity index 100% rename from Ines/DPE_KARMEN_INES.xlsx rename to Ines/rmonize/data_proc_elem/DPE_KARMEN_INES.xlsx diff --git a/Ines/Dataschema_INES.xlsx b/Ines/rmonize/data_schema/Dataschema_INES.xlsx similarity index 100% rename from Ines/Dataschema_INES.xlsx rename to Ines/rmonize/data_schema/Dataschema_INES.xlsx diff --git a/Ines/scripts/DEGS1_INES.R b/Ines/scripts/DEGS1_INES.R new file mode 100644 index 0000000..3d6aa09 --- /dev/null +++ b/Ines/scripts/DEGS1_INES.R @@ -0,0 +1,70 @@ +#### Script for harmonizing data for NFDI4Health + +####Installation of Rmonize and its dependent packages (necessary R Version > 3.4) +# install.packages("Rmonize") +# install.packages("readxl") +# install.packages("tidyverse") +# install.packages("here") +# install.packages("car") +# install.packages("writexl") + +#### Load the package in order to conduct +library(Rmonize) +library(readxl) +library(tidyverse) +library(here) +library(car) +library(writexl) + + +#### Step 0: Name of the study and creation of mock data +dataset_name <- 'DEGS1_INES' +folder_name <- 'Ines' + +source(here::here("create_mock_data", "mock_data_function.R")) +create_mock_data(studyname = dataset_name, folder_name = folder_name) + +#### Step 1: Import overall DataSchema +dataschema_1 <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_schema/", paste0("Dataschema_", toupper(folder_name), ".xlsx"))), sheet = 1)) +dataschema_2 <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_schema/", paste0("Dataschema_", toupper(folder_name), ".xlsx"))), sheet = 2)) + +dataschema <- list(Variables = dataschema_1, + Categories = dataschema_2) + +#### Step 2: Import Datasets +#### Import check provided in case the csv file is in German style (delim = ";", dec.point = ",") + +input_dataset <- readr::read_csv(here::here(paste0(folder_name, "/data/", paste0("DATA_", dataset_name, ".csv")))) + +if(dim(input_dataset)[2] == 1){ + input_dataset <- read.csv(here::here(paste0(folder_name, "/data", paste0("DATA_", dataset_name, ".csv"))), sep = ";", dec = ",") +} + +#### Step 3: Import Data Dictionaries of the study +dd_var <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_dictionary/"), paste0("DD_",dataset_name, ".xlsx")), sheet = 1)) +dd_cat <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name,"/rmonize/data_dictionary/"), paste0("DD_",dataset_name, ".xlsx")), sheet = 2)) + +dd <- list(Variables = dd_var, + Categories = dd_cat) + +#### Step 4: Import prepared Data Processing Elements (DPE) +data_proc_elem <- readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_proc_elem"), paste0("DPE_",dataset_name, ".xlsx")), sheet = 1) + +#### Step 5: Combine input datasets and data dictionaries into a dossier +dataset <- madshapR::data_dict_apply( + dataset = input_dataset, + data_dict = dd) + +dossier <- madshapR::dossier_create(dataset_list = list( + dataset)) + +#### Step 6: Create the harmonized dossier using the dossier, overall dataschema and DPE's +harmonized_dossier <- Rmonize::harmo_process( + dossier, + dataschema, + data_proc_elem) + +#### Step 9: Extract and save harmonized data into a pre-set folder +harmonized_dataset <- Rmonize::pooled_harmonized_dataset_create(harmonized_dossier) + + diff --git a/Ines/scripts/KARMEN_INES.R b/Ines/scripts/KARMEN_INES.R new file mode 100644 index 0000000..5bde453 --- /dev/null +++ b/Ines/scripts/KARMEN_INES.R @@ -0,0 +1,70 @@ +#### Script for harmonizing data for NFDI4Health + +####Installation of Rmonize and its dependent packages (necessary R Version > 3.4) +# install.packages("Rmonize") +# install.packages("readxl") +# install.packages("tidyverse") +# install.packages("here") +# install.packages("car") +# install.packages("writexl") + +#### Load the package in order to conduct +library(Rmonize) +library(readxl) +library(tidyverse) +library(here) +library(car) +library(writexl) + + +#### Step 0: Name of the study and creation of mock data +dataset_name <- 'KARMEN_INES' +folder_name <- 'Ines' + +source(here::here("create_mock_data", "mock_data_function.R")) +create_mock_data(studyname = dataset_name, folder_name = folder_name) + +#### Step 1: Import overall DataSchema +dataschema_1 <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_schema/", paste0("Dataschema_", toupper(folder_name), ".xlsx"))), sheet = 1)) +dataschema_2 <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_schema/", paste0("Dataschema_", toupper(folder_name), ".xlsx"))), sheet = 2)) + +dataschema <- list(Variables = dataschema_1, + Categories = dataschema_2) + +#### Step 2: Import Datasets +#### Import check provided in case the csv file is in German style (delim = ";", dec.point = ",") + +input_dataset <- readr::read_csv(here::here(paste0(folder_name, "/data/", paste0("DATA_", dataset_name, ".csv")))) + +if(dim(input_dataset)[2] == 1){ + input_dataset <- read.csv(here::here(paste0(folder_name, "/data", paste0("DATA_", dataset_name, ".csv"))), sep = ";", dec = ",") +} + +#### Step 3: Import Data Dictionaries of the study +dd_var <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_dictionary/"), paste0("DD_",dataset_name, ".xlsx")), sheet = 1)) +dd_cat <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name,"/rmonize/data_dictionary/"), paste0("DD_",dataset_name, ".xlsx")), sheet = 2)) + +dd <- list(Variables = dd_var, + Categories = dd_cat) + +#### Step 4: Import prepared Data Processing Elements (DPE) +data_proc_elem <- readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_proc_elem"), paste0("DPE_",dataset_name, ".xlsx")), sheet = 1) + +#### Step 5: Combine input datasets and data dictionaries into a dossier +dataset <- madshapR::data_dict_apply( + dataset = input_dataset, + data_dict = dd) + +dossier <- madshapR::dossier_create(dataset_list = list( + dataset)) + +#### Step 6: Create the harmonized dossier using the dossier, overall dataschema and DPE's +harmonized_dossier <- Rmonize::harmo_process( + dossier, + dataschema, + data_proc_elem) + +#### Step 9: Extract and save harmonized data into a pre-set folder +harmonized_dataset <- Rmonize::pooled_harmonized_dataset_create(harmonized_dossier) + + diff --git a/Tracy/rmonize/data_dictionary/DD_DEGS1_TRACY.xlsx b/Tracy/rmonize/data_dictionary/DD_DEGS1_TRACY.xlsx new file mode 100644 index 0000000..4086d19 Binary files /dev/null and b/Tracy/rmonize/data_dictionary/DD_DEGS1_TRACY.xlsx differ diff --git a/Tracy/DD_KARMEN_TRACY.xlsx b/Tracy/rmonize/data_dictionary/DD_KARMEN_TRACY.xlsx similarity index 100% rename from Tracy/DD_KARMEN_TRACY.xlsx rename to Tracy/rmonize/data_dictionary/DD_KARMEN_TRACY.xlsx diff --git a/Tracy/rmonize/data_proc_elem/DPE_DEGS1_TRACY.xlsx b/Tracy/rmonize/data_proc_elem/DPE_DEGS1_TRACY.xlsx new file mode 100644 index 0000000..1bb50a3 Binary files /dev/null and b/Tracy/rmonize/data_proc_elem/DPE_DEGS1_TRACY.xlsx differ diff --git a/Tracy/DPE_KARMEN_TRACY.xlsx b/Tracy/rmonize/data_proc_elem/DPE_KARMEN_TRACY.xlsx similarity index 100% rename from Tracy/DPE_KARMEN_TRACY.xlsx rename to Tracy/rmonize/data_proc_elem/DPE_KARMEN_TRACY.xlsx diff --git a/Tracy/Dataschema_TRACY.xlsx b/Tracy/rmonize/data_schema/Dataschema_TRACY.xlsx similarity index 100% rename from Tracy/Dataschema_TRACY.xlsx rename to Tracy/rmonize/data_schema/Dataschema_TRACY.xlsx diff --git a/Tracy/scripts/DEGS1_TRACY.R b/Tracy/scripts/DEGS1_TRACY.R new file mode 100644 index 0000000..f47495c --- /dev/null +++ b/Tracy/scripts/DEGS1_TRACY.R @@ -0,0 +1,70 @@ +#### Script for harmonizing data for NFDI4Health + +####Installation of Rmonize and its dependent packages (necessary R Version > 3.4) +# install.packages("Rmonize") +# install.packages("readxl") +# install.packages("tidyverse") +# install.packages("here") +# install.packages("car") +# install.packages("writexl") + +#### Load the package in order to conduct +library(Rmonize) +library(readxl) +library(tidyverse) +library(here) +library(car) +library(writexl) + + +#### Step 0: Name of the study and creation of mock data +dataset_name <- 'DEGS1_TRACY' +folder_name <- 'Tracy' + +source(here::here("create_mock_data", "mock_data_function.R")) +create_mock_data(studyname = dataset_name, folder_name = folder_name) + +#### Step 1: Import overall DataSchema +dataschema_1 <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_schema/", paste0("Dataschema_", toupper(folder_name), ".xlsx"))), sheet = 1)) +dataschema_2 <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_schema/", paste0("Dataschema_", toupper(folder_name), ".xlsx"))), sheet = 2)) + +dataschema <- list(Variables = dataschema_1, + Categories = dataschema_2) + +#### Step 2: Import Datasets +#### Import check provided in case the csv file is in German style (delim = ";", dec.point = ",") + +input_dataset <- readr::read_csv(here::here(paste0(folder_name, "/data/", paste0("DATA_", dataset_name, ".csv")))) + +if(dim(input_dataset)[2] == 1){ + input_dataset <- read.csv(here::here(paste0(folder_name, "/data", paste0("DATA_", dataset_name, ".csv"))), sep = ";", dec = ",") +} + +#### Step 3: Import Data Dictionaries of the study +dd_var <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_dictionary/"), paste0("DD_",dataset_name, ".xlsx")), sheet = 1)) +dd_cat <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name,"/rmonize/data_dictionary/"), paste0("DD_",dataset_name, ".xlsx")), sheet = 2)) + +dd <- list(Variables = dd_var, + Categories = dd_cat) + +#### Step 4: Import prepared Data Processing Elements (DPE) +data_proc_elem <- readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_proc_elem"), paste0("DPE_",dataset_name, ".xlsx")), sheet = 1) + +#### Step 5: Combine input datasets and data dictionaries into a dossier +dataset <- madshapR::data_dict_apply( + dataset = input_dataset, + data_dict = dd) + +dossier <- madshapR::dossier_create(dataset_list = list( + dataset)) + +#### Step 6: Create the harmonized dossier using the dossier, overall dataschema and DPE's +harmonized_dossier <- Rmonize::harmo_process( + dossier, + dataschema, + data_proc_elem) + +#### Step 9: Extract and save harmonized data into a pre-set folder +harmonized_dataset <- Rmonize::pooled_harmonized_dataset_create(harmonized_dossier) + + diff --git a/Tracy/scripts/KARMEN_TRACY.R b/Tracy/scripts/KARMEN_TRACY.R new file mode 100644 index 0000000..609153e --- /dev/null +++ b/Tracy/scripts/KARMEN_TRACY.R @@ -0,0 +1,70 @@ +#### Script for harmonizing data for NFDI4Health + +####Installation of Rmonize and its dependent packages (necessary R Version > 3.4) +# install.packages("Rmonize") +# install.packages("readxl") +# install.packages("tidyverse") +# install.packages("here") +# install.packages("car") +# install.packages("writexl") + +#### Load the package in order to conduct +library(Rmonize) +library(readxl) +library(tidyverse) +library(here) +library(car) +library(writexl) + + +#### Step 0: Name of the study and creation of mock data +dataset_name <- 'KARMEN_TRACY' +folder_name <- 'Tracy' + +source(here::here("create_mock_data", "mock_data_function.R")) +create_mock_data(studyname = dataset_name, folder_name = folder_name) + +#### Step 1: Import overall DataSchema +dataschema_1 <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_schema/", paste0("Dataschema_", toupper(folder_name), ".xlsx"))), sheet = 1)) +dataschema_2 <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_schema/", paste0("Dataschema_", toupper(folder_name), ".xlsx"))), sheet = 2)) + +dataschema <- list(Variables = dataschema_1, + Categories = dataschema_2) + +#### Step 2: Import Datasets +#### Import check provided in case the csv file is in German style (delim = ";", dec.point = ",") + +input_dataset <- readr::read_csv(here::here(paste0(folder_name, "/data/", paste0("DATA_", dataset_name, ".csv")))) + +if(dim(input_dataset)[2] == 1){ + input_dataset <- read.csv(here::here(paste0(folder_name, "/data", paste0("DATA_", dataset_name, ".csv"))), sep = ";", dec = ",") +} + +#### Step 3: Import Data Dictionaries of the study +dd_var <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_dictionary/"), paste0("DD_",dataset_name, ".xlsx")), sheet = 1)) +dd_cat <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name,"/rmonize/data_dictionary/"), paste0("DD_",dataset_name, ".xlsx")), sheet = 2)) + +dd <- list(Variables = dd_var, + Categories = dd_cat) + +#### Step 4: Import prepared Data Processing Elements (DPE) +data_proc_elem <- readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_proc_elem"), paste0("DPE_",dataset_name, ".xlsx")), sheet = 1) + +#### Step 5: Combine input datasets and data dictionaries into a dossier +dataset <- madshapR::data_dict_apply( + dataset = input_dataset, + data_dict = dd) + +dossier <- madshapR::dossier_create(dataset_list = list( + dataset)) + +#### Step 6: Create the harmonized dossier using the dossier, overall dataschema and DPE's +harmonized_dossier <- Rmonize::harmo_process( + dossier, + dataschema, + data_proc_elem) + +#### Step 9: Extract and save harmonized data into a pre-set folder +harmonized_dataset <- Rmonize::pooled_harmonized_dataset_create(harmonized_dossier) + + diff --git a/create_mock_data/mock_data_function.R b/create_mock_data/mock_data_function.R index 45d2310..8df25ad 100644 --- a/create_mock_data/mock_data_function.R +++ b/create_mock_data/mock_data_function.R @@ -2,8 +2,8 @@ #studyname <- "GINI_P2" create_mock_data <- function(studyname = NULL, single_dataset = TRUE, - vars_second_dataset = NULL){ - + vars_second_dataset = NULL, + folder_name = ""){ if(single_dataset == FALSE && is.null(vars_second_dataset)){ stop("You indicated that there is more than 1 dataset from which variables will be included @@ -13,8 +13,11 @@ create_mock_data <- function(studyname = NULL, library(tidyverse) library(readr) - study_variables_all <- tibble(readxl::read_excel(here::here("rmonize/data_dictionary", paste0("DD_",studyname,".xlsx")), sheet = 1)) - study_variables_categorical <- tibble(readxl::read_excel(here::here("rmonize/data_dictionary", paste0("DD_",studyname,".xlsx")), sheet = 2)) + input_path <- here::here(folder_name, "rmonize/data_dictionary") + output_path <- here::here(folder_name, "data") + + study_variables_all <- tibble(readxl::read_excel(file.path(input_path, paste0("DD_", studyname, ".xlsx")), sheet = 1)) + study_variables_categorical <- tibble(readxl::read_excel(file.path(input_path, paste0("DD_", studyname, ".xlsx")), sheet = 2)) study_variables <- study_variables_all |> select(name, valueType) @@ -51,30 +54,43 @@ create_mock_data <- function(studyname = NULL, dataset <- dataset_empty |> mutate(across(all_of(vars_decimal), ~ rnorm(n = 100, mean = 100, sd = 15))) |> mutate(across(all_of(vars_integer[!(vars_integer %in% vars_categorical)]), ~ as.integer(rnorm(n = 100, mean = 100, sd = 15)))) |> - mutate(across(all_of(vars_categorical), ~ 1)) |> mutate(across(all_of(vars_text[tolower(vars_text) %in% "id"]), ~ rep(1:100,1))) - dataset[c(1)] <- c(1:100) - for (i in 1:length(unique_categories)){ - - relevant_categories <- study_categories |> - filter(variable == vars_categorical[i]) |> - pull(name) - + if (length(vars_categorical) > 0) { dataset <- dataset |> - mutate(across(all_of(vars_categorical[i]), ~ rep(relevant_categories, 50)[1:100])) - + mutate(across(all_of(vars_categorical), ~ 1)) } + dataset[c(1)] <- c(1:100) + for (i in 1:length(vars_categorical)) { + # Ensure vars_categorical[i] is not empty or NULL + if (length(vars_categorical[i]) > 0 && vars_categorical[i] %in% colnames(dataset)) { + relevant_categories <- study_categories |> + filter(variable == vars_categorical[i]) |> + pull(name) + + # Only proceed if relevant_categories is non-empty + if (length(relevant_categories) > 0) { + dataset <- dataset |> + mutate(across(all_of(vars_categorical[i]), ~ rep(relevant_categories, 50)[1:100])) + } else { + dataset <- dataset |> + mutate(across(all_of(vars_categorical[i]), ~ NA)) + } + } + } + + #### Make 10% of the test dataset NA level_na <- 0.1 dataset[-c(1)] <- as.data.frame(lapply(dataset[-c(1)], function(cc) cc[ sample(c(TRUE, NA), prob = c(1-level_na, level_na), size = length(cc), replace = TRUE) ])) + if (!dir.exists(output_path)) { + dir.create(output_path, recursive = TRUE) + } readr::write_delim(x = dataset, - file = here::here("data", paste0("DATA_", studyname, ".csv")), + file = file.path(output_path, paste0("DATA_", studyname, ".csv")), delim = ",", na = "") - -} - +} diff --git a/workflow/individual_dataschema.R b/workflow/individual_docs.R similarity index 64% rename from workflow/individual_dataschema.R rename to workflow/individual_docs.R index 3190296..61af7c8 100644 --- a/workflow/individual_dataschema.R +++ b/workflow/individual_docs.R @@ -46,24 +46,49 @@ Dataschema_TRACY <- filter_dataschema(merged_dataschema,tracy_vars) Dataschema_INES <- filter_dataschema(merged_dataschema, ines_vars) Dataschema_FRANZI <- filter_dataschema(merged_dataschema, franzi_vars) -writexl::write_xlsx(Dataschema_TRACY, "Tracy/Dataschema_TRACY.xlsx") -writexl::write_xlsx(Dataschema_FRANZI, "Franzi/Dataschema_FRANZI.xlsx") -writexl::write_xlsx(Dataschema_INES, "Ines/Dataschema_INES.xlsx") +# writexl::write_xlsx(Dataschema_TRACY, "Tracy/rmonize/data_schema/Dataschema_TRACY.xlsx") +# writexl::write_xlsx(Dataschema_FRANZI, "Franzi/rmonize/data_schema/Dataschema_FRANZI.xlsx") +# writexl::write_xlsx(Dataschema_INES, "Ines/rmonize/data_schema/Dataschema_INES.xlsx") # DPEs based on Dataschema -studyname = "KARMEN" +studyname = "DEGS1" DPE_TRACY <- Dataschema_TRACY[["Variables"]] names(DPE_TRACY)[2] <- "dataschema_variable" DPE_TRACY[, c("input_dataset", "input_variables","Mlstr_harmo::rule_category","Mlstr_harmo::algorithm" ,"Mlstr_harmo::comment","Mlstr_harmo::status","Mlstr_harmo::status_detail")] <- NA -writexl::write_xlsx(DPE_TRACY, paste0("Tracy/DPE_", studyname, "_TRACY.xlsx")) +writexl::write_xlsx(DPE_TRACY, paste0("Tracy/rmonize/data_proc_elem/DPE_", studyname, "_TRACY.xlsx")) DPE_FRANZI <- Dataschema_FRANZI[["Variables"]] names(DPE_FRANZI)[2] <- "dataschema_variable" DPE_FRANZI[, c("input_dataset", "input_variables","Mlstr_harmo::rule_category","Mlstr_harmo::algorithm" ,"Mlstr_harmo::comment","Mlstr_harmo::status","Mlstr_harmo::status_detail")] <- NA -writexl::write_xlsx(DPE_FRANZI, paste0("Franzi/DPE_", studyname, "_Franzi.xlsx")) +writexl::write_xlsx(DPE_FRANZI, paste0("Franzi/rmonize/data_proc_elem/DPE_", studyname, "_FRANZI.xlsx")) DPE_INES <- Dataschema_INES[["Variables"]] names(DPE_INES)[2] <- "dataschema_variable" DPE_INES[, c("input_dataset", "input_variables","Mlstr_harmo::rule_category","Mlstr_harmo::algorithm" ,"Mlstr_harmo::comment","Mlstr_harmo::status","Mlstr_harmo::status_detail")] <- NA -writexl::write_xlsx(DPE_INES, paste0("Ines/DPE_", studyname, "_Ines.xlsx")) \ No newline at end of file +writexl::write_xlsx(DPE_INES, paste0("Ines/rmonize/data_proc_elem/DPE_", studyname, "_INES.xlsx")) + +dd <-list( + Variables = tibble::tibble( + index = integer(), + name = character(), + label = character(), + valueType = character() + + ), + Categories = tibble::tibble( + variable = character(), + name = integer(), + label = character() + ) +) + +writexl::write_xlsx(dd, paste0("Tracy/rmonize/data_dictionary/DD_", studyname, "_TRACY.xlsx")) +writexl::write_xlsx(dd, paste0("Franzi/rmonize/data_dictionary/DD_", studyname, "_FRANZI.xlsx")) +writexl::write_xlsx(dd, paste0("Ines/rmonize/data_dictionary/DD_", studyname, "_INES.xlsx")) + +# Scripts creation +source(here::here("workflow", "update_script.R")) +update_script(script_path = "workflow/script_template.R", dataset_name =paste0(studyname, "_TRACY"), folder_name = "Tracy") +update_script(script_path = "workflow/script_template.R", dataset_name =paste0(studyname, "_FRANZI"), folder_name = "Franzi") +update_script(script_path = "workflow/script_template.R", dataset_name =paste0(studyname, "_INES"), folder_name = "Ines") \ No newline at end of file diff --git a/workflow/script_template.R b/workflow/script_template.R new file mode 100644 index 0000000..93b0d0e --- /dev/null +++ b/workflow/script_template.R @@ -0,0 +1,70 @@ +#### Script for harmonizing data for NFDI4Health + +####Installation of Rmonize and its dependent packages (necessary R Version > 3.4) +# install.packages("Rmonize") +# install.packages("readxl") +# install.packages("tidyverse") +# install.packages("here") +# install.packages("car") +# install.packages("writexl") + +#### Load the package in order to conduct +library(Rmonize) +library(readxl) +library(tidyverse) +library(here) +library(car) +library(writexl) + + +#### Step 0: Name of the study and creation of mock data +dataset_name <- "" +folder_name <- "" + +source(here::here("create_mock_data", "mock_data_function.R")) +create_mock_data(studyname = dataset_name, folder_name = folder_name) + +#### Step 1: Import overall DataSchema +dataschema_1 <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_schema/", paste0("Dataschema_", toupper(folder_name), ".xlsx"))), sheet = 1)) +dataschema_2 <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_schema/", paste0("Dataschema_", toupper(folder_name), ".xlsx"))), sheet = 2)) + +dataschema <- list(Variables = dataschema_1, + Categories = dataschema_2) + +#### Step 2: Import Datasets +#### Import check provided in case the csv file is in German style (delim = ";", dec.point = ",") + +input_dataset <- readr::read_csv(here::here(paste0(folder_name, "/data/", paste0("DATA_", dataset_name, ".csv")))) + +if(dim(input_dataset)[2] == 1){ + input_dataset <- read.csv(here::here(paste0(folder_name, "/data", paste0("DATA_", dataset_name, ".csv"))), sep = ";", dec = ",") +} + +#### Step 3: Import Data Dictionaries of the study +dd_var <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_dictionary/"), paste0("DD_",dataset_name, ".xlsx")), sheet = 1)) +dd_cat <- tibble::tibble(readxl::read_excel(here::here(paste0(folder_name,"/rmonize/data_dictionary/"), paste0("DD_",dataset_name, ".xlsx")), sheet = 2)) + +dd <- list(Variables = dd_var, + Categories = dd_cat) + +#### Step 4: Import prepared Data Processing Elements (DPE) +data_proc_elem <- readxl::read_excel(here::here(paste0(folder_name, "/rmonize/data_proc_elem"), paste0("DPE_",dataset_name, ".xlsx")), sheet = 1) + +#### Step 5: Combine input datasets and data dictionaries into a dossier +dataset <- madshapR::data_dict_apply( + dataset = input_dataset, + data_dict = dd) + +dossier <- madshapR::dossier_create(dataset_list = list( + dataset)) + +#### Step 6: Create the harmonized dossier using the dossier, overall dataschema and DPE's +harmonized_dossier <- Rmonize::harmo_process( + dossier, + dataschema, + data_proc_elem) + +#### Step 9: Extract and save harmonized data into a pre-set folder +harmonized_dataset <- Rmonize::pooled_harmonized_dataset_create(harmonized_dossier) + + diff --git a/workflow/update_script.R b/workflow/update_script.R new file mode 100644 index 0000000..9f9e53e --- /dev/null +++ b/workflow/update_script.R @@ -0,0 +1,21 @@ +update_script <- function(script_path, dataset_name, folder_name) { + # Ensure the folder_name/scripts directory exists + scripts_dir <- file.path(folder_name, "scripts") + if (!dir.exists(scripts_dir)) { + dir.create(scripts_dir, recursive = TRUE) + } + + # Read the script into a character vector + lines <- readLines(script_path) + + # Update line 21 and 22 with the new values + lines[21] <- paste0("dataset_name <- '", dataset_name, "'") + lines[22] <- paste0("folder_name <- '", folder_name, "'") + + # Define the new script path + new_script_path <- file.path(scripts_dir, paste0(dataset_name, ".R")) + + # Write the modified script to the new location + writeLines(lines, new_script_path) +} +