Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add hashing #774

Merged
merged 15 commits into from
Nov 11, 2022
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

* Updated examples to use `scda.2022`.
* Added R session information into a link in the footer of `teal` applications.
* Added data hashing step using `rlang` instead of `digest` package to calculate the hash (which has been moved from `teal.data` and `teal.slice`). There is now an explicit hashing check in the reproducible code output

# teal 0.12.0

Expand Down
45 changes: 18 additions & 27 deletions R/get_rcode.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,9 @@ get_rcode <- function(datasets = NULL,
)
str_install <- paste(c(get_rcode_str_install(), ""), collapse = "\n")
str_libs <- paste(get_rcode_libraries(), "\n")
str_code <- get_datasets_code(datanames, datasets)

hashes <- calculate_hashes(datanames, datasets)
str_code <- c(get_datasets_code(datanames, datasets, hashes), teal.slice::get_filter_expr(datasets, datanames))
} else {
str_header <- get_rcode_header(title = title, description = description)
str_install <- character(0)
Expand Down Expand Up @@ -134,37 +136,29 @@ get_rcode <- function(datasets = NULL,
#' Get datasets code
#'
#' Get combined code from `FilteredData` and from `CodeClass` object.
#'
#' @param datanames (`character`) names of datasets to extract code from
#' @param datasets (`FilteredData`) object
#' @param hashes named (`list`) of hashes per dataset
#'
#' @return `character(3)` containing following elements:
#' - code from `CodeClass` (data loading code)
#' - hash of loaded objects
#' - filter panel code
#' - hash check of loaded objects
#'
#' @keywords internal
get_datasets_code <- function(datanames, datasets) {
get_datasets_code <- function(datanames, datasets, hashes) {
str_code <- datasets$get_code(datanames)
if (length(str_code) == 0 || (length(str_code) == 1 && str_code == "")) {
str_code <- paste0(c(
"#################################################################",
"# ___ ____ ____ ___ ____ ____ ____ ____ ____ ____ _ _ _ ____ #",
"# |__] |__/ |___ |__] |__/ | | | |___ [__ [__ | |\\ | | __ #",
"# | | \\ |___ | | \\ |__| |___ |___ ___] ___] | | \\| |__] #",
"# _ ____ ____ _ _ ___ ___ _ _ #",
"# | [__ |___ |\\/| |__] | \\_/ #",
"# | ___] |___ | | | | | #",
"#################################################################\n"
), collapse = "\n")
str_code <- "message('Preprocessing is empty')"
} else if (length(str_code) > 0) {
str_code <- paste0(str_code, "\n\n")
}

if (!datasets$get_check()) {
check_note_string <- paste0(
c(
"## NOTE: Reproducibility of data import and preprocessing was not",
"## explicitly checked (argument \"check = FALSE\" is set).",
"## The app developer has the choice to check the reproducibility",
"## and might have omitted this step for some reason. Please reach",
"## out to the app developer for details.\n"
"message(paste(\"Reproducibility of data import and preprocessing was not explicitly checked\",",
" \" ('check = FALSE' is set). Contact app developer if this is an issue.\n\"))"
),
collapse = "\n"
)
Expand All @@ -177,9 +171,9 @@ get_datasets_code <- function(datanames, datasets) {
datanames,
function(dataname) {
sprintf(
"# %s MD5 hash at the time of analysis: %s",
dataname,
datasets$get_filtered_dataset(dataname)$get_hash()
"stopifnot(%s == %s)",
deparse1(bquote(rlang::hash(.(as.name(dataname))))),
deparse1(hashes[[dataname]])
)
},
character(1)
Expand All @@ -189,11 +183,7 @@ get_datasets_code <- function(datanames, datasets) {
"\n\n"
)

str_filter <- teal.slice::get_filter_expr(datasets, datanames)
if (str_filter != "") {
str_filter <- paste0(str_filter, "\n\n")
}
c(str_code, str_hash, str_filter)
c(str_code, str_hash)
}

## Module ----
Expand All @@ -203,6 +193,7 @@ get_datasets_code <- function(datanames, datasets) {
#'
#' @inheritParams get_rcode
#' @inheritParams shiny::moduleServer
#'
#' @param modal_title optional, (`character`) title of the modal
#' @param code_header optional, (`character`) header inside R
#' @param disable_buttons optional, (`reactive`)
Expand Down
34 changes: 32 additions & 2 deletions R/module_nested_tabs.R
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ srv_nested_tabs.teal_modules <- function(id, datasets, modules, reporter) {
"module { deparse1(modules$label) }."
)
)

modules_reactive <- sapply(names(modules$children), USE.NAMES = TRUE, function(id) {
srv_nested_tabs(id = id, datasets = datasets, modules = modules$children[[id]], reporter = reporter)
})
Expand Down Expand Up @@ -232,13 +233,16 @@ srv_nested_tabs.teal_module <- function(id, datasets, modules, reporter) {
#'
#' Converts `FilteredData` object to `tdata` object containing datasets needed for a specific module.
#' Please note that if module needs dataset which has a parent, then parent will be also returned.
#' A hash per `dataset` is calculated internally and returned in the code.
#'
#' @param module (`teal_module`) module where needed filters are taken from
#' @param datasets (`FilteredData`) object where needed data are taken from
#' @return list of reactive datasets with following attributes:
#' - `code` (`character`) containing datasets reproducible code.
#' @keywords internal
#' - `join_keys` (`JoinKeys`) containing relationships between datasets.
#' - `metadata` (`list`) containing metadata of datasets.
#'
#' @keywords internal
.datasets_to_data <- function(module, datasets) {
datanames <- if (identical("all", module$filter) || is.null(module$filter)) {
datasets$datanames()
Expand All @@ -255,13 +259,39 @@ srv_nested_tabs.teal_module <- function(id, datasets, modules, reporter) {
}
)

hashes <- calculate_hashes(datanames, datasets)
metadata <- lapply(datanames, datasets$get_metadata)
names(metadata) <- datanames

new_tdata(
data,
reactive(get_datasets_code(datanames, datasets)),
reactive(
c(
get_rcode_str_install(),
get_rcode_libraries(),
get_datasets_code(datanames, datasets, hashes),
teal.slice::get_filter_expr(datasets, datanames)
)
),
datasets$get_join_keys(),
metadata
)
}

#' Get the hash of a dataset
#'
#' @param datanames (`character`) names of datasets
#' @param datasets (`FilteredData`) object holding the data
#'
#' @return A list of hashes per dataset
#' @keywords internal
#'
calculate_hashes <- function(datanames, datasets) {
sapply(
datanames,
simplify = FALSE,
function(x) {
rlang::hash(datasets$get_data(x, filtered = FALSE))
}
)
}
20 changes: 20 additions & 0 deletions man/calculate_hashes.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 3 additions & 7 deletions man/dot-datasets_to_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 4 additions & 3 deletions man/get_datasets_code.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions tests/testthat/test-get_rcode.R
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,16 @@ testthat::test_that("get_datasets_code returns code only for specified datanames
)
)

hashes <- calculate_hashes(datasets$datanames(), datasets)
testthat::expect_true(
!grepl(
"mtcars",
paste(get_datasets_code(datasets = datasets, dataname = "IRIS"), collapse = "\n"),
paste(get_datasets_code(datasets = datasets, dataname = "IRIS", hashes = hashes), collapse = "\n"),
ignore.case = TRUE
) &&
grepl(
"iris",
paste(get_datasets_code(datasets = datasets, dataname = "IRIS"), collapse = "\n"),
paste(get_datasets_code(datasets = datasets, dataname = "IRIS", hashes = hashes), collapse = "\n"),
ignore.case = TRUE
)
)
Expand Down
80 changes: 78 additions & 2 deletions tests/testthat/test-module_nested_tabs.R
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,22 @@ testthat::test_that(".datasets_to_data returns tdata object", {

# code
testthat::expect_equal(
isolate(get_code(data)[1]),
"d1 <- data.frame(id = 1:5, pk = c(2, 3, 2, 1, 4), val = 1:5)\nd2 <- data.frame(id = 1:5, value = 1:5)\n\n"
isolate(get_code(data)),
c(
"# Add any code to install/load your NEST environment here",
paste0(
"library(testthat)\nlibrary(shiny)\nlibrary(teal.data)\nlibrary(magrittr)\nlibrary(teal.transform)\n",
"library(teal)\nlibrary(matrixStats)\nlibrary(MatrixGenerics)\nlibrary(BiocGenerics)\nlibrary(S4Vectors)\n",
"library(IRanges)\nlibrary(GenomeInfoDb)\nlibrary(GenomicRanges)\nlibrary(Biobase)\n",
"library(SummarizedExperiment)\nlibrary(MultiAssayExperiment)"
),
"d1 <- data.frame(id = 1:5, pk = c(2, 3, 2, 1, 4), val = 1:5)\nd2 <- data.frame(id = 1:5, value = 1:5)\n\n",
paste0(
"stopifnot(rlang::hash(d1) == \"f6f90d2c133ca4abdeb2f7a7d85b731e\")\n",
"stopifnot(rlang::hash(d2) == \"6e30be195b7d914a1311672c3ebf4e4f\") \n\n"
),
""
)
)

# metadata
Expand Down Expand Up @@ -343,3 +357,65 @@ testthat::test_that(".datasets_to_data returns parent datasets for CDISC data",
data <- .datasets_to_data(module, datasets)
testthat::expect_setequal(isolate(names(data)), c("ADSL", "ADAE"))
})

testthat::test_that("calculate_hashes takes a FilteredData and vector of datanames as input", {
adsl <- data.frame(STUDYID = 1, USUBJID = 1)
adae <- data.frame(STUDYID = 1, USUBJID = 1, ASTDTM = 1, AETERM = 1, AESEQ = 1)
adtte <- data.frame(STUDYID = 1, USUBJID = 1, PARAMCD = 1)

datasets <- teal.slice::init_filtered_data(
teal.data::cdisc_data(
teal.data::cdisc_dataset("ADSL", adsl),
teal.data::cdisc_dataset("ADAE", adae),
teal.data::cdisc_dataset("ADTTE", adtte)
)
)

testthat::expect_error(calculate_hashes(datanames = c("ADSL", "ADAE", "ADTTE"), datasets = datasets), NA)
})

testthat::test_that("calculate_hashes returns a named list", {
adsl <- data.frame(STUDYID = 1, USUBJID = 1)
adae <- data.frame(STUDYID = 1, USUBJID = 1, ASTDTM = 1, AETERM = 1, AESEQ = 1)
adtte <- data.frame(STUDYID = 1, USUBJID = 1, PARAMCD = 1)

datasets <- teal.slice::init_filtered_data(
teal.data::cdisc_data(
teal.data::cdisc_dataset("ADSL", adsl),
teal.data::cdisc_dataset("ADAE", adae),
teal.data::cdisc_dataset("ADTTE", adtte)
)
)

hashes <- calculate_hashes(datanames = c("ADSL", "ADAE", "ADTTE"), datasets = datasets)
testthat::expect_identical(
hashes,
list(
"ADSL" = "e89f5271357822c78dd5cfddb60c0a95",
"ADAE" = "f71b576ecfd23075f7285841327515e0",
"ADTTE" = "c68c01c86b946a3dfe05150da040aa2a"
)
)
testthat::expect_is(hashes, "list")
testthat::expect_named(hashes)
})

testthat::test_that("calculate_hashes returns the hash of the non Filtered dataset", {
datasets <- teal.slice::init_filtered_data(
teal.data::teal_data(
teal.data::dataset("iris", iris)
)
)

fs <- list(
iris = list(
Sepal.Length = list(c(5.1, 6.4)),
Species = c("setosa", "versicolor")
)
)
datasets$set_filter_state(state = fs)

hashes <- calculate_hashes(datanames = c("iris"), datasets = datasets)
testthat::expect_identical(hashes, list("iris" = "34844aba7bde36f5a34f6d8e39803508"))
testthat::expect_false(hashes == rlang::hash(isolate(datasets$get_data("iris", filtered = TRUE))))
})
5 changes: 3 additions & 2 deletions vignettes/adding-support-for-reporting.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -323,8 +323,9 @@ example_reporter_module <- function(label = "Example") {

app <- init(
data = teal_data(
dataset("AIR", airquality, code = "AIR <- data(airquality)"),
dataset("IRIS", iris, code = "IRIS <- data(iris)")
dataset("AIR", airquality, code = "data(airquality); AIR <- airquality"),
dataset("IRIS", iris, code = "data(iris); IRIS <- iris"),
check = FALSE
),
modules = modules(
example_reporter_module(label = "with Reporter"),
Expand Down