epiverse-trace · Karim-Mane · Apr 5, 2024 · Apr 8, 2024 · Apr 8, 2024 · Apr 8, 2024
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -11,3 +11,4 @@
 ^doc$
 ^Meta$
 ^CITATION\.cff$
+^data-raw$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -22,7 +22,6 @@ Authors@R: c(
     person("Joshua W.", "Lambert", , "[email protected]", role = "rev",
            comment = c(ORCID = "0000-0001-5218-3046"))
   )
-Maintainer: Karim Mané <[email protected]>
 Description: cleanepi provides functions for cleaning and standardizing tabular data,
     tailored specifically for curating epidemiological data.
 License: MIT + file LICENSE
@@ -44,6 +43,7 @@ Imports:
     numberize,
     R.utils,
     readr,
+    rlang,
     snakecase,
     stringr,
     utils
@@ -53,7 +53,6 @@ Suggests:
     lintr,
     markdown,
     reactable,
-    rlang,
     rmarkdown,
     spelling,
     testthat (>= 3.0.0)

diff --git a/NAMESPACE b/NAMESPACE
@@ -11,7 +11,7 @@ export(convert_to_numeric)
 export(correct_subject_ids)
 export(find_duplicates)
 export(print_report)
-export(remove_constant)
+export(remove_constants)
 export(remove_duplicates)
 export(replace_missing_values)
 export(scan_data)
@@ -20,4 +20,5 @@ export(standardize_column_names)
 export(standardize_dates)
 importFrom(lubridate,"%--%")
 importFrom(magrittr,"%>%")
+importFrom(rlang,.data)
 importFrom(utils,browseURL)
diff --git a/R/check_date_sequence.R b/R/check_date_sequence.R
@@ -5,44 +5,49 @@
 #'
 #' @param data A data frame
 #' @param target_columns A vector of event column names. Users should specify at
-#'    least 2 column names in the expected order.
-#'    For example: target_columns = c("date_symptoms_onset",
-#'    "date_hospitalization", "date_death"). When the input data is a `linelist`
-#'    object, this parameter can be set to `linelist_tags` if you wish to
-#'    the date sequence across tagged columns only.
-#' @param remove A Boolean to specify if rows with incorrect order
-#'    should be filtered out or not. The default is FALSE
+#'    least 2 column names in the expected order. For example:
+#'    target_columns = c("date_symptoms_onset", "date_hospitalization",
+#'    "date_death").
+#'    When the input data is a `linelist` object, this parameter can be set to
+#'    `linelist_tags` if you wish to the date sequence across tagged columns
+#'    only.
+#'    The values in this column should be in the ISO format (2024-12-31).
+#'    Otherwise, use the `standardize_dates()` function to standardize them.
 #'
-#' @returns Rows of the input data frame with incorrect date sequence
-#'    if `remove = FALSE`, the input data frame without those
-#'    rows if not.
+#' @returns The input dataset. When found, the incorrect date sequences will be
+#'    stored in the report where they can be accessed using
+#'    `attr(data, "report")`.
 #' @export
 #'
 #' @examples
+#' # import the data
+#' data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi"))
+#'
+#' # standardize the date values
+#' data <- data |>
+#'   standardize_dates(
+#'     target_columns  = c("date_first_pcr_positive_test", "date.of.admission"),
+#'     error_tolerance = 0.4,
+#'     format          = NULL,
+#'     timeframe       = NULL
+#'   )
+#'
 #' good_date_sequence <- check_date_sequence(
-#'   data           = readRDS(system.file("extdata", "test_df.RDS",
-#'                                        package = "cleanepi")),
-#'   target_columns = c("date_first_pcr_positive_test", "date.of.admission"),
-#'   remove         = FALSE
+#'   data           = data,
+#'   target_columns = c("date_first_pcr_positive_test", "date.of.admission")
 #' )
-check_date_sequence <- function(data, target_columns,
-                                remove = FALSE) {
-
+check_date_sequence <- function(data, target_columns) {
   checkmate::assert_vector(target_columns, any.missing = FALSE, min.len = 1L,
                            max.len = dim(data)[2], null.ok = FALSE,
                            unique = TRUE)
   checkmate::assert_data_frame(data, null.ok = FALSE)
-  checkmate::assert_logical(remove, any.missing = FALSE, len = 1L,
-                            null.ok = FALSE)
 
-  # check if input is character string
-  if (all(grepl(",", target_columns, fixed = TRUE))) {
-    target_columns <- as.character(unlist(strsplit(target_columns, ",",
-                                                   fixed = TRUE)))
-    target_columns <- trimws(target_columns)
-  }
+  # get the correct names in case some have been modified - see the
+  # `retrieve_column_names()` function for more details
+  target_columns <- retrieve_column_names(data, target_columns)
   target_columns <- get_target_column_names(data, target_columns, cols = NULL)
 
+
   # check if all columns are part of the data frame
   if (!all(target_columns %in% names(data))) {
     idx            <- which(!(target_columns %in% names(data)))
@@ -54,14 +59,6 @@ check_date_sequence <- function(data, target_columns,
     }
   }
 
-  # check and convert to Date if required
-  for (cols in target_columns) {
-    if (!lubridate::is.Date(data[[cols]])) {
-      data <- standardize_dates(data, cols, timeframe = NULL,
-                                error_tolerance = 0.5)
-    }
-  }
-
   # checking the date sequence
   tmp_data   <- data %>% dplyr::select(dplyr::all_of(target_columns))
   order_date <- apply(tmp_data, 1L, is_date_sequence_ordered)
@@ -76,11 +73,6 @@ check_date_sequence <- function(data, target_columns,
             " incorrect date sequences at line(s): ",
             glue::glue_collapse(bad_order, sep = ", "),
             call. = FALSE)
-    if (remove) {
-      data  <- data[-bad_order, ]
-      warning("The incorrect date sequences have been removed.",
-              call. = FALSE)
-    }
   }
 
   return(data)
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,3 +11,4 @@ @@
     ^doc$
     ^Meta$
     ^CITATION\.cff$
+    ^data-raw$