Merge pull request #82 from matthiasgomolka/development

sdcLog_0.4.0. See NEWS.md for details.
matthiasgomolka · Oct 8, 2021 · e0cc217 · e0cc217
2 parents f466701 + a1015c0
commit e0cc217
Show file tree

Hide file tree

Showing 24 changed files with 390 additions and 126 deletions.
diff --git a/.gitignore b/.gitignore
@@ -38,3 +38,5 @@ vignettes/*.pdf
 inst/doc
 doc
 Meta
+/doc/
+/Meta/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -31,21 +31,24 @@ Imports:
     checkmate (>= 2.0.0),
     crayon (>= 1.3.4),
     data.table (>= 1.12.8),
+    mathjaxr,
     stats,
     utils
 Suggests: 
-    tibble,
     knitr,
     lfe,
     rmarkdown,
     skimr,
     spelling,
-    testthat (>= 3.0.0)
+    testthat (>= 3.0.0),
+    tibble
 VignetteBuilder: 
     knitr
+RdMacros: 
+    mathjaxr
 Config/testthat/edition: 3
 Encoding: UTF-8
 Language: en-US
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.1.1
+RoxygenNote: 7.1.2
diff --git a/Meta/vignette.rds b/Meta/vignette.rds
diff --git a/NAMESPACE b/NAMESPACE
@@ -11,6 +11,7 @@ export(sdc_descriptives)
 export(sdc_log)
 export(sdc_min_max)
 export(sdc_model)
+import(mathjaxr)
 importFrom(broom,augment)
 importFrom(broom,tidy)
 importFrom(checkmate,assert_character)
@@ -28,6 +29,7 @@ importFrom(data.table,":=")
 importFrom(data.table,.N)
 importFrom(data.table,.SD)
 importFrom(data.table,as.data.table)
+importFrom(data.table,between)
 importFrom(data.table,data.table)
 importFrom(data.table,fifelse)
 importFrom(data.table,fintersect)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,18 @@
+# sdcLog 0.4.0
+
+### Improvements
+
+* Introduction of [mathjaxr](https://CRAN.R-project.org/package=mathjaxr) to
+  improve the quality and readability of the help files.
+* [lintr](https://CRAN.R-project.org/package=lintr) induced cleanup of the 
+  code base.
+
+### Bug Fixes
+
+* Fixed https://github.com/matthiasgomolka/sdcLog/issues/79.
+* Fixed https://github.com/matthiasgomolka/sdcLog/issues/83.
+* Fixed https://github.com/matthiasgomolka/sdcLog/issues/75.
+
 # sdcLog 0.3.0
 
 ### Possibly Breaking Changes

diff --git a/R/arguments.R b/R/arguments.R
@@ -1,6 +1,3 @@
-# pure technical settings
-.datatable.aware <- TRUE
-
 #' arguments
 #' @name common_arguments
 #' @param data [data.frame] from which the descriptive statistics are

diff --git a/R/check_distinct_ids.R b/R/check_distinct_ids.R
@@ -18,13 +18,8 @@ check_distinct_ids <- function(data, id_var, val_var = NULL, by = NULL) {
     order(distinct_ids)
   ]
 
-  # If there is no id, nothing needs to be protected. If distinct_ids == NA, no
-  # warning will be thrown.
-  distinct_ids[distinct_ids == 0, distinct_ids := NA_integer_]
-
   structure(
     distinct_ids,
     class = c("sdc_distinct_ids", "data.table", "data.frame")
   )
 }
-
diff --git a/R/check_dominance.R b/R/check_dominance.R
@@ -4,19 +4,22 @@
 #' @noRd
 check_dominance <- function(data, id_var, val_var = NULL, by = NULL) {
   # remove NSE notes in R CMD check
-  agg_val_var <- value_share <- id_na <- cum_value_share <- value_share_na <- NULL
+  agg_val_var <- value_share <- id_na <- cum_value_share <- value_share_na <-
+    NULL
 
 
   class <- c("sdc_dominance", "data.table", "data.frame")
   # handle the case where no val_var is provided
   if (is.null(val_var)) {
-    return(structure(data.table(value_share = NA_real_), class = class))
+    return(
+      structure(data.table::data.table(value_share = NA_real_), class = class)
+    )
   }
 
   # missing_id_var = "structural" ----
   # distinguish between NA and non-NA id's and calculate the value_share for
   # each id and the cumulative value_share
-  DT <- data[
+  dt <- data[
     j = list(agg_val_var = sum(abs(get(val_var)), na.rm = TRUE)),
     keyby = c(id_var, by)
   ][
@@ -30,33 +33,40 @@ check_dominance <- function(data, id_var, val_var = NULL, by = NULL) {
   ]
 
   # calculate the value share of NA id's in order to subtract it later
-  na_shares <- DT[
-    i = id_na == TRUE,
-    j = list(id_na, value_share_na = value_share),
-    keyby = by
-  ]
+  na_shares <- dt[id_na == TRUE, list(value_share_na = value_share), keyby = by]
 
   if (nrow(na_shares) > 0L) {
     # The following code may look unnecessarily complicated, but it's necessary
     # in order to handle by groups correctly.
-    # We first merge the value_share_na, ...
-    DT <- merge(DT, na_shares, by = c("id_na", by), all.x = TRUE, sort = FALSE)
-    # ... then we fill this value forward. ROW ORDER MATTERS! Rows in DT are
-    # ordered decreasingly by agg_val_var above.
-    data.table::setnafill(DT, type = "locf", cols = "value_share_na")
-    # Now we substract the share of NA from the cumulative value share.
-    DT[, cum_value_share := cum_value_share - value_share_na, keyby = by]
+    # We first bind / merge the value_share_na, ...
+    if (is.null(by)) {
+      dt <- cbind(dt, na_shares)
+    } else {
+      dt <- merge(dt, na_shares, by = by, all.x = TRUE, sort = FALSE)
+    }
+    # ... then we subtract the share of NA from the cumulative value share.
+    dt[, cum_value_share := cum_value_share - value_share_na, keyby = by]
     # Lastly, we delete rows where the id is NA.
-    DT <- DT[id_na == FALSE]
+    dt <- dt[id_na == FALSE]
   }
 
-  dominance <- DT[
-    j = .SD[getOption("sdc.n_ids_dominance", 2L)],
-    keyby = by,
-    .SDcols = c(value_share = "cum_value_share")
-  ]
+  if (nrow(dt) == 0L) { # handle the edge case with no ID's
+    cols_to_keep <- setdiff(
+      names(dt),
+      c("id_na", "value_share", "value_share_na")
+    )
+    dominance <- dt[, cols_to_keep, with = FALSE]
+
+  } else { # general case
+    dominance <- dt[
+      j = .SD[min(getOption("sdc.n_ids_dominance", 2L), .N)],
+      # min() necessary to handle the edge case with only a single ID
+      keyby = by,
+      .SDcols = "cum_value_share"
+    ]
+  }
   data.table::setnames(dominance, old = "cum_value_share", new = "value_share")
-  setorderv(dominance, "value_share", order = -1L)
+  data.table::setorderv(dominance, "value_share", order = -1L)
 
   structure(dominance, class = class)
 }
diff --git a/R/is_dummy.R b/R/is_dummy.R
@@ -6,11 +6,6 @@ is_dummy <- function(x) {
   if (!is.atomic(x)) {
     return(FALSE)
   }
-  # handling complex vectors specifically because they cause a problem in the
-  # last if clause of this function
-  # if (is.complex(x)) {
-  #   return(FALSE)
-  # }
 
   if (is.logical(x)) {
     return(TRUE)

diff --git a/R/print_methods.R b/R/print_methods.R
@@ -1,19 +1,28 @@
 #' @importFrom crayon bold red
-#' @importFrom data.table as.data.table
+#' @importFrom data.table as.data.table between
 #' @export
 print.sdc_distinct_ids <- function(x, ...) {
   distinct_ids <- NULL # removes NSE notes in R CMD check
+
+  var_names <- setdiff(names(x), "distinct_ids")
+  x_non_zero <- subset_zero(x, var_names)
+
   # with problems
-  if (nrow(x[distinct_ids < getOption("sdc.n_ids", 5L)]) > 0L) {
+  if (nrow(x_non_zero[data.table::between(
+    distinct_ids,
+    lower = 0L,
+    upper = getOption("sdc.n_ids", 5L),
+    incbounds = FALSE)]
+  ) > 0L) {
     cat(crayon::red("Not enough distinct entities:\n"))
     print(data.table::as.data.table(x))
 
     # withOUT problems
   } else if (getOption("sdc.info_level", 1L) > 1L) {
+
+    n_distinct_ids <- min(x[["distinct_ids"]])
     message(
-      "No problem with number of distinct entities (",
-      min(x[["distinct_ids"]]),
-      ")."
+      "No problem with number of distinct entities (", n_distinct_ids, ")."
     )
   }
 
@@ -23,7 +32,7 @@ print.sdc_distinct_ids <- function(x, ...) {
 #' @importFrom data.table as.data.table
 #' @export
 print.sdc_dominance <- function(x, ...) {
-  distinct_ids <- value_share <- NULL # removes NSE notes in R CMD check
+  value_share <- NULL # removes NSE notes in R CMD check
 
   # with problems
   if (nrow(x[value_share >= getOption("sdc.share_dominance", 0.85)]) > 0L) {
@@ -73,6 +82,7 @@ print.sdc_settings <- function(x, ...) {
   )
 }
 
+#' @importFrom data.table between
 #' @export
 print.sdc_descriptives <- function(x, ...) {
   distinct_ids <- value_share <- NULL # removes NSE notes in R CMD check
@@ -83,8 +93,14 @@ print.sdc_descriptives <- function(x, ...) {
   print(x[["distinct_ids"]])
   print(x[["dominance"]])
   no_problems <- sum(
-    nrow(x[["distinct_ids"]][distinct_ids < getOption("sdc.n_ids", 5L)]),
-    nrow(x[["dominance"]][value_share >= getOption("sdc.share_dominance", 0.85)])
+    nrow(x[["distinct_ids"]][data.table::between(
+      distinct_ids,
+      lower = 0L,
+      upper = getOption("sdc.n_ids", 5L),
+      incbounds = FALSE)]),
+    nrow(
+      x[["dominance"]][value_share >= getOption("sdc.share_dominance", 0.85)]
+    )
   ) == 0L
   if (no_problems & (getOption("sdc.info_level", 1L) > 0L)) {
     message("Output complies to RDC rules.")
@@ -111,7 +127,15 @@ print.sdc_model <- function(x, ...) {
 
   n_problems <- vapply(
     append(list(distinct_ids = x[["distinct_ids"]]), x[["terms"]]),
-    function(x) nrow(x[distinct_ids < getOption("sdc.n_ids", 5L)]),
+    function(x) {
+      var_names <- setdiff(names(x), "distinct_ids")
+      x_non_zero <- subset_zero(x, var_names)
+      nrow(x_non_zero[data.table::between(
+        distinct_ids,
+        lower = 0L,
+        upper = getOption("sdc.n_ids", 5L),
+        incbounds = FALSE)])
+    },
     FUN.VALUE = integer(1L)
   )
   no_problems <- sum(n_problems) == 0L

diff --git a/R/sdc_descriptives.R b/R/sdc_descriptives.R
@@ -1,9 +1,37 @@
 #' Disclosure control for descriptive statistics
-#' @description Checks if your descriptive statistics comply to statistical
-#'   disclosure control. Checks for number of distinct entities and dominance.
+#'
+#' @description Checks the number of distinct entities and the (n, k)
+#'   dominance rule for your descriptive statistics.
+#'
+#'   That means that `sdc_descriptives()` checks if there are at least 5
+#'   distinct entities and if the largest 2 entities account for 85% or more of
+#'   `val_var`. The parameters can be changed using options. For details see
+#'   `vignette("options", package = "sdcLog")`.
+#'
 #' @inheritParams common_arguments
+#'
 #' @importFrom data.table is.data.table as.data.table set
+#'
+#' @import mathjaxr
+#'
+#' @details
+#'   \loadmathjax
+#'   The general form of the \mjseqn{(n, k)} dominance rule can be formulated
+#'   as:
+#'
+#'   \mjsdeqn{\sum_{i=1}^{n}x_i > \frac{k}{100} \sum_{i=1}^{N}x_i}
+#'
+#'   where \mjseqn{x_1 \ge x_2 \ge \cdots \ge x_{N}}. \mjseqn{n} denotes the
+#'   number of largest contributions to be considered, \mjseqn{x_n} the
+#'   \mjseqn{n}-th largest contribution, \mjseqn{k} the maximal percentage these
+#'   \mjseqn{n} contributions may account for, and \mjseqn{N} is the total
+#'   number of observations.
+#'
+#'   If the statement above is true, the \mjseqn{(n, k)} dominance rule is
+#'   violated.
+#'
 #' @export
+#'
 #' @examples
 #' sdc_descriptives(
 #'   data = sdc_descriptives_DT,
@@ -39,10 +67,17 @@
 #'   by = c("sector", "year"),
 #'   zero_as_NA = FALSE
 #' )
+#'
 #' @return A [list] of class `sdc_descriptives` with detailed information about
 #'   options, settings, and compliance with the criteria distinct entities and
 #'   dominance.
-sdc_descriptives <- function(data, id_var = getOption("sdc.id_var"), val_var = NULL, by = NULL, zero_as_NA = NULL) {
+
+sdc_descriptives <- function(
+  data, id_var = getOption("sdc.id_var"),
+  val_var = NULL,
+  by = NULL,
+  zero_as_NA = NULL
+) {
   distinct_ids <- value_share <- NULL # removes NSE notes in R CMD check
 
   # input checks ----
@@ -56,6 +91,10 @@ sdc_descriptives <- function(data, id_var = getOption("sdc.id_var"), val_var = N
   checkmate::assert_subset(id_var, choices = col_names)
 
   checkmate::assert_string(val_var, null.ok = TRUE)
+  # assert that val_var is not "val_var" (which would lead to errors later on)
+  if (!is.null(val_var) && val_var == "val_var") {
+    stop("Assertion on 'val_var' failed: Must not equal \"val_var\".")
+  }
   checkmate::assert_subset(val_var, choices = setdiff(col_names, id_var))
 
   checkmate::assert_character(by, any.missing = FALSE, null.ok = TRUE)
@@ -89,7 +128,8 @@ sdc_descriptives <- function(data, id_var = getOption("sdc.id_var"), val_var = N
     na_idx <- which(data[[val_var]] == 0)
     data.table::set(data, i = na_idx, j = val_var, value = NA)
 
-    on.exit( # reset to zero in order to leave the data unchanged
+    # reset to zero in order to leave the data unchanged
+    on.exit(
       data.table::set(data, i = na_idx, j = val_var, value = 0)
     )
   }
@@ -105,7 +145,9 @@ sdc_descriptives <- function(data, id_var = getOption("sdc.id_var"), val_var = N
   dominance <- check_dominance(data, id_var, val_var, by)
 
   # warn about dominance if necessary
-  if (nrow(dominance[value_share >= getOption("sdc.share_dominance", 0.85)]) > 0L) {
+  if (nrow(
+    dominance[value_share >= getOption("sdc.share_dominance", 0.85)]
+  ) > 0L) {
     warning(
       crayon::bold("DISCLOSURE PROBLEM: "), "Dominant entities.",
       call. = FALSE
-Original file line number
+Diff line change
@@ Expand Up / @@ -38,3 +38,5 @@ vignettes/*.pdf @@
     inst/doc
     doc
     Meta
+    /doc/
+    /Meta/