Skip to content

Commit

Permalink
Merge pull request #82 from matthiasgomolka/development
Browse files Browse the repository at this point in the history
sdcLog_0.4.0. See NEWS.md for details.
  • Loading branch information
matthiasgomolka authored Oct 8, 2021
2 parents f466701 + a1015c0 commit e0cc217
Show file tree
Hide file tree
Showing 24 changed files with 390 additions and 126 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,5 @@ vignettes/*.pdf
inst/doc
doc
Meta
/doc/
/Meta/
9 changes: 6 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,24 @@ Imports:
checkmate (>= 2.0.0),
crayon (>= 1.3.4),
data.table (>= 1.12.8),
mathjaxr,
stats,
utils
Suggests:
tibble,
knitr,
lfe,
rmarkdown,
skimr,
spelling,
testthat (>= 3.0.0)
testthat (>= 3.0.0),
tibble
VignetteBuilder:
knitr
RdMacros:
mathjaxr
Config/testthat/edition: 3
Encoding: UTF-8
Language: en-US
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
RoxygenNote: 7.1.2
Binary file modified Meta/vignette.rds
Binary file not shown.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export(sdc_descriptives)
export(sdc_log)
export(sdc_min_max)
export(sdc_model)
import(mathjaxr)
importFrom(broom,augment)
importFrom(broom,tidy)
importFrom(checkmate,assert_character)
Expand All @@ -28,6 +29,7 @@ importFrom(data.table,":=")
importFrom(data.table,.N)
importFrom(data.table,.SD)
importFrom(data.table,as.data.table)
importFrom(data.table,between)
importFrom(data.table,data.table)
importFrom(data.table,fifelse)
importFrom(data.table,fintersect)
Expand Down
15 changes: 15 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# sdcLog 0.4.0

### Improvements

* Introduction of [mathjaxr](https://CRAN.R-project.org/package=mathjaxr) to
improve the quality and readability of the help files.
* [lintr](https://CRAN.R-project.org/package=lintr) induced cleanup of the
code base.

### Bug Fixes

* Fixed https://github.com/matthiasgomolka/sdcLog/issues/79.
* Fixed https://github.com/matthiasgomolka/sdcLog/issues/83.
* Fixed https://github.com/matthiasgomolka/sdcLog/issues/75.

# sdcLog 0.3.0

### Possibly Breaking Changes
Expand Down
3 changes: 0 additions & 3 deletions R/arguments.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
# pure technical settings
.datatable.aware <- TRUE

#' arguments
#' @name common_arguments
#' @param data [data.frame] from which the descriptive statistics are
Expand Down
5 changes: 0 additions & 5 deletions R/check_distinct_ids.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,8 @@ check_distinct_ids <- function(data, id_var, val_var = NULL, by = NULL) {
order(distinct_ids)
]

# If there is no id, nothing needs to be protected. If distinct_ids == NA, no
# warning will be thrown.
distinct_ids[distinct_ids == 0, distinct_ids := NA_integer_]

structure(
distinct_ids,
class = c("sdc_distinct_ids", "data.table", "data.frame")
)
}

54 changes: 32 additions & 22 deletions R/check_dominance.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,22 @@
#' @noRd
check_dominance <- function(data, id_var, val_var = NULL, by = NULL) {
# remove NSE notes in R CMD check
agg_val_var <- value_share <- id_na <- cum_value_share <- value_share_na <- NULL
agg_val_var <- value_share <- id_na <- cum_value_share <- value_share_na <-
NULL


class <- c("sdc_dominance", "data.table", "data.frame")
# handle the case where no val_var is provided
if (is.null(val_var)) {
return(structure(data.table(value_share = NA_real_), class = class))
return(
structure(data.table::data.table(value_share = NA_real_), class = class)
)
}

# missing_id_var = "structural" ----
# distinguish between NA and non-NA id's and calculate the value_share for
# each id and the cumulative value_share
DT <- data[
dt <- data[
j = list(agg_val_var = sum(abs(get(val_var)), na.rm = TRUE)),
keyby = c(id_var, by)
][
Expand All @@ -30,33 +33,40 @@ check_dominance <- function(data, id_var, val_var = NULL, by = NULL) {
]

# calculate the value share of NA id's in order to subtract it later
na_shares <- DT[
i = id_na == TRUE,
j = list(id_na, value_share_na = value_share),
keyby = by
]
na_shares <- dt[id_na == TRUE, list(value_share_na = value_share), keyby = by]

if (nrow(na_shares) > 0L) {
# The following code may look unnecessarily complicated, but it's necessary
# in order to handle by groups correctly.
# We first merge the value_share_na, ...
DT <- merge(DT, na_shares, by = c("id_na", by), all.x = TRUE, sort = FALSE)
# ... then we fill this value forward. ROW ORDER MATTERS! Rows in DT are
# ordered decreasingly by agg_val_var above.
data.table::setnafill(DT, type = "locf", cols = "value_share_na")
# Now we substract the share of NA from the cumulative value share.
DT[, cum_value_share := cum_value_share - value_share_na, keyby = by]
# We first bind / merge the value_share_na, ...
if (is.null(by)) {
dt <- cbind(dt, na_shares)
} else {
dt <- merge(dt, na_shares, by = by, all.x = TRUE, sort = FALSE)
}
# ... then we subtract the share of NA from the cumulative value share.
dt[, cum_value_share := cum_value_share - value_share_na, keyby = by]
# Lastly, we delete rows where the id is NA.
DT <- DT[id_na == FALSE]
dt <- dt[id_na == FALSE]
}

dominance <- DT[
j = .SD[getOption("sdc.n_ids_dominance", 2L)],
keyby = by,
.SDcols = c(value_share = "cum_value_share")
]
if (nrow(dt) == 0L) { # handle the edge case with no ID's
cols_to_keep <- setdiff(
names(dt),
c("id_na", "value_share", "value_share_na")
)
dominance <- dt[, cols_to_keep, with = FALSE]

} else { # general case
dominance <- dt[
j = .SD[min(getOption("sdc.n_ids_dominance", 2L), .N)],
# min() necessary to handle the edge case with only a single ID
keyby = by,
.SDcols = "cum_value_share"
]
}
data.table::setnames(dominance, old = "cum_value_share", new = "value_share")
setorderv(dominance, "value_share", order = -1L)
data.table::setorderv(dominance, "value_share", order = -1L)

structure(dominance, class = class)
}
5 changes: 0 additions & 5 deletions R/is_dummy.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,6 @@ is_dummy <- function(x) {
if (!is.atomic(x)) {
return(FALSE)
}
# handling complex vectors specifically because they cause a problem in the
# last if clause of this function
# if (is.complex(x)) {
# return(FALSE)
# }

if (is.logical(x)) {
return(TRUE)
Expand Down
42 changes: 33 additions & 9 deletions R/print_methods.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,28 @@
#' @importFrom crayon bold red
#' @importFrom data.table as.data.table
#' @importFrom data.table as.data.table between
#' @export
print.sdc_distinct_ids <- function(x, ...) {
distinct_ids <- NULL # removes NSE notes in R CMD check

var_names <- setdiff(names(x), "distinct_ids")
x_non_zero <- subset_zero(x, var_names)

# with problems
if (nrow(x[distinct_ids < getOption("sdc.n_ids", 5L)]) > 0L) {
if (nrow(x_non_zero[data.table::between(
distinct_ids,
lower = 0L,
upper = getOption("sdc.n_ids", 5L),
incbounds = FALSE)]
) > 0L) {
cat(crayon::red("Not enough distinct entities:\n"))
print(data.table::as.data.table(x))

# withOUT problems
} else if (getOption("sdc.info_level", 1L) > 1L) {

n_distinct_ids <- min(x[["distinct_ids"]])
message(
"No problem with number of distinct entities (",
min(x[["distinct_ids"]]),
")."
"No problem with number of distinct entities (", n_distinct_ids, ")."
)
}

Expand All @@ -23,7 +32,7 @@ print.sdc_distinct_ids <- function(x, ...) {
#' @importFrom data.table as.data.table
#' @export
print.sdc_dominance <- function(x, ...) {
distinct_ids <- value_share <- NULL # removes NSE notes in R CMD check
value_share <- NULL # removes NSE notes in R CMD check

# with problems
if (nrow(x[value_share >= getOption("sdc.share_dominance", 0.85)]) > 0L) {
Expand Down Expand Up @@ -73,6 +82,7 @@ print.sdc_settings <- function(x, ...) {
)
}

#' @importFrom data.table between
#' @export
print.sdc_descriptives <- function(x, ...) {
distinct_ids <- value_share <- NULL # removes NSE notes in R CMD check
Expand All @@ -83,8 +93,14 @@ print.sdc_descriptives <- function(x, ...) {
print(x[["distinct_ids"]])
print(x[["dominance"]])
no_problems <- sum(
nrow(x[["distinct_ids"]][distinct_ids < getOption("sdc.n_ids", 5L)]),
nrow(x[["dominance"]][value_share >= getOption("sdc.share_dominance", 0.85)])
nrow(x[["distinct_ids"]][data.table::between(
distinct_ids,
lower = 0L,
upper = getOption("sdc.n_ids", 5L),
incbounds = FALSE)]),
nrow(
x[["dominance"]][value_share >= getOption("sdc.share_dominance", 0.85)]
)
) == 0L
if (no_problems & (getOption("sdc.info_level", 1L) > 0L)) {
message("Output complies to RDC rules.")
Expand All @@ -111,7 +127,15 @@ print.sdc_model <- function(x, ...) {

n_problems <- vapply(
append(list(distinct_ids = x[["distinct_ids"]]), x[["terms"]]),
function(x) nrow(x[distinct_ids < getOption("sdc.n_ids", 5L)]),
function(x) {
var_names <- setdiff(names(x), "distinct_ids")
x_non_zero <- subset_zero(x, var_names)
nrow(x_non_zero[data.table::between(
distinct_ids,
lower = 0L,
upper = getOption("sdc.n_ids", 5L),
incbounds = FALSE)])
},
FUN.VALUE = integer(1L)
)
no_problems <- sum(n_problems) == 0L
Expand Down
52 changes: 47 additions & 5 deletions R/sdc_descriptives.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,37 @@
#' Disclosure control for descriptive statistics
#' @description Checks if your descriptive statistics comply to statistical
#' disclosure control. Checks for number of distinct entities and dominance.
#'
#' @description Checks the number of distinct entities and the (n, k)
#' dominance rule for your descriptive statistics.
#'
#' That means that `sdc_descriptives()` checks if there are at least 5
#' distinct entities and if the largest 2 entities account for 85% or more of
#' `val_var`. The parameters can be changed using options. For details see
#' `vignette("options", package = "sdcLog")`.
#'
#' @inheritParams common_arguments
#'
#' @importFrom data.table is.data.table as.data.table set
#'
#' @import mathjaxr
#'
#' @details
#' \loadmathjax
#' The general form of the \mjseqn{(n, k)} dominance rule can be formulated
#' as:
#'
#' \mjsdeqn{\sum_{i=1}^{n}x_i > \frac{k}{100} \sum_{i=1}^{N}x_i}
#'
#' where \mjseqn{x_1 \ge x_2 \ge \cdots \ge x_{N}}. \mjseqn{n} denotes the
#' number of largest contributions to be considered, \mjseqn{x_n} the
#' \mjseqn{n}-th largest contribution, \mjseqn{k} the maximal percentage these
#' \mjseqn{n} contributions may account for, and \mjseqn{N} is the total
#' number of observations.
#'
#' If the statement above is true, the \mjseqn{(n, k)} dominance rule is
#' violated.
#'
#' @export
#'
#' @examples
#' sdc_descriptives(
#' data = sdc_descriptives_DT,
Expand Down Expand Up @@ -39,10 +67,17 @@
#' by = c("sector", "year"),
#' zero_as_NA = FALSE
#' )
#'
#' @return A [list] of class `sdc_descriptives` with detailed information about
#' options, settings, and compliance with the criteria distinct entities and
#' dominance.
sdc_descriptives <- function(data, id_var = getOption("sdc.id_var"), val_var = NULL, by = NULL, zero_as_NA = NULL) {

sdc_descriptives <- function(
data, id_var = getOption("sdc.id_var"),
val_var = NULL,
by = NULL,
zero_as_NA = NULL
) {
distinct_ids <- value_share <- NULL # removes NSE notes in R CMD check

# input checks ----
Expand All @@ -56,6 +91,10 @@ sdc_descriptives <- function(data, id_var = getOption("sdc.id_var"), val_var = N
checkmate::assert_subset(id_var, choices = col_names)

checkmate::assert_string(val_var, null.ok = TRUE)
# assert that val_var is not "val_var" (which would lead to errors later on)
if (!is.null(val_var) && val_var == "val_var") {
stop("Assertion on 'val_var' failed: Must not equal \"val_var\".")
}
checkmate::assert_subset(val_var, choices = setdiff(col_names, id_var))

checkmate::assert_character(by, any.missing = FALSE, null.ok = TRUE)
Expand Down Expand Up @@ -89,7 +128,8 @@ sdc_descriptives <- function(data, id_var = getOption("sdc.id_var"), val_var = N
na_idx <- which(data[[val_var]] == 0)
data.table::set(data, i = na_idx, j = val_var, value = NA)

on.exit( # reset to zero in order to leave the data unchanged
# reset to zero in order to leave the data unchanged
on.exit(
data.table::set(data, i = na_idx, j = val_var, value = 0)
)
}
Expand All @@ -105,7 +145,9 @@ sdc_descriptives <- function(data, id_var = getOption("sdc.id_var"), val_var = N
dominance <- check_dominance(data, id_var, val_var, by)

# warn about dominance if necessary
if (nrow(dominance[value_share >= getOption("sdc.share_dominance", 0.85)]) > 0L) {
if (nrow(
dominance[value_share >= getOption("sdc.share_dominance", 0.85)]
) > 0L) {
warning(
crayon::bold("DISCLOSURE PROBLEM: "), "Dominant entities.",
call. = FALSE
Expand Down
Loading

0 comments on commit e0cc217

Please sign in to comment.