Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug #449 tidyr complete #453

Merged
merged 12 commits into from
Feb 5, 2022
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: covidregionaldata
Title: Subnational Data for COVID-19 Epidemiology
Version: 0.9.2.3000
Version: 0.9.2.4000
Authors@R:
c(person(given = "Joseph",
family = "Palmer",
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ importFrom(dplyr,filter)
importFrom(dplyr,full_join)
importFrom(dplyr,group_by)
importFrom(dplyr,group_by_at)
importFrom(dplyr,group_vars)
importFrom(dplyr,if_else)
importFrom(dplyr,lag)
importFrom(dplyr,last_col)
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ This release is currently under development

## Bug fixes

- Fixed a bug in `fill_empty_dates_with_na()` caused by changes made in version `1.2.0` of `tidyr`.
- Fixed a bug in the data sourced from Germany so that instead of treating it as a line list of individuals it is treated as a relatively finely resolved count data which needs to be summed up (by @sbfnk).

# covidregionaldata 0.9.2
Expand Down
13 changes: 9 additions & 4 deletions R/processing.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,21 @@ set_negative_values_to_zero <- function(data) {
#' @family compulsory_processing
#' @concept compulsory_processing
#' @importFrom tidyr complete full_seq nesting
#' @importFrom dplyr starts_with
#' @importFrom dplyr starts_with group_by ungroup group_vars
#' @importFrom rlang !!! syms
fill_empty_dates_with_na <- function(data) {
regions <- select(data, starts_with("level_")) %>%
names()

groups <- group_vars(data)

data <- data %>%
ungroup() %>%
complete(
date = full_seq(data$date, period = 1),
nesting(!!!syms(regions))
)
) %>%
group_by(across(.cols = all_of(groups)))
return(data)
}

Expand Down Expand Up @@ -258,9 +263,9 @@ process_internal <- function(clean_data, level, group_vars,
"hosp_new", "hosp_total", "tested_new", "tested_total"
)),
everything()
) %>%
arrange(.data$date, all_of(group_vars_standard[1]))
)
}

dat <- ungroup(dat)

if (localise) {
Expand Down
3 changes: 2 additions & 1 deletion R/shared-methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,8 @@ CountryDataClass <- R6::R6Class("CountryDataClass",
if (!is.null(self$target_regions)) {
self$target_regions <- countryname(
self$target_regions,
destination = "country.name.en"
destination = "country.name.en",
warn = FALSE
)
if (all(is.na(self$target_regions))) {
stop("No countries found with target names")
Expand Down
Binary file modified tests/testthat/custom_data/ecdc.rds
Binary file not shown.
Binary file modified tests/testthat/custom_data/mexico_level_1_snap.rds
Binary file not shown.
Binary file modified tests/testthat/custom_data/mexico_level_2_snap.rds
Binary file not shown.
Binary file modified tests/testthat/custom_data/who.rds
Binary file not shown.
18 changes: 14 additions & 4 deletions tests/testthat/custom_tests/mock_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -207,12 +207,16 @@ get_expected_data_for_fill_empty_dates_with_na_test <- function() {
# full data is data with all dates/regions + some NAs in the cases column
expected_data <- data.frame(expand.grid(dates, regions))
colnames(expected_data) <- c("date", "level_1_region")

expected_data$date <- as.Date(expected_data$date)
expected_data$level_1_region <- as.character(expected_data$level_1_region)
expected_data <- expected_data %>%
dplyr::arrange(date, level_1_region) %>%
dplyr::left_join(region_codes, by = c("level_1_region" = "region"))
expected_data$cases <- c(1:5, rep(NA, 4), 10:12)
expected_data <- dplyr::select(
expected_data, date, level_1_region, level_1_region_code, everything()
)
return(dplyr::tibble(expected_data))
}

Expand All @@ -234,10 +238,16 @@ get_expected_data_for_complete_cumulative_columns_test <- function() {
partial_data <- expected_data[-c(6:9), ]

# manually add cumulative cases to get expected data
full_data_with_cum_cases_filled <- covidregionaldata:::fill_empty_dates_with_na(partial_data)
full_data_with_cum_cases_filled <- dplyr::arrange(full_data_with_cum_cases_filled, level_1_region, date)
full_data_with_cum_cases_filled <- cbind(full_data_with_cum_cases_filled, as.integer(c(1, 5, 5, 15, 2, 7, 7, 18, 3, 3, 3, 15)))
full_data_with_cum_cases_filled <- partial_data %>%
covidregionaldata:::fill_empty_dates_with_na()
full_data_with_cum_cases_filled <-
dplyr::arrange(full_data_with_cum_cases_filled, level_1_region, date)
full_data_with_cum_cases_filled <-
cbind(
full_data_with_cum_cases_filled,
as.integer(c(1, 5, 5, 15, 2, 7, 7, 18, 3, 3, 3, 15))
)
colnames(full_data_with_cum_cases_filled)[5] <- "cases_total"

return(dplyr::tibble(full_data_with_cum_cases_filled))
return(full_data_with_cum_cases_filled)
}
17 changes: 13 additions & 4 deletions tests/testthat/test-processing.R
Original file line number Diff line number Diff line change
Expand Up @@ -80,19 +80,28 @@ test_that("fill_empty_dates_with_na fills empty dates with NA", {
expected_data <- get_expected_data_for_fill_empty_dates_with_na_test()
# partial data deletes some rows (i.e. gets rid of some dates - all the ones
# with NA in cases)
expected_data <- expected_data

partial_data <- expected_data[-c(6:9), ]
expect_equal(fill_empty_dates_with_na(partial_data), expected_data)
expect_equal(
fill_empty_dates_with_na(partial_data),
expected_data
)
expected_data <- dplyr::mutate(
expected_data,
level_2_region = level_1_region,
level_2_region_code = level_1_region_code
) %>%
group_by(level_2_region, level_2_region_code) %>%
select(
date, level_2_region, level_2_region_code,
level_1_region, level_1_region_code, cases
date, level_1_region, level_1_region_code,
level_2_region, level_2_region_code, cases
)
partial_data <- expected_data[-c(6:9), ]
expect_equal(fill_empty_dates_with_na(partial_data), expected_data)
expect_equal(
fill_empty_dates_with_na(partial_data),
expected_data
)
})

test_that("complete_cumulative_columns works", {
Expand Down
75 changes: 43 additions & 32 deletions tests/testthat/test-regional-datasets.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,40 +19,51 @@ if (identical(Sys.getenv("NOT_CRAN"), "true")) {
download <- getOption("testDownload")
}

# get datasets for testing
sources <- get_available_datasets() %>%
dplyr::filter(.data$type %in%
c("national", "regional")) %>%
dplyr::select(
source = class,
level_1_region, level_2_region, level_3_region
) %>%
tidyr::pivot_longer(
cols = -source,
names_to = "level",
values_to = "regions"
) %>%
dplyr::mutate(
level = stringr::str_split(level, "_"),
level = purrr::map_chr(level, ~ .[2])
) %>%
tidyr::drop_na(regions)

# filter out target datasets
if (!is.null(source_of_interest)) {
sources <- sources %>%
dplyr::filter(source %in% source_of_interest)
test_regions <- TRUE
}else{
test_regions <- FALSE
}
if (!is.null(getOption("testRegions"))) {
test_regions <- getOption("testRegions")
}

# apply tests to each data source in turn
sources %>%
dplyr::rowwise() %>%
dplyr::group_split() %>%
purrr::walk(
~ test_regional_dataset(
source = .$source[[1]],
level = .$level[[1]],
download = download
if (test_regions) {
# get datasets for testing
sources <- get_available_datasets() %>%
dplyr::filter(.data$type %in%
c("national", "regional")) %>%
dplyr::select(
source = class,
level_1_region, level_2_region, level_3_region
) %>%
tidyr::pivot_longer(
cols = -source,
names_to = "level",
values_to = "regions"
) %>%
dplyr::mutate(
level = stringr::str_split(level, "_"),
level = purrr::map_chr(level, ~ .[2])
) %>%
tidyr::drop_na(regions)

# filter out target datasets
if (!is.null(source_of_interest)) {
sources <- sources %>%
dplyr::filter(source %in% source_of_interest)
}

# apply tests to each data source in turn
sources %>%
dplyr::rowwise() %>%
dplyr::group_split() %>%
purrr::walk(
~ test_regional_dataset(
source = .$source[[1]],
level = .$level[[1]],
download = download
)
)
)
}
}