diff --git a/DESCRIPTION b/DESCRIPTION index fcb8852b..eb118233 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: covidregionaldata Title: Subnational Data for COVID-19 Epidemiology -Version: 0.9.2.3000 +Version: 0.9.2.4000 Authors@R: c(person(given = "Joseph", family = "Palmer", @@ -98,6 +98,7 @@ Imports: R6, readxl, rlang, + stringi, stringr, tidyr (>= 1.0.0), vroom, diff --git a/NAMESPACE b/NAMESPACE index 35d1f272..e1363830 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -61,6 +61,7 @@ importFrom(dplyr,filter) importFrom(dplyr,full_join) importFrom(dplyr,group_by) importFrom(dplyr,group_by_at) +importFrom(dplyr,group_vars) importFrom(dplyr,if_else) importFrom(dplyr,lag) importFrom(dplyr,last_col) @@ -116,12 +117,12 @@ importFrom(rlang,"!!") importFrom(rlang,":=") importFrom(rlang,.data) importFrom(rlang,syms) -importFrom(stringr,str_conv) +importFrom(stringi,stri_trans_general) +importFrom(stringi,stri_trim_both) importFrom(stringr,str_detect) importFrom(stringr,str_replace_all) importFrom(stringr,str_to_sentence) importFrom(stringr,str_to_title) -importFrom(stringr,str_trim) importFrom(tidyr,complete) importFrom(tidyr,drop_na) importFrom(tidyr,fill) diff --git a/NEWS.md b/NEWS.md index 16faffd2..8062c9c0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,6 +15,7 @@ This release is currently under development ## Bug fixes +- Fixed a bug in `fill_empty_dates_with_na()` caused by changes made in version `1.2.0` of `tidyr`. - Fixed a bug in the data sourced from Germany so that instead of treating it as a line list of individuals it is treated as a relatively finely resolved count data which needs to be summed up (by @sbfnk). # covidregionaldata 0.9.2 diff --git a/R/Vietnam.R b/R/Vietnam.R index 44062df8..258ced9c 100644 --- a/R/Vietnam.R +++ b/R/Vietnam.R @@ -64,7 +64,8 @@ Vietnam <- R6::R6Class("Vietnam", #' @importFrom dplyr filter select mutate rename tibble as_tibble full_join #' @importFrom tidyr replace_na drop_na separate #' @importFrom purrr map - #' @importFrom stringr str_conv str_trim str_to_title str_replace_all + #' @importFrom stringr str_to_title str_replace_all + #' @importFrom stringi stri_trans_general stri_trim_both #' @importFrom lubridate dmy clean_common = function() { # The first three elements of self$data$raw are the data @@ -77,21 +78,22 @@ Vietnam <- R6::R6Class("Vietnam", function(x) as_tibble(unlist(x), rownames = "date")), function(y) { - y %>% separate(date, sep = "[.]+", into = c(NA, "province", "date")) + separate(y, date, sep = "[.]+", into = c(NA, "province", "date")) } ) - self$data$clean <- full_join( - full_join( - flat_all$case_by_time, flat_all$death_by_time, - by = c("province", "date"), - suffix = c(".cases", ".deaths"), - copy = TRUE - ), - flat_all$recovered_by_time, - by = c("province", "date"), - suffix = c("", ".recovered"), - copy = TRUE - ) %>% + index_cols <- bind_rows( + select(flat_all$case_by_time, "date", "province"), + select(flat_all$death_by_time, "date", "province"), + select(flat_all$recovered_by_time, "date", "province")) %>% + unique() + + self$data$clean <- index_cols %>% + left_join(rename(flat_all$case_by_time, cases_total = value), + by = c("province", "date") ) %>% + left_join(rename(flat_all$death_by_time, deaths_total = value), + by = c("province", "date") ) %>% + left_join(rename(flat_all$recovered_by_time, recovered_total = value), + by = c("province", "date") ) %>% # The api uses integer codes for provinces which do not # line up with ISO 3166-2 (some of which are not numbers) # so we use this as a temporary code to line names up @@ -99,9 +101,10 @@ Vietnam <- R6::R6Class("Vietnam", select( ncsc_region_code = province, date, - cases_total = value.cases, - deaths_total = value.deaths, - recovered_total = value) %>% + cases_total, + deaths_total, + recovered_total + ) %>% mutate(ncsc_region_code = as.numeric(ncsc_region_code)) %>% left_join( self$data$raw$provinces %>% @@ -119,8 +122,8 @@ Vietnam <- R6::R6Class("Vietnam", # #tidyr::drop_na(date, region_name) %>% mutate( - level_1_region = str_conv(level_1_region, "ASCII"), - level_1_region = str_trim(level_1_region, side = "both"), + level_1_region = stri_trans_general(level_1_region, "ASCII"), + level_1_region = stri_trim_both(level_1_region), level_1_region = str_replace_all(level_1_region, "\\(.*\\)|-| ", ""), level_1_region = str_to_title(level_1_region), diff --git a/R/processing.R b/R/processing.R index 75b63ce1..81c5ec4f 100644 --- a/R/processing.R +++ b/R/processing.R @@ -51,16 +51,21 @@ set_negative_values_to_zero <- function(data) { #' @family compulsory_processing #' @concept compulsory_processing #' @importFrom tidyr complete full_seq nesting -#' @importFrom dplyr starts_with +#' @importFrom dplyr starts_with group_by ungroup group_vars #' @importFrom rlang !!! syms fill_empty_dates_with_na <- function(data) { regions <- select(data, starts_with("level_")) %>% names() + + groups <- group_vars(data) + data <- data %>% + ungroup() %>% complete( date = full_seq(data$date, period = 1), nesting(!!!syms(regions)) - ) + ) %>% + group_by(across(.cols = all_of(groups))) return(data) } @@ -258,9 +263,9 @@ process_internal <- function(clean_data, level, group_vars, "hosp_new", "hosp_total", "tested_new", "tested_total" )), everything() - ) %>% - arrange(.data$date, all_of(group_vars_standard[1])) + ) } + dat <- ungroup(dat) if (localise) { diff --git a/R/shared-methods.R b/R/shared-methods.R index 62a88ba8..0433831e 100644 --- a/R/shared-methods.R +++ b/R/shared-methods.R @@ -640,7 +640,8 @@ CountryDataClass <- R6::R6Class("CountryDataClass", if (!is.null(self$target_regions)) { self$target_regions <- countryname( self$target_regions, - destination = "country.name.en" + destination = "country.name.en", + warn = FALSE ) if (all(is.na(self$target_regions))) { stop("No countries found with target names") diff --git a/_pkgdown.yml b/_pkgdown.yml index da51ab70..9076ed31 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,8 +1,9 @@ url: https://epiforecasts.io/covidregionaldata/ template: bootstrap: 5 + package: preferably params: - bootswatch: lumen + toggle: manual docsearch: api_key: 721a43acf6af66699c04bd8b2af75ff1 index_name: epiforecasts diff --git a/tests/testthat/custom_data/ecdc.rds b/tests/testthat/custom_data/ecdc.rds index 9e6c2294..6edff880 100644 Binary files a/tests/testthat/custom_data/ecdc.rds and b/tests/testthat/custom_data/ecdc.rds differ diff --git a/tests/testthat/custom_data/mexico_level_1_snap.rds b/tests/testthat/custom_data/mexico_level_1_snap.rds index cc89f66d..71abefd0 100644 Binary files a/tests/testthat/custom_data/mexico_level_1_snap.rds and b/tests/testthat/custom_data/mexico_level_1_snap.rds differ diff --git a/tests/testthat/custom_data/mexico_level_2_snap.rds b/tests/testthat/custom_data/mexico_level_2_snap.rds index b6dfc25b..8b021b02 100644 Binary files a/tests/testthat/custom_data/mexico_level_2_snap.rds and b/tests/testthat/custom_data/mexico_level_2_snap.rds differ diff --git a/tests/testthat/custom_data/who.rds b/tests/testthat/custom_data/who.rds index 14b102ee..6f004077 100644 Binary files a/tests/testthat/custom_data/who.rds and b/tests/testthat/custom_data/who.rds differ diff --git a/tests/testthat/custom_tests/mock_data.R b/tests/testthat/custom_tests/mock_data.R index 4b09b694..006cffdf 100644 --- a/tests/testthat/custom_tests/mock_data.R +++ b/tests/testthat/custom_tests/mock_data.R @@ -207,12 +207,16 @@ get_expected_data_for_fill_empty_dates_with_na_test <- function() { # full data is data with all dates/regions + some NAs in the cases column expected_data <- data.frame(expand.grid(dates, regions)) colnames(expected_data) <- c("date", "level_1_region") + expected_data$date <- as.Date(expected_data$date) expected_data$level_1_region <- as.character(expected_data$level_1_region) expected_data <- expected_data %>% dplyr::arrange(date, level_1_region) %>% dplyr::left_join(region_codes, by = c("level_1_region" = "region")) expected_data$cases <- c(1:5, rep(NA, 4), 10:12) + expected_data <- dplyr::select( + expected_data, date, level_1_region, level_1_region_code, everything() + ) return(dplyr::tibble(expected_data)) } @@ -234,10 +238,16 @@ get_expected_data_for_complete_cumulative_columns_test <- function() { partial_data <- expected_data[-c(6:9), ] # manually add cumulative cases to get expected data - full_data_with_cum_cases_filled <- covidregionaldata:::fill_empty_dates_with_na(partial_data) - full_data_with_cum_cases_filled <- dplyr::arrange(full_data_with_cum_cases_filled, level_1_region, date) - full_data_with_cum_cases_filled <- cbind(full_data_with_cum_cases_filled, as.integer(c(1, 5, 5, 15, 2, 7, 7, 18, 3, 3, 3, 15))) + full_data_with_cum_cases_filled <- partial_data %>% + covidregionaldata:::fill_empty_dates_with_na() + full_data_with_cum_cases_filled <- + dplyr::arrange(full_data_with_cum_cases_filled, level_1_region, date) + full_data_with_cum_cases_filled <- + cbind( + full_data_with_cum_cases_filled, + as.integer(c(1, 5, 5, 15, 2, 7, 7, 18, 3, 3, 3, 15)) + ) colnames(full_data_with_cum_cases_filled)[5] <- "cases_total" - return(dplyr::tibble(full_data_with_cum_cases_filled)) + return(full_data_with_cum_cases_filled) } diff --git a/tests/testthat/test-processing.R b/tests/testthat/test-processing.R index 92f6410a..e13ce6ad 100644 --- a/tests/testthat/test-processing.R +++ b/tests/testthat/test-processing.R @@ -80,19 +80,28 @@ test_that("fill_empty_dates_with_na fills empty dates with NA", { expected_data <- get_expected_data_for_fill_empty_dates_with_na_test() # partial data deletes some rows (i.e. gets rid of some dates - all the ones # with NA in cases) + expected_data <- expected_data + partial_data <- expected_data[-c(6:9), ] - expect_equal(fill_empty_dates_with_na(partial_data), expected_data) + expect_equal( + fill_empty_dates_with_na(partial_data), + expected_data + ) expected_data <- dplyr::mutate( expected_data, level_2_region = level_1_region, level_2_region_code = level_1_region_code ) %>% + group_by(level_2_region, level_2_region_code) %>% select( - date, level_2_region, level_2_region_code, - level_1_region, level_1_region_code, cases + date, level_1_region, level_1_region_code, + level_2_region, level_2_region_code, cases ) partial_data <- expected_data[-c(6:9), ] - expect_equal(fill_empty_dates_with_na(partial_data), expected_data) + expect_equal( + fill_empty_dates_with_na(partial_data), + expected_data + ) }) test_that("complete_cumulative_columns works", { diff --git a/tests/testthat/test-regional-datasets.R b/tests/testthat/test-regional-datasets.R index 4a6061ca..f3f865ec 100644 --- a/tests/testthat/test-regional-datasets.R +++ b/tests/testthat/test-regional-datasets.R @@ -19,40 +19,51 @@ if (identical(Sys.getenv("NOT_CRAN"), "true")) { download <- getOption("testDownload") } - # get datasets for testing - sources <- get_available_datasets() %>% - dplyr::filter(.data$type %in% - c("national", "regional")) %>% - dplyr::select( - source = class, - level_1_region, level_2_region, level_3_region - ) %>% - tidyr::pivot_longer( - cols = -source, - names_to = "level", - values_to = "regions" - ) %>% - dplyr::mutate( - level = stringr::str_split(level, "_"), - level = purrr::map_chr(level, ~ .[2]) - ) %>% - tidyr::drop_na(regions) - - # filter out target datasets if (!is.null(source_of_interest)) { - sources <- sources %>% - dplyr::filter(source %in% source_of_interest) + test_regions <- TRUE + }else{ + test_regions <- FALSE + } + if (!is.null(getOption("testRegions"))) { + test_regions <- getOption("testRegions") } - # apply tests to each data source in turn - sources %>% - dplyr::rowwise() %>% - dplyr::group_split() %>% - purrr::walk( - ~ test_regional_dataset( - source = .$source[[1]], - level = .$level[[1]], - download = download + if (test_regions) { + # get datasets for testing + sources <- get_available_datasets() %>% + dplyr::filter(.data$type %in% + c("national", "regional")) %>% + dplyr::select( + source = class, + level_1_region, level_2_region, level_3_region + ) %>% + tidyr::pivot_longer( + cols = -source, + names_to = "level", + values_to = "regions" + ) %>% + dplyr::mutate( + level = stringr::str_split(level, "_"), + level = purrr::map_chr(level, ~ .[2]) + ) %>% + tidyr::drop_na(regions) + + # filter out target datasets + if (!is.null(source_of_interest)) { + sources <- sources %>% + dplyr::filter(source %in% source_of_interest) + } + + # apply tests to each data source in turn + sources %>% + dplyr::rowwise() %>% + dplyr::group_split() %>% + purrr::walk( + ~ test_regional_dataset( + source = .$source[[1]], + level = .$level[[1]], + download = download + ) ) - ) + } }