JSON-reading implementation of Vietnam

Uses download_json method for downloading. Adjustment to the DataClass tests to deal with tables of lists. Addition of stringi to imported packages, and of fromJSON and jsonlinte to the WORDLIST. Tests run and data stored.
RichardMN · Sep 19, 2021 · 2c20067 · 2c20067
1 parent f0cec88
commit 2c20067
Show file tree

Hide file tree

Showing 7 changed files with 64 additions and 73 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -95,6 +95,7 @@ Imports:
     R6,
     readxl,
     rlang,
+    stringi,
     stringr,
     tibble,
     tidyr (>= 1.0.0),

diff --git a/R/Vietnam.R b/R/Vietnam.R
@@ -28,82 +28,35 @@ Vietnam <- R6::R6Class("Vietnam",
     supported_region_codes = list("1" = "iso_3166_2"),
     #' @field common_data_urls List of named links to raw data.
     common_data_urls = list(
+      # nolint start
       "case_by_time" = "https://covid.ncsc.gov.vn/api/v3/covid/provinces?filter_type=case_by_time",
       "death_by_time" = "https://covid.ncsc.gov.vn/api/v3/covid/provinces?filter_type=death_by_time",
       "recovered_by_time" = "https://covid.ncsc.gov.vn/api/v3/covid/provinces?filter_type=recovered_by_time",
       "provinces" = "https://covid.ncsc.gov.vn/api/v3/covid/provinces"
+      # nolint end
     ),
     #' @field source_data_cols existing columns within the raw data
     source_data_cols = c(
       "cases_total", "deaths_total", "recovered_total"
     ),
     #' @field source_text Plain text description of the source of the data
-    source_text = "Public COVID-19 for Vietnam, curated by NCSC's COVID-19 team",
+    source_text =
+      "Public COVID-19 for Vietnam, curated by NCSC's COVID-19 team",
     #' @field source_url Website address for explanation/introduction of the
     #' data
-    source_url = "https://covid.ncsc.gov.vn", # nolint
+    source_url = "https://covid.ncsc.gov.vn",
 
     #' @description Set up a table of region codes for clean data
     #' @importFrom tibble tibble
     set_region_codes = function() {
       self$codes_lookup$`1` <- covidregionaldata::vietnam_codes
     },
 
-    #' @description download function to get raw data
-    #' @importFrom tidyr replace_na drop_na
-    #' @importFrom lubridate dmy
-    #' @importFrom jsonlite fromJSON
-    download = function(){
+    #' @description Download function to get raw data. Uses the
+    #' parent class JSON-specific method for downloads.
+    download = function() {
       super$download_JSON()
     },
-    #   function() {
-    #   bundles_urls <- list(
-    #     "case_by_time" = "https://covid.ncsc.gov.vn/api/v3/covid/provinces?filter_type=case_by_time",
-    #     "death_by_time" = "https://covid.ncsc.gov.vn/api/v3/covid/provinces?filter_type=death_by_time",
-    #     "recovered_by_time" = "https://covid.ncsc.gov.vn/api/v3/covid/provinces?filter_type=recovered_by_time"
-    #   )
-    #   Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 4) # Fix VROOM error
-    #   provines_url <- "https://covid.ncsc.gov.vn/api/v3/covid/provinces"
-    #   bundles <- names(bundles_urls)
-    #   provines_data <- fromJSON(provines_url)
-    #
-    #   get_bundles_data <- function(bundles) {
-    #     bundles_data <- list()
-    #     for (bundle in bundles) {
-    #       url <- paste0("https://covid.ncsc.gov.vn/api/v3/covid/provinces?filter_type=", bundle)
-    #       data <- fromJSON(url)
-    #       bundles_data <- c(bundles_data, setNames(list(data), bundle))
-    #     }
-    #     bundles_data
-    #   }
-    #
-    #   bundles_data <- get_bundles_data(bundles)
-    #
-    #   get_province <- function(id, data) {
-    #     row_dat <- provines_data[(id <- id), ]
-    #     death_by_time <- do.call(cbind, data$death_by_time[id])
-    #     case_by_time <- do.call(cbind, data$case_by_time[id])
-    #     recovered_by_time <- do.call(cbind, data$recovered_by_time[id])
-    #     if (!identical(row.names(death_by_time), row.names(death_by_time))) {
-    #       stop("Dates on case_by_time and death_by_time do not match!")
-    #     }
-    #     df <- tibble(
-    #       date = dmy(row.names(case_by_time)),
-    #       id = row_dat$id,
-    #       name = row_dat$name,
-    #       case_by_time = case_by_time,
-    #       death_by_time = death_by_time,
-    #       recovered_by_time = recovered_by_time
-    #     )
-    #     df
-    #   }
-    #
-    #   df <- do.call(rbind, lapply(provines_data$id, function(id) {
-    #     get_province(id, bundles_data)
-    #   }))
-    #   names(df) <- c("date", "id", "region_name", "cases_total", "deaths_total", "recovered_total")
-    #   self$data$raw[["main"]] <- df
-    # },
 
     #' @description Provincial Level Data
     #' cleaning
@@ -116,31 +69,64 @@ Vietnam <- R6::R6Class("Vietnam",
     #' @importFrom stringr str_to_title str_replace_all
     #' @importFrom lubridate dmy
     clean_common = function() {
+      # The first three elements of self$data$raw are the data
+      # tables downloaded and so these can be processed identically
+      #
       data_inputs <- self$data$raw[1:3]
-      flat_all <- map(map(vn_data_inputs, function(x) as_tibble(unlist(x), rownames="date")),
-                         function(x) {x %>% separate(date, sep="[.]+", into=c(NA, "province", "date"))})
+      flat_all <- map(
+        map(
+          data_inputs,
+          function(x) as_tibble(unlist(x),
+                                rownames = "date")),
+        function(y) {
+          y %>% separate(date, sep = "[.]+", into = c(NA, "province", "date"))
+        }
+      )
       self$data$clean <- full_join(
         full_join(
-          vn_flat_all$case_by_time, vn_flat_all$death_by_time, by=c("province","date"),suffix=c(".cases", ".deaths"),copy=TRUE),
-        vn_flat_all$recovered_by_time, by=c("province","date"), suffix=c("",".recovered"), copy=TRUE) %>%
-        select(level_1_region_code=province, date, cases_total = value.cases, deaths_total=value.deaths, recovered_total = value) %>%
-        mutate(level_1_region_code = as.numeric(level_1_region_code)) %>%
-        left_join(self$data$raw$provinces%>%select(level_1_region_code=id,level_1_region=name), by=c("level_1_region_code")) %>%
-        #select(date, region_name, cases_total, deaths_total, recovered_total) %>%
+          flat_all$case_by_time, flat_all$death_by_time,
+          by = c("province", "date"),
+          suffix = c(".cases", ".deaths"),
+          copy = TRUE
+        ),
+        flat_all$recovered_by_time,
+        by = c("province", "date"),
+        suffix = c("", ".recovered"),
+        copy = TRUE
+      ) %>%
+        # The api uses integer codes for provinces which do not
+        # line up with ISO 3166-2 (some of which are not numbers)
+        # so we use this as a temporary code to line names up
+        # with data.
+        select(
+          ncsc_region_code = province,
+          date,
+          cases_total = value.cases,
+          deaths_total = value.deaths,
+          recovered_total = value) %>%
+        mutate(ncsc_region_code = as.numeric(ncsc_region_code)) %>%
+        left_join(
+          self$data$raw$provinces %>%
+            select(ncsc_region_code = id, level_1_region = name),
+          by = c("ncsc_region_code")) %>%
+        select( -ncsc_region_code ) %>%
         mutate(
+          date = dmy(date),
           cases_total = as.numeric(cases_total),
           deaths_total = as.numeric(deaths_total),
           recovered_total = as.numeric(recovered_total),
-          region_name = stringr::str_replace_all(region_name, "TP HCM", "Hochiminh"),
+          level_1_region = str_replace_all(level_1_region,
+                                        "TP HCM", "Hochiminh"),
         ) %>%
-        tidyr::drop_na(date, region_name) %>%
-        rename(level_1_region = region_name) %>%
+        # 
+        #tidyr::drop_na(date, region_name) %>%
         mutate(
-          level_1_region = stringi::stri_trans_general(level_1_region, "latin-ascii"),
-          level_1_region = stringi::stri_trim_both(level_1_region),
-          level_1_region = stringr::str_replace_all(level_1_region, "\\(.*\\)|-| ", ""),
-          level_1_region = stringr::str_to_title(level_1_region),
-          level_1_region = tidyr::replace_na(level_1_region, "Unknown")
+          level_1_region = stri_trans_general(level_1_region, "latin-ascii"),
+          level_1_region = stri_trim_both(level_1_region),
+          level_1_region = str_replace_all(level_1_region,
+                                           "\\(.*\\)|-| ", ""),
+          level_1_region = str_to_title(level_1_region),
+          level_1_region = replace_na(level_1_region, "Unknown")
         ) %>%
         left_join(
           self$codes_lookup$`1`,

diff --git a/R/test-DataClass.R b/R/test-DataClass.R
@@ -102,7 +102,8 @@ test_download <- function(DataClass_obj, download, snapshot_path) {
         walk(DataClass_obj$data$raw, function(data) {
           testthat::expect_s3_class(data, "data.frame")
           testthat::expect_true(nrow(data) > 0)
-          testthat::expect_true(ncol(data) >= 2)
+          testthat::expect_true(ncol(data) >= 2
+                                || typeof(data[[1]])=="list")
         })
       }
     )

diff --git a/data/all_country_data.rda b/data/all_country_data.rda
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -32,6 +32,7 @@ Estados
 etc
 filepath
 fns
+fromJSON
 geocode
 geocoding
 github
@@ -50,6 +51,7 @@ iso
 JHU
 jrc
 JRC
+jsonlite
 jure
 Kreise
 Landkreis

diff --git a/man/Vietnam.Rd b/man/Vietnam.Rd
diff --git a/tests/testthat/custom_data/Vietnam_level_1.rds b/tests/testthat/custom_data/Vietnam_level_1.rds
-Original file line number
+Diff line change
@@ Expand Up / @@ -95,6 +95,7 @@ Imports: @@
         R6,
         readxl,
         rlang,
+        stringi,
         stringr,
         tibble,
         tidyr (>= 1.0.0),
@@ Expand Down @@