Merge pull request epiforecasts#413 from biocyberman/vietnam

Add subnational data for Vietnam
RichardMN · Sep 27, 2021 · 73784b7 · 73784b7
2 parents c59154d + d1232f1
commit 73784b7
Show file tree

Hide file tree

Showing 49 changed files with 647 additions and 49 deletions.
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -38,7 +38,8 @@ RUN install2.r --error --skipinstalled --repos ${CRAN} --ncpus -1 \
 RUN apt-get update \
      && export DEBIAN_FRONTEND=noninteractive \
      && apt-get -y install --no-install-recommends libgdal-dev \
-            libudunits2-dev  libharfbuzz-dev libfribidi-dev
+            libudunits2-dev  libharfbuzz-dev libfribidi-dev \
+            libjq-dev libprotobuf-dev
 
 # install dependencies
 COPY DESCRIPTION /tmp/package/DESCRIPTION

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -23,7 +23,9 @@
 	"extensions": [
 		"ikuyadeu.r",
 		"reditorsupport.r-lsp",
-		"shan.code-settings-sync"
+		"shan.code-settings-sync",
+		"searking.preview-vscode",
+		"tht13.html-preview-vscode"
 	],
 
 	// Use 'forwardPorts' to make a list of ports inside the container available locally.

diff --git a/.github/workflows/Vietnam.yaml b/.github/workflows/Vietnam.yaml
@@ -0,0 +1,48 @@
+on:
+  schedule:
+    - cron: '36 12 * * *'
+  workflow_dispatch:
+
+name: Vietnam
+
+jobs:
+  Vietnam:
+    runs-on: macOS-latest
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: r-lib/actions/setup-r@v1
+
+      - name: Query dependencies
+        run: |
+          install.packages('remotes')
+          saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
+          writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
+        shell: Rscript {0}
+
+      - name: Cache R packages
+        uses: actions/cache@v2
+        with:
+          path: ${{ env.R_LIBS_USER }}
+          key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
+
+      - name: Install dependencies
+        run: |
+          install.packages(c("remotes"))
+          remotes::install_deps(dependencies = TRUE)
+          install.packages("devtools")
+        shell: Rscript {0}
+
+      - name: Install package
+        run: R CMD INSTALL .
+
+      - name: Test dataset
+        run: |
+          options("testDownload" = TRUE)
+          options("testSource" = "Vietnam")
+          devtools::load_all()
+          testthat::test_file("tests/testthat/test-regional-datasets.R", reporter = c("summary", "fail"))
+        shell: Rscript {0}
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: covidregionaldata
 Title: Subnational Data for COVID-19 Epidemiology
-Version: 0.9.2.2000
+Version: 0.9.2.3000
 Authors@R:
     c(person(given = "Joseph",
              family = "Palmer",
@@ -55,6 +55,10 @@ Authors@R:
              family = "Gruson",
              role = "ctb",
              comment = c(ORCID = "0000-0002-4094-1476")),
+      person(given = "Vang",
+             family = "Le",
+             role = "ctb",
+             comment = c(URL = "https://github.com/biocyberman")),
       person(given = "Sebastian",
              family = "Funk",
              role = "aut",
@@ -86,6 +90,7 @@ Imports:
     countrycode (>= 1.2.0),
     dplyr,
     httr,
+    jsonlite,
     lifecycle,
     lubridate,
     magrittr,
@@ -94,6 +99,7 @@ Imports:
     R6,
     readxl,
     rlang,
+    stringi,
     stringr,
     tibble,
     tidyr (>= 1.0.0),

diff --git a/NAMESPACE b/NAMESPACE
@@ -25,6 +25,7 @@ export(SouthAfrica)
 export(Switzerland)
 export(UK)
 export(USA)
+export(Vietnam)
 export(WHO)
 export(expect_clean_cols)
 export(expect_columns_contain_data)
@@ -48,6 +49,7 @@ importFrom(countrycode,countryname)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,across)
 importFrom(dplyr,arrange)
+importFrom(dplyr,as_tibble)
 importFrom(dplyr,bind_rows)
 importFrom(dplyr,count)
 importFrom(dplyr,distinct)
@@ -74,13 +76,15 @@ importFrom(dplyr,slice_tail)
 importFrom(dplyr,starts_with)
 importFrom(dplyr,summarise)
 importFrom(dplyr,tally)
+importFrom(dplyr,tibble)
 importFrom(dplyr,transmute)
 importFrom(dplyr,ungroup)
 importFrom(dplyr,vars)
 importFrom(httr,GET)
 importFrom(httr,POST)
 importFrom(httr,content)
 importFrom(httr,status_code)
+importFrom(jsonlite,fromJSON)
 importFrom(lifecycle,deprecate_warn)
 importFrom(lifecycle,deprecated)
 importFrom(lifecycle,is_present)
@@ -109,6 +113,9 @@ importFrom(rlang,"!!")
 importFrom(rlang,":=")
 importFrom(rlang,.data)
 importFrom(rlang,syms)
+importFrom(stringi,stri_replace_all)
+importFrom(stringi,stri_trans_general)
+importFrom(stringi,stri_trim_both)
 importFrom(stringr,str_detect)
 importFrom(stringr,str_replace_all)
 importFrom(stringr,str_to_sentence)
@@ -125,6 +132,7 @@ importFrom(tidyr,nesting)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyr,pivot_wider)
 importFrom(tidyr,replace_na)
+importFrom(tidyr,separate)
 importFrom(tidyselect,all_of)
 importFrom(tidyselect,ends_with)
 importFrom(tidyselect,starts_with)

diff --git a/NEWS.md b/NEWS.md
@@ -4,11 +4,12 @@ This release is currrently under development
 
 ## New data sets
 
-- Support for level 1 region data in Estonia (thanks to @RichardMN). See `?Estonia` for details. 
+- Support for level 1 region data in Estonia (thanks to @RichardMN). See `?Estonia` for details.
+- Support for level 1 region data in Vietnam (thanks to @biocyberman). See `?Vietname` for details.
 
 # covidregionaldata 0.9.2
 
-This release adds support for the Covid19 Data Hub which includes Google and Apple mobility data amongst a large range of other data sets, data from the European Commission's Joint Research Centre which is at both the regional and national level, and individual sources for regional data from several countries. Package updates have been made in line with a software review at the [Journal of Open Source Software](https://github.com/openjournals/joss-reviews/issues/3290). Finally, this release exposes more of the testing infrastructure to users and adds a package hexsticker. 
+This release adds support for the Covid19 Data Hub which includes Google and Apple mobility data amongst a large range of other data sets, data from the European Commission's Joint Research Centre which is at both the regional and national level, and individual sources for regional data from several countries. Package updates have been made in line with a software review at the [Journal of Open Source Software](https://github.com/openjournals/joss-reviews/issues/3290). Finally, this release exposes more of the testing infrastructure to users and adds a package hexsticker.
 
 Thanks to @joseph-palmer, @RichardMN, and @kathsherratt for contributions towards this release.
 

diff --git a/R/Vietnam.R b/R/Vietnam.R
@@ -0,0 +1,137 @@
+#' Vietnam Class for downloading, cleaning and processing
+#' notification data
+#'
+#' @description Information for downloading, cleaning
+#'  and processing covid-19 region data for Vietnam.
+#'
+#' @source \url{https://covid.ncsc.gov.vn}
+#' @export
+#' @concept dataset
+#' @family subnational
+#' @examples
+#' \dontrun{
+#' region <- Vietnam$new(verbose = TRUE, steps = TRUE, get = TRUE)
+#' region$return()
+#' }
+Vietnam <- R6::R6Class("Vietnam",
+  inherit = DataClass,
+  public = list(
+
+    # Core Attributes (amend each parameter for country specific information)
+    #' @field origin name of country to fetch data for
+    origin = "Vietnam",
+    #' @field supported_levels List of supported levels.
+    supported_levels = list("1"),
+    #' @field supported_region_names List of region names in order of level.
+    supported_region_names = list("1" = "region"),
+    #' @field supported_region_codes List of region codes in order of level.
+    supported_region_codes = list("1" = "iso_3166_2"),
+    #' @field common_data_urls List of named links to raw data.
+    common_data_urls = list(
+      # nolint start
+      "case_by_time" = "https://covid.ncsc.gov.vn/api/v3/covid/provinces?filter_type=case_by_time",
+      "death_by_time" = "https://covid.ncsc.gov.vn/api/v3/covid/provinces?filter_type=death_by_time",
+      "recovered_by_time" = "https://covid.ncsc.gov.vn/api/v3/covid/provinces?filter_type=recovered_by_time",
+      "provinces" = "https://covid.ncsc.gov.vn/api/v3/covid/provinces"
+      # nolint end
+    ),
+    #' @field source_data_cols existing columns within the raw data
+    source_data_cols = c(
+      "cases_total", "deaths_total", "recovered_total"
+    ),
+    #' @field source_text Plain text description of the source of the data
+    source_text =
+      "Public COVID-19 for Vietnam, curated by NCSC's COVID-19 team",
+    #' @field source_url Website address for explanation/introduction of the
+    #' data
+    source_url = "https://covid.ncsc.gov.vn",
+
+    #' @description Set up a table of region codes for clean data
+    #' @importFrom tibble tibble
+    set_region_codes = function() {
+      self$codes_lookup$`1` <- covidregionaldata::vietnam_codes
+    },
+
+    #' @description Download function to get raw data. Uses the
+    #' parent class JSON-specific method for downloads.
+    download = function() {
+      super$download_JSON()
+    },
+
+    #' @description Provincial Level Data
+    #' cleaning
+    #' @param ... pass additional arguments
+    #'
+    #' @importFrom dplyr filter select mutate rename tibble as_tibble full_join
+    #' @importFrom tidyr replace_na drop_na separate
+    #' @importFrom purrr map
+    #' @importFrom stringi stri_trans_general stri_trim_both stri_replace_all
+    #' @importFrom stringr str_to_title str_replace_all
+    #' @importFrom lubridate dmy
+    clean_common = function() {
+      # The first three elements of self$data$raw are the data
+      # tables downloaded and so these can be processed identically
+      #
+      data_inputs <- self$data$raw[1:3]
+      flat_all <- map(
+        map(
+          data_inputs,
+          function(x) as_tibble(unlist(x),
+                                rownames = "date")),
+        function(y) {
+          y %>% separate(date, sep = "[.]+", into = c(NA, "province", "date"))
+        }
+      )
+      self$data$clean <- full_join(
+        full_join(
+          flat_all$case_by_time, flat_all$death_by_time,
+          by = c("province", "date"),
+          suffix = c(".cases", ".deaths"),
+          copy = TRUE
+        ),
+        flat_all$recovered_by_time,
+        by = c("province", "date"),
+        suffix = c("", ".recovered"),
+        copy = TRUE
+      ) %>%
+        # The api uses integer codes for provinces which do not
+        # line up with ISO 3166-2 (some of which are not numbers)
+        # so we use this as a temporary code to line names up
+        # with data.
+        select(
+          ncsc_region_code = province,
+          date,
+          cases_total = value.cases,
+          deaths_total = value.deaths,
+          recovered_total = value) %>%
+        mutate(ncsc_region_code = as.numeric(ncsc_region_code)) %>%
+        left_join(
+          self$data$raw$provinces %>%
+            select(ncsc_region_code = id, level_1_region = name),
+          by = c("ncsc_region_code")) %>%
+        select(-ncsc_region_code) %>%
+        mutate(
+          date = dmy(date),
+          cases_total = as.numeric(cases_total),
+          deaths_total = as.numeric(deaths_total),
+          recovered_total = as.numeric(recovered_total),
+          level_1_region = str_replace_all(level_1_region,
+                                        "TP HCM", "Hochiminh"),
+        ) %>%
+        #
+        #tidyr::drop_na(date, region_name) %>%
+        mutate(
+          level_1_region = stri_trans_general(level_1_region, "latin-ascii"),
+          level_1_region = stri_trim_both(level_1_region),
+          level_1_region = str_replace_all(level_1_region,
+                                           "\\(.*\\)|-| ", ""),
+          level_1_region = str_to_title(level_1_region),
+          level_1_region = replace_na(level_1_region, "Unknown")
+        ) %>%
+        left_join(
+          self$codes_lookup$`1`,
+          by = c("level_1_region" = "level_1_region")
+        )
+    }
+  )
+)
diff --git a/R/datasets.R b/R/datasets.R
@@ -33,6 +33,12 @@
 #' @return A tibble of region codes and related information.
 "france_codes"
 
+#' Region Codes for Vietnam Dataset.
+#'
+#' @description The region codes for Viet Nam
+#' @return A tibble of region codes and related information.
+"vietnam_codes"
+
 #' Region Codes for JHU Dataset. Taken from the region codes provided as
 #' part of the WHO dataset.
 #'

diff --git a/R/shared-methods.R b/R/shared-methods.R
@@ -302,6 +302,20 @@ DataClass <- R6::R6Class(
       )
     },
 
+    #' @description Download raw data from `data_urls`, stores a named list
+    #' of the `data_url` name and the corresponding raw data table in
+    #' `data$raw`. Designed as a drop-in replacement for `download` so
+    #' it can be used in sub-classes.
+    #' @importFrom purrr map
+    download_JSON = function() {
+      if (length(self$data_urls) == 0) {
+        stop("No data to download as data_urls is empty")
+      }
+      self$data$raw <- map(self$data_urls, json_reader,
+        verbose = self$verbose
+      )
+    },
+
     #' @description Cleans raw data (corrects format, converts column types,
     #' etc). Works on raw data and so should be called after
     #' \href{#method-download}{\code{download()}}

diff --git a/R/test-DataClass.R b/R/test-DataClass.R
@@ -102,7 +102,8 @@ test_download <- function(DataClass_obj, download, snapshot_path) {
         walk(DataClass_obj$data$raw, function(data) {
           testthat::expect_s3_class(data, "data.frame")
           testthat::expect_true(nrow(data) > 0)
-          testthat::expect_true(ncol(data) >= 2)
+          testthat::expect_true(ncol(data) >= 2
+                                || typeof(data[[1]]) == "list")
         })
       }
     )

diff --git a/R/utils.R b/R/utils.R
@@ -53,6 +53,30 @@ csv_reader <- function(file, verbose = FALSE, guess_max = 1000, ...) {
   return(tibble(data))
 }
 
+#' Custom JSON reading function
+#'
+#' @description Checks for use of memoise and then uses vroom::vroom.
+#' @param file A URL or filepath to a JSON
+#' @param ... extra parameters to be passed to jsonlite::fromJSON
+#' @inheritParams message_verbose
+#' @return A data table
+#' @importFrom tibble tibble
+#' @importFrom jsonlite fromJSON
+#' @concept utility
+json_reader <- function(file, verbose = FALSE, ...) {
+  if (verbose) {
+    message("Downloading data from ", file)
+    data <- fromJSON(file, ...)
+  } else {
+    data <- suppressWarnings(
+      suppressMessages(
+        fromJSON(file, ...)
+      )
+    )
+  }
+  return(tibble(data))
+}
+
 #' Wrapper for message
 #'
 #' @description A wrapper for `message` that only prints output when