Add priliminary support for Vietnam subnational data

RichardMN · Sep 15, 2021 · abcd78a · abcd78a
1 parent aa85f36
commit abcd78a
Show file tree

Hide file tree

Showing 32 changed files with 388 additions and 42 deletions.
diff --git a/.github/workflows/Vietnam.yaml b/.github/workflows/Vietnam.yaml
@@ -0,0 +1,48 @@
+on:
+  schedule:
+    - cron: '36 12 * * *'
+  workflow_dispatch:
+
+name: Vietnam
+
+jobs:
+  Vietnam:
+    runs-on: macOS-latest
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: r-lib/actions/setup-r@v1
+
+      - name: Query dependencies
+        run: |
+          install.packages('remotes')
+          saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
+          writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
+        shell: Rscript {0}
+
+      - name: Cache R packages
+        uses: actions/cache@v2
+        with:
+          path: ${{ env.R_LIBS_USER }}
+          key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
+
+      - name: Install dependencies
+        run: |
+          install.packages(c("remotes"))
+          remotes::install_deps(dependencies = TRUE)
+          install.packages("devtools")
+        shell: Rscript {0}
+
+      - name: Install package
+        run: R CMD INSTALL .
+
+      - name: Test dataset
+        run: |
+          options("testDownload" = TRUE)
+          options("testSource" = "Vietnam")
+          devtools::load_all()
+          testthat::test_file("tests/testthat/test-regional-datasets.R", reporter = c("summary", "fail"))
+        shell: Rscript {0}
diff --git a/NAMESPACE b/NAMESPACE
@@ -24,6 +24,7 @@ export(SouthAfrica)
 export(Switzerland)
 export(UK)
 export(USA)
+export(Vietnam)
 export(WHO)
 export(expect_clean_cols)
 export(expect_columns_contain_data)

diff --git a/R/Vietnam.R b/R/Vietnam.R
@@ -0,0 +1,85 @@
+#' Vietnam Class for downloading, cleaning and processing
+#' notification data
+#'
+#' @description Information for downloading, cleaning
+#'  and processing covid-19 region data for Vietnam.
+#'
+# nolint start
+#' @source \url{https://github.com/biocyberman/covidregionaldata/}
+# nolint end
+#' @export
+#' @concept dataset
+#' @family subnational
+#' @examples
+#' \dontrun{
+#' region <- Vietnam$new(verbose = TRUE, steps = TRUE, get = TRUE)
+#' region$return()
+#' }
+Vietnam <- R6::R6Class("Vietnam",
+                       inherit = DataClass,
+                       public = list(
+
+                         # Core Attributes (amend each paramater for country specific infomation)
+                         #' @field origin name of country to fetch data for
+                         origin = "Vietnam",
+                         #' @field supported_levels List of supported levels.
+                         supported_levels = list("1"),
+                         #' @field supported_region_names List of region names in order of level.
+                         supported_region_names = list("1" = "region"),
+                         #' @field supported_region_codes List of region codes in order of level.
+                         supported_region_codes = list("1" = "is_3166_2"),
+                         #' @field common_data_urls List of named links to raw data.
+                         # nolint start
+                         common_data_urls = list(
+                           "main" = "https://docs.google.com/spreadsheets/d/1_d7oK-SKj-7KrWAW7DbGYEad2JO4TyR7ApsUAuoiH5g/export?format=csv&gid=0"
+                         ),
+                         # nolint end
+                         #' @field source_data_cols existing columns within the raw data
+                         source_data_cols = c(
+                           "cases_new"
+                         ),
+                         #' @field source_text Plain text description of the source of the data
+                         source_text = "Public COVID-19 data curated by 5F team",
+                         #' @field source_url Website address for explanation/introduction of the
+                         #' data
+                         source_url = "https://datastudio.google.com/u/0/reporting/1cc8d45e-2c74-4084-af70-cbbe60f1660e/page/bLUVC", # nolint
+
+                         #' @description Set up a table of region codes for clean data
+                         #' @importFrom tibble tibble
+                         set_region_codes = function(){
+                           self$codes_lookup$`1` <- covidregionaldata::vietnam_codes
+                         },
+
+                         #' @description Provincial Level Data
+                         #' cleaning
+                         #' @param ... pass additional arguments
+                         #'
+                         #' @importFrom dplyr filter select mutate rename
+                         #' @importFrom tidyr replace_na
+                         #' @importFrom lubridate dmy
+                         clean_common = function() {
+                           self$data$clean <- self$data$raw[["main"]] %>%
+                             `colnames<-`(c('date', 'region', 'cases_new', 'case_group')) %>%
+                             select(
+                               date,
+                               region,
+                               cases_new
+                             ) %>%
+                             rename(level_1_region = region) %>%
+                             mutate(
+                               date = dmy(date),
+                               cases_new = as.numeric(cases_new)
+                             )%>%
+                             mutate(
+                               level_1_region = stringi::stri_trans_general(level_1_region, "latin-ascii"),
+                               level_1_region = stringi::stri_trim_both(level_1_region),
+                               level_1_region = stringr::str_replace_all(level_1_region, '\\(.*\\)|-| ', ''),
+                               level_1_region = stringr::str_to_title(level_1_region)
+                             )  %>%
+                             left_join(
+                               self$codes_lookup$`1`,
+                               by = c("level_1_region" = "level_1_region")
+                             )
+                         }
+                       )
+)
diff --git a/R/datasets.R b/R/datasets.R
@@ -33,6 +33,12 @@
 #' @return A tibble of region codes and related information.
 "france_codes"
 
+#' Region Codes for Vietnam Dataset.
+#'
+#' @description The region codes for Viet Nam
+#' @return A tibble of region codes and related information.
+"vietnam_codes"
+
 #' Region Codes for JHU Dataset. Taken from the region codes provided as
 #' part of the WHO dataset.
 #'

diff --git a/README.md b/README.md
@@ -80,7 +80,7 @@ the temporary directory by default),
 
 ``` r
 start_using_memoise()
-#> Using a cache at: /tmp/RtmprTOAdV
+#> Using a cache at: /tmp/RtmpPgZXiv
 ```
 
 To stop using `memoise` use,
@@ -105,7 +105,7 @@ the Google COVID-19 open data project), use:
 ``` r
 nots <- get_national_data()
 #> Downloading data from https://covid19.who.int/WHO-COVID-19-global-data.csv
-#> Rows: 132483 Columns: 8
+#> Rows: 142911 Columns: 8
 #> ── Column specification ────────────────────────────────────────────────────────
 #> Delimiter: ","
 #> chr  (3): Country_code, Country, WHO_region
@@ -117,20 +117,20 @@ nots <- get_national_data()
 #> Cleaning data
 #> Processing data
 nots
-#> # A tibble: 132,483 x 15
-#>    date       un_region who_region country        iso_code cases_new cases_total
-#>    <date>     <chr>     <chr>      <chr>          <chr>        <dbl>       <dbl>
-#>  1 2020-01-03 Asia      EMRO       Afghanistan    AF               0           0
-#>  2 2020-01-03 Europe    EURO       Albania        AL               0           0
-#>  3 2020-01-03 Africa    AFRO       Algeria        DZ               0           0
-#>  4 2020-01-03 Oceania   WPRO       American Samoa AS               0           0
-#>  5 2020-01-03 Europe    EURO       Andorra        AD               0           0
-#>  6 2020-01-03 Africa    AFRO       Angola         AO               0           0
-#>  7 2020-01-03 Americas  AMRO       Anguilla       AI               0           0
-#>  8 2020-01-03 Americas  AMRO       Antigua & Bar… AG               0           0
-#>  9 2020-01-03 Americas  AMRO       Argentina      AR               0           0
-#> 10 2020-01-03 Asia      EURO       Armenia        AM               0           0
-#> # … with 132,473 more rows, and 8 more variables: deaths_new <dbl>,
+#> # A tibble: 142,911 × 15
+#>    date       un_region who_region country           iso_code cases_new cases_total
+#>    <date>     <chr>     <chr>      <chr>             <chr>        <dbl>       <dbl>
+#>  1 2020-01-03 Asia      EMRO       Afghanistan       AF               0           0
+#>  2 2020-01-03 Europe    EURO       Albania           AL               0           0
+#>  3 2020-01-03 Africa    AFRO       Algeria           DZ               0           0
+#>  4 2020-01-03 Oceania   WPRO       American Samoa    AS               0           0
+#>  5 2020-01-03 Europe    EURO       Andorra           AD               0           0
+#>  6 2020-01-03 Africa    AFRO       Angola            AO               0           0
+#>  7 2020-01-03 Americas  AMRO       Anguilla          AI               0           0
+#>  8 2020-01-03 Americas  AMRO       Antigua & Barbuda AG               0           0
+#>  9 2020-01-03 Americas  AMRO       Argentina         AR               0           0
+#> 10 2020-01-03 Asia      EURO       Armenia           AM               0           0
+#> # … with 142,901 more rows, and 8 more variables: deaths_new <dbl>,
 #> #   deaths_total <dbl>, recovered_new <dbl>, recovered_total <dbl>,
 #> #   hosp_new <dbl>, hosp_total <dbl>, tested_new <dbl>, tested_total <dbl>
 ```
@@ -171,7 +171,7 @@ for example by level 1 region in the UK, use:
 ``` r
 uk_nots <- get_regional_data(country = "UK", verbose = FALSE)
 uk_nots
-#> # A tibble: 6,916 x 26
+#> # A tibble: 7,501 × 26
 #>    date       region   region_code cases_new cases_total deaths_new deaths_total
 #>    <date>     <chr>    <chr>           <dbl>       <dbl>      <dbl>        <dbl>
 #>  1 2020-01-30 East Mi… E12000004          NA          NA         NA           NA
@@ -184,16 +184,13 @@ uk_nots
 #>  8 2020-01-30 Scotland S92000003          NA          NA         NA           NA
 #>  9 2020-01-30 South E… E12000008          NA          NA         NA           NA
 #> 10 2020-01-30 South W… E12000009          NA          NA         NA           NA
-#> # … with 6,906 more rows, and 19 more variables: recovered_new <dbl>,
+#> # … with 7,491 more rows, and 19 more variables: recovered_new <dbl>,
 #> #   recovered_total <dbl>, hosp_new <dbl>, hosp_total <dbl>, tested_new <dbl>,
 #> #   tested_total <dbl>, areaType <chr>, cumCasesByPublishDate <dbl>,
 #> #   cumCasesBySpecimenDate <dbl>, newCasesByPublishDate <dbl>,
 #> #   newCasesBySpecimenDate <dbl>, cumDeaths28DaysByDeathDate <dbl>,
 #> #   cumDeaths28DaysByPublishDate <dbl>, newDeaths28DaysByDeathDate <dbl>,
-#> #   newDeaths28DaysByPublishDate <dbl>, newPillarFourTestsByPublishDate <lgl>,
-#> #   newPillarOneTestsByPublishDate <dbl>,
-#> #   newPillarThreeTestsByPublishDate <dbl>,
-#> #   newPillarTwoTestsByPublishDate <dbl>
+#> #   newDeaths28DaysByPublishDate <dbl>, …
 ```
 
 Now we have the data we can create plots, for example the time-series of

diff --git a/data-raw/vietnam_codes.R b/data-raw/vietnam_codes.R
@@ -0,0 +1,33 @@
+# Set vietnam region codes
+#
+# Level 1 codes: ISO-3166-2
+# Source: https://en.wikipedia.org/wiki/ISO_3166-2:VN
+#
+library(rvest)
+library(stringi)
+library(stringr)
+library(dplyr)
+library(tibble)
+
+# Level 1 -----------------------------------------------------------------
+# Get ISO codes
+vn_iso <- "https://en.wikipedia.org/wiki/ISO_3166-2:VN"
+
+level_1_region_df <- read_html(vn_iso) %>%
+  html_element(css="table.wikitable:nth-child(11)") %>%
+  html_table()
+
+vietnam_codes <- data.frame(
+  level_1_region_code = level_1_region_df$Code,
+  level_1_region = level_1_region_df$`Subdivision name (vi)`,
+  stringsAsFactors = FALSE
+) %>%
+  mutate(
+    level_1_region = stringi::stri_trans_general(level_1_region, "latin-ascii"),
+    level_1_region = stringi::stri_trim_both(level_1_region),
+    level_1_region = stringr::str_replace_all(level_1_region, '\\(.*\\)|-| ', ''),
+    level_1_region = stringr::str_to_title(level_1_region)
+  )
+
+# update package region_codes
+usethis::use_data(vietnam_codes, overwrite = TRUE)
diff --git a/data/all_country_data.rda b/data/all_country_data.rda
diff --git a/data/vietnam_codes.rda b/data/vietnam_codes.rda
diff --git a/man/Belgium.Rd b/man/Belgium.Rd
diff --git a/man/Brazil.Rd b/man/Brazil.Rd
diff --git a/man/Canada.Rd b/man/Canada.Rd
diff --git a/man/Colombia.Rd b/man/Colombia.Rd
diff --git a/man/Covid19DataHub.Rd b/man/Covid19DataHub.Rd
diff --git a/man/Cuba.Rd b/man/Cuba.Rd
diff --git a/man/France.Rd b/man/France.Rd
diff --git a/man/Germany.Rd b/man/Germany.Rd
diff --git a/man/Google.Rd b/man/Google.Rd
diff --git a/man/India.Rd b/man/India.Rd
diff --git a/man/Italy.Rd b/man/Italy.Rd
diff --git a/man/JHU.Rd b/man/JHU.Rd