epiforecasts · seabbs · Sep 27, 2021 · Aug 28, 2021 · Sep 2, 2021 · Sep 12, 2021
diff --git a/.github/workflows/Vietnam.yaml b/.github/workflows/Vietnam.yaml
@@ -0,0 +1,48 @@
+on:
+  schedule:
+    - cron: '36 12 * * *'
+  workflow_dispatch:
+
+name: Vietnam
+
+jobs:
+  Vietnam:
+    runs-on: macOS-latest
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: r-lib/actions/setup-r@v1
+
+      - name: Query dependencies
+        run: |
+          install.packages('remotes')
+          saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
+          writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
+        shell: Rscript {0}
+
+      - name: Cache R packages
+        uses: actions/cache@v2
+        with:
+          path: ${{ env.R_LIBS_USER }}
+          key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
+
+      - name: Install dependencies
+        run: |
+          install.packages(c("remotes"))
+          remotes::install_deps(dependencies = TRUE)
+          install.packages("devtools")
+        shell: Rscript {0}
+
+      - name: Install package
+        run: R CMD INSTALL .
+
+      - name: Test dataset
+        run: |
+          options("testDownload" = TRUE)
+          options("testSource" = "Vietnam")
+          devtools::load_all()
+          testthat::test_file("tests/testthat/test-regional-datasets.R", reporter = c("summary", "fail"))
+        shell: Rscript {0}
diff --git a/NAMESPACE b/NAMESPACE
@@ -24,6 +24,7 @@ export(SouthAfrica)
 export(Switzerland)
 export(UK)
 export(USA)
+export(Vietnam)
 export(WHO)
 export(expect_clean_cols)
 export(expect_columns_contain_data)

diff --git a/R/Vietnam.R b/R/Vietnam.R
@@ -0,0 +1,124 @@
+#' Vietnam Class for downloading, cleaning and processing
+#' notification data
+#'
+#' @description Information for downloading, cleaning
+#'  and processing covid-19 region data for Vietnam.
+#'
+# nolint start
+#' @source \url{https://github.com/biocyberman/covidregionaldata/}
+# nolint end
+#' @export
+#' @concept dataset
+#' @family subnational
+#' @examples
+#' \dontrun{
+#' region <- Vietnam$new(verbose = TRUE, steps = TRUE, get = TRUE)
+#' region$return()
+#' }
+Vietnam <- R6::R6Class("Vietnam",
+                       inherit = DataClass,
+                       public = list(
+
+                         # Core Attributes (amend each paramater for country specific infomation)
+                         #' @field origin name of country to fetch data for
+                         origin = "Vietnam",
+                         #' @field supported_levels List of supported levels.
+                         supported_levels = list("1"),
+                         #' @field supported_region_names List of region names in order of level.
+                         supported_region_names = list("1" = "region"),
+                         #' @field supported_region_codes List of region codes in order of level.
+                         supported_region_codes = list("1" = "is_3166_2"),
+                         #' @field common_data_urls List of named links to raw data.
+                         # nolint start
+                         common_data_urls = list(
+                           "case_by_time" = 'https://covid.ncsc.gov.vn/api/v3/covid/provinces?filter_type=case_by_time',
+                           "death_by_time" = 'https://covid.ncsc.gov.vn/api/v3/covid/provinces?filter_type=death_by_time',
+                           "recovered_by_time" = 'https://covid.ncsc.gov.vn/api/v3/covid/provinces?filter_type=recovered_by_time'
+
+                         ),
+                         # nolint end
+                         #' @field source_data_cols existing columns within the raw data
+                         source_data_cols = c(
+                           "cases_total", "deaths_total", "recovered_total"
+                         ),
+                         #' @field source_text Plain text description of the source of the data
+                         source_text = "Public COVID-19 data curated by 5F team",
+                         #' @field source_url Website address for explanation/introduction of the
+                         #' data
+                         source_url = "https://covid.ncsc.gov.vn", # nolint
+
+                         #' @description Set up a table of region codes for clean data
+                         #' @importFrom tibble tibble
+                         set_region_codes = function(){
+                           self$codes_lookup$`1` <- covidregionaldata::vietnam_codes
+                         },
+
+                         #' @description Provincial Level Data
+                         #' cleaning
+                         #' @param ... pass additional arguments
+                         #'
+                         #' @importFrom dplyr filter select mutate rename
+                         #' @importFrom tidyr replace_na drop_na
+                         #' @importFrom lubridate dmy 
+                         #' @importFrom jsonlite fromJSON
+                         clean_common = function() {
+                           Sys.setenv("VROOM_CONNECTION_SIZE" = 131072*4) # Fix VROOM error 
+                           provines_url = 'https://covid.ncsc.gov.vn/api/v3/covid/provinces'
+                           bundles = names(self$data$raw)
+                           provines_data = jsonlite::fromJSON(provines_url)
+
+                           get_bundles_data = function(bundles){
+                             bundles_data = list()
+                             for (bundle in bundles){ 
+                               url = paste0('https://covid.ncsc.gov.vn/api/v3/covid/provinces?filter_type=', bundle)
+                               data = jsonlite::fromJSON(url)
+                               bundles_data = c(bundles_data, setNames(list(data), bundle)) 
+                             }
+                             bundles_data
+                           }
+
+                           bundles_data = get_bundles_data(bundles)
+
+                           get_province = function(id, data){
+                             row_dat = provines_data[(id=id),]
+                             death_by_time= do.call(cbind, data$death_by_time[id])
+                             case_by_time=do.call(cbind, data$case_by_time[id]) 
+                             recovered_by_time=do.call(cbind, data$recovered_by_time[id]) 
+                             if (!identical(row.names(death_by_time), row.names(death_by_time))) {
+                               stop("Dates on case_by_time and death_by_time do not match!")
+                             }
+                             df = dplyr::tibble(date= lubridate::dmy(row.names(case_by_time)), 
+                                                id = row_dat$id,
+                                                name = row_dat$name,
+                                                case_by_time= case_by_time, 
+                                                death_by_time= death_by_time,
+                                                recovered_by_time= recovered_by_time)
+                             df
+                           }
+
+                           df = do.call(rbind, lapply(provines_data$id, function(id){get_province(id, bundles_data)}))
+                           names(df) <- c("date", "id", "region_name", "cases_total", "deaths_total", "recovered_total")
+
+                           self$data$clean <- df %>%
+                             select( date, region_name, cases_total, deaths_total, recovered_total) %>%
+                             mutate(cases_total = as.numeric(cases_total),
+                                    deaths_total = as.numeric(deaths_total),
+                                    recovered_total = as.numeric(recovered_total),
+                                    region_name = stringr::str_replace_all(region_name, 'TP HCM', 'Hochiminh'),
+                                    ) %>%
+                             tidyr::drop_na(date, region_name) %>%
+                             rename(level_1_region = region_name) %>%
+                             mutate(
+                               level_1_region = stringi::stri_trans_general(level_1_region, "latin-ascii"),
+                               level_1_region = stringi::stri_trim_both(level_1_region),
+                               level_1_region = stringr::str_replace_all(level_1_region, '\\(.*\\)|-| ', ''),
+                               level_1_region = stringr::str_to_title(level_1_region),
+                               level_1_region = tidyr::replace_na(level_1_region, "Unknown")
+                             )  %>%
+                             left_join(
+                               self$codes_lookup$`1`,
+                               by = c("level_1_region" = "level_1_region")
+                             )
+                         }
+                       )
+)
diff --git a/R/datasets.R b/R/datasets.R
@@ -33,6 +33,12 @@
 #' @return A tibble of region codes and related information.
 "france_codes"
 
+#' Region Codes for Vietnam Dataset.
+#'
+#' @description The region codes for Viet Nam
+#' @return A tibble of region codes and related information.
+"vietnam_codes"
+
 #' Region Codes for JHU Dataset. Taken from the region codes provided as
 #' part of the WHO dataset.
 #'

diff --git a/README.md b/README.md
@@ -80,7 +80,7 @@ the temporary directory by default),
 
 ``` r
 start_using_memoise()
-#> Using a cache at: /tmp/RtmprTOAdV
+#> Using a cache at: /tmp/RtmpPgZXiv
 ```
 
 To stop using `memoise` use,
@@ -105,7 +105,7 @@ the Google COVID-19 open data project), use:
 ``` r
 nots <- get_national_data()
 #> Downloading data from https://covid19.who.int/WHO-COVID-19-global-data.csv
-#> Rows: 132483 Columns: 8
+#> Rows: 142911 Columns: 8
 #> ── Column specification ────────────────────────────────────────────────────────
 #> Delimiter: ","
 #> chr  (3): Country_code, Country, WHO_region
@@ -117,20 +117,20 @@ nots <- get_national_data()
 #> Cleaning data
 #> Processing data
 nots
-#> # A tibble: 132,483 x 15
-#>    date       un_region who_region country        iso_code cases_new cases_total
-#>    <date>     <chr>     <chr>      <chr>          <chr>        <dbl>       <dbl>
-#>  1 2020-01-03 Asia      EMRO       Afghanistan    AF               0           0
-#>  2 2020-01-03 Europe    EURO       Albania        AL               0           0
-#>  3 2020-01-03 Africa    AFRO       Algeria        DZ               0           0
-#>  4 2020-01-03 Oceania   WPRO       American Samoa AS               0           0
-#>  5 2020-01-03 Europe    EURO       Andorra        AD               0           0
-#>  6 2020-01-03 Africa    AFRO       Angola         AO               0           0
-#>  7 2020-01-03 Americas  AMRO       Anguilla       AI               0           0
-#>  8 2020-01-03 Americas  AMRO       Antigua & Bar… AG               0           0
-#>  9 2020-01-03 Americas  AMRO       Argentina      AR               0           0
-#> 10 2020-01-03 Asia      EURO       Armenia        AM               0           0
-#> # … with 132,473 more rows, and 8 more variables: deaths_new <dbl>,
+#> # A tibble: 142,911 × 15
+#>    date       un_region who_region country           iso_code cases_new cases_total
+#>    <date>     <chr>     <chr>      <chr>             <chr>        <dbl>       <dbl>
+#>  1 2020-01-03 Asia      EMRO       Afghanistan       AF               0           0
+#>  2 2020-01-03 Europe    EURO       Albania           AL               0           0
+#>  3 2020-01-03 Africa    AFRO       Algeria           DZ               0           0
+#>  4 2020-01-03 Oceania   WPRO       American Samoa    AS               0           0
+#>  5 2020-01-03 Europe    EURO       Andorra           AD               0           0
+#>  6 2020-01-03 Africa    AFRO       Angola            AO               0           0
+#>  7 2020-01-03 Americas  AMRO       Anguilla          AI               0           0
+#>  8 2020-01-03 Americas  AMRO       Antigua & Barbuda AG               0           0
+#>  9 2020-01-03 Americas  AMRO       Argentina         AR               0           0
+#> 10 2020-01-03 Asia      EURO       Armenia           AM               0           0
+#> # … with 142,901 more rows, and 8 more variables: deaths_new <dbl>,
 #> #   deaths_total <dbl>, recovered_new <dbl>, recovered_total <dbl>,
 #> #   hosp_new <dbl>, hosp_total <dbl>, tested_new <dbl>, tested_total <dbl>
 ```
@@ -171,7 +171,7 @@ for example by level 1 region in the UK, use:
 ``` r
 uk_nots <- get_regional_data(country = "UK", verbose = FALSE)
 uk_nots
-#> # A tibble: 6,916 x 26
+#> # A tibble: 7,501 × 26
 #>    date       region   region_code cases_new cases_total deaths_new deaths_total
 #>    <date>     <chr>    <chr>           <dbl>       <dbl>      <dbl>        <dbl>
 #>  1 2020-01-30 East Mi… E12000004          NA          NA         NA           NA
@@ -184,16 +184,13 @@ uk_nots
 #>  8 2020-01-30 Scotland S92000003          NA          NA         NA           NA
 #>  9 2020-01-30 South E… E12000008          NA          NA         NA           NA
 #> 10 2020-01-30 South W… E12000009          NA          NA         NA           NA
-#> # … with 6,906 more rows, and 19 more variables: recovered_new <dbl>,
+#> # … with 7,491 more rows, and 19 more variables: recovered_new <dbl>,
 #> #   recovered_total <dbl>, hosp_new <dbl>, hosp_total <dbl>, tested_new <dbl>,
 #> #   tested_total <dbl>, areaType <chr>, cumCasesByPublishDate <dbl>,
 #> #   cumCasesBySpecimenDate <dbl>, newCasesByPublishDate <dbl>,
 #> #   newCasesBySpecimenDate <dbl>, cumDeaths28DaysByDeathDate <dbl>,
 #> #   cumDeaths28DaysByPublishDate <dbl>, newDeaths28DaysByDeathDate <dbl>,
-#> #   newDeaths28DaysByPublishDate <dbl>, newPillarFourTestsByPublishDate <lgl>,
-#> #   newPillarOneTestsByPublishDate <dbl>,
-#> #   newPillarThreeTestsByPublishDate <dbl>,
-#> #   newPillarTwoTestsByPublishDate <dbl>
+#> #   newDeaths28DaysByPublishDate <dbl>, …
 ```
 
 Now we have the data we can create plots, for example the time-series of

diff --git a/data-raw/vietnam_codes.R b/data-raw/vietnam_codes.R
@@ -0,0 +1,33 @@
+# Set vietnam region codes
+#
+# Level 1 codes: ISO-3166-2
+# Source: https://en.wikipedia.org/wiki/ISO_3166-2:VN
+#
+library(rvest)
+library(stringi)
+library(stringr)
+library(dplyr)
+library(tibble)
+
+# Level 1 -----------------------------------------------------------------
+# Get ISO codes
+vn_iso <- "https://en.wikipedia.org/wiki/ISO_3166-2:VN"
+
+level_1_region_df <- read_html(vn_iso) %>%
+  html_element(css="table.wikitable:nth-child(11)") %>%
+  html_table()
+
+vietnam_codes <- data.frame(
+  level_1_region_code = level_1_region_df$Code,
+  level_1_region = level_1_region_df$`Subdivision name (vi)`,
+  stringsAsFactors = FALSE
+) %>%
+  mutate(
+    level_1_region = stringi::stri_trans_general(level_1_region, "latin-ascii"),
+    level_1_region = stringi::stri_trim_both(level_1_region),
+    level_1_region = stringr::str_replace_all(level_1_region, '\\(.*\\)|-| ', ''),
+    level_1_region = stringr::str_to_title(level_1_region)
+  )
+
+# update package region_codes
+usethis::use_data(vietnam_codes, overwrite = TRUE)
diff --git a/data/all_country_data.rda b/data/all_country_data.rda
diff --git a/data/vietnam_codes.rda b/data/vietnam_codes.rda
diff --git a/man/Belgium.Rd b/man/Belgium.Rd
diff --git a/man/Brazil.Rd b/man/Brazil.Rd
diff --git a/man/Canada.Rd b/man/Canada.Rd
diff --git a/man/Colombia.Rd b/man/Colombia.Rd
diff --git a/man/Covid19DataHub.Rd b/man/Covid19DataHub.Rd
diff --git a/man/Cuba.Rd b/man/Cuba.Rd
diff --git a/man/France.Rd b/man/France.Rd
diff --git a/man/Germany.Rd b/man/Germany.Rd
diff --git a/man/Google.Rd b/man/Google.Rd