diff --git a/DESCRIPTION b/DESCRIPTION index eb118233..5eeb3c6a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -101,6 +101,7 @@ Imports: stringi, stringr, tidyr (>= 1.0.0), + tidyselect, vroom, xml2 Suggests: @@ -109,6 +110,7 @@ Suggests: knitr, mockery, rmarkdown, + RSocrata, rvest, rworldmap, sf, diff --git a/NAMESPACE b/NAMESPACE index 68efa036..de3a4b03 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -71,6 +71,7 @@ importFrom(dplyr,left_join) importFrom(dplyr,mutate) importFrom(dplyr,mutate_at) importFrom(dplyr,mutate_if) +importFrom(dplyr,n) importFrom(dplyr,na_if) importFrom(dplyr,pull) importFrom(dplyr,recode) @@ -94,6 +95,7 @@ importFrom(jsonlite,fromJSON) importFrom(lifecycle,deprecate_stop) importFrom(lubridate,as_date) importFrom(lubridate,dmy) +importFrom(lubridate,dmy_hms) importFrom(lubridate,mdy) importFrom(lubridate,month) importFrom(lubridate,year) @@ -132,6 +134,7 @@ importFrom(tidyr,pivot_longer) importFrom(tidyr,pivot_wider) importFrom(tidyr,replace_na) importFrom(tidyr,separate) +importFrom(tidyselect,vars_select_helpers) importFrom(utils,download.file) importFrom(utils,untar) importFrom(vroom,vroom) diff --git a/NEWS.md b/NEWS.md index 942c164e..ad8f7d6c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,6 +20,7 @@ This release is currently under development - Fixed a bug in the data sourced from Germany so that instead of treating it as a line list of individuals it is treated as a relatively finely resolved count data which needs to be summed up (by @sbfnk). - Fixed a bug in the Vietnam class due to `stringr` ([#448](https://github.com/epiforecasts/covidregionaldata/pull/448) by @RichardMN). - Fixed a bug with the Netherlands class were the lack of Hospitalisation data in the source was causing the class to fail ([#446](https://github.com/epiforecasts/covidregionaldata/pull/446) by @RichardMN). +- Fixed an issue with the Colombia data and reduced dependencies by making `RSocrata` be a suggested package ([#433](https://github.com/epiforecasts/covidregionaldata/pull/433) by @RichardMN). ## Depreciations diff --git a/R/Colombia.R b/R/Colombia.R index b920f1d9..f72e7880 100644 --- a/R/Colombia.R +++ b/R/Colombia.R @@ -3,7 +3,7 @@ #' and processing COVID-19 region data for Colombia #' # nolint start -#' @source \url{https://github.com/danielcs88/colombia_covid-19/} +#' @source \url{https://www.datos.gov.co/Salud-y-Protecci-n-Social/Casos-positivos-de-COVID-19-en-Colombia/gt2j-8ykr} # nolint end #' @export #' @concept dataset @@ -21,63 +21,99 @@ Colombia <- R6::R6Class("Colombia", #' @field origin name of origin to fetch data for origin = "Colombia", #' @field supported_levels A list of supported levels. - supported_levels = list("1"), + supported_levels = list("1", "2"), #' @field supported_region_names A list of region names in order of level. - supported_region_names = list("1" = "departamento"), + supported_region_names = list( + "1" = "departamento", + "2" = "municipio" + ), #' @field supported_region_codes A list of region codes in order of level. - supported_region_codes = list("1" = "iso_3166_2"), + supported_region_codes = list( + "1" = "iso_3166_2", + "2" = "codigo_municipio" + ), #' @field common_data_urls List of named links to raw data. # nolint start common_data_urls = list( - "main" = "https://raw.githubusercontent.com/danielcs88/colombia_covid-19/master/datos/cronologia.csv" + "main" = "https://www.datos.gov.co/resource/gt2j-8ykr.csv?$select=fecha_diagnostico,ciudad_municipio" ), # nolint end #' @field source_data_cols existing columns within the raw data - source_data_cols = c("cases_total"), + source_data_cols = c("cases_new"), #' @field source_text Plain text description of the source of the data - source_text = "Daniel C\u00e1rdenas", + source_text = "Datos abiertos Colombia (Colombia open data)", #' @field source_url Website address for explanation/introduction of the #' data - source_url = "https://github.com/danielcs88/colombia_covid-19/", + source_url = "https://www.datos.gov.co/Salud-y-Protecci-n-Social/Casos-positivos-de-COVID-19-en-Colombia/gt2j-8ykr", # nolint #' @description Set up a table of region codes for clean data #' @importFrom dplyr mutate set_region_codes = function() { - self$codes_lookup$`1` <- covidregionaldata::colombia_codes + self$codes_lookup$`1` <- covidregionaldata::colombia_codes %>% + select(level_1_region, level_1_region_code) %>% + unique() + self$codes_lookup$`2` <- covidregionaldata::colombia_codes + }, + + #' @description Colombia specific download using Socrata API + #' This uses the `RSocrata` package if it is installed or downloads + #' a much larger csv file if that package is not available. + #' @importFrom dplyr select + download = function() { + message_verbose(self$verbose, + "Downloading Colombia data. This may take a while.") + # RSocrata package is recommended but not required + if (requireNamespace("RSocrata", quietly = self$verbose)) { + self$data$raw$main <- RSocrata::read.socrata(self$data_urls[["main"]]) + } else { + stop("covidregionaldata::Colombia$download - requires RSocrata package.\n", + "Please run install.packages(\"RSocrata\")\n", call.=TRUE) + } }, - #' @description Colombia specific state level data cleaning - #' @importFrom dplyr select mutate - #' @importFrom lubridate ymd + #' @description Colombia specific data cleaning + #' @importFrom dplyr select mutate rename summarise group_by n + #' @importFrom lubridate dmy_hms as_date #' @importFrom stringr str_replace_all str_to_sentence str_to_title #' @importFrom rlang .data #' clean_common = function() { self$data$clean <- self$data$raw[["main"]] %>% - select( - date = .data$fecha, - level_1_region = .data$departamento, - cases_total = .data$casos - ) %>% - mutate( - date = ymd(.data$date), - level_1_region = iconv(.data$level_1_region, - from = "UTF-8", - to = "ASCII//TRANSLIT" - ), - level_1_region = str_replace_all(.data$level_1_region, " D.C.", ""), - level_1_region = str_replace_all( - .data$level_1_region, - "San Andres y Providencia", - "San Andres, Providencia y Santa Catalina" - ), - level_1_region = str_to_sentence(.data$level_1_region), - level_1_region = str_to_title(.data$level_1_region) + rename( + date = .data$fecha_diagnostico, + level_2_region_code = .data$ciudad_municipio ) %>% + group_by(date, level_2_region_code) %>% + summarise(cases_new = n(), .groups = "drop") %>% + mutate(date = as_date(dmy_hms(date)), + level_2_region_code = sprintf("%05d", level_2_region_code)) %>% left_join( - self$codes_lookup$`1`, - by = c("level_1_region" = "level_1_region") + self$codes_lookup$`2`, + by = c("level_2_region_code" = "level_2_region_code") ) + }, + + #' @description Colombia Specific Department Level Data Cleaning + #' + #' Aggregates data to the level 1 (department) regional level. Data is + #' provided by the source at the level 2 (municipality) regional level. + #' + #' @importFrom dplyr group_by summarise ungroup across select + #' @importFrom tidyselect vars_select_helpers + clean_level_1 = function() { + self$data$clean <- self$data$clean %>% + select(-level_2_region_code, -level_2_region) %>% + group_by( + .data$date, + .data$level_1_region, .data$level_1_region_code + ) %>% + summarise( + across( + tidyselect::vars_select_helpers$where(is.numeric), + sum + ) + ) %>% + ungroup() } ) ) diff --git a/R/Vietnam.R b/R/Vietnam.R index 258ced9c..63dfc455 100644 --- a/R/Vietnam.R +++ b/R/Vietnam.R @@ -104,7 +104,7 @@ Vietnam <- R6::R6Class("Vietnam", cases_total, deaths_total, recovered_total - ) %>% + ) %>% mutate(ncsc_region_code = as.numeric(ncsc_region_code)) %>% left_join( self$data$raw$provinces %>% @@ -119,8 +119,6 @@ Vietnam <- R6::R6Class("Vietnam", level_1_region = str_replace_all(level_1_region, "TP HCM", "Hochiminh"), ) %>% - # - #tidyr::drop_na(date, region_name) %>% mutate( level_1_region = stri_trans_general(level_1_region, "ASCII"), level_1_region = stri_trim_both(level_1_region), diff --git a/data-raw/colombia_codes.R b/data-raw/colombia_codes.R index 8cb8792b..8c6202a5 100644 --- a/data-raw/colombia_codes.R +++ b/data-raw/colombia_codes.R @@ -22,7 +22,7 @@ level_1_region <- read_html(co_iso) %>% html_text() level_1_region <- level_1_region[1:33] -colombia_codes <- data.frame( +colombia_departments <- data.frame( level_1_region_code, level_1_region, stringsAsFactors = FALSE @@ -37,7 +37,7 @@ colombia_codes <- data.frame( replacements <- list( "Distrito Capital De Bogota" = "Bogota" ) -colombia_codes <- colombia_codes %>% +colombia_departments <- colombia_departments %>% mutate( level_1_region = ifelse(level_1_region %in% names(replacements), replacements[level_1_region], @@ -46,5 +46,47 @@ colombia_codes <- colombia_codes %>% level_1_region = as.character(level_1_region) ) +# Download list of municipalities and codes +# + +colombia_municipalities_sheet <- download_excel( + "https://www.dane.gov.co/files/censo2005/provincias/subregiones.xls", + "colombia_municipalities.xls", + verbose = TRUE, + transpose = FALSE, + sheet = "Hoja1" +) + colombia_municipalities <- colombia_municipalities_sheet %>% + select(level_2_region = NOM_MPIO, + level_2_region_code = DPTOC_MPIO, + level_1_region = NOM_DEPTO) %>% + mutate( + level_1_region = stri_trans_general(level_1_region, "latin-ascii"), + level_1_region = stri_trim_both(level_1_region), + level_1_region = stringr::str_to_title(level_1_region), + level_1_region = + str_replace_all(.data$level_1_region, + c(" D.c." = "", + "Archipielago De San Andres" + = "San Andres, Providencia Y Santa Catalina", + "Norte Santander" = "Norte De Santander" + ) + ), + level_2_region = stri_trans_general(level_2_region, "latin-ascii"), + level_2_region = stri_trim_both(level_2_region), + level_2_region = + str_replace_all(.data$level_2_region, + c(" D.C." = "" + ) + ), + level_2_region = stringr::str_to_title(level_2_region), + ) + + +# anti_join(colombia_municipalities, colombia_departments, by=c("level_1_region")) +colombia_codes <- left_join(colombia_municipalities, + colombia_departments, + by=c("level_1_region")) + # update package region_codes usethis::use_data(colombia_codes, overwrite = TRUE) diff --git a/data/all_country_data.rda b/data/all_country_data.rda index 5a05afe2..961dfecf 100644 Binary files a/data/all_country_data.rda and b/data/all_country_data.rda differ diff --git a/data/colombia_codes.rda b/data/colombia_codes.rda index efc435d6..81666ce6 100644 Binary files a/data/colombia_codes.rda and b/data/colombia_codes.rda differ diff --git a/inst/WORDLIST b/inst/WORDLIST index 4423c8e6..e6a1fa72 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -30,6 +30,7 @@ COVID covidregionaldata cre CSSE +csv ctb currrently DataClass @@ -132,6 +133,7 @@ rlang rmarkdown Roxygen RoxygenNote +RSocrata rvest RVIM rworldmap @@ -144,6 +146,7 @@ seperate shapefiles sherratt Sherratt +Socrata sophie SouthAfrica spi diff --git a/man/Colombia.Rd b/man/Colombia.Rd index d1f88abb..f8228ff5 100644 --- a/man/Colombia.Rd +++ b/man/Colombia.Rd @@ -4,7 +4,7 @@ \alias{Colombia} \title{Colombia Class for downloading, cleaning and processing notification data} \source{ -\url{https://github.com/danielcs88/colombia_covid-19/} +\url{https://www.datos.gov.co/Salud-y-Protecci-n-Social/Casos-positivos-de-COVID-19-en-Colombia/gt2j-8ykr} } \description{ Information for downloading, cleaning @@ -70,7 +70,9 @@ data} \subsection{Public methods}{ \itemize{ \item \href{#method-set_region_codes}{\code{Colombia$set_region_codes()}} +\item \href{#method-download}{\code{Colombia$download()}} \item \href{#method-clean_common}{\code{Colombia$clean_common()}} +\item \href{#method-clean_level_1}{\code{Colombia$clean_level_1()}} \item \href{#method-clone}{\code{Colombia$clone()}} } } @@ -79,7 +81,6 @@ data} \itemize{ \item \out{}\href{../../covidregionaldata/html/DataClass.html#method-available_regions}{\code{covidregionaldata::DataClass$available_regions()}}\out{} \item \out{}\href{../../covidregionaldata/html/DataClass.html#method-clean}{\code{covidregionaldata::DataClass$clean()}}\out{} -\item \out{}\href{../../covidregionaldata/html/DataClass.html#method-download}{\code{covidregionaldata::DataClass$download()}}\out{} \item \out{}\href{../../covidregionaldata/html/DataClass.html#method-download_JSON}{\code{covidregionaldata::DataClass$download_JSON()}}\out{} \item \out{}\href{../../covidregionaldata/html/DataClass.html#method-filter}{\code{covidregionaldata::DataClass$filter()}}\out{} \item \out{}\href{../../covidregionaldata/html/DataClass.html#method-get}{\code{covidregionaldata::DataClass$get()}}\out{} @@ -100,16 +101,41 @@ Set up a table of region codes for clean data \if{html}{\out{