diff --git a/DESCRIPTION b/DESCRIPTION index 4fe406f..2dee7ef 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: NVIdb Title: Tools to facilitate the use of NVI's databases -Version: 0.13.0 -Date: 2024-12-13 +Version: 0.13.1 +Date: 2024-12-19 Authors@R: c(person(given = "Petter", family = "Hopp", diff --git a/NEWS.md b/NEWS.md index fc2eddd..cf64a30 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# NVIdb 0.13.1 - (2024-12-19) + +## New features: + +- `standardize_columns` now accepts list or filename and path as input to `standards`. + + # NVIdb 0.13.0 - (2024-12-13) ## Bug fixes: diff --git a/R/standardize_columns.R b/R/standardize_columns.R index 2c35bf3..69e1b6a 100644 --- a/R/standardize_columns.R +++ b/R/standardize_columns.R @@ -2,7 +2,7 @@ #' @description Standardizes column names, labels, column width #' for variables in external databases. #' -#' @details The standardization table is under development. This +#' @details The standardisation table is under development. This #' function only works when being connected to the NVI network. #' #' Variables in internal and external data sources uses @@ -16,6 +16,41 @@ #' \ifelse{html}{\code{\link[data.table:fread]{data.table::fread}}}{\code{data.table::fread}} #' can be generated. #' +#' \code{standards} gives the source file or the data.frame with the standards +#' for formatting the columns. Default is to give the general source csv +#' file. It can also be a data.frame as for example the +#' \ifelse{html}{\code{\link[OKplan:OK_column_standards]{OKplan::OK_column_standards}}}{\code{OKplan::OK_column_standards}} +#' giving the standards when generating +#' selection files for the Norwegian surveillance programmes. As this file +#' is embedded into OKplan-package, it may be convenient to update the +#' source file and load it as a csv file. On some occasions, it may be most +#' easy to input the column standards directly using a \code{list}. +#' +#' #' The list input to column_standards must follow a specific format. +#' It is a \code{list} with at least three named vectors: +#' \itemize{ +#' \item \code{colname}: a vector of all columns in in the source file that +#' should be included in the Excel report with the selection list. +#' \item \code{collabel}: A vector with the labels that should be used in the +#' Excel report. +#' \item \code{colwidth}: A vector with the column width that should be used +#' in the Excel report. +#' } +#' +#' In addition one may input: +#' +#' \itemize{ +#' \item \code{colorder}: the order of the columns to be used in the Excel report. +#' The default is to use the same order as they are entered in the vectors. +#' \item \code{column_db}: input added as a possibility to keep on the same format +#' as \code{column_standards}. Not necessary to input. +#' \item \code{table_db}: input added as a possibility to keep on the same format +#' as \code{column_standards}. Must be the same as \code{dbsource}. +#' Not necessary to input. +#' } +#' +#' All vectors must have the same order and the same length. +#' #' \code{property = "colnames"} will replace the column names #' in a data frame with standardized column names. All #' standard column names is snake_case. If no standard name @@ -78,8 +113,11 @@ #' The database that is the source of data. Should be the name of #' the data source as registered in column_standards table. Defaults #' to \code{deparse(substitute(data))}. -#' @param standards [\code{character(1)}]\cr -#' For giving alternative standard tables to column_standards. +#' @param standards [\code{data.frame} | \code{list} | \code{character(1)}]\cr +#' For giving alternatives to the standard table for column_standards using +#' different formats, see details. Defaults to +#' file.path(NVIdb::set_dir_NVI("ProgrammeringR", slash = FALSE), +#' "standardization", "colnames", "column_standards.csv"). #' @param property [\code{character(1)}]\cr #' Property of the column that should be standardized. Must be one #' of c("colnames", "colclasses", "collabels", "colwidths_Excel", @@ -135,13 +173,20 @@ standardize_columns <- function(data, dbsource = deparse(substitute(data)), # csvfile = NULL, - standards = NULL, + standards = file.path(NVIdb::set_dir_NVI("ProgrammeringR", slash = FALSE), + "standardization", "colnames", "column_standards.csv"), property, language = "no", exclude = FALSE, ...) { # TO DO: replace reading column standards with including column standards in sysdata for the package. + # PREPARE ARGUMENTS BEFORE CHECKING ---- + if (is.null(standards)) { + standards <- file.path(NVIdb::set_dir_NVI("ProgrammeringR", slash = FALSE), + "standardization", "colnames", "column_standards.csv") + } + # ARGUMENT CHECKING ---- # Object to store check-results checks <- checkmate::makeAssertCollection() @@ -158,14 +203,34 @@ standardize_columns <- function(data, } checkmate::assert_character(dbsource, len = 1, min.chars = 1, add = checks) - checkmate::assert_data_frame(standards, null.ok = TRUE, add = checks) - + # standards + # checkmate::assert_data_frame(standards, null.ok = TRUE, add = checks) + checkmate::assert(checkmate::check_class(standards, classes = c("data.frame")), + checkmate::check_class(standards, classes = c("list")), + checkmate::check_class(standards, classes = c("character")), + add = checks) + if (inherits(standards, what = "character")) { + checkmate::assert_file_exists(standards, add = checks) + } + if (inherits(standards, what = "list")) { + lengths_standard <- lengths(standards) + NVIcheckmate::assert_integer(lengths_standard, lower = lengths_standard[1], upper = lengths_standard[1], + min.len = 3, max.len = 6, + comment = "When input as a list, all elements must have the same length", + add = checks) + checkmate::assert_subset(names(standards), choices = c("table_db", "colname_db", "colname", "collabel", "colwidth", "colorder"), + add = checks) + } + if (inherits(standards, what = "data.frame")) { + checkmate::assert_data_frame(standards, min.rows = 1, min.cols = 6, add = checks) + } +# property checkmate::assert_subset(tolower(property), choices = c("colnames", "colclasses", "collabels", "colwidths_excel", "colwidths_DT", "colorder"), add = checks) - + # language checkmate::assert_subset(language, choices = c("no", "en"), add = checks) checkmate::assert_logical(exclude, add = checks) @@ -177,12 +242,36 @@ standardize_columns <- function(data, dbsource <- tolower(dbsource) # Reading column standards from a csv-file based on in an Excel file - if (is.null(standards)) { - column_standards <- utils::read.csv2( - file = paste0(NVIdb::set_dir_NVI("ProgrammeringR"), "standardization/column_standards.csv"), - fileEncoding = "UTF-8" - ) - } else { + # if (is.null(standards)) { + # column_standards <- utils::read.csv2( + # file = file.path(NVIdb::set_dir_NVI("ProgrammeringR", slash = FALSE), + # "standardization", "column_standards.csv"), + # fileEncoding = "UTF-8" + # ) + # } else { + # column_standards <- standards + # } + if (inherits(standards, what = "character")) { + column_standards <- utils::read.csv2(file = standards, fileEncoding = "UTF-8") + } + if (inherits(standards, what = "list")) { + column_standards <- as.data.frame((standards)) + + if (!"table_db" %in% colnames(column_standards)) { + column_standards$table_db <- dbsource + } + + if (!"colname_db" %in% colnames(column_standards)) { + column_standards$colname_db <- column_standards$colname + } + + if (!"colorder" %in% colnames(column_standards)) { + column_standards$colorder <- c(1:dim(column_standards)[1]) + } + colnames(column_standards)[which(colnames(column_standards) == "collabel")] <- "label_1_no" + colnames(column_standards)[which(colnames(column_standards) == "colwidth")] <- "colwidth_Excel" + } + if (inherits(standards, what = "data.frame")) { column_standards <- standards } diff --git a/man/standardize_columns.Rd b/man/standardize_columns.Rd index ed3c63b..8261ba5 100644 --- a/man/standardize_columns.Rd +++ b/man/standardize_columns.Rd @@ -7,7 +7,8 @@ standardize_columns( data, dbsource = deparse(substitute(data)), - standards = NULL, + standards = file.path(NVIdb::set_dir_NVI("ProgrammeringR", slash = FALSE), + "standardization", "colnames", "column_standards.csv"), property, language = "no", exclude = FALSE, @@ -24,8 +25,11 @@ The database that is the source of data. Should be the name of the data source as registered in column_standards table. Defaults to \code{deparse(substitute(data))}.} -\item{standards}{[\code{character(1)}]\cr -For giving alternative standard tables to column_standards.} +\item{standards}{[\code{data.frame} | \code{list} | \code{character(1)}]\cr +For giving alternatives to the standard table for column_standards using + different formats, see details. Defaults to + file.path(NVIdb::set_dir_NVI("ProgrammeringR", slash = FALSE), + "standardization", "colnames", "column_standards.csv").} \item{property}{[\code{character(1)}]\cr Property of the column that should be standardized. Must be one @@ -65,7 +69,7 @@ Standardizes column names, labels, column width for variables in external databases. } \details{ -The standardization table is under development. This +The standardisation table is under development. This function only works when being connected to the NVI network. Variables in internal and external data sources uses @@ -79,6 +83,41 @@ Variables in internal and external data sources uses \ifelse{html}{\code{\link[data.table:fread]{data.table::fread}}}{\code{data.table::fread}} can be generated. +\code{standards} gives the source file or the data.frame with the standards + for formatting the columns. Default is to give the general source csv + file. It can also be a data.frame as for example the + \ifelse{html}{\code{\link[OKplan:OK_column_standards]{OKplan::OK_column_standards}}}{\code{OKplan::OK_column_standards}} + giving the standards when generating + selection files for the Norwegian surveillance programmes. As this file + is embedded into OKplan-package, it may be convenient to update the + source file and load it as a csv file. On some occasions, it may be most + easy to input the column standards directly using a \code{list}. + + #' The list input to column_standards must follow a specific format. + It is a \code{list} with at least three named vectors: +\itemize{ +\item \code{colname}: a vector of all columns in in the source file that + should be included in the Excel report with the selection list. +\item \code{collabel}: A vector with the labels that should be used in the + Excel report. +\item \code{colwidth}: A vector with the column width that should be used + in the Excel report. +} + + In addition one may input: + +\itemize{ +\item \code{colorder}: the order of the columns to be used in the Excel report. + The default is to use the same order as they are entered in the vectors. +\item \code{column_db}: input added as a possibility to keep on the same format + as \code{column_standards}. Not necessary to input. +\item \code{table_db}: input added as a possibility to keep on the same format + as \code{column_standards}. Must be the same as \code{dbsource}. + Not necessary to input. +} + +All vectors must have the same order and the same length. + \code{property = "colnames"} will replace the column names in a data frame with standardized column names. All standard column names is snake_case. If no standard name diff --git a/tests/testthat/test_login.R b/tests/testthat/test_login.R index 6bc5f04..d9e9aca 100644 --- a/tests/testthat/test_login.R +++ b/tests/testthat/test_login.R @@ -7,7 +7,7 @@ test_that("Log in to db services", { linewidth <- options("width") options(width = 80) - + odbc_connected <- login("PJS") expect_true(as.vector(odbc_connected) >= 1) RODBC::odbcClose(odbc_connected) @@ -21,7 +21,7 @@ test_that("Log in to db services", { regexp = "'login_by_credentials_PJS' is replaced by 'login_by_credentials") expect_true(as.vector(odbc_connected) >= 1) RODBC::odbcClose(odbc_connected) - + options(width = unlist(linewidth)) }) @@ -49,4 +49,3 @@ test_that("Errors or warnings for login", { options(width = unlist(linewidth)) }) - diff --git a/tests/testthat/test_standardize_columns.R b/tests/testthat/test_standardize_columns.R index 501c7f0..9822199 100644 --- a/tests/testthat/test_standardize_columns.R +++ b/tests/testthat/test_standardize_columns.R @@ -32,6 +32,13 @@ test_that("Standardize colnames from PJS", { property = "colnames")), correct_result) + # Compare Add fylke, current fylkenr and current fylke with correct result + expect_identical(colnames(standardize_columns(data = df, + standards = file.path(NVIdb::set_dir_NVI("ProgrammeringR", slash = FALSE), + "standardization", "colnames", "column_standards.csv"), + property = "colnames")), + correct_result) + }) @@ -61,6 +68,14 @@ test_that("Standardize colnames from EOS scrapie", { property = "colnames")), correct_result) + # Compare Add fylke, current fylkenr and current fylke with correct result + expect_identical(colnames(standardize_columns(data = df, + dbsource = "proveresultat_scrapie", + standards = file.path(NVIdb::set_dir_NVI("ProgrammeringR", slash = FALSE), + "standardization", "colnames", "column_standards.csv"), + property = "colnames")), + correct_result) + }) @@ -108,26 +123,26 @@ test_that("colClasses for csv-files", { test_that("Standardize colwidths for Excel", { -# skip if no connection to 'FAG' have been established -skip_if_not(dir.exists(set_dir_NVI("FAG"))) - -PJStest <- readRDS(file.path(".", "PJS_testdata.rds")) -# PJStest <- readRDS("./tests/testthat/PJS_testdata.rds") - -# # Make a vector with correct column names after translation -correct_result <- c(5.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.00, 10.71, 10.71, - 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, - 10.71, 10.71, 10.71, 20.00, 20.00, 10.71, 10.71, 10.71, 10.71, 10.71, - 10.71, 11.00, 11.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, - 10.71, 10.71, 10.71, 11.00, 10.71, 8.00, 10.71, 10.71, 10.71, 10.71, - 30.00, 10.71, 10.71, 10.71, 11.00, 10.71, 10.71, 10.71, 10.71, 10.71, - 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, - 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, - 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, - 10.71, 10.71, 10.71, 10.71, 8.00) + # skip if no connection to 'FAG' have been established + skip_if_not(dir.exists(set_dir_NVI("FAG"))) + + PJStest <- readRDS(file.path(".", "PJS_testdata.rds")) + # PJStest <- readRDS("./tests/testthat/PJS_testdata.rds") + + # # Make a vector with correct column names after translation + correct_result <- c(5.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.00, 10.71, 10.71, + 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, + 10.71, 10.71, 10.71, 20.00, 20.00, 10.71, 10.71, 10.71, 10.71, 10.71, + 10.71, 11.00, 11.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, + 10.71, 10.71, 10.71, 11.00, 10.71, 8.00, 10.71, 10.71, 10.71, 10.71, + 30.00, 10.71, 10.71, 10.71, 11.00, 10.71, 10.71, 10.71, 10.71, 10.71, + 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, + 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, + 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, + 10.71, 10.71, 10.71, 10.71, 8.00) expect_equal(standardize_columns(data = PJStest, property = "colwidths_Excel"), - correct_result) + correct_result) # Standardisere kolonnenavn @@ -137,12 +152,12 @@ correct_result <- c(5.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.00, 10.71 correct_result <- c(5.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.00, 10.00, 10.00, 10.00, 10.00, 10.71, 10.71, 5.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 20.00, 20.00, 10.71, 10.71, 10.71, 10.71, 10.71, - 5.00, 11.00, 11.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, + 5.00, 11.00, 11.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.00, 10.00, 11.00, 10.71, 8.00, 10.71, 10.71, 10.71, 10.71, 30.00, 10.71, 10.71, 10.71, 11.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, - 8.00, 10.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, + 8.00, 10.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 30.00, 8.00) expect_equal(standardize_columns(data = PJStest, property = "colwidths_Excel"), @@ -155,12 +170,12 @@ correct_result <- c(5.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.00, 10.71 correct_result <- c(5.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.00, 10.00, 10.00, 10.00, 10.00, 10.71, 10.71, 5.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 20.00, 20.00, 10.71, 10.71, 10.71, 10.71, 10.71, - 5.00, 11.00, 11.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, + 5.00, 11.00, 11.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.00, 10.00, 11.00, 10.71, 8.00, 10.71, 10.71, 10.71, 10.71, 30.00, 10.71, 10.71, 10.71, 11.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, - 8.00, 10.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, + 8.00, 10.00, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 10.71, 30.00, 8.00) expect_equal(standardize_columns(data = PJStest, property = "colwidths_Excel"), @@ -190,6 +205,13 @@ test_that("Standardize English collabels", { language = "en"), correct_result) + expect_identical(standardize_columns(data = df, + standards = file.path(NVIdb::set_dir_NVI("ProgrammeringR", slash = FALSE), + "standardization", "colnames", "column_standards.csv"), + property = "collabels", + language = "en"), + correct_result) + }) test_that("Column order", { @@ -228,7 +250,54 @@ test_that("Column order", { }) +test_that("List input to standards", { + # Make example data + df <- as.data.frame(cbind("ok_aar" = 2021, "rapport" = "Brucellose hos geit, utvalgsliste", + "mt_avdelingnr" = "M21150", "mt_avdeling" = "Romerike", + "mt_regionnr" = "M21000", "mt_region" = "Region Stor-Oslo", + "eier_lokalitetnr" = "30303030", "eier_lokalitet" = "XXX XXXXX", "orgnr" = 989989989, + "postnr" = "0468", "poststed" = "OSLO", "ant_prover" = 26)) + # Probably first makes a matrix, therefore "Antall prøver" is character. + df$ant_prover <- as.numeric(df$ant_prover) + + df <- standardize_columns(data = df, + standards = + list("colname" = c("ok_aar", "rapport", "mt_regionnr", "mt_region", "mt_avdelingnr", + "mt_avdeling", "eier_lokalitetnr", "orgnr", "eier_lokalitet", "postnr", "poststed", + "ant_prover"), + "collabel" = c("År", "Rapport", "MT regionnr", "MT region", "MT avdelingsnr", + "MT avdeling", "Produsentnr", "Virksomhetnr", "Virksomhet", "Postnr", "Poststed", + "Antall prøver"), + "colwidth" = c(5, 9, 12.5, 16, 13, 30, 12, 12, 30, 8, 15, 8.5)), + dbsource = "brucella_geit", + property = "colorder", + exclude = TRUE) + + expect_equal(colnames(df), + c("ok_aar", "rapport", "mt_regionnr", "mt_region", "mt_avdelingnr", "mt_avdeling", "eier_lokalitetnr", "orgnr", + "eier_lokalitet", "postnr", "poststed", "ant_prover")) + + expect_equal(standardize_columns(data = df, + standards = + list("colname" = c("ok_aar", "rapport", "mt_regionnr", "mt_region", "mt_avdelingnr", + "mt_avdeling", "eier_lokalitetnr", "orgnr", "eier_lokalitet", "postnr", "poststed", + "ant_prover"), + "collabel" = c("År", "Rapport", "MT regionnr", "MT region", "MT avdelingsnr", + "MT avdeling", "Produsentnr", "Virksomhetnr", "Virksomhet", "Postnr", "Poststed", + "Antall prøver"), + "colwidth" = c(5, 9, 12.5, 16, 13, 30, 12, 12, 30, 8, 15, 8.5)), + dbsource = "brucella_geit", + property = "collabels", + exclude = TRUE), + c("År", "Rapport", + "MT regionnr", "MT region", "MT avdelingsnr", "MT avdeling", + "Produsentnr", "Virksomhetnr", "Virksomhet", + "Postnr", "Poststed", "Antall prøver")) +}) + test_that("standardize_columns argument checking", { + linewidth <- options("width") + options(width = 80) PJStest <- readRDS(file.path(".", "PJS_testdata.rds")) # PJStest <- readRDS("./tests/testthat/PJS_testdata.rds") @@ -239,6 +308,9 @@ test_that("standardize_columns argument checking", { expect_error(standardize_columns(data = PJStest, property = "columnNames", language = "no", exclude = FALSE), regexp = "property") + expect_error(standardize_columns(data = PJStest, standards = 1, property = "colNames", language = "no", exclude = FALSE), + regexp = "but has class 'numeric'") + expect_error(standardize_columns(data = PJStest, property = "colClasses", language = "no", exclude = FALSE), regexp = "No file provided.") @@ -248,4 +320,5 @@ test_that("standardize_columns argument checking", { expect_error(standardize_columns(data = PJStest, property = "colNames", language = "no", exclude = "FALSE"), regexp = "Variable 'exclude': Must be of type 'logical', not 'character'.") + options(width = unlist(linewidth)) }) diff --git a/vignettes/NVIdb.pdf b/vignettes/NVIdb.pdf index 98ad96a..9078dee 100644 Binary files a/vignettes/NVIdb.pdf and b/vignettes/NVIdb.pdf differ