From fbe6e6f31473a5939b491df37f01128f7a9698cb Mon Sep 17 00:00:00 2001 From: teofiln Date: Wed, 15 Sep 2021 21:30:58 -0600 Subject: [PATCH] add option to omit colname prefix when only one column is recoded (includes a couple of tests) --- R/dummy_cols.R | 23 +++++++- man/dummy_cols.Rd | 6 +- man/dummy_columns.Rd | 6 +- tests/testthat/test-omit-colname-prefix.R | 70 +++++++++++++++++++++++ 4 files changed, 101 insertions(+), 4 deletions(-) create mode 100644 tests/testthat/test-omit-colname-prefix.R diff --git a/R/dummy_cols.R b/R/dummy_cols.R index b8d017f..c038e9d 100644 --- a/R/dummy_cols.R +++ b/R/dummy_cols.R @@ -32,6 +32,9 @@ #' and dog dummy columns. #' @param remove_selected_columns #' If TRUE (not default), removes the columns used to generate the dummy columns. +#' @param omit_colname_prefix +#' If TRUE (not default) and `length(select_columns) == 1`, omit pre-pending the +#' name of `select_columns` to the names of the newly generated dummy columns #' #' @return #' A data.frame (or tibble or data.table, depending on input data type) with @@ -54,7 +57,8 @@ dummy_cols <- function(.data, remove_most_frequent_dummy = FALSE, ignore_na = FALSE, split = NULL, - remove_selected_columns = FALSE) { + remove_selected_columns = FALSE, + omit_colname_prefix = FALSE) { stopifnot(is.null(select_columns) || is.character(select_columns), select_columns != "", @@ -177,6 +181,7 @@ dummy_cols <- function(.data, } data.table::alloc.col(.data, ncol(.data) + length(unique_vals)) + # data.table::set(.data, j = paste0(col_name, "_", unique_vals), value = 0L) .data[, paste0(col_name, "_", unique_vals)] <- 0L for (unique_value in unique_vals) { @@ -219,8 +224,22 @@ dummy_cols <- function(.data, } .data <- fix_data_type(.data, data_type) - return(.data) + if (omit_colname_prefix) { + if (length(select_columns) == 1) { + + new_col_index <- + as.logical(rowSums(sapply(unique_vals, function(x) + grepl(paste0(select_columns, "_", x), names(.data))))) + names(.data)[new_col_index] <- + gsub(paste0(select_columns, "_"), "", names(.data)[new_col_index]) + + } else { + message("Can't omit the colname prefix when recoding more than one column.") + message("Returning prefixed dummy columns.") + } + } + return(.data) } diff --git a/man/dummy_cols.Rd b/man/dummy_cols.Rd index ca6f916..337d2cb 100644 --- a/man/dummy_cols.Rd +++ b/man/dummy_cols.Rd @@ -11,7 +11,8 @@ dummy_cols( remove_most_frequent_dummy = FALSE, ignore_na = FALSE, split = NULL, - remove_selected_columns = FALSE + remove_selected_columns = FALSE, + omit_colname_prefix = FALSE ) } \arguments{ @@ -38,6 +39,9 @@ then a split value of "," this row would have a value of 1 for both the cat and dog dummy columns.} \item{remove_selected_columns}{If TRUE (not default), removes the columns used to generate the dummy columns.} + +\item{omit_colname_prefix}{If TRUE (not default) and `length(select_columns) == 1`, omit pre-pending the +name of `select_columns` to the names of the newly generated dummy columns} } \value{ A data.frame (or tibble or data.table, depending on input data type) with diff --git a/man/dummy_columns.Rd b/man/dummy_columns.Rd index 1236b0a..19946bd 100644 --- a/man/dummy_columns.Rd +++ b/man/dummy_columns.Rd @@ -11,7 +11,8 @@ dummy_columns( remove_most_frequent_dummy = FALSE, ignore_na = FALSE, split = NULL, - remove_selected_columns = FALSE + remove_selected_columns = FALSE, + omit_colname_prefix = FALSE ) } \arguments{ @@ -38,6 +39,9 @@ then a split value of "," this row would have a value of 1 for both the cat and dog dummy columns.} \item{remove_selected_columns}{If TRUE (not default), removes the columns used to generate the dummy columns.} + +\item{omit_colname_prefix}{If TRUE (not default) and `length(select_columns) == 1`, omit pre-pending the +name of `select_columns` to the names of the newly generated dummy columns} } \description{ dummy_columns() quickly creates dummy (binary) columns from character and diff --git a/tests/testthat/test-omit-colname-prefix.R b/tests/testthat/test-omit-colname-prefix.R new file mode 100644 index 0000000..0825ad9 --- /dev/null +++ b/tests/testthat/test-omit-colname-prefix.R @@ -0,0 +1,70 @@ +sample_data <- + structure( + list( + colA = c("a", "a", "a", "b", "b", "c", "c", "c", + "c", "c"), + colB = c(1, 1, 1, 2, 2, 3, 3, 3, 3, 3), + colC = c( + "val1", + "val2", + "val3", + "val1", + "val2", + "val7", + "val2", + "val4", + "val6", + "val8" + ) + ), + row.names = c(NA, -10L), + class = c("tbl_df", "tbl", + "data.frame") + ) + +test_that("omit_colname_prefix works", { + expect_named( + dummy_cols( + sample_data, + c("colC"), + remove_selected_columns = TRUE, + omit_colname_prefix = TRUE + ), + c( + "colA", + "colB", + "val1", + "val2", + "val3", + "val4", + "val6", + "val7", + "val8" + ) + ) +}) + +test_that("omit_colname_prefix does not remove prefix when >1 select_columns", + { + expect_named( + dummy_cols( + sample_data, + c("colB", "colC"), + remove_selected_columns = TRUE, + omit_colname_prefix = TRUE + ), + c( + "colA", + "colB_1", + "colB_2", + "colB_3", + "colC_val1", + "colC_val2", + "colC_val3", + "colC_val4", + "colC_val6", + "colC_val7", + "colC_val8" + ) + ) + })