From cd472024af182df370754e01cda057ef7a55c2d3 Mon Sep 17 00:00:00 2001 From: olivroy <52606734+olivroy@users.noreply.github.com> Date: Thu, 27 Jun 2024 14:25:27 -0400 Subject: [PATCH] tabyl uses labels (#575) * Import `%||%` from rlang. * Add support for displaying the label attribute for column name. * Reduce space * Style code (GHA) * style * Handle 2-3 ways tabyl as well. * Clarify explanation comments Co-authored-by: Sam Firke * Update NEWS.md * Update NEWS.md * reword breaking change --------- Co-authored-by: olivroy Co-authored-by: Sam Firke --- NAMESPACE | 1 + NEWS.md | 3 +++ R/get_dupes.R | 2 +- R/print_tabyl.R | 1 - R/tabyl.R | 34 ++++++++++++++++++++++++++-------- tests/testthat/test-tabyl.R | 11 +++++++++++ 6 files changed, 42 insertions(+), 10 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 27073345..1461f757 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -69,6 +69,7 @@ importFrom(lubridate,second) importFrom(lubridate,ymd) importFrom(lubridate,ymd_hms) importFrom(magrittr,"%>%") +importFrom(rlang,"%||%") importFrom(rlang,dots_n) importFrom(rlang,expr) importFrom(rlang,syms) diff --git a/NEWS.md b/NEWS.md index 5d715ce2..c4356e67 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,9 @@ These are all minor breaking changes resulting from enhancements and are not exp * When using `row_to_names()`, when all input values in `row_number` for a column are `NA`, `row_to_names()` creates a column name of `"NA"`, a character, rather than `NA`. If code previously used relied on a column name of `NA`, it will now error. To fix this, rely on a column name of `"NA"`. +* When `tabyl()` is called on a data.frame containing labels, it now displays the label attribute as the name of the first column in the the resulting `tabyl` object (@olivroy, #394). This may break subsequent code that refers to the output of such a `tabyl` by column name. To maintain the previous behavior of ignoring variable labels, you can remove the labels with a function like `haven::zap_labels()` or `labelled::remove_labels()` before calling `tabyl()`. + + ## New features * A new function `paste_skip_na()` pastes without including NA values (#537). diff --git a/R/get_dupes.R b/R/get_dupes.R index b86e409c..0a37f077 100644 --- a/R/get_dupes.R +++ b/R/get_dupes.R @@ -22,7 +22,7 @@ #' mtcars %>% get_dupes(-c(wt, qsec)) #' mtcars %>% get_dupes(starts_with("cy")) #' @importFrom tidyselect eval_select -#' @importFrom rlang expr dots_n syms +#' @importFrom rlang expr dots_n syms %||% get_dupes <- function(dat, ...) { expr <- rlang::expr(c(...)) pos <- tidyselect::eval_select(expr, data = dat) diff --git a/R/print_tabyl.R b/R/print_tabyl.R index 62832419..b971bc40 100644 --- a/R/print_tabyl.R +++ b/R/print_tabyl.R @@ -1,5 +1,4 @@ #' @export - print.tabyl <- function(x, ...) { print.data.frame(x, row.names = FALSE) } diff --git a/R/tabyl.R b/R/tabyl.R index 6fddebdd..fca82750 100644 --- a/R/tabyl.R +++ b/R/tabyl.R @@ -66,7 +66,6 @@ tabyl.default <- function(dat, show_na = TRUE, show_missing_levels = TRUE, ...) var_name <- names(dat) } - # useful error message if input vector doesn't exist if (is.null(dat)) { stop(paste0("object ", var_name, " not found")) @@ -76,6 +75,13 @@ tabyl.default <- function(dat, show_na = TRUE, show_missing_levels = TRUE, ...) var_name <- paste(var_name, collapse = "") } + # Try to retrieve label + if (is.data.frame(dat)) { + var_label <- attr(dat[, var_name], "label", exact = TRUE) %||% var_name + } else { + var_label <- attr(dat, "label", exact = TRUE) %||% var_name + } + # if show_na is not length-1 logical, error helpfully (#377) if (length(show_na) > 1 || !inherits(show_na, "logical")) { stop("The value supplied to the \"show_na\" argument must be TRUE or FALSE.\n\nDid you try to call tabyl on two vectors, like tabyl(data$var1, data$var2) ? To create a two-way tabyl, the two vectors must be in the same data.frame, and the function should be called like this: \n @@ -133,8 +139,8 @@ tabyl.default <- function(dat, show_na = TRUE, show_missing_levels = TRUE, ...) dplyr::mutate(percent = n / sum(n, na.rm = TRUE)) # recalculate % without NAs } - # reassign correct variable name - names(result)[1] <- var_name + # reassign correct variable name (or label if it exists) + names(result)[1] <- var_label # in case input var name was "n" or "percent", call helper function to set unique names result <- handle_if_special_names_used(result) @@ -238,10 +244,11 @@ tabyl_2way <- function(dat, var1, var2, show_na = TRUE, show_missing_levels = TR result <- result[c(setdiff(names(result), "NA_"), "NA_")] } - - result %>% - data.frame(., check.names = FALSE) %>% - as_tabyl(axes = 2, row_var_name = names(dat)[1], col_var_name = names(dat)[2]) + row_var_name <- names(dat)[1] + col_var_name <- names(dat)[2] + names(result)[1] <- attr(dat[, 1], "label", exact = TRUE) %||% names(result)[1] + data.frame(result, check.names = FALSE) %>% + as_tabyl(axes = 2, row_var_name = row_var_name, col_var_name = col_var_name) } @@ -250,6 +257,10 @@ tabyl_3way <- function(dat, var1, var2, var3, show_na = TRUE, show_missing_level dat <- dplyr::select(dat, !!var1, !!var2, !!var3) var3_numeric <- is.numeric(dat[[3]]) + # Preserve labels, as attributes are sometimes dropped during transformations. + var1_label <- attr(dat[, 1], "label", exact = TRUE) + var2_label <- attr(dat[, 2], "label", exact = TRUE) + # Keep factor levels for ordering the list at the end if (is.factor(dat[[3]])) { third_levels_for_sorting <- levels(dat[[3]]) @@ -277,7 +288,14 @@ tabyl_3way <- function(dat, var1, var2, var3, show_na = TRUE, show_missing_level dat[[2]] <- as.factor(dat[[2]]) } - result <- split(dat, dat[[rlang::quo_name(var3)]]) %>% + result <- split(dat, dat[[rlang::quo_name(var3)]]) + # split() drops attributes, so we manually add back the label attributes. + result <- lapply(result, function(x) { + attr(x[[1]], "label") <- var1_label + attr(x[[2]], "label") <- var2_label + x + }) + result <- result %>% purrr::map(tabyl_2way, var1, var2, show_na = show_na, show_missing_levels = show_missing_levels) %>% purrr::map(reset_1st_col_status, col1_class, col1_levels) # reset class of var in 1st col to its input class, #168 diff --git a/tests/testthat/test-tabyl.R b/tests/testthat/test-tabyl.R index 7667e15f..4226e987 100644 --- a/tests/testthat/test-tabyl.R +++ b/tests/testthat/test-tabyl.R @@ -395,6 +395,17 @@ test_that("3-way tabyl with 3rd var factor is listed in right order, #250", { expect_equal(names(tabyl(z, am, gear, cyl)), c("8", "6", "NA_")) }) +test_that("tabyl works with label attributes (#394)", { + mt_label <- mtcars + attr(mt_label$cyl, "label") <- "Number of cyl" + tab <- tabyl(mt_label, cyl) + expect_named(tab, c("Number of cyl", "n", "percent")) + tab2 <- tabyl(mt_label, cyl, am) + expect_named(tab2, c("Number of cyl", "0", "1")) + tab3 <- tabyl(mt_label, cyl, am, vs) + expect_equal(names(tab3[[1]])[1], "Number of cyl") +}) + test_that("tabyl works with ordered 1st variable, #386", { mt_ordered <- mtcars mt_ordered$cyl <- ordered(mt_ordered$cyl, levels = c("4", "8", "6"))