From f3bcb066a29d23efb319e85905fe63f6605ea173 Mon Sep 17 00:00:00 2001 From: chilampoon Date: Sat, 16 Sep 2023 15:19:07 -0400 Subject: [PATCH 1/7] resolve tidyomics conflicts and docs clean up --- DESCRIPTION | 3 +- NAMESPACE | 18 +- R/attach.R | 22 + R/dplyr_methods.R | 738 +++--------------- R/functions.R | 9 +- R/functions_SE.R | 11 +- R/methods.R | 134 ++-- R/methods_SE.R | 30 +- R/tidyr_methods.R | 98 +-- R/zzz.R | 37 +- man/adjust_abundance-methods.Rd | 11 +- man/arrange-methods.Rd | 66 -- man/arrange.Rd | 91 +++ man/bind_rows.Rd | 28 +- man/distinct-methods.Rd | 24 - man/distinct.Rd | 55 ++ man/dplyr-methods.Rd | 31 - man/filter-methods.Rd | 85 -- man/filter.Rd | 116 +++ man/full_join.Rd | 172 ++++ man/group_by-methods.Rd | 47 -- man/group_by.Rd | 162 ++++ man/inner_join.Rd | 172 ++++ man/join-methods.Rd | 48 -- man/left_join.Rd | 172 ++++ man/mutate-methods.Rd | 103 --- man/mutate.Rd | 173 ++++ man/mutate.nested_tidybulk.Rd | 175 +++++ man/nest-methods.Rd | 61 -- man/nest.Rd | 84 ++ man/reexports.Rd | 2 +- man/rename-methods.Rd | 51 -- man/rename.Rd | 87 +++ man/right_join.Rd | 172 ++++ man/rotate_dimensions-methods.Rd | 8 +- man/rowwise-methods.Rd | 36 - man/rowwise.Rd | 78 ++ man/summarise-methods.Rd | 91 --- man/summarise.Rd | 139 ++++ man/symbol_to_entrez.Rd | 5 +- man/test_gene_enrichment-methods.Rd | 23 +- man/tidybulk-methods.Rd | 5 +- man/ungroup.Rd | 148 ++++ man/unnest.Rd | 111 +++ tests/testthat/test-bulk_methods.R | 2 +- .../test-bulk_methods_SummarizedExperiment.R | 2 +- 46 files changed, 2449 insertions(+), 1487 deletions(-) create mode 100644 R/attach.R delete mode 100644 man/arrange-methods.Rd create mode 100644 man/arrange.Rd delete mode 100644 man/distinct-methods.Rd create mode 100644 man/distinct.Rd delete mode 100644 man/dplyr-methods.Rd delete mode 100644 man/filter-methods.Rd create mode 100644 man/filter.Rd create mode 100644 man/full_join.Rd delete mode 100644 man/group_by-methods.Rd create mode 100644 man/group_by.Rd create mode 100644 man/inner_join.Rd delete mode 100644 man/join-methods.Rd create mode 100644 man/left_join.Rd delete mode 100644 man/mutate-methods.Rd create mode 100644 man/mutate.Rd create mode 100644 man/mutate.nested_tidybulk.Rd delete mode 100644 man/nest-methods.Rd create mode 100644 man/nest.Rd delete mode 100644 man/rename-methods.Rd create mode 100644 man/rename.Rd create mode 100644 man/right_join.Rd delete mode 100644 man/rowwise-methods.Rd create mode 100644 man/rowwise.Rd delete mode 100644 man/summarise-methods.Rd create mode 100644 man/summarise.Rd create mode 100644 man/ungroup.Rd create mode 100644 man/unnest.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 1c97a1ae..cf67279f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,7 +35,8 @@ Imports: GenomicRanges, methods, S4Vectors, - crayon + crayon, + pkgconfig Suggests: BiocStyle, testthat, diff --git a/NAMESPACE b/NAMESPACE index 58ce8456..f77a86ca 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,49 +16,36 @@ S3method(rename,tidybulk) S3method(right_join,tidybulk) S3method(rowwise,tidybulk) S3method(summarise,tidybulk) +S3method(summarize,tidybulk) S3method(ungroup,tidybulk) S3method(unnest,nested_tidybulk) export("%>%") export(adjust_abundance) export(aggregate_duplicates) -export(arrange) export(as_SummarizedExperiment) export(as_matrix) export(as_tibble) -export(bind_cols) export(cluster_elements) export(deconvolve_cellularity) export(describe_transcript) -export(distinct) export(do) export(ensembl_to_symbol) export(fill_missing_abundance) -export(filter) -export(full_join) export(get_bibliography) -export(group_by) export(identify_abundant) export(impute_missing_abundance) -export(inner_join) export(keep_abundant) export(keep_variable) -export(left_join) export(log10_reverse_trans) export(logit_trans) -export(mutate) -export(nest) export(pivot_sample) export(pivot_transcript) export(quantile_normalise_abundance) export(reduce_dimensions) export(remove_redundancy) -export(rename) -export(right_join) export(rotate_dimensions) -export(rowwise) export(scale_abundance) export(select) -export(summarise) export(symbol_to_entrez) export(test_differential_abundance) export(test_differential_cellularity) @@ -69,7 +56,6 @@ export(test_stratification_cellularity) export(tibble) export(tidybulk) export(tidybulk_SAM_BAM) -export(unnest) exportMethods(as_SummarizedExperiment) exportMethods(quantile_normalise_abundance) exportMethods(scale_abundance) @@ -121,6 +107,7 @@ importFrom(dplyr,slice) importFrom(dplyr,starts_with) importFrom(dplyr,summarise) importFrom(dplyr,summarise_all) +importFrom(dplyr,summarize) importFrom(dplyr,ungroup) importFrom(lifecycle,deprecate_soft) importFrom(lifecycle,deprecate_warn) @@ -147,7 +134,6 @@ importFrom(rlang,":=") importFrom(rlang,dots_list) importFrom(rlang,dots_values) importFrom(rlang,enquo) -importFrom(rlang,enquos) importFrom(rlang,flatten_if) importFrom(rlang,inform) importFrom(rlang,is_spliced) diff --git a/R/attach.R b/R/attach.R new file mode 100644 index 00000000..0543a45c --- /dev/null +++ b/R/attach.R @@ -0,0 +1,22 @@ +core <- c("dplyr", "tidyr", "ttservice", "ggplot2") + +core_unloaded <- function() { + search <- paste0("package:", core) + core[!search %in% search()] +} + + +same_library <- function(pkg) { + loc <- if (pkg %in% loadedNamespaces()) + dirname(getNamespaceInfo(pkg, "path")) + library(pkg, lib.loc=loc, character.only=TRUE, warn.conflicts=FALSE) +} + +tidyverse_attach <- function() { + to_load <- core_unloaded() + + suppressPackageStartupMessages( + lapply(to_load, same_library)) + + invisible(to_load) +} diff --git a/R/dplyr_methods.R b/R/dplyr_methods.R index 0f43685b..d84ca57c 100755 --- a/R/dplyr_methods.R +++ b/R/dplyr_methods.R @@ -1,70 +1,13 @@ - -#' @export -dplyr::select - -#' Arrange rows by column values -#' -#' -#' @description -#' `arrange()` order the rows of a data frame rows by the values of selected -#' columns. -#' -#' Unlike other dplyr verbs, `arrange()` largely ignores grouping; you -#' need to explicit mention grouping variables (or use `by_group = TRUE`) -#' in order to group by them, and functions of variables are evaluated -#' once per data frame, not once per group. -#' -#' @details -#' ## Locales -#' The sort order for character vectors will depend on the collating sequence -#' of the locale in use: see [locales()]. -#' -#' ## Missing values -#' Unlike base sorting with `sort()`, `NA` are: -#' * always sorted to the end for local data, even when wrapped with `desc()`. -#' * treated differently for remote data, depending on the backend. -#' -#' @return -#' An object of the same type as `.data`. -#' -#' * All rows appear in the output, but (usually) in a different place. -#' * Columns are not modified. -#' * Groups are not modified. -#' * Data frame attributes are preserved. -#' @section Methods: -#' This function is a **generic**, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' The following methods are currently available in loaded packages: -#' -#' @param .data A data frame, data frame extension (e.g. a tibble), or a -#' lazy data frame (e.g. from dbplyr or dtplyr). See *Methods*, below, for -#' more details. -#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Variables, or functions or -#' variables. Use [desc()] to sort a variable in descending order. -#' @param .by_group If TRUE, will sort first by grouping variable. Applies to grouped data frames only. -#' -#' @return A tibble -#' @family single table verbs -#' -#' @rdname arrange-methods #' @name arrange +#' @rdname arrange +#' @inherit dplyr::arrange +#' @family single table verbs #' @importFrom dplyr arrange -#' -#' @examples -#' -#' arrange(mtcars, cyl, disp) -#' -#' @export -NULL - #' @export arrange.tidybulk <- function(.data, ..., .by_group = FALSE) { - - .data |> - drop_class(c("tidybulk", "tt")) |> - dplyr::arrange( ..., .by_group = .by_group) |> + .data |> + drop_class(c("tidybulk", "tt")) |> + dplyr::arrange( ..., .by_group = .by_group) |> # Attach attributes reattach_internals(.data) |> @@ -72,130 +15,68 @@ arrange.tidybulk <- function(.data, ..., .by_group = FALSE) { # Add class add_class("tt") |> add_class("tidybulk") - } -#' Efficiently bind multiple data frames by row and column -#' -#' This is an efficient implementation of the common pattern of -#' `do.call(rbind, dfs)` or `do.call(cbind, dfs)` for binding many -#' data frames into one. -#' -#' The output of `bind_rows()` will contain a column if that column -#' appears in any of the inputs. -#' -#' @param ... Data frames to combine. -#' -#' Each argument can either be a data frame, a list that could be a data -#' frame, or a list of data frames. -#' -#' When row-binding, columns are matched by name, and any missing -#' columns will be filled with NA. -#' -#' When column-binding, rows are matched by position, so all data -#' frames must have the same number of rows. To match by value, not -#' position, see mutate-joins. -#' @param .id Data frame identifier. -#' -#' When `.id` is supplied, a new column of identifiers is -#' created to link each row to its original data frame. The labels -#' are taken from the named arguments to `bind_rows()`. When a -#' list of data frames is supplied, the labels are taken from the -#' names of the list. If no names are found a numeric sequence is -#' used instead. -#' @param add.cell.ids from Seurat 3.0 A character vector of length(x = c(x, y)). Appends the corresponding values to the start of each objects' cell names. -#' -#' @importFrom ttservice bind_rows +#' @name bind_rows +#' @rdname bind_rows +#' @inherit ttservice::bind_rows #' -#' @return `bind_rows()` and `bind_cols()` return the same type as -#' the first input, either a data frame, `tbl_df`, or `grouped_df`. #' @examples #' data(se_mini) #' -#' se_mini_tidybulk = se_mini |> tidybulk() -#' bind_rows( se_mini_tidybulk, se_mini_tidybulk ) +#' se_mini_tidybulk <- se_mini |> tidybulk() +#' bind_rows(se_mini_tidybulk, se_mini_tidybulk) #' -#' tt_bind = se_mini_tidybulk |> select(time, condition) +#' tt_bind <- se_mini_tidybulk |> select(time, condition) #' se_mini_tidybulk |> bind_cols(tt_bind) #' -#' @name bind_rows -NULL - #' @importFrom rlang dots_values #' @importFrom rlang flatten_if #' @importFrom rlang is_spliced -#' +#' @importFrom ttservice bind_rows #' @export -#' -bind_rows.tidybulk <- function(..., .id = NULL) -{ +bind_rows.tidybulk <- function(..., .id = NULL) { + tts <- flatten_if(dots_values(...), is_spliced) - tts = flatten_if(dots_values(...), is_spliced) # Original that fails Bioconductor dplyr:::flatten_bindable(rlang::dots_values(...)) - - par1 = tts[[1]] |> get_tt_columns() |> unlist() - par2 = tts[[2]] |> get_tt_columns() |> unlist() + par1 <- tts[[1]] |> get_tt_columns() |> unlist() + par2 <- tts[[2]] |> get_tt_columns() |> unlist() # # tt_columns of the two objects must match # error_if_parameters_not_match(par1, par2) ttservice:::bind_rows.data.frame(..., .id = .id) |> - # Attach attributes reattach_internals(tts[[1]]) - } -#' @export -#' -#' @importFrom ttservice bind_cols -#' @inheritParams bind_cols -#' -#' @rdname dplyr-methods -#' @name bind_cols -NULL - #' @importFrom rlang dots_values #' @importFrom rlang flatten_if #' @importFrom rlang is_spliced -#' -#' @export -#' -bind_cols.tidybulk <- function(..., .id = NULL) -{ - - tts = tts = flatten_if(dots_values(...), is_spliced) # Original that fails Bioconductor dplyr:::flatten_bindable(rlang::dots_values(...)) - - dplyr::bind_cols(..., .id = .id) |> - - # Attach attributes - reattach_internals(tts[[1]]) - +#' @importFrom ttservice bind_cols +bind_cols_ <- function(..., .id = NULL) { + tts <- tts <- flatten_if(dots_values(...), is_spliced) + + ttservice::bind_cols(..., .id = .id) |> + # Attach attributes + reattach_internals(tts[[1]]) } -#' distinct -#' @param .data A tbl. (See dplyr) -#' @param ... Data frames to combine (See dplyr) -#' @param .keep_all If TRUE, keep all variables in .data. If a combination of ... is not distinct, this keeps the first row of values. (See dplyr) -#' -#' @return A tt object -#' -#' @rdname distinct-methods +#' @rdname bind_rows +#' @aliases bind_cols +#' @export +bind_cols.tidybulk <- bind_cols_ + #' @name distinct -#' @importFrom dplyr distinct +#' @rdname distinct +#' @inherit dplyr::distinct #' #' @examples -#' -#' tidybulk::se_mini |> tidybulk() |> distinct() -#' -#' -#' @export -NULL - - -#' @inheritParams distinct +#' data(se_mini) +#' se_mini |> tidybulk() |> distinct() +#' +#' @importFrom dplyr distinct #' @export -distinct.tidybulk <- function (.data, ..., .keep_all = FALSE) -{ +distinct.tidybulk <- function (.data, ..., .keep_all = FALSE) { .data |> drop_class(c("tidybulk", "tt")) |> dplyr::distinct(..., .keep_all = .keep_all) |> @@ -206,84 +87,23 @@ distinct.tidybulk <- function (.data, ..., .keep_all = FALSE) # Add class add_class("tt") |> add_class("tidybulk") - } -#' Subset rows using column values -#' -#' `filter()` retains the rows where the conditions you provide a `TRUE`. Note -#' that, unlike base subsetting with `[`, rows where the condition evaluates -#' to `NA` are dropped. -#' -#' dplyr is not yet smart enough to optimise filtering optimisation -#' on grouped datasets that don't need grouped calculations. For this reason, -#' filtering is often considerably faster on [ungroup()]ed data. -#' -#' @section Useful filter functions: -#' -#' * [`==`], [`>`], [`>=`] etc -#' * [`&`], [`|`], [`!`], [xor()] -#' * [is.na()] -#' * [between()], [near()] -#' -#' @section Grouped tibbles: -#' -#' Because filtering expressions are computed within groups, they may -#' yield different results on grouped tibbles. This will be the case -#' as soon as an aggregating, lagging, or ranking function is -#' involved. Compare this ungrouped filtering: -#' -#' -#' The former keeps rows with `mass` greater than the global average -#' whereas the latter keeps rows with `mass` greater than the gender -#' -#' average. -#' @family single table verbs -#' @param .data A tbl. (See dplyr) -#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Logical predicates defined in -#' terms of the variables in `.data`. -#' Multiple conditions are combined with `&`. Only rows where the -#' condition evaluates to `TRUE` are kept. -#' @param .preserve when `FALSE` (the default), the grouping structure -#' is recalculated based on the resulting data, otherwise it is kept as is. -#' @return -#' An object of the same type as `.data`. -#' -#' * Rows are a subset of the input, but appear in the same order. -#' * Columns are not modified. -#' * The number of groups may be reduced (if `.preserve` is not `TRUE`). -#' * Data frame attributes are preserved. -#' @section Methods: -#' This function is a **generic**, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' The following methods are currently available in loaded packages: -#' @seealso [filter_all()], [filter_if()] and [filter_at()]. -#' -#' @rdname filter-methods #' @name filter -#' -#' @importFrom dplyr filter -#' -#' @export +#' @rdname filter +#' @inherit dplyr::filter #' #' @examples -#' #' data(se) -#' #' se |> tidybulk() |> filter(dex=="untrt") -#' #' # Learn more in ?dplyr_tidy_eval -NULL - -#' @inheritParams filter +#' +#' @importFrom dplyr filter #' @export -filter.tidybulk <- function (.data, ..., .preserve = FALSE) -{ +filter.tidybulk <- function (.data, ..., .preserve = FALSE) { .data |> drop_class(c("tidybulk", "tt")) |> - dplyr::filter( ..., .preserve = .preserve) |> + dplyr::filter(..., .preserve = .preserve) |> # Attach attributes reattach_internals(.data) |> @@ -291,55 +111,16 @@ filter.tidybulk <- function (.data, ..., .preserve = FALSE) # Add class add_class("tt") |> add_class("tidybulk") - } -#' Group by one or more variables -#' -#' @description -#' Most data operations are done on groups defined by variables. -#' `group_by()` takes an existing tbl and converts it into a grouped tbl -#' where operations are performed "by group". `ungroup()` removes grouping. -#' -#' @family grouping functions -#' @param .data A tbl. (See dplyr) -#' @param ... In `group_by()`, variables or computations to group by. -#' In `ungroup()`, variables to remove from the grouping. -#' @param .add When `FALSE`, the default, `group_by()` will -#' override existing groups. To add to the existing groups, use -#' `.add = TRUE`. -#' -#' This argument was previously called `add`, but that prevented -#' creating a new grouping variable called `add`, and conflicts with -#' our naming conventions. -#' @param .drop When `.drop = TRUE`, empty groups are dropped. See [group_by_drop_default()] for -#' what the default value is for this argument. -#' @return A [grouped data frame][grouped_df()], unless the combination of `...` and `add` -#' yields a non empty set of grouping columns, a regular (ungrouped) data frame -#' otherwise. -#' @section Methods: -#' These function are **generic**s, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' Methods available in currently loaded packages: -#' -#' @rdname group_by-methods #' @name group_by -#' @importFrom dplyr group_by -#' -#' @export -#' -#' @examples -#' -#' by_cyl <- mtcars |> group_by(cyl) -#' -NULL - +#' @rdname group_by +#' @inherit dplyr::group_by #' @importFrom dplyr group_by_drop_default +#' @importFrom dplyr group_by #' @export -group_by.tidybulk <- function (.data, ..., .add = FALSE, .drop = group_by_drop_default(.data)) -{ +group_by.tidybulk <- function (.data, ..., .add = FALSE, + .drop = group_by_drop_default(.data)) { .data |> drop_class(c("tidybulk", "tt")) |> dplyr::group_by( ..., .drop = .drop) |> @@ -350,20 +131,14 @@ group_by.tidybulk <- function (.data, ..., .add = FALSE, .drop = group_by_drop_d # Add class add_class("tt") |> add_class("tidybulk") - } - -#' @rdname ungroup-methods #' @name ungroup +#' @rdname ungroup +#' @inherit dplyr::ungroup #' @importFrom dplyr ungroup -#' -#' @param x A [tbl()] -#' @param ... See dplyr -#' #' @export -ungroup.tidybulk <- function (x, ...) -{ +ungroup.tidybulk <- function (x, ...) { x |> drop_class(c("tidybulk", "tt")) |> dplyr::ungroup( ...) |> @@ -374,89 +149,15 @@ ungroup.tidybulk <- function (x, ...) # Add class add_class("tt") |> add_class("tidybulk") - } -#' Summarise each group to fewer rows -#' -#' @description -#' `summarise()` creates a new data frame. It will have one (or more) rows for -#' each combination of grouping variables; if there are no grouping variables, -#' the output will have a single row summarising all observations in the input. -#' It will contain one column for each grouping variable and one column -#' for each of the summary statistics that you have specified. -#' -#' `summarise()` and `summarize()` are synonyms. -#' -#' @section Useful functions: -#' -#' * Center: [mean()], [median()] -#' * Spread: [sd()], [IQR()], [mad()] -#' * Range: [min()], [max()], [quantile()] -#' * Position: [first()], [last()], [nth()], -#' * Count: [n()], [n_distinct()] -#' * Logical: [any()], [all()] -#' -#' @section Backend variations: -#' -#' The data frame backend supports creating a variable and using it in the -#' same summary. This means that previously created summary variables can be -#' further transformed or combined within the summary, as in [mutate()]. -#' However, it also means that summary variables with the same names as previous -#' variables overwrite them, making those variables unavailable to later summary -#' variables. -#' -#' This behaviour may not be supported in other backends. To avoid unexpected -#' results, consider using new names for your summary variables, especially when -#' creating multiple summaries. -#' -#' @export -#' @param .data A tbl. (See dplyr) -#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Name-value pairs of summary -#' functions. The name will be the name of the variable in the result. -#' -#' The value can be: -#' -#' * A vector of length 1, e.g. `min(x)`, `n()`, or `sum(is.na(y))`. -#' * A vector of length `n`, e.g. `quantile()`. -#' * A data frame, to add multiple columns from a single expression. -#' @family single table verbs -#' @return -#' An object _usually_ of the same type as `.data`. -#' -#' * The rows come from the underlying `group_keys()`. -#' * The columns are a combination of the grouping keys and the summary -#' expressions that you provide. -#' * If `x` is grouped by more than one variable, the output will be another -#' [grouped_df] with the right-most group removed. -#' * If `x` is grouped by one variable, or is not grouped, the output will -#' be a [tibble]. -#' * Data frame attributes are **not** preserved, because `summarise()` -#' fundamentally creates a new data frame. -#' @section Methods: -#' This function is a **generic**, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' The following methods are currently available in loaded packages: -#' @examples -#' -#' # A summary applied to ungrouped tbl returns a single row -#' -#' mtcars |> -#' summarise(mean = mean(disp)) -#' -#' -#' @rdname summarise-methods #' @name summarise +#' @aliases summarize +#' @inherit dplyr::summarise +#' @family single table verbs #' @importFrom dplyr summarise #' @export -NULL - -#' @inheritParams summarise -#' @export -summarise.tidybulk <- function (.data, ...) -{ +summarise.tidybulk <- function (.data, ...) { .data |> drop_class(c("tidybulk", "tt")) |> dplyr::summarise( ...) |> @@ -467,102 +168,21 @@ summarise.tidybulk <- function (.data, ...) # Add class add_class("tt") |> add_class("tidybulk") - } -#' Create, modify, and delete columns -#' -#' `mutate()` adds new variables and preserves existing ones; -#' `transmute()` adds new variables and drops existing ones. -#' New variables overwrite existing variables of the same name. -#' Variables can be removed by setting their value to `NULL`. -#' -#' @section Useful mutate functions: -#' -#' * [`+`], [`-`], [log()], etc., for their usual mathematical meanings -#' -#' * [lead()], [lag()] -#' -#' * [dense_rank()], [min_rank()], [percent_rank()], [row_number()], -#' [cume_dist()], [ntile()] -#' -#' * [cumsum()], [cummean()], [cummin()], [cummax()], [cumany()], [cumall()] -#' -#' * [na_if()], [coalesce()] -#' -#' * [if_else()], [recode()], [case_when()] -#' -#' @section Grouped tibbles: -#' -#' Because mutating expressions are computed within groups, they may -#' yield different results on grouped tibbles. This will be the case -#' as soon as an aggregating, lagging, or ranking function is -#' involved. Compare this ungrouped mutate: -#' -#' With the grouped equivalent: -#' -#' The former normalises `mass` by the global average whereas the -#' latter normalises by the averages within gender levels. -#' +#' @name summarise +#' @rdname summarise +#' @importFrom dplyr summarize #' @export -#' @param .data A tbl. (See dplyr) -#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Name-value pairs. -#' The name gives the name of the column in the output. -#' -#' The value can be: -#' -#' * A vector of length 1, which will be recycled to the correct length. -#' * A vector the same length as the current group (or the whole data frame -#' if ungrouped). -#' * `NULL`, to remove the column. -#' * A data frame or tibble, to create multiple columns in the output. -#' -#' @family single table verbs -#' @return -#' An object of the same type as `.data`. -#' -#' For `mutate()`: -#' -#' * Rows are not affected. -#' * Existing columns will be preserved unless explicitly modified. -#' * New columns will be added to the right of existing columns. -#' * Columns given value `NULL` will be removed -#' * Groups will be recomputed if a grouping variable is mutated. -#' * Data frame attributes are preserved. -#' -#' For `transmute()`: -#' -#' * Rows are not affected. -#' * Apart from grouping variables, existing columns will be remove unless -#' explicitly kept. -#' * Column order matches order of expressions. -#' * Groups will be recomputed if a grouping variable is mutated. -#' * Data frame attributes are preserved. -#' @section Methods: -#' These function are **generic**s, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' Methods available in currently loaded packages: -#' -#' @examples -#' -#' # Newly created variables are available immediately -#' mtcars |> as_tibble() |> mutate( -#' cyl2 = cyl * 2, -#' cyl4 = cyl2 * 2 -#' ) -#' -#' @rdname mutate-methods +summarize.tidybulk <- summarise.tidybulk + #' @name mutate +#' @rdname mutate +#' @inherit dplyr::mutate +#' @family single table verbs #' @importFrom dplyr mutate #' @export -NULL - -#' @inheritParams mutate -#' @export -mutate.tidybulk <- function(.data, ...) -{ +mutate.tidybulk <- function(.data, ...) { .data |> drop_class(c("tidybulk", "tt")) |> dplyr::mutate(...) |> @@ -573,14 +193,12 @@ mutate.tidybulk <- function(.data, ...) # Add class add_class("tt") |> add_class("tidybulk") - - } -#' @inheritParams mutate +#' @inherit dplyr::mutate +#' @importFrom dplyr mutate #' @export -mutate.nested_tidybulk <- function(.data, ...) -{ +mutate.nested_tidybulk <- function(.data, ...) { .data |> drop_class(c("nested_tidybulk", "tt")) |> dplyr::mutate(...) |> @@ -591,52 +209,15 @@ mutate.nested_tidybulk <- function(.data, ...) # Add class add_class("tt") |> add_class("nested_tidybulk") - - } -#' Rename columns -#' -#' Rename individual variables using `new_name = old_name` syntax. -#' -#' @section Scoped selection and renaming: -#' -#' Use the three scoped variants ([rename_all()], [rename_if()], [rename_at()]) -#' to renaming a set of variables with a function. -#' -#' @param .data A tbl. (See dplyr) -#' @param ... <[`tidy-select`][dplyr_tidy_select]> Use `new_name = old_name` -#' to rename selected variables. -#' @return -#' An object of the same type as `.data`. -#' * Rows are not affected. -#' * Column names are changed; column order is preserved -#' * Data frame attributes are preserved. -#' * Groups are updated to reflect new names. -#' @section Methods: -#' This function is a **generic**, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' The following methods are currently available in loaded packages: -#' @family single table verbs -#' @export -#' -#' @examples -#' -#' iris <- as_tibble(iris) # so it prints a little nicer -#' rename(iris, petal_length = Petal.Length) -#' -#' @rdname rename-methods #' @name rename +#' @rdname rename +#' @inherit dplyr::rename +#' @family single table verbs #' @importFrom dplyr rename #' @export -NULL - -#' @inheritParams rename -#' @export -rename.tidybulk <- function(.data, ...) -{ +rename.tidybulk <- function(.data, ...) { .data |> drop_class(c("tidybulk", "tt")) |> dplyr::rename(...) |> @@ -647,49 +228,14 @@ rename.tidybulk <- function(.data, ...) # Add class add_class("tt") |> add_class("tidybulk") - - } -#' Group input by rows -#' -#' -#' See [this repository](https://github.com/jennybc/row-oriented-workflows) -#' for alternative ways to perform row-wise operations. -#' -#' `rowwise()` is used for the results of [do()] when you -#' create list-variables. It is also useful to support arbitrary -#' complex operations that need to be applied to each row. -#' -#' Currently, rowwise grouping only works with data frames. Its -#' main impact is to allow you to work with list-variables in -#' [summarise()] and [mutate()] without having to -#' use \code{[[1]]}. This makes `summarise()` on a rowwise tbl -#' effectively equivalent to [plyr::ldply()]. -#' -#' @param data Input data frame. -#' @param ... Variables to be preserved when calling summarise(). This is typically a set of variables whose combination uniquely identify each row. NB: unlike group_by() you can not create new variables here but instead you can select multiple variables with (e.g.) everything(). -#' -#' @return A consistent object (to the input) -#' -#' A `tbl` -#' -#' @export -#' @examples -#' -#' df <- expand.grid(x = 1:3, y = 3:1) -#' df_done <- df |> rowwise() -#' -#' @rdname rowwise-methods #' @name rowwise +#' @rdname rowwise +#' @inherit dplyr::rowwise #' @importFrom dplyr rowwise #' @export -NULL - -#' @inheritParams rowwise -#' @export -rowwise.tidybulk <- function(data, ...) -{ +rowwise.tidybulk <- function(data, ...) { data |> drop_class(c("tidybulk", "tt")) |> dplyr::rowwise() |> @@ -700,37 +246,22 @@ rowwise.tidybulk <- function(data, ...) # Add class add_class("tt") |> add_class("tidybulk") - - } -#' Left join datasets -#' -#' @param x tbls to join. (See dplyr) -#' @param y tbls to join. (See dplyr) -#' @param by A character vector of variables to join by. (See dplyr) -#' @param copy If x and y are not from the same data source, and copy is TRUE, then y will be copied into the same src as x. (See dplyr) -#' @param suffix If there are non-joined duplicate variables in x and y, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2. (See dplyr) -#' @param ... Data frames to combine (See dplyr) -#' -#' @return A tt object -#' +#' @name left_join +#' @rdname left_join +#' @inherit dplyr::left_join +#' #' @examples +#' data(se_mini) +#' annotation <- se_mini |> tidybulk() |> as_tibble() |> +#' distinct(.sample) |> mutate(source = "AU") +#' se_mini |> tidybulk() |> as_tibble() |> left_join(annotation) #' -#' annotation = tidybulk::se_mini |> tidybulk() |> as_tibble() |> distinct(.sample) |> mutate(source = "AU") -#' tidybulk::se_mini |> tidybulk() |> as_tibble() |> left_join(annotation) -#' -#' @rdname dplyr-methods -#' @name left_join #' @importFrom dplyr left_join #' @export -NULL - -#' @inheritParams left_join -#' @export -left_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), - ...) -{ +left_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, + suffix = c(".x", ".y"), ...) { x |> drop_class(c("tidybulk", "tt")) |> dplyr::left_join(y, by = by, copy = copy, suffix = suffix, ...) |> @@ -741,35 +272,22 @@ left_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", # Add class add_class("tt") |> add_class("tidybulk") - } -#' Inner join datasets -#' -#' @param x tbls to join. (See dplyr) -#' @param y tbls to join. (See dplyr) -#' @param by A character vector of variables to join by. (See dplyr) -#' @param copy If x and y are not from the same data source, and copy is TRUE, then y will be copied into the same src as x. (See dplyr) -#' @param suffix If there are non-joined duplicate variables in x and y, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2. (See dplyr) -#' @param ... Data frames to combine (See dplyr) -#' -#' @return A tt object +#' @name inner_join +#' @rdname inner_join +#' @inherit dplyr::inner_join #' #' @examples -#' -#' annotation = tidybulk::se_mini |> tidybulk() |> as_tibble() |> distinct(.sample) |> mutate(source = "AU") -#' tidybulk::se_mini |> tidybulk() |> as_tibble() |> inner_join(annotation) +#' data(se_mini) +#' annotation <- tidybulk::se_mini |> tidybulk() |> as_tibble() |> +#' distinct(.sample) |> mutate(source = "AU") +#' se_mini |> tidybulk() |> as_tibble() |> inner_join(annotation) #' -#' @rdname join-methods -#' @name inner_join #' @importFrom dplyr inner_join #' @export -NULL - -#' @inheritParams inner_join -#' @export -inner_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...) -{ +inner_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, + suffix = c(".x", ".y"), ...) { x |> drop_class(c("tidybulk", "tt")) |> dplyr::inner_join(y, by = by, copy = copy, suffix = suffix, ...) |> @@ -780,36 +298,22 @@ inner_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", # Add class add_class("tt") |> add_class("tidybulk") - } -#' Right join datasets -#' -#' @param x tbls to join. (See dplyr) -#' @param y tbls to join. (See dplyr) -#' @param by A character vector of variables to join by. (See dplyr) -#' @param copy If x and y are not from the same data source, and copy is TRUE, then y will be copied into the same src as x. (See dplyr) -#' @param suffix If there are non-joined duplicate variables in x and y, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2. (See dplyr) -#' @param ... Data frames to combine (See dplyr) -#' -#' @return A tt object -#' -#' @examples +#' @name right_join +#' @rdname right_join +#' @inherit dplyr::right_join #' -#' annotation = tidybulk::se_mini |> tidybulk() |> as_tibble() |> distinct(.sample) |> mutate(source = "AU") -#' tidybulk::se_mini |> tidybulk() |> as_tibble() |> right_join(annotation) +#' @examples +#' data(se_mini) +#' annotation <- se_mini |> tidybulk() |> as_tibble() |> +#' distinct(.sample) |> mutate(source = "AU") +#' se_mini |> tidybulk() |> as_tibble() |> right_join(annotation) #' -#' @rdname join-methods -#' @name right_join #' @importFrom dplyr right_join #' @export -NULL - -#' @inheritParams right_join -#' @export -right_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), - ...) -{ +right_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, + suffix = c(".x", ".y"), ...) { x |> drop_class(c("tidybulk", "tt")) |> dplyr::right_join(y, by = by, copy = copy, suffix = suffix, ...) |> @@ -820,37 +324,22 @@ right_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", # Add class add_class("tt") |> add_class("tidybulk") - } - -#' Full join datasets -#' -#' @param x tbls to join. (See dplyr) -#' @param y tbls to join. (See dplyr) -#' @param by A character vector of variables to join by. (See dplyr) -#' @param copy If x and y are not from the same data source, and copy is TRUE, then y will be copied into the same src as x. (See dplyr) -#' @param suffix If there are non-joined duplicate variables in x and y, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2. (See dplyr) -#' @param ... Data frames to combine (See dplyr) -#' -#' @return A tt object +#' @name full_join +#' @rdname full_join +#' @inherit dplyr::full_join #' #' @examples -#' -#' annotation = tidybulk::se_mini |> tidybulk() |> as_tibble() |> distinct(.sample) |> mutate(source = "AU") -#' tidybulk::se_mini |> tidybulk() |> as_tibble() |> full_join(annotation) +#' data(se_mini) +#' annotation <- se_mini |> tidybulk() |> as_tibble() |> +#' distinct(.sample) |> mutate(source = "AU") +#' se_mini |> tidybulk() |> as_tibble() |> full_join(annotation) #' -#' @rdname join-methods -#' @name full_join #' @importFrom dplyr full_join #' @export -NULL - -#' @inheritParams full_join -#' @export -full_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), - ...) -{ +full_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, + suffix = c(".x", ".y"), ...) { x |> drop_class(c("tidybulk", "tt")) |> dplyr::full_join(y, by = by, copy = copy, suffix = suffix, ...) |> @@ -861,9 +350,12 @@ full_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", # Add class add_class("tt") |> add_class("tidybulk") - } #' @importFrom dplyr do #' @export dplyr::do + +#' @importFrom dplyr select +#' @export +dplyr::select diff --git a/R/functions.R b/R/functions.R index 7129b2df..25d127cc 100755 --- a/R/functions.R +++ b/R/functions.R @@ -2170,10 +2170,11 @@ get_reduced_dimensions_TSNE_bulk <- } # Set perprexity to not be too high - if (!"perplexity" %in% names(arguments)) - arguments = arguments %>% c(perplexity = (( - .data %>% distinct(!!.element) %>% nrow() %>% sum(-1) - ) / 3 / 2) %>% floor() %>% min(30)) + if (!"perplexity" %in% names(arguments)) { + perplexity_value <- (ncol(.data) - 1 / 3 / 2) + perplexity_value <- pmin(floor(perplexity_value), 30) + arguments$perplexity <- perplexity_value + } # If not enough samples stop if (arguments$perplexity <= 2) diff --git a/R/functions_SE.R b/R/functions_SE.R index fcee85d0..e275e5e0 100755 --- a/R/functions_SE.R +++ b/R/functions_SE.R @@ -360,10 +360,11 @@ get_reduced_dimensions_TSNE_bulk_SE <- } # Set perprexity to not be too high - if (!"perplexity" %in% names(arguments)) - arguments = arguments %>% c(perplexity = (( - .data %>% ncol() %>% sum(-1) - ) / 3 / 2) %>% floor() %>% min(30)) + if (!"perplexity" %in% names(arguments)) { + perplexity_value <- (ncol(.data) - 1 / 3 / 2) + perplexity_value <- pmin(floor(perplexity_value), 30) + arguments$perplexity <- perplexity_value + } # If not enough samples stop if (arguments$perplexity <= 2) @@ -372,8 +373,6 @@ get_reduced_dimensions_TSNE_bulk_SE <- # Calculate the most variable genes, from plotMDS Limma tsne_obj = do.call(Rtsne::Rtsne, c(list(t(.data)), arguments)) - - list( raw_result = tsne_obj, result = tsne_obj %$% diff --git a/R/methods.R b/R/methods.R index e06b5ad4..d82fec6a 100755 --- a/R/methods.R +++ b/R/methods.R @@ -4,7 +4,10 @@ setOldClass("tidybulk") #' #' `r lifecycle::badge("maturing")` #' -#' @description tidybulk() creates an annotated `tidybulk` tibble from a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @description tidybulk() creates an annotated `tidybulk` tibble from a `tbl` +#' (with at least three columns for sample, feature and transcript abundance) +#' or `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) #' #' @importFrom rlang enquo #' @importFrom rlang quo_is_missing @@ -1271,7 +1274,9 @@ setMethod("reduce_dimensions", "tidybulk", .reduce_dimensions) #' #' `r lifecycle::badge("maturing")` #' -#' @description rotate_dimensions() takes as input a `tbl` formatted as | | | <...> | and calculates the rotated dimensional space of the transcript abundance. +#' @description rotate_dimensions() takes as input a `tbl` formatted as +#' | | | <...> | and calculates the rotated +#' dimensional space of the transcript abundance. #' #' @importFrom rlang enquo #' @importFrom magrittr "%>%" @@ -1313,7 +1318,9 @@ setMethod("reduce_dimensions", "tidybulk", .reduce_dimensions) #' identify_abundant() |> #' reduce_dimensions( method="MDS", .dims = 3) #' -#' counts.MDS.rotated = rotate_dimensions(counts.MDS, `Dim1`, `Dim2`, rotation_degrees = 45, .element = sample) +#' counts.MDS.rotated = rotate_dimensions(counts.MDS, `Dim1`, `Dim2`, +#' rotation_degrees = 45, +#' .element = sample) #' #' #' @docType methods @@ -1667,7 +1674,12 @@ setMethod("remove_redundancy", "tidybulk", .remove_redundancy) #' #' `r lifecycle::badge("maturing")` #' -#' @description adjust_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with an additional adjusted abundance column. This method uses scaled counts if present. +#' @description adjust_abundance() takes as input A `tbl` +#' (with at least three columns for sample, feature and transcript abundance) +#' or `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a consistent object +#' (to the input) with an additional adjusted abundance column. +#' This method uses scaled counts if present. #' #' @importFrom rlang enquo #' @importFrom magrittr "%>%" @@ -1711,7 +1723,9 @@ setMethod("remove_redundancy", "tidybulk", .remove_redundancy) #' #' cm |> #' identify_abundant() |> -#' adjust_abundance( .factor_unwanted = batch, .factor_of_interest = condition, method="combat" ) +#' adjust_abundance(.factor_unwanted = batch, +#' .factor_of_interest = condition, +#' method="combat") #' #' #' @docType methods @@ -1776,7 +1790,8 @@ setGeneric("adjust_abundance", function(.data, if (is_present(log_transform) & !is.null(log_transform)) { # Signal the deprecation to the user - deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(log_transform = )", details = "The argument log_transform is now deprecated, please use transform.") + deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(log_transform = )", + details = "The argument log_transform is now deprecated, please use transform.") if(log_transform){ transform = log1p @@ -2257,8 +2272,9 @@ setMethod("deconvolve_cellularity", #' #' # This function was designed for data.frame #' # Convert from SummarizedExperiment for this example. It is NOT reccomended. -#' -#' tidybulk::se_mini |> tidybulk() |> as_tibble() |> symbol_to_entrez(.transcript = .feature, .sample = .sample) +#' data(se_mini) +#' se_mini |> tidybulk() |> as_tibble() |> +#' symbol_to_entrez(.transcript = .feature, .sample = .sample) #' #' @export #' @@ -3429,7 +3445,10 @@ setMethod("keep_abundant", "tidybulk", .keep_abundant) #' #' `r lifecycle::badge("maturing")` #' -#' @description test_gene_enrichment() takes as input a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a `tbl` of gene set information +#' @description test_gene_enrichment() takes as input a `tbl` +#' (with at least three columns for sample, feature and transcript abundance) +#' or `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a `tbl` of gene set information #' #' @importFrom rlang enquo #' @importFrom magrittr "%>%" @@ -3453,7 +3472,7 @@ setMethod("keep_abundant", "tidybulk", .keep_abundant) #' @details This wrapper executes ensemble gene enrichment analyses of the dataset using EGSEA (DOI:0.12688/f1000research.12544.1) #' #' -#' dge = +#' dge <- #' data |> #' keep_abundant( #' factor_of_interest = !!as.symbol(parse_formula(.formula)[[1]]), @@ -3465,11 +3484,11 @@ setMethod("keep_abundant", "tidybulk", .keep_abundant) #' as_matrix(rownames = !!.entrez) %>% #' edgeR::DGEList(counts = .) #' -#' idx = buildIdx(entrezIDs = rownames(dge), species = species, msigdb.gsets = msigdb.gsets, +#' idx <- buildIdx(entrezIDs = rownames(dge), species = species, +#' msigdb.gsets = msigdb.gsets, #' kegg.exclude = kegg.exclude) #' #' dge |> -#' #' # Calculate weights #' limma::voom(design, plot = FALSE) |> #' @@ -3484,17 +3503,15 @@ setMethod("keep_abundant", "tidybulk", .keep_abundant) #' ) #' #' @return A consistent object (to the input) -#' -#' -#' + #' #' @examples #' \dontrun{ #' #' library(SummarizedExperiment) -#' se = tidybulk::se_mini -#' rowData( se)$entrez = rownames(se ) -#' df_entrez = aggregate_duplicates(se,.transcript = entrez ) +#' se <- tidybulk::se_mini +#' rowData(se)$entrez <- rownames(se) +#' df_entrez <- aggregate_duplicates(se, .transcript = entrez) #' #' library("EGSEA") #' @@ -3504,8 +3521,10 @@ setMethod("keep_abundant", "tidybulk", .keep_abundant) #' .sample = sample, #' .entrez = entrez, #' .abundance = count, -#' methods = c("roast" , "safe", "gage" , "padog" , "globaltest", "ora" ), -#' gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), +#' methods = c("roast", "safe", "gage", +#' "padog", "globaltest", "ora"), +#' gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", +#' "kegg_disease", "kegg_metabolism", "kegg_signaling"), #' species="human", #' cores = 2 #' ) @@ -3517,40 +3536,47 @@ setMethod("keep_abundant", "tidybulk", .keep_abundant) #' @export #' #' -setGeneric("test_gene_enrichment", function(.data, - .formula, - .sample = NULL, - .entrez, - .abundance = NULL, - contrasts = NULL, - methods = c("camera" , "roast" , "safe", "gage" , "padog" , "globaltest", "ora" ), - gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), - species, - cores = 10, - - # DEPRECATED - method = NULL, - .contrasts = NULL - ) - standardGeneric("test_gene_enrichment")) +setGeneric("test_gene_enrichment", function( + .data, + .formula, + .sample = NULL, + .entrez, + .abundance = NULL, + contrasts = NULL, + methods = c("camera", "roast", "safe", "gage", + "padog", "globaltest", "ora"), + gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", + "kegg_disease", "kegg_metabolism", "kegg_signaling"), + species, + cores = 10, + + # DEPRECATED + method = NULL, + .contrasts = NULL) + standardGeneric("test_gene_enrichment") +) + # Set internal #' @importFrom lifecycle deprecate_warn -.test_gene_enrichment = function(.data, - .formula, - .sample = NULL, - .entrez, - .abundance = NULL, - contrasts = NULL, - methods = c("camera" , "roast" , "safe", "gage" , "padog" , "globaltest", "ora" ), - gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), - species, - cores = 10, - - # DEPRECATED - method = NULL, - .contrasts = NULL - ) { +.test_gene_enrichment <- function( + .data, + .formula, + .sample = NULL, + .entrez, + .abundance = NULL, + contrasts = NULL, + methods = c("camera" , "roast" , "safe", "gage", + "padog", "globaltest", "ora" ), + gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", + "kegg_disease", "kegg_metabolism", "kegg_signaling"), + species, + cores = 10, + + # DEPRECATED + method = NULL, + .contrasts = NULL +) { # Fix NOTEs . = NULL @@ -3559,7 +3585,8 @@ setGeneric("test_gene_enrichment", function(.data, if (is_present(method) & !is.null(method)) { # Signal the deprecation to the user - deprecate_warn("1.3.2", "tidybulk::test_gene_enrichment(method = )", details = "The argument method is now deprecated please use methods") + deprecate_warn("1.3.2", "tidybulk::test_gene_enrichment(method = )", + details = "The argument method is now deprecated please use methods") methods = method } @@ -3567,7 +3594,8 @@ setGeneric("test_gene_enrichment", function(.data, if (is_present(.contrasts) & !is.null(.contrasts)) { # Signal the deprecation to the user - deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(.contrasts = )", details = "The argument .contrasts is now deprecated please use contrasts (without the dot).") + deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(.contrasts = )", + details = "The argument .contrasts is now deprecated please use contrasts (without the dot).") contrasts = .contrasts } diff --git a/R/methods_SE.R b/R/methods_SE.R index 7c86345d..cff12cc3 100755 --- a/R/methods_SE.R +++ b/R/methods_SE.R @@ -871,8 +871,7 @@ setMethod("remove_redundancy", get_assay_scaled_if_exists_SE(.data) ) - if(tolower(method) == "combat"){ - + if(tolower(method) == "combat") { my_assay_adjusted = .data |> assay(my_assay) |> # Check if log transform is needed @@ -881,8 +880,8 @@ setMethod("remove_redundancy", `+` (rnorm(length(.), 0, 0.000001)) - for(i in colnames(my_batch)){ - my_assay_adjusted = + for (i in colnames(my_batch)) { + my_assay_adjusted <- my_assay_adjusted %>% # Run combat @@ -895,11 +894,9 @@ setMethod("remove_redundancy", } # Tranfrom back - my_assay_adjusted = + my_assay_adjusted <- my_assay_adjusted %>% - expm1() |> - apply(2, pmax, 0) - + apply(MARGIN=2, FUN=function(col) pmax(expm1(col), 0)) } else if(tolower(method) == "combat_seq"){ @@ -919,16 +916,15 @@ setMethod("remove_redundancy", } } - else if(tolower(method) == "limma_remove_batch_effect") { - - unwanted_covariate_matrix = + else if (tolower(method) == "limma_remove_batch_effect") { + unwanted_covariate_matrix <- model.matrix( object = as.formula(sprintf("~ 0 + %s", colData(.data) |> as_tibble() |> select(!!.factor_unwanted) |> colnames() |> str_c(collapse = '+'))), # get first argument of the .formula data = colData(.data) ) - my_assay_adjusted = + my_assay_adjusted <- .data |> assay(my_assay) |> edgeR::cpm(log = T) |> @@ -937,25 +933,21 @@ setMethod("remove_redundancy", covariates = unwanted_covariate_matrix, ... ) |> - expm1() |> - apply(2, pmax, 0) - + apply(MARGIN=2, FUN=function(col) pmax(expm1(col), 0)) } else { stop("tidybulk says: the argument \"method\" must be combat_seq, combat, or limma_remove_batch_effect") } # Add the assay - my_assay_scaled = list(my_assay_adjusted) %>% setNames(value_adjusted) + my_assay_scaled <- list(my_assay_adjusted) %>% setNames(value_adjusted) - assays(.data) = assays(.data) %>% c(my_assay_scaled) + assays(.data) <- assays(.data) %>% c(my_assay_scaled) # Return .data %>% - # Add methods memorise_methods_used("sva") %>% - # Attach column internals add_tt_columns(.abundance_adjusted = !!(function(x, v) enquo(v))(x, !!as.symbol(value_adjusted))) diff --git a/R/tidyr_methods.R b/R/tidyr_methods.R index 0065c08d..7a6dfa8b 100755 --- a/R/tidyr_methods.R +++ b/R/tidyr_methods.R @@ -1,62 +1,20 @@ -#' unnest -#' -#' @importFrom tidyr unnest -#' -#' @param data A tbl. (See tidyr) -#' @param cols <[`tidy-select`][tidyr_tidy_select]> Columns to unnest. -#' If you `unnest()` multiple columns, parallel entries must be of -#' compatibble sizes, i.e. they're either equal or length 1 (following the -#' standard tidyverse recycling rules). -#' @param ... <[`tidy-select`][tidyr_tidy_select]> Columns to nest, specified -#' using name-variable pairs of the form `new_col=c(col1, col2, col3)`. -#' The right hand side can be any valid tidy select expression. -#' -#' \Sexpr[results=rd, stage=render]{lifecycle::badge("deprecated")}: -#' previously you could write `df %>% nest(x, y, z)` and `df %>% -#' unnest(x, y, z)`. Convert to `df %>% nest(data=c(x, y, z))`. -#' and `df %>% unnest(c(x, y, z))`. -#' -#' If you previously created new variable in `unnest()` you'll now need to -#' do it explicitly with `mutate()`. Convert `df %>% unnest(y=fun(x, y, z))` -#' to `df %>% mutate(y=fun(x, y, z)) %>% unnest(y)`. -#' @param names_sep If `NULL`, the default, the names will be left -#' as is. In `nest()`, inner names will come from the former outer names; -#' in `unnest()`, the new outer names will come from the inner names. -#' -#' If a string, the inner and outer names will be used together. In `nest()`, -#' the names of the new outer columns will be formed by pasting together the -#' outer and the inner column names, separated by `names_sep`. In `unnest()`, -#' the new inner names will have the outer names (+ `names_sep`) automatically -#' stripped. This makes `names_sep` roughly symmetric between nesting and unnesting. -#' @param keep_empty See tidyr::unnest -#' @param names_repair See tidyr::unnest -#' @param ptype See tidyr::unnest -#' @param .drop See tidyr::unnest -#' @param .id tidyr::unnest -#' @param .sep tidyr::unnest -#' @param .preserve See tidyr::unnest -#' -#' -#' @return A tidySummarizedExperiment objector a tibble depending on input +#' @name unnest +#' @rdname unnest +#' @inherit tidyr::unnest +#' @return `tidySingleCellExperiment` #' #' @examples +#' data(se_mini) +#' se_mini |> tidybulk() |> nest( data = -.feature) |> unnest(data) #' -#' -#' tidybulk::se_mini |> tidybulk() |> nest( data = -.feature) |> unnest(data) -#' -#' @rdname nest-methods -#' @name unnest -#' -#' @export -NULL - +#' @importFrom tidyr unnest +#' @importFrom rlang enquo #' @export -unnest.nested_tidybulk <- function (data, cols, ..., keep_empty=FALSE, ptype=NULL, names_sep=NULL, names_repair="check_unique", .drop, .id, .sep, .preserve) -{ +unnest.nested_tidybulk <- function (data, cols, ..., keep_empty=FALSE, + ptype=NULL, names_sep=NULL, names_repair="check_unique", + .drop, .id, .sep, .preserve) { cols <- enquo(cols) - - data %>% drop_class(c("nested_tidybulk", "tt")) %>% tidyr::unnest(!!cols, ..., keep_empty = keep_empty, ptype = ptype, @@ -68,41 +26,24 @@ unnest.nested_tidybulk <- function (data, cols, ..., keep_empty=FALSE, ptype=NUL # Add class add_class("tt") %>% add_class("tidybulk") - } -#' nest -#' -#' @importFrom tidyr nest -#' -#' @param .data A tbl. (See tidyr) -#' @param ... Name-variable pairs of the form new_col = c(col1, col2, col3) (See tidyr) -#' -#' @return A tt object -#' -#' @examples -#' -#' tidybulk::se_mini %>% tidybulk() %>% nest( data = -.feature) -#' -#' @rdname nest-methods #' @name nest +#' @rdname nest +#' @inherit tidyr::nest #' -#' @export -NULL - -#' @importFrom rlang enquos +#' @examples +#' data(se_mini) +#' se_mini %>% tidybulk() %>% nest(data = -.feature) #' +#' @importFrom tidyr nest +#' @importFrom rlang enquo #' @export -#' -#' -#' -nest.tidybulk <- function (.data, ..., .names_sep = NULL) -{ +nest.tidybulk <- function (.data, ..., .names_sep = NULL) { cols <- enquos(...) col_name_data = names(cols) .data %>% - # This is needed otherwise nest goes into loop and fails drop_class(c("tidybulk", "tt")) %>% tidyr::nest(...) %>% @@ -121,5 +62,4 @@ nest.tidybulk <- function (.data, ..., .names_sep = NULL) # Add class add_class("tt") %>% add_class("nested_tidybulk") - } diff --git a/R/zzz.R b/R/zzz.R index cfa9fcb7..170b9430 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -3,24 +3,25 @@ version = packageDescription(pkgname, fields = "Version") msg = paste0("======================================== -", pkgname, " version ", version, " -If you use TIDYBULK in published research, please cite: - -Mangiola et al. tidybulk: an R tidy framework for modular -transcriptomic data analysis. Genome Biology 2021. - -This message can be suppressed by: - suppressPackageStartupMessages(library(tidybulk)) -======================================== -") + ", pkgname, " version ", version, " + If you use TIDYBULK in published research, please cite: + + Mangiola et al. tidybulk: an R tidy framework for modular + transcriptomic data analysis. Genome Biology 2021. + + This message can be suppressed by: + suppressPackageStartupMessages(library(tidybulk)) + ======================================== + ") - packageStartupMessage(msg) + # Attach tidyverse + attached <- tidyverse_attach() } -rv = R.Version() - -if(getRversion() >= "4.0.0" && as.numeric(rv$`svn rev`) >= 77889) { - unitType = get("unitType", envir = asNamespace("grid")) -} else { - unitType = function(x, recurse = TRUE) attr(x, "unit") -} \ No newline at end of file +# rv = R.Version() +# +# if(getRversion() >= "4.0.0" && as.numeric(rv$`svn rev`) >= 77889) { +# unitType = get("unitType", envir = asNamespace("grid")) +# } else { +# unitType = function(x, recurse = TRUE) attr(x, "unit") +# } \ No newline at end of file diff --git a/man/adjust_abundance-methods.Rd b/man/adjust_abundance-methods.Rd index 8ee70a84..7eac35aa 100644 --- a/man/adjust_abundance-methods.Rd +++ b/man/adjust_abundance-methods.Rd @@ -147,7 +147,12 @@ A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -adjust_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with an additional adjusted abundance column. This method uses scaled counts if present. +adjust_abundance() takes as input A `tbl` +(with at least three columns for sample, feature and transcript abundance) +or `SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a consistent object +(to the input) with an additional adjusted abundance column. +This method uses scaled counts if present. } \details{ `r lifecycle::badge("maturing")` @@ -168,7 +173,9 @@ cm$batch[colnames(cm) \%in\% c("SRR1740035", "SRR1740043")] = 1 cm |> identify_abundant() |> -adjust_abundance( .factor_unwanted = batch, .factor_of_interest = condition, method="combat" ) +adjust_abundance(.factor_unwanted = batch, + .factor_of_interest = condition, + method="combat") } diff --git a/man/arrange-methods.Rd b/man/arrange-methods.Rd deleted file mode 100644 index 477fb465..00000000 --- a/man/arrange-methods.Rd +++ /dev/null @@ -1,66 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{arrange} -\alias{arrange} -\title{Arrange rows by column values} -\arguments{ -\item{.data}{A data frame, data frame extension (e.g. a tibble), or a -lazy data frame (e.g. from dbplyr or dtplyr). See *Methods*, below, for -more details.} - -\item{...}{<[`tidy-eval`][dplyr_tidy_eval]> Variables, or functions or -variables. Use [desc()] to sort a variable in descending order.} - -\item{.by_group}{If TRUE, will sort first by grouping variable. Applies to grouped data frames only.} -} -\value{ -An object of the same type as `.data`. - -* All rows appear in the output, but (usually) in a different place. -* Columns are not modified. -* Groups are not modified. -* Data frame attributes are preserved. - -A tibble -} -\description{ -`arrange()` order the rows of a data frame rows by the values of selected -columns. - -Unlike other dplyr verbs, `arrange()` largely ignores grouping; you -need to explicit mention grouping variables (or use `by_group = TRUE`) -in order to group by them, and functions of variables are evaluated -once per data frame, not once per group. -} -\details{ -## Locales -The sort order for character vectors will depend on the collating sequence -of the locale in use: see [locales()]. - -## Missing values -Unlike base sorting with `sort()`, `NA` are: -* always sorted to the end for local data, even when wrapped with `desc()`. -* treated differently for remote data, depending on the backend. -} -\section{Methods}{ - -This function is a **generic**, which means that packages can provide -implementations (methods) for other classes. See the documentation of -individual methods for extra arguments and differences in behaviour. - -The following methods are currently available in loaded packages: -} - -\examples{ - -arrange(mtcars, cyl, disp) - -} -\seealso{ -Other single table verbs: -\code{\link{filter}()}, -\code{\link{mutate}()}, -\code{\link{rename}()}, -\code{\link{summarise}()} -} -\concept{single table verbs} diff --git a/man/arrange.Rd b/man/arrange.Rd new file mode 100644 index 00000000..58180ef4 --- /dev/null +++ b/man/arrange.Rd @@ -0,0 +1,91 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{arrange} +\alias{arrange} +\alias{arrange.tidybulk} +\title{Order rows using column values} +\usage{ +\method{arrange}{tidybulk}(.data, ..., .by_group = FALSE) +} +\arguments{ +\item{.data}{A data frame, data frame extension (e.g. a tibble), or a +lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Variables, or +functions of variables. Use \code{\link[dplyr:desc]{desc()}} to sort a variable in descending +order.} + +\item{.by_group}{If \code{TRUE}, will sort first by grouping variable. Applies to +grouped data frames only.} +} +\value{ +An object of the same type as \code{.data}. The output has the following +properties: +\itemize{ +\item All rows appear in the output, but (usually) in a different place. +\item Columns are not modified. +\item Groups are not modified. +\item Data frame attributes are preserved. +} +} +\description{ +\code{arrange()} orders the rows of a data frame by the values of selected +columns. + +Unlike other dplyr verbs, \code{arrange()} largely ignores grouping; you +need to explicitly mention grouping variables (or use \code{.by_group = TRUE}) +in order to group by them, and functions of variables are evaluated +once per data frame, not once per group. +} +\details{ +\subsection{Missing values}{ + +Unlike base sorting with \code{sort()}, \code{NA} are: +\itemize{ +\item always sorted to the end for local data, even when wrapped with \code{desc()}. +\item treated differently for remote data, depending on the backend. +} +} +} +\section{Methods}{ + + +This function is a \strong{generic}, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +The following methods are currently available in loaded packages: +\Sexpr[stage=render,results=rd]{dplyr:::methods_rd("arrange")}. + +} + +\examples{ +arrange(mtcars, cyl, disp) +arrange(mtcars, desc(disp)) + +# grouped arrange ignores groups +by_cyl <- mtcars \%>\% group_by(cyl) +by_cyl \%>\% arrange(desc(wt)) +# Unless you specifically ask: +by_cyl \%>\% arrange(desc(wt), .by_group = TRUE) + +# use embracing when wrapping in a function; +# see ?rlang::args_data_masking for more details +tidy_eval_arrange <- function(.data, var) { + .data \%>\% + arrange({{ var }}) +} +tidy_eval_arrange(mtcars, mpg) + +# Use `across()` or `pick()` to select columns with tidy-select +iris \%>\% arrange(pick(starts_with("Sepal"))) +iris \%>\% arrange(across(starts_with("Sepal"), desc)) +} +\seealso{ +Other single table verbs: +\code{\link{mutate}()}, +\code{\link{rename}()}, +\code{\link{summarise}()} +} +\concept{single table verbs} diff --git a/man/bind_rows.Rd b/man/bind_rows.Rd index b98c804e..a2c3110d 100644 --- a/man/bind_rows.Rd +++ b/man/bind_rows.Rd @@ -2,7 +2,15 @@ % Please edit documentation in R/dplyr_methods.R \name{bind_rows} \alias{bind_rows} -\title{Efficiently bind multiple data frames by row and column} +\alias{bind_rows.tidybulk} +\alias{bind_cols.tidybulk} +\alias{bind_cols} +\title{#' Efficiently bind multiple data frames by row and column} +\usage{ +\method{bind_rows}{tidybulk}(..., .id = NULL) + +\method{bind_cols}{tidybulk}(..., .id = NULL) +} \arguments{ \item{...}{Data frames to combine. @@ -24,29 +32,37 @@ list of data frames is supplied, the labels are taken from the names of the list. If no names are found a numeric sequence is used instead.} - -\item{add.cell.ids}{from Seurat 3.0 A character vector of length(x = c(x, y)). Appends the corresponding values to the start of each objects' cell names.} } \value{ +`bind_rows()` and `bind_cols()` return the same type as + the first input, either a data frame, `tbl_df`, or `grouped_df`. + `bind_rows()` and `bind_cols()` return the same type as the first input, either a data frame, `tbl_df`, or `grouped_df`. } \description{ +This is an efficient implementation of the common pattern of +`do.call(rbind, dfs)` or `do.call(cbind, dfs)` for binding many +data frames into one. + This is an efficient implementation of the common pattern of `do.call(rbind, dfs)` or `do.call(cbind, dfs)` for binding many data frames into one. } \details{ +The output of `bind_rows()` will contain a column if that column +appears in any of the inputs. + The output of `bind_rows()` will contain a column if that column appears in any of the inputs. } \examples{ data(se_mini) -se_mini_tidybulk = se_mini |> tidybulk() -bind_rows( se_mini_tidybulk, se_mini_tidybulk ) +se_mini_tidybulk <- se_mini |> tidybulk() +bind_rows(se_mini_tidybulk, se_mini_tidybulk) -tt_bind = se_mini_tidybulk |> select(time, condition) +tt_bind <- se_mini_tidybulk |> select(time, condition) se_mini_tidybulk |> bind_cols(tt_bind) } diff --git a/man/distinct-methods.Rd b/man/distinct-methods.Rd deleted file mode 100644 index 6bf62635..00000000 --- a/man/distinct-methods.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{distinct} -\alias{distinct} -\title{distinct} -\arguments{ -\item{.data}{A tbl. (See dplyr)} - -\item{...}{Data frames to combine (See dplyr)} - -\item{.keep_all}{If TRUE, keep all variables in .data. If a combination of ... is not distinct, this keeps the first row of values. (See dplyr)} -} -\value{ -A tt object -} -\description{ -distinct -} -\examples{ - -tidybulk::se_mini |> tidybulk() |> distinct() - - -} diff --git a/man/distinct.Rd b/man/distinct.Rd new file mode 100644 index 00000000..b362a370 --- /dev/null +++ b/man/distinct.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{distinct} +\alias{distinct} +\alias{distinct.tidybulk} +\title{Keep distinct/unique rows} +\usage{ +\method{distinct}{tidybulk}(.data, ..., .keep_all = FALSE) +} +\arguments{ +\item{.data}{A data frame, data frame extension (e.g. a tibble), or a +lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Optional variables to +use when determining uniqueness. If there are multiple rows for a given +combination of inputs, only the first row will be preserved. If omitted, +will use all variables in the data frame.} + +\item{.keep_all}{If \code{TRUE}, keep all variables in \code{.data}. +If a combination of \code{...} is not distinct, this keeps the +first row of values.} +} +\value{ +An object of the same type as \code{.data}. The output has the following +properties: +\itemize{ +\item Rows are a subset of the input but appear in the same order. +\item Columns are not modified if \code{...} is empty or \code{.keep_all} is \code{TRUE}. +Otherwise, \code{distinct()} first calls \code{mutate()} to create new columns. +\item Groups are not modified. +\item Data frame attributes are preserved. +} +} +\description{ +Keep only unique/distinct rows from a data frame. This is similar +to \code{\link[=unique.data.frame]{unique.data.frame()}} but considerably faster. +} +\section{Methods}{ + + +This function is a \strong{generic}, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +The following methods are currently available in loaded packages: +\Sexpr[stage=render,results=rd]{dplyr:::methods_rd("distinct")}. + +} + +\examples{ +data(se_mini) +se_mini |> tidybulk() |> distinct() + +} diff --git a/man/dplyr-methods.Rd b/man/dplyr-methods.Rd deleted file mode 100644 index fbdf8de4..00000000 --- a/man/dplyr-methods.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{bind_cols} -\alias{bind_cols} -\alias{left_join} -\title{Left join datasets} -\arguments{ -\item{x}{tbls to join. (See dplyr)} - -\item{y}{tbls to join. (See dplyr)} - -\item{by}{A character vector of variables to join by. (See dplyr)} - -\item{copy}{If x and y are not from the same data source, and copy is TRUE, then y will be copied into the same src as x. (See dplyr)} - -\item{suffix}{If there are non-joined duplicate variables in x and y, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2. (See dplyr)} - -\item{...}{Data frames to combine (See dplyr)} -} -\value{ -A tt object -} -\description{ -Left join datasets -} -\examples{ - -annotation = tidybulk::se_mini |> tidybulk() |> as_tibble() |> distinct(.sample) |> mutate(source = "AU") -tidybulk::se_mini |> tidybulk() |> as_tibble() |> left_join(annotation) - -} diff --git a/man/filter-methods.Rd b/man/filter-methods.Rd deleted file mode 100644 index c9a6c68b..00000000 --- a/man/filter-methods.Rd +++ /dev/null @@ -1,85 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{filter} -\alias{filter} -\title{Subset rows using column values} -\arguments{ -\item{.data}{A tbl. (See dplyr)} - -\item{...}{<[`tidy-eval`][dplyr_tidy_eval]> Logical predicates defined in -terms of the variables in `.data`. -Multiple conditions are combined with `&`. Only rows where the -condition evaluates to `TRUE` are kept.} - -\item{.preserve}{when `FALSE` (the default), the grouping structure -is recalculated based on the resulting data, otherwise it is kept as is.} -} -\value{ -An object of the same type as `.data`. - -* Rows are a subset of the input, but appear in the same order. -* Columns are not modified. -* The number of groups may be reduced (if `.preserve` is not `TRUE`). -* Data frame attributes are preserved. -} -\description{ -`filter()` retains the rows where the conditions you provide a `TRUE`. Note -that, unlike base subsetting with `[`, rows where the condition evaluates -to `NA` are dropped. -} -\details{ -dplyr is not yet smart enough to optimise filtering optimisation -on grouped datasets that don't need grouped calculations. For this reason, -filtering is often considerably faster on [ungroup()]ed data. -} -\section{Useful filter functions}{ - - -* [`==`], [`>`], [`>=`] etc -* [`&`], [`|`], [`!`], [xor()] -* [is.na()] -* [between()], [near()] -} - -\section{Grouped tibbles}{ - - -Because filtering expressions are computed within groups, they may -yield different results on grouped tibbles. This will be the case -as soon as an aggregating, lagging, or ranking function is -involved. Compare this ungrouped filtering: - - -The former keeps rows with `mass` greater than the global average -whereas the latter keeps rows with `mass` greater than the gender - -average. -} - -\section{Methods}{ - -This function is a **generic**, which means that packages can provide -implementations (methods) for other classes. See the documentation of -individual methods for extra arguments and differences in behaviour. - -The following methods are currently available in loaded packages: -} - -\examples{ - -data(se) - -se |> tidybulk() |> filter(dex=="untrt") - -# Learn more in ?dplyr_tidy_eval -} -\seealso{ -[filter_all()], [filter_if()] and [filter_at()]. - -Other single table verbs: -\code{\link{arrange}()}, -\code{\link{mutate}()}, -\code{\link{rename}()}, -\code{\link{summarise}()} -} -\concept{single table verbs} diff --git a/man/filter.Rd b/man/filter.Rd new file mode 100644 index 00000000..7f568a20 --- /dev/null +++ b/man/filter.Rd @@ -0,0 +1,116 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{filter} +\alias{filter} +\alias{filter.tidybulk} +\title{Keep rows that match a condition} +\usage{ +\method{filter}{tidybulk}(.data, ..., .preserve = FALSE) +} +\arguments{ +\item{.data}{A data frame, data frame extension (e.g. a tibble), or a +lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Expressions that +return a logical value, and are defined in terms of the variables in +\code{.data}. If multiple expressions are included, they are combined with the +\code{&} operator. Only rows for which all conditions evaluate to \code{TRUE} are +kept.} + +\item{.preserve}{Relevant when the \code{.data} input is grouped. +If \code{.preserve = FALSE} (the default), the grouping structure +is recalculated based on the resulting data, otherwise the grouping is kept as is.} +} +\value{ +An object of the same type as \code{.data}. The output has the following properties: +\itemize{ +\item Rows are a subset of the input, but appear in the same order. +\item Columns are not modified. +\item The number of groups may be reduced (if \code{.preserve} is not \code{TRUE}). +\item Data frame attributes are preserved. +} +} +\description{ +The \code{filter()} function is used to subset a data frame, +retaining all rows that satisfy your conditions. +To be retained, the row must produce a value of \code{TRUE} for all conditions. +Note that when a condition evaluates to \code{NA} +the row will be dropped, unlike base subsetting with \code{[}. +} +\details{ +The \code{filter()} function is used to subset the rows of +\code{.data}, applying the expressions in \code{...} to the column values to determine which +rows should be retained. It can be applied to both grouped and ungrouped data (see \code{\link[dplyr:group_by]{group_by()}} and +\code{\link[dplyr:ungroup]{ungroup()}}). However, dplyr is not yet smart enough to optimise the filtering +operation on grouped datasets that do not need grouped calculations. For this +reason, filtering is often considerably faster on ungrouped data. +} +\section{Useful filter functions}{ + + + +There are many functions and operators that are useful when constructing the +expressions used to filter the data: +\itemize{ +\item \code{\link{==}}, \code{\link{>}}, \code{\link{>=}} etc +\item \code{\link{&}}, \code{\link{|}}, \code{\link{!}}, \code{\link[=xor]{xor()}} +\item \code{\link[=is.na]{is.na()}} +\item \code{\link[dplyr:between]{between()}}, \code{\link[dplyr:near]{near()}} +} + +} + +\section{Grouped tibbles}{ + + + +Because filtering expressions are computed within groups, they may +yield different results on grouped tibbles. This will be the case +as soon as an aggregating, lagging, or ranking function is +involved. Compare this ungrouped filtering: + +\if{html}{\out{
}}\preformatted{starwars \%>\% filter(mass > mean(mass, na.rm = TRUE)) +}\if{html}{\out{
}} + +With the grouped equivalent: + +\if{html}{\out{
}}\preformatted{starwars \%>\% group_by(gender) \%>\% filter(mass > mean(mass, na.rm = TRUE)) +}\if{html}{\out{
}} + +In the ungrouped version, \code{filter()} compares the value of \code{mass} in each row to +the global average (taken over the whole data set), keeping only the rows with +\code{mass} greater than this global average. In contrast, the grouped version calculates +the average mass separately for each \code{gender} group, and keeps rows with \code{mass} greater +than the relevant within-gender average. + +} + +\section{Methods}{ + + +This function is a \strong{generic}, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +The following methods are currently available in loaded packages: +\Sexpr[stage=render,results=rd]{dplyr:::methods_rd("filter")}. + +} + +\examples{ +data(se) +se |> tidybulk() |> filter(dex=="untrt") +# Learn more in ?dplyr_tidy_eval + +} +\seealso{ +Other single table verbs: +\code{\link[dplyr]{arrange}()}, +\code{\link[dplyr]{mutate}()}, +\code{\link[dplyr]{reframe}()}, +\code{\link[dplyr]{rename}()}, +\code{\link[dplyr]{select}()}, +\code{\link[dplyr]{slice}()}, +\code{\link[dplyr]{summarise}()} +} diff --git a/man/full_join.Rd b/man/full_join.Rd new file mode 100644 index 00000000..1a60ee82 --- /dev/null +++ b/man/full_join.Rd @@ -0,0 +1,172 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{full_join} +\alias{full_join} +\alias{full_join.tidybulk} +\title{Mutating joins} +\usage{ +\method{full_join}{tidybulk}(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...) +} +\arguments{ +\item{x, y}{A pair of data frames, data frame extensions (e.g. a tibble), or +lazy data frames (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{by}{A join specification created with \code{\link[dplyr:join_by]{join_by()}}, or a character +vector of variables to join by. + +If \code{NULL}, the default, \verb{*_join()} will perform a natural join, using all +variables in common across \code{x} and \code{y}. A message lists the variables so +that you can check they're correct; suppress the message by supplying \code{by} +explicitly. + +To join on different variables between \code{x} and \code{y}, use a \code{\link[dplyr:join_by]{join_by()}} +specification. For example, \code{join_by(a == b)} will match \code{x$a} to \code{y$b}. + +To join by multiple variables, use a \code{\link[dplyr:join_by]{join_by()}} specification with +multiple expressions. For example, \code{join_by(a == b, c == d)} will match +\code{x$a} to \code{y$b} and \code{x$c} to \code{y$d}. If the column names are the same between +\code{x} and \code{y}, you can shorten this by listing only the variable names, like +\code{join_by(a, c)}. + +\code{\link[dplyr:join_by]{join_by()}} can also be used to perform inequality, rolling, and overlap +joins. See the documentation at \link[dplyr:join_by]{?join_by} for details on +these types of joins. + +For simple equality joins, you can alternatively specify a character vector +of variable names to join by. For example, \code{by = c("a", "b")} joins \code{x$a} +to \code{y$a} and \code{x$b} to \code{y$b}. If variable names differ between \code{x} and \code{y}, +use a named character vector like \code{by = c("x_a" = "y_a", "x_b" = "y_b")}. + +To perform a cross-join, generating all combinations of \code{x} and \code{y}, see +\code{\link[dplyr:cross_join]{cross_join()}}.} + +\item{copy}{If \code{x} and \code{y} are not from the same data source, +and \code{copy} is \code{TRUE}, then \code{y} will be copied into the +same src as \code{x}. This allows you to join tables across srcs, but +it is a potentially expensive operation so you must opt into it.} + +\item{suffix}{If there are non-joined duplicate variables in \code{x} and +\code{y}, these suffixes will be added to the output to disambiguate them. +Should be a character vector of length 2.} + +\item{...}{Other parameters passed onto methods.} +} +\value{ +An object of the same type as \code{x} (including the same groups). The order of +the rows and columns of \code{x} is preserved as much as possible. The output has +the following properties: +\itemize{ +\item The rows are affect by the join type. +\itemize{ +\item \code{inner_join()} returns matched \code{x} rows. +\item \code{left_join()} returns all \code{x} rows. +\item \code{right_join()} returns matched of \code{x} rows, followed by unmatched \code{y} rows. +\item \code{full_join()} returns all \code{x} rows, followed by unmatched \code{y} rows. +} +\item Output columns include all columns from \code{x} and all non-key columns from +\code{y}. If \code{keep = TRUE}, the key columns from \code{y} are included as well. +\item If non-key columns in \code{x} and \code{y} have the same name, \code{suffix}es are added +to disambiguate. If \code{keep = TRUE} and key columns in \code{x} and \code{y} have +the same name, \code{suffix}es are added to disambiguate these as well. +\item If \code{keep = FALSE}, output columns included in \code{by} are coerced to their +common type between \code{x} and \code{y}. +} +} +\description{ +Mutating joins add columns from \code{y} to \code{x}, matching observations based on +the keys. There are four mutating joins: the inner join, and the three outer +joins. +\subsection{Inner join}{ + +An \code{inner_join()} only keeps observations from \code{x} that have a matching key +in \code{y}. + +The most important property of an inner join is that unmatched rows in either +input are not included in the result. This means that generally inner joins +are not appropriate in most analyses, because it is too easy to lose +observations. +} + +\subsection{Outer joins}{ + +The three outer joins keep observations that appear in at least one of the +data frames: +\itemize{ +\item A \code{left_join()} keeps all observations in \code{x}. +\item A \code{right_join()} keeps all observations in \code{y}. +\item A \code{full_join()} keeps all observations in \code{x} and \code{y}. +} +} +} +\section{Many-to-many relationships}{ + + + +By default, dplyr guards against many-to-many relationships in equality joins +by throwing a warning. These occur when both of the following are true: +\itemize{ +\item A row in \code{x} matches multiple rows in \code{y}. +\item A row in \code{y} matches multiple rows in \code{x}. +} + +This is typically surprising, as most joins involve a relationship of +one-to-one, one-to-many, or many-to-one, and is often the result of an +improperly specified join. Many-to-many relationships are particularly +problematic because they can result in a Cartesian explosion of the number of +rows returned from the join. + +If a many-to-many relationship is expected, silence this warning by +explicitly setting \code{relationship = "many-to-many"}. + +In production code, it is best to preemptively set \code{relationship} to whatever +relationship you expect to exist between the keys of \code{x} and \code{y}, as this +forces an error to occur immediately if the data doesn't align with your +expectations. + +Inequality joins typically result in many-to-many relationships by nature, so +they don't warn on them by default, but you should still take extra care when +specifying an inequality join, because they also have the capability to +return a large number of rows. + +Rolling joins don't warn on many-to-many relationships either, but many +rolling joins follow a many-to-one relationship, so it is often useful to +set \code{relationship = "many-to-one"} to enforce this. + +Note that in SQL, most database providers won't let you specify a +many-to-many relationship between two tables, instead requiring that you +create a third \emph{junction table} that results in two one-to-many relationships +instead. + +} + +\section{Methods}{ + + +These functions are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{inner_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("inner_join")}. +\item \code{left_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("left_join")}. +\item \code{right_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("right_join")}. +\item \code{full_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("full_join")}. +} + +} + +\examples{ +data(se_mini) +annotation <- se_mini |> tidybulk() |> as_tibble() |> + distinct(.sample) |> mutate(source = "AU") +se_mini |> tidybulk() |> as_tibble() |> full_join(annotation) + +} +\seealso{ +Other joins: +\code{\link[dplyr]{cross_join}()}, +\code{\link[dplyr]{filter-joins}}, +\code{\link[dplyr]{nest_join}()} +} diff --git a/man/group_by-methods.Rd b/man/group_by-methods.Rd deleted file mode 100644 index 3fcd8adc..00000000 --- a/man/group_by-methods.Rd +++ /dev/null @@ -1,47 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{group_by} -\alias{group_by} -\title{Group by one or more variables} -\arguments{ -\item{.data}{A tbl. (See dplyr)} - -\item{...}{In `group_by()`, variables or computations to group by. -In `ungroup()`, variables to remove from the grouping.} - -\item{.add}{When `FALSE`, the default, `group_by()` will - override existing groups. To add to the existing groups, use - `.add = TRUE`. - - This argument was previously called `add`, but that prevented - creating a new grouping variable called `add`, and conflicts with - our naming conventions.} - -\item{.drop}{When `.drop = TRUE`, empty groups are dropped. See [group_by_drop_default()] for -what the default value is for this argument.} -} -\value{ -A [grouped data frame][grouped_df()], unless the combination of `...` and `add` - yields a non empty set of grouping columns, a regular (ungrouped) data frame - otherwise. -} -\description{ -Most data operations are done on groups defined by variables. -`group_by()` takes an existing tbl and converts it into a grouped tbl -where operations are performed "by group". `ungroup()` removes grouping. -} -\section{Methods}{ - -These function are **generic**s, which means that packages can provide -implementations (methods) for other classes. See the documentation of -individual methods for extra arguments and differences in behaviour. - -Methods available in currently loaded packages: -} - -\examples{ - -by_cyl <- mtcars |> group_by(cyl) - -} -\concept{grouping functions} diff --git a/man/group_by.Rd b/man/group_by.Rd new file mode 100644 index 00000000..5ff1f037 --- /dev/null +++ b/man/group_by.Rd @@ -0,0 +1,162 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{group_by} +\alias{group_by} +\alias{group_by.tidybulk} +\title{Group by one or more variables} +\usage{ +\method{group_by}{tidybulk}(.data, ..., .add = FALSE, .drop = group_by_drop_default(.data)) +} +\arguments{ +\item{.data}{A data frame, data frame extension (e.g. a tibble), or a +lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{...}{In \code{group_by()}, variables or computations to group by. +Computations are always done on the ungrouped data frame. +To perform computations on the grouped data, you need to use +a separate \code{mutate()} step before the \code{group_by()}. +Computations are not allowed in \code{nest_by()}. +In \code{ungroup()}, variables to remove from the grouping.} + +\item{.add}{When \code{FALSE}, the default, \code{group_by()} will +override existing groups. To add to the existing groups, use +\code{.add = TRUE}. + +This argument was previously called \code{add}, but that prevented +creating a new grouping variable called \code{add}, and conflicts with +our naming conventions.} + +\item{.drop}{Drop groups formed by factor levels that don't appear in the +data? The default is \code{TRUE} except when \code{.data} has been previously +grouped with \code{.drop = FALSE}. See \code{\link[dplyr:group_by_drop_default]{group_by_drop_default()}} for details.} +} +\value{ +A grouped data frame with class \code{\link[dplyr]{grouped_df}}, +unless the combination of \code{...} and \code{add} yields a empty set of +grouping columns, in which case a tibble will be returned. +} +\description{ +Most data operations are done on groups defined by variables. +\code{group_by()} takes an existing tbl and converts it into a grouped tbl +where operations are performed "by group". \code{ungroup()} removes grouping. +} +\section{Methods}{ + + +These function are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{group_by()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("group_by")}. +\item \code{ungroup()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("ungroup")}. +} + +} + +\section{Ordering}{ + + +Currently, \code{group_by()} internally orders the groups in ascending order. This +results in ordered output from functions that aggregate groups, such as +\code{\link[dplyr:summarise]{summarise()}}. + +When used as grouping columns, character vectors are ordered in the C locale +for performance and reproducibility across R sessions. If the resulting +ordering of your grouped operation matters and is dependent on the locale, +you should follow up the grouped operation with an explicit call to +\code{\link[dplyr:arrange]{arrange()}} and set the \code{.locale} argument. For example: + +\if{html}{\out{
}}\preformatted{data \%>\% + group_by(chr) \%>\% + summarise(avg = mean(x)) \%>\% + arrange(chr, .locale = "en") +}\if{html}{\out{
}} + +This is often useful as a preliminary step before generating content intended +for humans, such as an HTML table. +\subsection{Legacy behavior}{ + +Prior to dplyr 1.1.0, character vector grouping columns were ordered in the +system locale. If you need to temporarily revert to this behavior, you can +set the global option \code{dplyr.legacy_locale} to \code{TRUE}, but this should be +used sparingly and you should expect this option to be removed in a future +version of dplyr. It is better to update existing code to explicitly call +\code{arrange(.locale = )} instead. Note that setting \code{dplyr.legacy_locale} will +also force calls to \code{\link[dplyr:arrange]{arrange()}} to use the system locale. +} + +} + +\examples{ +by_cyl <- mtcars \%>\% group_by(cyl) + +# grouping doesn't change how the data looks (apart from listing +# how it's grouped): +by_cyl + +# It changes how it acts with the other dplyr verbs: +by_cyl \%>\% summarise( + disp = mean(disp), + hp = mean(hp) +) +by_cyl \%>\% filter(disp == max(disp)) + +# Each call to summarise() removes a layer of grouping +by_vs_am <- mtcars \%>\% group_by(vs, am) +by_vs <- by_vs_am \%>\% summarise(n = n()) +by_vs +by_vs \%>\% summarise(n = sum(n)) + +# To removing grouping, use ungroup +by_vs \%>\% + ungroup() \%>\% + summarise(n = sum(n)) + +# By default, group_by() overrides existing grouping +by_cyl \%>\% + group_by(vs, am) \%>\% + group_vars() + +# Use add = TRUE to instead append +by_cyl \%>\% + group_by(vs, am, .add = TRUE) \%>\% + group_vars() + +# You can group by expressions: this is a short-hand +# for a mutate() followed by a group_by() +mtcars \%>\% + group_by(vsam = vs + am) + +# The implicit mutate() step is always performed on the +# ungrouped data. Here we get 3 groups: +mtcars \%>\% + group_by(vs) \%>\% + group_by(hp_cut = cut(hp, 3)) + +# If you want it to be performed by groups, +# you have to use an explicit mutate() call. +# Here we get 3 groups per value of vs +mtcars \%>\% + group_by(vs) \%>\% + mutate(hp_cut = cut(hp, 3)) \%>\% + group_by(hp_cut) + +# when factors are involved and .drop = FALSE, groups can be empty +tbl <- tibble( + x = 1:10, + y = factor(rep(c("a", "c"), each = 5), levels = c("a", "b", "c")) +) +tbl \%>\% + group_by(y, .drop = FALSE) \%>\% + group_rows() +} +\seealso{ +Other grouping functions: +\code{\link[dplyr]{group_map}()}, +\code{\link[dplyr]{group_nest}()}, +\code{\link[dplyr]{group_split}()}, +\code{\link[dplyr]{group_trim}()} +} diff --git a/man/inner_join.Rd b/man/inner_join.Rd new file mode 100644 index 00000000..1556c9f4 --- /dev/null +++ b/man/inner_join.Rd @@ -0,0 +1,172 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{inner_join} +\alias{inner_join} +\alias{inner_join.tidybulk} +\title{Mutating joins} +\usage{ +\method{inner_join}{tidybulk}(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...) +} +\arguments{ +\item{x, y}{A pair of data frames, data frame extensions (e.g. a tibble), or +lazy data frames (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{by}{A join specification created with \code{\link[dplyr:join_by]{join_by()}}, or a character +vector of variables to join by. + +If \code{NULL}, the default, \verb{*_join()} will perform a natural join, using all +variables in common across \code{x} and \code{y}. A message lists the variables so +that you can check they're correct; suppress the message by supplying \code{by} +explicitly. + +To join on different variables between \code{x} and \code{y}, use a \code{\link[dplyr:join_by]{join_by()}} +specification. For example, \code{join_by(a == b)} will match \code{x$a} to \code{y$b}. + +To join by multiple variables, use a \code{\link[dplyr:join_by]{join_by()}} specification with +multiple expressions. For example, \code{join_by(a == b, c == d)} will match +\code{x$a} to \code{y$b} and \code{x$c} to \code{y$d}. If the column names are the same between +\code{x} and \code{y}, you can shorten this by listing only the variable names, like +\code{join_by(a, c)}. + +\code{\link[dplyr:join_by]{join_by()}} can also be used to perform inequality, rolling, and overlap +joins. See the documentation at \link[dplyr:join_by]{?join_by} for details on +these types of joins. + +For simple equality joins, you can alternatively specify a character vector +of variable names to join by. For example, \code{by = c("a", "b")} joins \code{x$a} +to \code{y$a} and \code{x$b} to \code{y$b}. If variable names differ between \code{x} and \code{y}, +use a named character vector like \code{by = c("x_a" = "y_a", "x_b" = "y_b")}. + +To perform a cross-join, generating all combinations of \code{x} and \code{y}, see +\code{\link[dplyr:cross_join]{cross_join()}}.} + +\item{copy}{If \code{x} and \code{y} are not from the same data source, +and \code{copy} is \code{TRUE}, then \code{y} will be copied into the +same src as \code{x}. This allows you to join tables across srcs, but +it is a potentially expensive operation so you must opt into it.} + +\item{suffix}{If there are non-joined duplicate variables in \code{x} and +\code{y}, these suffixes will be added to the output to disambiguate them. +Should be a character vector of length 2.} + +\item{...}{Other parameters passed onto methods.} +} +\value{ +An object of the same type as \code{x} (including the same groups). The order of +the rows and columns of \code{x} is preserved as much as possible. The output has +the following properties: +\itemize{ +\item The rows are affect by the join type. +\itemize{ +\item \code{inner_join()} returns matched \code{x} rows. +\item \code{left_join()} returns all \code{x} rows. +\item \code{right_join()} returns matched of \code{x} rows, followed by unmatched \code{y} rows. +\item \code{full_join()} returns all \code{x} rows, followed by unmatched \code{y} rows. +} +\item Output columns include all columns from \code{x} and all non-key columns from +\code{y}. If \code{keep = TRUE}, the key columns from \code{y} are included as well. +\item If non-key columns in \code{x} and \code{y} have the same name, \code{suffix}es are added +to disambiguate. If \code{keep = TRUE} and key columns in \code{x} and \code{y} have +the same name, \code{suffix}es are added to disambiguate these as well. +\item If \code{keep = FALSE}, output columns included in \code{by} are coerced to their +common type between \code{x} and \code{y}. +} +} +\description{ +Mutating joins add columns from \code{y} to \code{x}, matching observations based on +the keys. There are four mutating joins: the inner join, and the three outer +joins. +\subsection{Inner join}{ + +An \code{inner_join()} only keeps observations from \code{x} that have a matching key +in \code{y}. + +The most important property of an inner join is that unmatched rows in either +input are not included in the result. This means that generally inner joins +are not appropriate in most analyses, because it is too easy to lose +observations. +} + +\subsection{Outer joins}{ + +The three outer joins keep observations that appear in at least one of the +data frames: +\itemize{ +\item A \code{left_join()} keeps all observations in \code{x}. +\item A \code{right_join()} keeps all observations in \code{y}. +\item A \code{full_join()} keeps all observations in \code{x} and \code{y}. +} +} +} +\section{Many-to-many relationships}{ + + + +By default, dplyr guards against many-to-many relationships in equality joins +by throwing a warning. These occur when both of the following are true: +\itemize{ +\item A row in \code{x} matches multiple rows in \code{y}. +\item A row in \code{y} matches multiple rows in \code{x}. +} + +This is typically surprising, as most joins involve a relationship of +one-to-one, one-to-many, or many-to-one, and is often the result of an +improperly specified join. Many-to-many relationships are particularly +problematic because they can result in a Cartesian explosion of the number of +rows returned from the join. + +If a many-to-many relationship is expected, silence this warning by +explicitly setting \code{relationship = "many-to-many"}. + +In production code, it is best to preemptively set \code{relationship} to whatever +relationship you expect to exist between the keys of \code{x} and \code{y}, as this +forces an error to occur immediately if the data doesn't align with your +expectations. + +Inequality joins typically result in many-to-many relationships by nature, so +they don't warn on them by default, but you should still take extra care when +specifying an inequality join, because they also have the capability to +return a large number of rows. + +Rolling joins don't warn on many-to-many relationships either, but many +rolling joins follow a many-to-one relationship, so it is often useful to +set \code{relationship = "many-to-one"} to enforce this. + +Note that in SQL, most database providers won't let you specify a +many-to-many relationship between two tables, instead requiring that you +create a third \emph{junction table} that results in two one-to-many relationships +instead. + +} + +\section{Methods}{ + + +These functions are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{inner_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("inner_join")}. +\item \code{left_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("left_join")}. +\item \code{right_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("right_join")}. +\item \code{full_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("full_join")}. +} + +} + +\examples{ +data(se_mini) +annotation <- tidybulk::se_mini |> tidybulk() |> as_tibble() |> + distinct(.sample) |> mutate(source = "AU") +se_mini |> tidybulk() |> as_tibble() |> inner_join(annotation) + +} +\seealso{ +Other joins: +\code{\link[dplyr]{cross_join}()}, +\code{\link[dplyr]{filter-joins}}, +\code{\link[dplyr]{nest_join}()} +} diff --git a/man/join-methods.Rd b/man/join-methods.Rd deleted file mode 100644 index 0e4d8ac6..00000000 --- a/man/join-methods.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{inner_join} -\alias{inner_join} -\alias{right_join} -\alias{full_join} -\title{Inner join datasets} -\arguments{ -\item{x}{tbls to join. (See dplyr)} - -\item{y}{tbls to join. (See dplyr)} - -\item{by}{A character vector of variables to join by. (See dplyr)} - -\item{copy}{If x and y are not from the same data source, and copy is TRUE, then y will be copied into the same src as x. (See dplyr)} - -\item{suffix}{If there are non-joined duplicate variables in x and y, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2. (See dplyr)} - -\item{...}{Data frames to combine (See dplyr)} -} -\value{ -A tt object - -A tt object - -A tt object -} -\description{ -Inner join datasets - -Right join datasets - -Full join datasets -} -\examples{ - -annotation = tidybulk::se_mini |> tidybulk() |> as_tibble() |> distinct(.sample) |> mutate(source = "AU") -tidybulk::se_mini |> tidybulk() |> as_tibble() |> inner_join(annotation) - - -annotation = tidybulk::se_mini |> tidybulk() |> as_tibble() |> distinct(.sample) |> mutate(source = "AU") -tidybulk::se_mini |> tidybulk() |> as_tibble() |> right_join(annotation) - - -annotation = tidybulk::se_mini |> tidybulk() |> as_tibble() |> distinct(.sample) |> mutate(source = "AU") -tidybulk::se_mini |> tidybulk() |> as_tibble() |> full_join(annotation) - -} diff --git a/man/left_join.Rd b/man/left_join.Rd new file mode 100644 index 00000000..58657133 --- /dev/null +++ b/man/left_join.Rd @@ -0,0 +1,172 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{left_join} +\alias{left_join} +\alias{left_join.tidybulk} +\title{Mutating joins} +\usage{ +\method{left_join}{tidybulk}(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...) +} +\arguments{ +\item{x, y}{A pair of data frames, data frame extensions (e.g. a tibble), or +lazy data frames (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{by}{A join specification created with \code{\link[dplyr:join_by]{join_by()}}, or a character +vector of variables to join by. + +If \code{NULL}, the default, \verb{*_join()} will perform a natural join, using all +variables in common across \code{x} and \code{y}. A message lists the variables so +that you can check they're correct; suppress the message by supplying \code{by} +explicitly. + +To join on different variables between \code{x} and \code{y}, use a \code{\link[dplyr:join_by]{join_by()}} +specification. For example, \code{join_by(a == b)} will match \code{x$a} to \code{y$b}. + +To join by multiple variables, use a \code{\link[dplyr:join_by]{join_by()}} specification with +multiple expressions. For example, \code{join_by(a == b, c == d)} will match +\code{x$a} to \code{y$b} and \code{x$c} to \code{y$d}. If the column names are the same between +\code{x} and \code{y}, you can shorten this by listing only the variable names, like +\code{join_by(a, c)}. + +\code{\link[dplyr:join_by]{join_by()}} can also be used to perform inequality, rolling, and overlap +joins. See the documentation at \link[dplyr:join_by]{?join_by} for details on +these types of joins. + +For simple equality joins, you can alternatively specify a character vector +of variable names to join by. For example, \code{by = c("a", "b")} joins \code{x$a} +to \code{y$a} and \code{x$b} to \code{y$b}. If variable names differ between \code{x} and \code{y}, +use a named character vector like \code{by = c("x_a" = "y_a", "x_b" = "y_b")}. + +To perform a cross-join, generating all combinations of \code{x} and \code{y}, see +\code{\link[dplyr:cross_join]{cross_join()}}.} + +\item{copy}{If \code{x} and \code{y} are not from the same data source, +and \code{copy} is \code{TRUE}, then \code{y} will be copied into the +same src as \code{x}. This allows you to join tables across srcs, but +it is a potentially expensive operation so you must opt into it.} + +\item{suffix}{If there are non-joined duplicate variables in \code{x} and +\code{y}, these suffixes will be added to the output to disambiguate them. +Should be a character vector of length 2.} + +\item{...}{Other parameters passed onto methods.} +} +\value{ +An object of the same type as \code{x} (including the same groups). The order of +the rows and columns of \code{x} is preserved as much as possible. The output has +the following properties: +\itemize{ +\item The rows are affect by the join type. +\itemize{ +\item \code{inner_join()} returns matched \code{x} rows. +\item \code{left_join()} returns all \code{x} rows. +\item \code{right_join()} returns matched of \code{x} rows, followed by unmatched \code{y} rows. +\item \code{full_join()} returns all \code{x} rows, followed by unmatched \code{y} rows. +} +\item Output columns include all columns from \code{x} and all non-key columns from +\code{y}. If \code{keep = TRUE}, the key columns from \code{y} are included as well. +\item If non-key columns in \code{x} and \code{y} have the same name, \code{suffix}es are added +to disambiguate. If \code{keep = TRUE} and key columns in \code{x} and \code{y} have +the same name, \code{suffix}es are added to disambiguate these as well. +\item If \code{keep = FALSE}, output columns included in \code{by} are coerced to their +common type between \code{x} and \code{y}. +} +} +\description{ +Mutating joins add columns from \code{y} to \code{x}, matching observations based on +the keys. There are four mutating joins: the inner join, and the three outer +joins. +\subsection{Inner join}{ + +An \code{inner_join()} only keeps observations from \code{x} that have a matching key +in \code{y}. + +The most important property of an inner join is that unmatched rows in either +input are not included in the result. This means that generally inner joins +are not appropriate in most analyses, because it is too easy to lose +observations. +} + +\subsection{Outer joins}{ + +The three outer joins keep observations that appear in at least one of the +data frames: +\itemize{ +\item A \code{left_join()} keeps all observations in \code{x}. +\item A \code{right_join()} keeps all observations in \code{y}. +\item A \code{full_join()} keeps all observations in \code{x} and \code{y}. +} +} +} +\section{Many-to-many relationships}{ + + + +By default, dplyr guards against many-to-many relationships in equality joins +by throwing a warning. These occur when both of the following are true: +\itemize{ +\item A row in \code{x} matches multiple rows in \code{y}. +\item A row in \code{y} matches multiple rows in \code{x}. +} + +This is typically surprising, as most joins involve a relationship of +one-to-one, one-to-many, or many-to-one, and is often the result of an +improperly specified join. Many-to-many relationships are particularly +problematic because they can result in a Cartesian explosion of the number of +rows returned from the join. + +If a many-to-many relationship is expected, silence this warning by +explicitly setting \code{relationship = "many-to-many"}. + +In production code, it is best to preemptively set \code{relationship} to whatever +relationship you expect to exist between the keys of \code{x} and \code{y}, as this +forces an error to occur immediately if the data doesn't align with your +expectations. + +Inequality joins typically result in many-to-many relationships by nature, so +they don't warn on them by default, but you should still take extra care when +specifying an inequality join, because they also have the capability to +return a large number of rows. + +Rolling joins don't warn on many-to-many relationships either, but many +rolling joins follow a many-to-one relationship, so it is often useful to +set \code{relationship = "many-to-one"} to enforce this. + +Note that in SQL, most database providers won't let you specify a +many-to-many relationship between two tables, instead requiring that you +create a third \emph{junction table} that results in two one-to-many relationships +instead. + +} + +\section{Methods}{ + + +These functions are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{inner_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("inner_join")}. +\item \code{left_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("left_join")}. +\item \code{right_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("right_join")}. +\item \code{full_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("full_join")}. +} + +} + +\examples{ +data(se_mini) +annotation <- se_mini |> tidybulk() |> as_tibble() |> + distinct(.sample) |> mutate(source = "AU") +se_mini |> tidybulk() |> as_tibble() |> left_join(annotation) + +} +\seealso{ +Other joins: +\code{\link[dplyr]{cross_join}()}, +\code{\link[dplyr]{filter-joins}}, +\code{\link[dplyr]{nest_join}()} +} diff --git a/man/mutate-methods.Rd b/man/mutate-methods.Rd deleted file mode 100644 index cc79af02..00000000 --- a/man/mutate-methods.Rd +++ /dev/null @@ -1,103 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{mutate} -\alias{mutate} -\title{Create, modify, and delete columns} -\arguments{ -\item{.data}{A tbl. (See dplyr)} - -\item{...}{<[`tidy-eval`][dplyr_tidy_eval]> Name-value pairs. - The name gives the name of the column in the output. - - The value can be: - - * A vector of length 1, which will be recycled to the correct length. - * A vector the same length as the current group (or the whole data frame - if ungrouped). - * `NULL`, to remove the column. - * A data frame or tibble, to create multiple columns in the output.} -} -\value{ -An object of the same type as `.data`. - -For `mutate()`: - -* Rows are not affected. -* Existing columns will be preserved unless explicitly modified. -* New columns will be added to the right of existing columns. -* Columns given value `NULL` will be removed -* Groups will be recomputed if a grouping variable is mutated. -* Data frame attributes are preserved. - -For `transmute()`: - -* Rows are not affected. -* Apart from grouping variables, existing columns will be remove unless - explicitly kept. -* Column order matches order of expressions. -* Groups will be recomputed if a grouping variable is mutated. -* Data frame attributes are preserved. -} -\description{ -`mutate()` adds new variables and preserves existing ones; -`transmute()` adds new variables and drops existing ones. -New variables overwrite existing variables of the same name. -Variables can be removed by setting their value to `NULL`. -} -\section{Useful mutate functions}{ - - -* [`+`], [`-`], [log()], etc., for their usual mathematical meanings - -* [lead()], [lag()] - -* [dense_rank()], [min_rank()], [percent_rank()], [row_number()], - [cume_dist()], [ntile()] - -* [cumsum()], [cummean()], [cummin()], [cummax()], [cumany()], [cumall()] - -* [na_if()], [coalesce()] - -* [if_else()], [recode()], [case_when()] -} - -\section{Grouped tibbles}{ - - -Because mutating expressions are computed within groups, they may -yield different results on grouped tibbles. This will be the case -as soon as an aggregating, lagging, or ranking function is -involved. Compare this ungrouped mutate: - -With the grouped equivalent: - -The former normalises `mass` by the global average whereas the -latter normalises by the averages within gender levels. -} - -\section{Methods}{ - -These function are **generic**s, which means that packages can provide -implementations (methods) for other classes. See the documentation of -individual methods for extra arguments and differences in behaviour. - -Methods available in currently loaded packages: -} - -\examples{ - -# Newly created variables are available immediately -mtcars |> as_tibble() |> mutate( - cyl2 = cyl * 2, - cyl4 = cyl2 * 2 -) - -} -\seealso{ -Other single table verbs: -\code{\link{arrange}()}, -\code{\link{filter}()}, -\code{\link{rename}()}, -\code{\link{summarise}()} -} -\concept{single table verbs} diff --git a/man/mutate.Rd b/man/mutate.Rd new file mode 100644 index 00000000..b2b48431 --- /dev/null +++ b/man/mutate.Rd @@ -0,0 +1,173 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{mutate} +\alias{mutate} +\alias{mutate.tidybulk} +\title{Create, modify, and delete columns} +\usage{ +\method{mutate}{tidybulk}(.data, ...) +} +\arguments{ +\item{.data}{A data frame, data frame extension (e.g. a tibble), or a +lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Name-value pairs. +The name gives the name of the column in the output. + +The value can be: +\itemize{ +\item A vector of length 1, which will be recycled to the correct length. +\item A vector the same length as the current group (or the whole data frame +if ungrouped). +\item \code{NULL}, to remove the column. +\item A data frame or tibble, to create multiple columns in the output. +}} +} +\value{ +An object of the same type as \code{.data}. The output has the following +properties: +\itemize{ +\item Columns from \code{.data} will be preserved according to the \code{.keep} argument. +\item Existing columns that are modified by \code{...} will always be returned in +their original location. +\item New columns created through \code{...} will be placed according to the +\code{.before} and \code{.after} arguments. +\item The number of rows is not affected. +\item Columns given the value \code{NULL} will be removed. +\item Groups will be recomputed if a grouping variable is mutated. +\item Data frame attributes are preserved. +} +} +\description{ +\code{mutate()} creates new columns that are functions of existing variables. +It can also modify (if the name is the same as an existing +column) and delete columns (by setting their value to \code{NULL}). +} +\section{Useful mutate functions}{ + + +\itemize{ +\item \code{\link{+}}, \code{\link{-}}, \code{\link[=log]{log()}}, etc., for their usual mathematical meanings +\item \code{\link[dplyr:lead]{lead()}}, \code{\link[dplyr:lag]{lag()}} +\item \code{\link[dplyr:dense_rank]{dense_rank()}}, \code{\link[dplyr:min_rank]{min_rank()}}, \code{\link[dplyr:percent_rank]{percent_rank()}}, \code{\link[dplyr:row_number]{row_number()}}, +\code{\link[dplyr:cume_dist]{cume_dist()}}, \code{\link[dplyr:ntile]{ntile()}} +\item \code{\link[=cumsum]{cumsum()}}, \code{\link[dplyr:cummean]{cummean()}}, \code{\link[=cummin]{cummin()}}, \code{\link[=cummax]{cummax()}}, \code{\link[dplyr:cumany]{cumany()}}, \code{\link[dplyr:cumall]{cumall()}} +\item \code{\link[dplyr:na_if]{na_if()}}, \code{\link[dplyr:coalesce]{coalesce()}} +\item \code{\link[dplyr:if_else]{if_else()}}, \code{\link[dplyr:recode]{recode()}}, \code{\link[dplyr:case_when]{case_when()}} +} + +} + +\section{Grouped tibbles}{ + + + +Because mutating expressions are computed within groups, they may +yield different results on grouped tibbles. This will be the case +as soon as an aggregating, lagging, or ranking function is +involved. Compare this ungrouped mutate: + +\if{html}{\out{
}}\preformatted{starwars \%>\% + select(name, mass, species) \%>\% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) +}\if{html}{\out{
}} + +With the grouped equivalent: + +\if{html}{\out{
}}\preformatted{starwars \%>\% + select(name, mass, species) \%>\% + group_by(species) \%>\% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) +}\if{html}{\out{
}} + +The former normalises \code{mass} by the global average whereas the +latter normalises by the averages within species levels. + +} + +\section{Methods}{ + + +This function is a \strong{generic}, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\Sexpr[stage=render,results=rd]{dplyr:::methods_rd("mutate")}. + +} + +\examples{ +# Newly created variables are available immediately +starwars \%>\% + select(name, mass) \%>\% + mutate( + mass2 = mass * 2, + mass2_squared = mass2 * mass2 + ) + +# As well as adding new variables, you can use mutate() to +# remove variables and modify existing variables. +starwars \%>\% + select(name, height, mass, homeworld) \%>\% + mutate( + mass = NULL, + height = height * 0.0328084 # convert to feet + ) + +# Use across() with mutate() to apply a transformation +# to multiple columns in a tibble. +starwars \%>\% + select(name, homeworld, species) \%>\% + mutate(across(!name, as.factor)) +# see more in ?across + +# Window functions are useful for grouped mutates: +starwars \%>\% + select(name, mass, homeworld) \%>\% + group_by(homeworld) \%>\% + mutate(rank = min_rank(desc(mass))) +# see `vignette("window-functions")` for more details + +# By default, new columns are placed on the far right. +df <- tibble(x = 1, y = 2) +df \%>\% mutate(z = x + y) +df \%>\% mutate(z = x + y, .before = 1) +df \%>\% mutate(z = x + y, .after = x) + +# By default, mutate() keeps all columns from the input data. +df <- tibble(x = 1, y = 2, a = "a", b = "b") +df \%>\% mutate(z = x + y, .keep = "all") # the default +df \%>\% mutate(z = x + y, .keep = "used") +df \%>\% mutate(z = x + y, .keep = "unused") +df \%>\% mutate(z = x + y, .keep = "none") + +# Grouping ---------------------------------------- +# The mutate operation may yield different results on grouped +# tibbles because the expressions are computed within groups. +# The following normalises `mass` by the global average: +starwars \%>\% + select(name, mass, species) \%>\% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) + +# Whereas this normalises `mass` by the averages within species +# levels: +starwars \%>\% + select(name, mass, species) \%>\% + group_by(species) \%>\% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) + +# Indirection ---------------------------------------- +# Refer to column names stored as strings with the `.data` pronoun: +vars <- c("mass", "height") +mutate(starwars, prod = .data[[vars[[1]]]] * .data[[vars[[2]]]]) +# Learn more in ?rlang::args_data_masking +} +\seealso{ +Other single table verbs: +\code{\link{arrange}()}, +\code{\link{rename}()}, +\code{\link{summarise}()} +} +\concept{single table verbs} diff --git a/man/mutate.nested_tidybulk.Rd b/man/mutate.nested_tidybulk.Rd new file mode 100644 index 00000000..f388c83b --- /dev/null +++ b/man/mutate.nested_tidybulk.Rd @@ -0,0 +1,175 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{mutate.nested_tidybulk} +\alias{mutate.nested_tidybulk} +\title{Create, modify, and delete columns} +\usage{ +\method{mutate}{nested_tidybulk}(.data, ...) +} +\arguments{ +\item{.data}{A data frame, data frame extension (e.g. a tibble), or a +lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Name-value pairs. +The name gives the name of the column in the output. + +The value can be: +\itemize{ +\item A vector of length 1, which will be recycled to the correct length. +\item A vector the same length as the current group (or the whole data frame +if ungrouped). +\item \code{NULL}, to remove the column. +\item A data frame or tibble, to create multiple columns in the output. +}} +} +\value{ +An object of the same type as \code{.data}. The output has the following +properties: +\itemize{ +\item Columns from \code{.data} will be preserved according to the \code{.keep} argument. +\item Existing columns that are modified by \code{...} will always be returned in +their original location. +\item New columns created through \code{...} will be placed according to the +\code{.before} and \code{.after} arguments. +\item The number of rows is not affected. +\item Columns given the value \code{NULL} will be removed. +\item Groups will be recomputed if a grouping variable is mutated. +\item Data frame attributes are preserved. +} +} +\description{ +\code{mutate()} creates new columns that are functions of existing variables. +It can also modify (if the name is the same as an existing +column) and delete columns (by setting their value to \code{NULL}). +} +\section{Useful mutate functions}{ + + +\itemize{ +\item \code{\link{+}}, \code{\link{-}}, \code{\link[=log]{log()}}, etc., for their usual mathematical meanings +\item \code{\link[dplyr:lead]{lead()}}, \code{\link[dplyr:lag]{lag()}} +\item \code{\link[dplyr:dense_rank]{dense_rank()}}, \code{\link[dplyr:min_rank]{min_rank()}}, \code{\link[dplyr:percent_rank]{percent_rank()}}, \code{\link[dplyr:row_number]{row_number()}}, +\code{\link[dplyr:cume_dist]{cume_dist()}}, \code{\link[dplyr:ntile]{ntile()}} +\item \code{\link[=cumsum]{cumsum()}}, \code{\link[dplyr:cummean]{cummean()}}, \code{\link[=cummin]{cummin()}}, \code{\link[=cummax]{cummax()}}, \code{\link[dplyr:cumany]{cumany()}}, \code{\link[dplyr:cumall]{cumall()}} +\item \code{\link[dplyr:na_if]{na_if()}}, \code{\link[dplyr:coalesce]{coalesce()}} +\item \code{\link[dplyr:if_else]{if_else()}}, \code{\link[dplyr:recode]{recode()}}, \code{\link[dplyr:case_when]{case_when()}} +} + +} + +\section{Grouped tibbles}{ + + + +Because mutating expressions are computed within groups, they may +yield different results on grouped tibbles. This will be the case +as soon as an aggregating, lagging, or ranking function is +involved. Compare this ungrouped mutate: + +\if{html}{\out{
}}\preformatted{starwars \%>\% + select(name, mass, species) \%>\% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) +}\if{html}{\out{
}} + +With the grouped equivalent: + +\if{html}{\out{
}}\preformatted{starwars \%>\% + select(name, mass, species) \%>\% + group_by(species) \%>\% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) +}\if{html}{\out{
}} + +The former normalises \code{mass} by the global average whereas the +latter normalises by the averages within species levels. + +} + +\section{Methods}{ + + +This function is a \strong{generic}, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\Sexpr[stage=render,results=rd]{dplyr:::methods_rd("mutate")}. + +} + +\examples{ +# Newly created variables are available immediately +starwars \%>\% + select(name, mass) \%>\% + mutate( + mass2 = mass * 2, + mass2_squared = mass2 * mass2 + ) + +# As well as adding new variables, you can use mutate() to +# remove variables and modify existing variables. +starwars \%>\% + select(name, height, mass, homeworld) \%>\% + mutate( + mass = NULL, + height = height * 0.0328084 # convert to feet + ) + +# Use across() with mutate() to apply a transformation +# to multiple columns in a tibble. +starwars \%>\% + select(name, homeworld, species) \%>\% + mutate(across(!name, as.factor)) +# see more in ?across + +# Window functions are useful for grouped mutates: +starwars \%>\% + select(name, mass, homeworld) \%>\% + group_by(homeworld) \%>\% + mutate(rank = min_rank(desc(mass))) +# see `vignette("window-functions")` for more details + +# By default, new columns are placed on the far right. +df <- tibble(x = 1, y = 2) +df \%>\% mutate(z = x + y) +df \%>\% mutate(z = x + y, .before = 1) +df \%>\% mutate(z = x + y, .after = x) + +# By default, mutate() keeps all columns from the input data. +df <- tibble(x = 1, y = 2, a = "a", b = "b") +df \%>\% mutate(z = x + y, .keep = "all") # the default +df \%>\% mutate(z = x + y, .keep = "used") +df \%>\% mutate(z = x + y, .keep = "unused") +df \%>\% mutate(z = x + y, .keep = "none") + +# Grouping ---------------------------------------- +# The mutate operation may yield different results on grouped +# tibbles because the expressions are computed within groups. +# The following normalises `mass` by the global average: +starwars \%>\% + select(name, mass, species) \%>\% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) + +# Whereas this normalises `mass` by the averages within species +# levels: +starwars \%>\% + select(name, mass, species) \%>\% + group_by(species) \%>\% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) + +# Indirection ---------------------------------------- +# Refer to column names stored as strings with the `.data` pronoun: +vars <- c("mass", "height") +mutate(starwars, prod = .data[[vars[[1]]]] * .data[[vars[[2]]]]) +# Learn more in ?rlang::args_data_masking +} +\seealso{ +Other single table verbs: +\code{\link[dplyr]{arrange}()}, +\code{\link[dplyr]{filter}()}, +\code{\link[dplyr]{reframe}()}, +\code{\link[dplyr]{rename}()}, +\code{\link[dplyr]{select}()}, +\code{\link[dplyr]{slice}()}, +\code{\link[dplyr]{summarise}()} +} diff --git a/man/nest-methods.Rd b/man/nest-methods.Rd deleted file mode 100644 index 3e86e90d..00000000 --- a/man/nest-methods.Rd +++ /dev/null @@ -1,61 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tidyr_methods.R -\name{unnest} -\alias{unnest} -\alias{nest} -\title{unnest} -\arguments{ -\item{data}{A tbl. (See tidyr)} - -\item{cols}{<[`tidy-select`][tidyr_tidy_select]> Columns to unnest. -If you `unnest()` multiple columns, parallel entries must be of -compatibble sizes, i.e. they're either equal or length 1 (following the -standard tidyverse recycling rules).} - -\item{names_sep}{If `NULL`, the default, the names will be left - as is. In `nest()`, inner names will come from the former outer names; - in `unnest()`, the new outer names will come from the inner names. - - If a string, the inner and outer names will be used together. In `nest()`, - the names of the new outer columns will be formed by pasting together the - outer and the inner column names, separated by `names_sep`. In `unnest()`, - the new inner names will have the outer names (+ `names_sep`) automatically - stripped. This makes `names_sep` roughly symmetric between nesting and unnesting.} - -\item{keep_empty}{See tidyr::unnest} - -\item{names_repair}{See tidyr::unnest} - -\item{ptype}{See tidyr::unnest} - -\item{.drop}{See tidyr::unnest} - -\item{.id}{tidyr::unnest} - -\item{.sep}{tidyr::unnest} - -\item{.preserve}{See tidyr::unnest} - -\item{.data}{A tbl. (See tidyr)} - -\item{...}{Name-variable pairs of the form new_col = c(col1, col2, col3) (See tidyr)} -} -\value{ -A tidySummarizedExperiment objector a tibble depending on input - -A tt object -} -\description{ -unnest - -nest -} -\examples{ - - -tidybulk::se_mini |> tidybulk() |> nest( data = -.feature) |> unnest(data) - - -tidybulk::se_mini \%>\% tidybulk() \%>\% nest( data = -.feature) - -} diff --git a/man/nest.Rd b/man/nest.Rd new file mode 100644 index 00000000..d244d59a --- /dev/null +++ b/man/nest.Rd @@ -0,0 +1,84 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tidyr_methods.R +\name{nest} +\alias{nest} +\alias{nest.tidybulk} +\title{Nest rows into a list-column of data frames} +\usage{ +\method{nest}{tidybulk}(.data, ..., .names_sep = NULL) +} +\arguments{ +\item{.data}{A data frame.} + +\item{...}{<\code{\link[tidyr:tidyr_tidy_select]{tidy-select}}> Columns to nest; these will +appear in the inner data frames. + +Specified using name-variable pairs of the form +\code{new_col = c(col1, col2, col3)}. The right hand side can be any valid +tidyselect expression. + +If not supplied, then \code{...} is derived as all columns \emph{not} selected by +\code{.by}, and will use the column name from \code{.key}. + +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}: +previously you could write \code{df \%>\% nest(x, y, z)}. +Convert to \code{df \%>\% nest(data = c(x, y, z))}.} + +\item{.names_sep}{If \code{NULL}, the default, the inner names will come from +the former outer names. If a string, the new inner names will use the +outer names with \code{names_sep} automatically stripped. This makes +\code{names_sep} roughly symmetric between nesting and unnesting.} +} +\description{ +Nesting creates a list-column of data frames; unnesting flattens it back out +into regular columns. Nesting is implicitly a summarising operation: you +get one row for each group defined by the non-nested columns. This is useful +in conjunction with other summaries that work with whole datasets, most +notably models. + +Learn more in \code{vignette("nest")}. +} +\details{ +If neither \code{...} nor \code{.by} are supplied, \code{nest()} will nest all variables, +and will use the column name supplied through \code{.key}. +} +\section{New syntax}{ + + +tidyr 1.0.0 introduced a new syntax for \code{nest()} and \code{unnest()} that's +designed to be more similar to other functions. Converting to the new syntax +should be straightforward (guided by the message you'll receive) but if +you just need to run an old analysis, you can easily revert to the previous +behaviour using \code{\link[tidyr:nest_legacy]{nest_legacy()}} and \code{\link[tidyr:unnest_legacy]{unnest_legacy()}} as follows: + +\if{html}{\out{
}}\preformatted{library(tidyr) +nest <- nest_legacy +unnest <- unnest_legacy +}\if{html}{\out{
}} + +} + +\section{Grouped data frames}{ + + +\code{df \%>\% nest(data = c(x, y))} specifies the columns to be nested; i.e. the +columns that will appear in the inner data frame. \code{df \%>\% nest(.by = c(x, y))} specifies the columns to nest \emph{by}; i.e. the columns that will remain in +the outer data frame. An alternative way to achieve the latter is to \code{nest()} +a grouped data frame created by \code{\link[dplyr:group_by]{dplyr::group_by()}}. The grouping variables +remain in the outer data frame and the others are nested. The result +preserves the grouping of the input. + +Variables supplied to \code{nest()} will override grouping variables so that +\code{df \%>\% group_by(x, y) \%>\% nest(data = !z)} will be equivalent to +\code{df \%>\% nest(data = !z)}. + +You can't supply \code{.by} with a grouped data frame, as the groups already +represent what you are nesting by. + +} + +\examples{ +data(se_mini) +se_mini \%>\% tidybulk() \%>\% nest(data = -.feature) + +} diff --git a/man/reexports.Rd b/man/reexports.Rd index 8bc39e1e..10ab6d8b 100644 --- a/man/reexports.Rd +++ b/man/reexports.Rd @@ -3,8 +3,8 @@ \docType{import} \name{reexports} \alias{reexports} -\alias{select} \alias{do} +\alias{select} \alias{tibble} \alias{as_tibble} \title{Objects exported from other packages} diff --git a/man/rename-methods.Rd b/man/rename-methods.Rd deleted file mode 100644 index db4defc0..00000000 --- a/man/rename-methods.Rd +++ /dev/null @@ -1,51 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{rename} -\alias{rename} -\title{Rename columns} -\arguments{ -\item{.data}{A tbl. (See dplyr)} - -\item{...}{<[`tidy-select`][dplyr_tidy_select]> Use `new_name = old_name` -to rename selected variables.} -} -\value{ -An object of the same type as `.data`. -* Rows are not affected. -* Column names are changed; column order is preserved -* Data frame attributes are preserved. -* Groups are updated to reflect new names. -} -\description{ -Rename individual variables using `new_name = old_name` syntax. -} -\section{Scoped selection and renaming}{ - - -Use the three scoped variants ([rename_all()], [rename_if()], [rename_at()]) -to renaming a set of variables with a function. -} - -\section{Methods}{ - -This function is a **generic**, which means that packages can provide -implementations (methods) for other classes. See the documentation of -individual methods for extra arguments and differences in behaviour. - -The following methods are currently available in loaded packages: -} - -\examples{ - -iris <- as_tibble(iris) # so it prints a little nicer -rename(iris, petal_length = Petal.Length) - -} -\seealso{ -Other single table verbs: -\code{\link{arrange}()}, -\code{\link{filter}()}, -\code{\link{mutate}()}, -\code{\link{summarise}()} -} -\concept{single table verbs} diff --git a/man/rename.Rd b/man/rename.Rd new file mode 100644 index 00000000..3d593bda --- /dev/null +++ b/man/rename.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{rename} +\alias{rename} +\alias{rename.tidybulk} +\title{Rename columns} +\usage{ +\method{rename}{tidybulk}(.data, ...) +} +\arguments{ +\item{.data}{A data frame, data frame extension (e.g. a tibble), or a +lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{...}{For \code{rename()}: <\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Use +\code{new_name = old_name} to rename selected variables. + +For \code{rename_with()}: additional arguments passed onto \code{.fn}.} +} +\value{ +An object of the same type as \code{.data}. The output has the following +properties: +\itemize{ +\item Rows are not affected. +\item Column names are changed; column order is preserved. +\item Data frame attributes are preserved. +\item Groups are updated to reflect new names. +} +} +\description{ +\code{rename()} changes the names of individual variables using +\code{new_name = old_name} syntax; \code{rename_with()} renames columns using a +function. +} +\section{Methods}{ + + +This function is a \strong{generic}, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +The following methods are currently available in loaded packages: +\Sexpr[stage=render,results=rd]{dplyr:::methods_rd("rename")}. + +} + +\examples{ +iris <- as_tibble(iris) # so it prints a little nicer +rename(iris, petal_length = Petal.Length) + +# Rename using a named vector and `all_of()` +lookup <- c(pl = "Petal.Length", sl = "Sepal.Length") +rename(iris, all_of(lookup)) + +# If your named vector might contain names that don't exist in the data, +# use `any_of()` instead +lookup <- c(lookup, new = "unknown") +try(rename(iris, all_of(lookup))) +rename(iris, any_of(lookup)) + +rename_with(iris, toupper) +rename_with(iris, toupper, starts_with("Petal")) +rename_with(iris, ~ tolower(gsub(".", "_", .x, fixed = TRUE))) + +\dontshow{if (getRversion() > "4.0.1") (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# If your renaming function uses `paste0()`, make sure to set +# `recycle0 = TRUE` to ensure that empty selections are recycled correctly +try(rename_with( + iris, + ~ paste0("prefix_", .x), + starts_with("nonexistent") +)) + +rename_with( + iris, + ~ paste0("prefix_", .x, recycle0 = TRUE), + starts_with("nonexistent") +) +\dontshow{\}) # examplesIf} +} +\seealso{ +Other single table verbs: +\code{\link{arrange}()}, +\code{\link{mutate}()}, +\code{\link{summarise}()} +} +\concept{single table verbs} diff --git a/man/right_join.Rd b/man/right_join.Rd new file mode 100644 index 00000000..88594f4f --- /dev/null +++ b/man/right_join.Rd @@ -0,0 +1,172 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{right_join} +\alias{right_join} +\alias{right_join.tidybulk} +\title{Mutating joins} +\usage{ +\method{right_join}{tidybulk}(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...) +} +\arguments{ +\item{x, y}{A pair of data frames, data frame extensions (e.g. a tibble), or +lazy data frames (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{by}{A join specification created with \code{\link[dplyr:join_by]{join_by()}}, or a character +vector of variables to join by. + +If \code{NULL}, the default, \verb{*_join()} will perform a natural join, using all +variables in common across \code{x} and \code{y}. A message lists the variables so +that you can check they're correct; suppress the message by supplying \code{by} +explicitly. + +To join on different variables between \code{x} and \code{y}, use a \code{\link[dplyr:join_by]{join_by()}} +specification. For example, \code{join_by(a == b)} will match \code{x$a} to \code{y$b}. + +To join by multiple variables, use a \code{\link[dplyr:join_by]{join_by()}} specification with +multiple expressions. For example, \code{join_by(a == b, c == d)} will match +\code{x$a} to \code{y$b} and \code{x$c} to \code{y$d}. If the column names are the same between +\code{x} and \code{y}, you can shorten this by listing only the variable names, like +\code{join_by(a, c)}. + +\code{\link[dplyr:join_by]{join_by()}} can also be used to perform inequality, rolling, and overlap +joins. See the documentation at \link[dplyr:join_by]{?join_by} for details on +these types of joins. + +For simple equality joins, you can alternatively specify a character vector +of variable names to join by. For example, \code{by = c("a", "b")} joins \code{x$a} +to \code{y$a} and \code{x$b} to \code{y$b}. If variable names differ between \code{x} and \code{y}, +use a named character vector like \code{by = c("x_a" = "y_a", "x_b" = "y_b")}. + +To perform a cross-join, generating all combinations of \code{x} and \code{y}, see +\code{\link[dplyr:cross_join]{cross_join()}}.} + +\item{copy}{If \code{x} and \code{y} are not from the same data source, +and \code{copy} is \code{TRUE}, then \code{y} will be copied into the +same src as \code{x}. This allows you to join tables across srcs, but +it is a potentially expensive operation so you must opt into it.} + +\item{suffix}{If there are non-joined duplicate variables in \code{x} and +\code{y}, these suffixes will be added to the output to disambiguate them. +Should be a character vector of length 2.} + +\item{...}{Other parameters passed onto methods.} +} +\value{ +An object of the same type as \code{x} (including the same groups). The order of +the rows and columns of \code{x} is preserved as much as possible. The output has +the following properties: +\itemize{ +\item The rows are affect by the join type. +\itemize{ +\item \code{inner_join()} returns matched \code{x} rows. +\item \code{left_join()} returns all \code{x} rows. +\item \code{right_join()} returns matched of \code{x} rows, followed by unmatched \code{y} rows. +\item \code{full_join()} returns all \code{x} rows, followed by unmatched \code{y} rows. +} +\item Output columns include all columns from \code{x} and all non-key columns from +\code{y}. If \code{keep = TRUE}, the key columns from \code{y} are included as well. +\item If non-key columns in \code{x} and \code{y} have the same name, \code{suffix}es are added +to disambiguate. If \code{keep = TRUE} and key columns in \code{x} and \code{y} have +the same name, \code{suffix}es are added to disambiguate these as well. +\item If \code{keep = FALSE}, output columns included in \code{by} are coerced to their +common type between \code{x} and \code{y}. +} +} +\description{ +Mutating joins add columns from \code{y} to \code{x}, matching observations based on +the keys. There are four mutating joins: the inner join, and the three outer +joins. +\subsection{Inner join}{ + +An \code{inner_join()} only keeps observations from \code{x} that have a matching key +in \code{y}. + +The most important property of an inner join is that unmatched rows in either +input are not included in the result. This means that generally inner joins +are not appropriate in most analyses, because it is too easy to lose +observations. +} + +\subsection{Outer joins}{ + +The three outer joins keep observations that appear in at least one of the +data frames: +\itemize{ +\item A \code{left_join()} keeps all observations in \code{x}. +\item A \code{right_join()} keeps all observations in \code{y}. +\item A \code{full_join()} keeps all observations in \code{x} and \code{y}. +} +} +} +\section{Many-to-many relationships}{ + + + +By default, dplyr guards against many-to-many relationships in equality joins +by throwing a warning. These occur when both of the following are true: +\itemize{ +\item A row in \code{x} matches multiple rows in \code{y}. +\item A row in \code{y} matches multiple rows in \code{x}. +} + +This is typically surprising, as most joins involve a relationship of +one-to-one, one-to-many, or many-to-one, and is often the result of an +improperly specified join. Many-to-many relationships are particularly +problematic because they can result in a Cartesian explosion of the number of +rows returned from the join. + +If a many-to-many relationship is expected, silence this warning by +explicitly setting \code{relationship = "many-to-many"}. + +In production code, it is best to preemptively set \code{relationship} to whatever +relationship you expect to exist between the keys of \code{x} and \code{y}, as this +forces an error to occur immediately if the data doesn't align with your +expectations. + +Inequality joins typically result in many-to-many relationships by nature, so +they don't warn on them by default, but you should still take extra care when +specifying an inequality join, because they also have the capability to +return a large number of rows. + +Rolling joins don't warn on many-to-many relationships either, but many +rolling joins follow a many-to-one relationship, so it is often useful to +set \code{relationship = "many-to-one"} to enforce this. + +Note that in SQL, most database providers won't let you specify a +many-to-many relationship between two tables, instead requiring that you +create a third \emph{junction table} that results in two one-to-many relationships +instead. + +} + +\section{Methods}{ + + +These functions are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{inner_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("inner_join")}. +\item \code{left_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("left_join")}. +\item \code{right_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("right_join")}. +\item \code{full_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("full_join")}. +} + +} + +\examples{ +data(se_mini) +annotation <- se_mini |> tidybulk() |> as_tibble() |> + distinct(.sample) |> mutate(source = "AU") +se_mini |> tidybulk() |> as_tibble() |> right_join(annotation) + +} +\seealso{ +Other joins: +\code{\link[dplyr]{cross_join}()}, +\code{\link[dplyr]{filter-joins}}, +\code{\link[dplyr]{nest_join}()} +} diff --git a/man/rotate_dimensions-methods.Rd b/man/rotate_dimensions-methods.Rd index c87c1cca..8825daa9 100644 --- a/man/rotate_dimensions-methods.Rd +++ b/man/rotate_dimensions-methods.Rd @@ -115,7 +115,9 @@ A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -rotate_dimensions() takes as input a `tbl` formatted as | | | <...> | and calculates the rotated dimensional space of the transcript abundance. +rotate_dimensions() takes as input a `tbl` formatted as + | | | <...> | and calculates the rotated + dimensional space of the transcript abundance. } \details{ `r lifecycle::badge("maturing")` @@ -140,7 +142,9 @@ counts.MDS = identify_abundant() |> reduce_dimensions( method="MDS", .dims = 3) -counts.MDS.rotated = rotate_dimensions(counts.MDS, `Dim1`, `Dim2`, rotation_degrees = 45, .element = sample) +counts.MDS.rotated = rotate_dimensions(counts.MDS, `Dim1`, `Dim2`, + rotation_degrees = 45, + .element = sample) } diff --git a/man/rowwise-methods.Rd b/man/rowwise-methods.Rd deleted file mode 100644 index 3ff5fecf..00000000 --- a/man/rowwise-methods.Rd +++ /dev/null @@ -1,36 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{rowwise} -\alias{rowwise} -\title{Group input by rows} -\arguments{ -\item{data}{Input data frame.} - -\item{...}{Variables to be preserved when calling summarise(). This is typically a set of variables whose combination uniquely identify each row. NB: unlike group_by() you can not create new variables here but instead you can select multiple variables with (e.g.) everything().} -} -\value{ -A consistent object (to the input) - - A `tbl` -} -\description{ -See [this repository](https://github.com/jennybc/row-oriented-workflows) -for alternative ways to perform row-wise operations. -} -\details{ -`rowwise()` is used for the results of [do()] when you -create list-variables. It is also useful to support arbitrary -complex operations that need to be applied to each row. - -Currently, rowwise grouping only works with data frames. Its -main impact is to allow you to work with list-variables in -[summarise()] and [mutate()] without having to -use \code{[[1]]}. This makes `summarise()` on a rowwise tbl -effectively equivalent to [plyr::ldply()]. -} -\examples{ - -df <- expand.grid(x = 1:3, y = 3:1) -df_done <- df |> rowwise() - -} diff --git a/man/rowwise.Rd b/man/rowwise.Rd new file mode 100644 index 00000000..28d150da --- /dev/null +++ b/man/rowwise.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{rowwise} +\alias{rowwise} +\alias{rowwise.tidybulk} +\title{Group input by rows} +\usage{ +\method{rowwise}{tidybulk}(data, ...) +} +\arguments{ +\item{data}{Input data frame.} + +\item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Variables to be preserved +when calling \code{\link[dplyr:summarise]{summarise()}}. This is typically a set of variables whose +combination uniquely identify each row. + +\strong{NB}: unlike \code{group_by()} you can not create new variables here but +instead you can select multiple variables with (e.g.) \code{everything()}.} +} +\value{ +A row-wise data frame with class \code{rowwise_df}. Note that a +\code{rowwise_df} is implicitly grouped by row, but is not a \code{grouped_df}. +} +\description{ +\code{rowwise()} allows you to compute on a data frame a row-at-a-time. +This is most useful when a vectorised function doesn't exist. + +Most dplyr verbs preserve row-wise grouping. The exception is \code{\link[dplyr:summarise]{summarise()}}, +which return a \link[dplyr]{grouped_df}. You can explicitly ungroup with \code{\link[dplyr:ungroup]{ungroup()}} +or \code{\link[dplyr:as_tibble]{as_tibble()}}, or convert to a \link[dplyr]{grouped_df} with \code{\link[dplyr:group_by]{group_by()}}. +} +\section{List-columns}{ + + +Because a rowwise has exactly one row per group it offers a small +convenience for working with list-columns. Normally, \code{summarise()} and +\code{mutate()} extract a groups worth of data with \code{[}. But when you index +a list in this way, you get back another list. When you're working with +a \code{rowwise} tibble, then dplyr will use \code{[[} instead of \code{[} to make your +life a little easier. + +} + +\examples{ +df <- tibble(x = runif(6), y = runif(6), z = runif(6)) +# Compute the mean of x, y, z in each row +df \%>\% rowwise() \%>\% mutate(m = mean(c(x, y, z))) +# use c_across() to more easily select many variables +df \%>\% rowwise() \%>\% mutate(m = mean(c_across(x:z))) + +# Compute the minimum of x and y in each row +df \%>\% rowwise() \%>\% mutate(m = min(c(x, y, z))) +# In this case you can use an existing vectorised function: +df \%>\% mutate(m = pmin(x, y, z)) +# Where these functions exist they'll be much faster than rowwise +# so be on the lookout for them. + +# rowwise() is also useful when doing simulations +params <- tribble( + ~sim, ~n, ~mean, ~sd, + 1, 1, 1, 1, + 2, 2, 2, 4, + 3, 3, -1, 2 +) +# Here I supply variables to preserve after the computation +params \%>\% + rowwise(sim) \%>\% + reframe(z = rnorm(n, mean, sd)) + +# If you want one row per simulation, put the results in a list() +params \%>\% + rowwise(sim) \%>\% + summarise(z = list(rnorm(n, mean, sd)), .groups = "keep") +} +\seealso{ +\code{\link[dplyr:nest_by]{nest_by()}} for a convenient way of creating rowwise data frames +with nested data. +} diff --git a/man/summarise-methods.Rd b/man/summarise-methods.Rd deleted file mode 100644 index 56907289..00000000 --- a/man/summarise-methods.Rd +++ /dev/null @@ -1,91 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{summarise} -\alias{summarise} -\title{Summarise each group to fewer rows} -\arguments{ -\item{.data}{A tbl. (See dplyr)} - -\item{...}{<[`tidy-eval`][dplyr_tidy_eval]> Name-value pairs of summary - functions. The name will be the name of the variable in the result. - - The value can be: - - * A vector of length 1, e.g. `min(x)`, `n()`, or `sum(is.na(y))`. - * A vector of length `n`, e.g. `quantile()`. - * A data frame, to add multiple columns from a single expression.} -} -\value{ -An object _usually_ of the same type as `.data`. - -* The rows come from the underlying `group_keys()`. -* The columns are a combination of the grouping keys and the summary - expressions that you provide. -* If `x` is grouped by more than one variable, the output will be another - [grouped_df] with the right-most group removed. -* If `x` is grouped by one variable, or is not grouped, the output will - be a [tibble]. -* Data frame attributes are **not** preserved, because `summarise()` - fundamentally creates a new data frame. -} -\description{ -`summarise()` creates a new data frame. It will have one (or more) rows for -each combination of grouping variables; if there are no grouping variables, -the output will have a single row summarising all observations in the input. -It will contain one column for each grouping variable and one column -for each of the summary statistics that you have specified. - -`summarise()` and `summarize()` are synonyms. -} -\section{Useful functions}{ - - -* Center: [mean()], [median()] -* Spread: [sd()], [IQR()], [mad()] -* Range: [min()], [max()], [quantile()] -* Position: [first()], [last()], [nth()], -* Count: [n()], [n_distinct()] -* Logical: [any()], [all()] -} - -\section{Backend variations}{ - - -The data frame backend supports creating a variable and using it in the -same summary. This means that previously created summary variables can be -further transformed or combined within the summary, as in [mutate()]. -However, it also means that summary variables with the same names as previous -variables overwrite them, making those variables unavailable to later summary -variables. - -This behaviour may not be supported in other backends. To avoid unexpected -results, consider using new names for your summary variables, especially when -creating multiple summaries. -} - -\section{Methods}{ - -This function is a **generic**, which means that packages can provide -implementations (methods) for other classes. See the documentation of -individual methods for extra arguments and differences in behaviour. - -The following methods are currently available in loaded packages: -} - -\examples{ - -# A summary applied to ungrouped tbl returns a single row - -mtcars |> - summarise(mean = mean(disp)) - - -} -\seealso{ -Other single table verbs: -\code{\link{arrange}()}, -\code{\link{filter}()}, -\code{\link{mutate}()}, -\code{\link{rename}()} -} -\concept{single table verbs} diff --git a/man/summarise.Rd b/man/summarise.Rd new file mode 100644 index 00000000..cf23fdc1 --- /dev/null +++ b/man/summarise.Rd @@ -0,0 +1,139 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{summarise} +\alias{summarise} +\alias{summarise.tidybulk} +\alias{summarize} +\alias{summarize.tidybulk} +\title{Summarise each group down to one row} +\usage{ +\method{summarise}{tidybulk}(.data, ...) + +\method{summarize}{tidybulk}(.data, ...) +} +\arguments{ +\item{.data}{A data frame, data frame extension (e.g. a tibble), or a +lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Name-value pairs of +summary functions. The name will be the name of the variable in the result. + +The value can be: +\itemize{ +\item A vector of length 1, e.g. \code{min(x)}, \code{n()}, or \code{sum(is.na(y))}. +\item A data frame, to add multiple columns from a single expression. +} + +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Returning values with size 0 or >1 was +deprecated as of 1.1.0. Please use \code{\link[dplyr:reframe]{reframe()}} for this instead.} +} +\value{ +An object \emph{usually} of the same type as \code{.data}. +\itemize{ +\item The rows come from the underlying \code{\link[dplyr:group_keys]{group_keys()}}. +\item The columns are a combination of the grouping keys and the summary +expressions that you provide. +\item The grouping structure is controlled by the \verb{.groups=} argument, the +output may be another \link[dplyr]{grouped_df}, a \link[dplyr]{tibble} or a \link[dplyr]{rowwise} data frame. +\item Data frame attributes are \strong{not} preserved, because \code{summarise()} +fundamentally creates a new data frame. +} +} +\description{ +\code{summarise()} creates a new data frame. It returns one row for each +combination of grouping variables; if there are no grouping variables, the +output will have a single row summarising all observations in the input. It +will contain one column for each grouping variable and one column for each of +the summary statistics that you have specified. + +\code{summarise()} and \code{summarize()} are synonyms. +} +\section{Useful functions}{ + + +\itemize{ +\item Center: \code{\link[=mean]{mean()}}, \code{\link[=median]{median()}} +\item Spread: \code{\link[=sd]{sd()}}, \code{\link[=IQR]{IQR()}}, \code{\link[=mad]{mad()}} +\item Range: \code{\link[=min]{min()}}, \code{\link[=max]{max()}}, +\item Position: \code{\link[dplyr:first]{first()}}, \code{\link[dplyr:last]{last()}}, \code{\link[dplyr:nth]{nth()}}, +\item Count: \code{\link[dplyr:n]{n()}}, \code{\link[dplyr:n_distinct]{n_distinct()}} +\item Logical: \code{\link[=any]{any()}}, \code{\link[=all]{all()}} +} + +} + +\section{Backend variations}{ + + + +The data frame backend supports creating a variable and using it in the +same summary. This means that previously created summary variables can be +further transformed or combined within the summary, as in \code{\link[dplyr:mutate]{mutate()}}. +However, it also means that summary variables with the same names as previous +variables overwrite them, making those variables unavailable to later summary +variables. + +This behaviour may not be supported in other backends. To avoid unexpected +results, consider using new names for your summary variables, especially when +creating multiple summaries. + +} + +\section{Methods}{ + + +This function is a \strong{generic}, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +The following methods are currently available in loaded packages: +\Sexpr[stage=render,results=rd]{dplyr:::methods_rd("summarise")}. + +} + +\examples{ +# A summary applied to ungrouped tbl returns a single row +mtcars \%>\% + summarise(mean = mean(disp), n = n()) + +# Usually, you'll want to group first +mtcars \%>\% + group_by(cyl) \%>\% + summarise(mean = mean(disp), n = n()) + +# Each summary call removes one grouping level (since that group +# is now just a single row) +mtcars \%>\% + group_by(cyl, vs) \%>\% + summarise(cyl_n = n()) \%>\% + group_vars() + +# BEWARE: reusing variables may lead to unexpected results +mtcars \%>\% + group_by(cyl) \%>\% + summarise(disp = mean(disp), sd = sd(disp)) + +# Refer to column names stored as strings with the `.data` pronoun: +var <- "mass" +summarise(starwars, avg = mean(.data[[var]], na.rm = TRUE)) +# Learn more in ?rlang::args_data_masking + +# In dplyr 1.1.0, returning multiple rows per group was deprecated in favor +# of `reframe()`, which never messages and always returns an ungrouped +# result: +mtcars \%>\% + group_by(cyl) \%>\% + summarise(qs = quantile(disp, c(0.25, 0.75)), prob = c(0.25, 0.75)) +# -> +mtcars \%>\% + group_by(cyl) \%>\% + reframe(qs = quantile(disp, c(0.25, 0.75)), prob = c(0.25, 0.75)) +} +\seealso{ +Other single table verbs: +\code{\link{arrange}()}, +\code{\link{mutate}()}, +\code{\link{rename}()} +} +\concept{single table verbs} diff --git a/man/symbol_to_entrez.Rd b/man/symbol_to_entrez.Rd index f02ca98d..d61a335b 100644 --- a/man/symbol_to_entrez.Rd +++ b/man/symbol_to_entrez.Rd @@ -23,7 +23,8 @@ Get ENTREZ id from gene SYMBOL # This function was designed for data.frame # Convert from SummarizedExperiment for this example. It is NOT reccomended. - -tidybulk::se_mini |> tidybulk() |> as_tibble() |> symbol_to_entrez(.transcript = .feature, .sample = .sample) +data(se_mini) +se_mini |> tidybulk() |> as_tibble() |> + symbol_to_entrez(.transcript = .feature, .sample = .sample) } diff --git a/man/test_gene_enrichment-methods.Rd b/man/test_gene_enrichment-methods.Rd index b21525d8..43180fc0 100644 --- a/man/test_gene_enrichment-methods.Rd +++ b/man/test_gene_enrichment-methods.Rd @@ -145,7 +145,10 @@ A consistent object (to the input) A consistent object (to the input) } \description{ -test_gene_enrichment() takes as input a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a `tbl` of gene set information +test_gene_enrichment() takes as input a `tbl` +(with at least three columns for sample, feature and transcript abundance) +or `SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a `tbl` of gene set information } \details{ `r lifecycle::badge("maturing")` @@ -153,7 +156,7 @@ test_gene_enrichment() takes as input a `tbl` (with at least three columns for s This wrapper executes ensemble gene enrichment analyses of the dataset using EGSEA (DOI:0.12688/f1000research.12544.1) -dge = +dge <- data |> keep_abundant( factor_of_interest = !!as.symbol(parse_formula(.formula)[[1]]), @@ -165,11 +168,11 @@ dge = as_matrix(rownames = !!.entrez) %>% edgeR::DGEList(counts = .) -idx = buildIdx(entrezIDs = rownames(dge), species = species, msigdb.gsets = msigdb.gsets, +idx <- buildIdx(entrezIDs = rownames(dge), species = species, + msigdb.gsets = msigdb.gsets, kegg.exclude = kegg.exclude) dge |> - # Calculate weights limma::voom(design, plot = FALSE) |> @@ -187,9 +190,9 @@ dge |> \dontrun{ library(SummarizedExperiment) -se = tidybulk::se_mini -rowData( se)$entrez = rownames(se ) -df_entrez = aggregate_duplicates(se,.transcript = entrez ) +se <- tidybulk::se_mini +rowData(se)$entrez <- rownames(se) +df_entrez <- aggregate_duplicates(se, .transcript = entrez) library("EGSEA") @@ -199,8 +202,10 @@ library("EGSEA") .sample = sample, .entrez = entrez, .abundance = count, - methods = c("roast" , "safe", "gage" , "padog" , "globaltest", "ora" ), - gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), + methods = c("roast", "safe", "gage", + "padog", "globaltest", "ora"), + gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", + "kegg_disease", "kegg_metabolism", "kegg_signaling"), species="human", cores = 2 ) diff --git a/man/tidybulk-methods.Rd b/man/tidybulk-methods.Rd index 0ef4e838..b7439c81 100644 --- a/man/tidybulk-methods.Rd +++ b/man/tidybulk-methods.Rd @@ -42,7 +42,10 @@ A `tidybulk` object A `tidybulk` object } \description{ -tidybulk() creates an annotated `tidybulk` tibble from a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +tidybulk() creates an annotated `tidybulk` tibble from a `tbl` +(with at least three columns for sample, feature and transcript abundance) +or `SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) } \details{ `r lifecycle::badge("maturing")` diff --git a/man/ungroup.Rd b/man/ungroup.Rd new file mode 100644 index 00000000..d634c6df --- /dev/null +++ b/man/ungroup.Rd @@ -0,0 +1,148 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{ungroup} +\alias{ungroup} +\alias{ungroup.tidybulk} +\title{Group by one or more variables} +\usage{ +\method{ungroup}{tidybulk}(x, ...) +} +\arguments{ +\item{x}{A \code{\link[dplyr:tbl]{tbl()}}} + +\item{...}{In \code{group_by()}, variables or computations to group by. +Computations are always done on the ungrouped data frame. +To perform computations on the grouped data, you need to use +a separate \code{mutate()} step before the \code{group_by()}. +Computations are not allowed in \code{nest_by()}. +In \code{ungroup()}, variables to remove from the grouping.} +} +\value{ +A grouped data frame with class \code{\link[dplyr]{grouped_df}}, +unless the combination of \code{...} and \code{add} yields a empty set of +grouping columns, in which case a tibble will be returned. +} +\description{ +Most data operations are done on groups defined by variables. +\code{group_by()} takes an existing tbl and converts it into a grouped tbl +where operations are performed "by group". \code{ungroup()} removes grouping. +} +\section{Methods}{ + + +These function are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{group_by()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("group_by")}. +\item \code{ungroup()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("ungroup")}. +} + +} + +\section{Ordering}{ + + +Currently, \code{group_by()} internally orders the groups in ascending order. This +results in ordered output from functions that aggregate groups, such as +\code{\link[dplyr:summarise]{summarise()}}. + +When used as grouping columns, character vectors are ordered in the C locale +for performance and reproducibility across R sessions. If the resulting +ordering of your grouped operation matters and is dependent on the locale, +you should follow up the grouped operation with an explicit call to +\code{\link[dplyr:arrange]{arrange()}} and set the \code{.locale} argument. For example: + +\if{html}{\out{
}}\preformatted{data \%>\% + group_by(chr) \%>\% + summarise(avg = mean(x)) \%>\% + arrange(chr, .locale = "en") +}\if{html}{\out{
}} + +This is often useful as a preliminary step before generating content intended +for humans, such as an HTML table. +\subsection{Legacy behavior}{ + +Prior to dplyr 1.1.0, character vector grouping columns were ordered in the +system locale. If you need to temporarily revert to this behavior, you can +set the global option \code{dplyr.legacy_locale} to \code{TRUE}, but this should be +used sparingly and you should expect this option to be removed in a future +version of dplyr. It is better to update existing code to explicitly call +\code{arrange(.locale = )} instead. Note that setting \code{dplyr.legacy_locale} will +also force calls to \code{\link[dplyr:arrange]{arrange()}} to use the system locale. +} + +} + +\examples{ +by_cyl <- mtcars \%>\% group_by(cyl) + +# grouping doesn't change how the data looks (apart from listing +# how it's grouped): +by_cyl + +# It changes how it acts with the other dplyr verbs: +by_cyl \%>\% summarise( + disp = mean(disp), + hp = mean(hp) +) +by_cyl \%>\% filter(disp == max(disp)) + +# Each call to summarise() removes a layer of grouping +by_vs_am <- mtcars \%>\% group_by(vs, am) +by_vs <- by_vs_am \%>\% summarise(n = n()) +by_vs +by_vs \%>\% summarise(n = sum(n)) + +# To removing grouping, use ungroup +by_vs \%>\% + ungroup() \%>\% + summarise(n = sum(n)) + +# By default, group_by() overrides existing grouping +by_cyl \%>\% + group_by(vs, am) \%>\% + group_vars() + +# Use add = TRUE to instead append +by_cyl \%>\% + group_by(vs, am, .add = TRUE) \%>\% + group_vars() + +# You can group by expressions: this is a short-hand +# for a mutate() followed by a group_by() +mtcars \%>\% + group_by(vsam = vs + am) + +# The implicit mutate() step is always performed on the +# ungrouped data. Here we get 3 groups: +mtcars \%>\% + group_by(vs) \%>\% + group_by(hp_cut = cut(hp, 3)) + +# If you want it to be performed by groups, +# you have to use an explicit mutate() call. +# Here we get 3 groups per value of vs +mtcars \%>\% + group_by(vs) \%>\% + mutate(hp_cut = cut(hp, 3)) \%>\% + group_by(hp_cut) + +# when factors are involved and .drop = FALSE, groups can be empty +tbl <- tibble( + x = 1:10, + y = factor(rep(c("a", "c"), each = 5), levels = c("a", "b", "c")) +) +tbl \%>\% + group_by(y, .drop = FALSE) \%>\% + group_rows() +} +\seealso{ +Other grouping functions: +\code{\link[dplyr]{group_map}()}, +\code{\link[dplyr]{group_nest}()}, +\code{\link[dplyr]{group_split}()}, +\code{\link[dplyr]{group_trim}()} +} diff --git a/man/unnest.Rd b/man/unnest.Rd new file mode 100644 index 00000000..68da1d78 --- /dev/null +++ b/man/unnest.Rd @@ -0,0 +1,111 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tidyr_methods.R +\name{unnest} +\alias{unnest} +\alias{unnest.nested_tidybulk} +\title{Unnest a list-column of data frames into rows and columns} +\usage{ +\method{unnest}{nested_tidybulk}( + data, + cols, + ..., + keep_empty = FALSE, + ptype = NULL, + names_sep = NULL, + names_repair = "check_unique", + .drop, + .id, + .sep, + .preserve +) +} +\arguments{ +\item{data}{A data frame.} + +\item{cols}{<\code{\link[tidyr:tidyr_tidy_select]{tidy-select}}> List-columns to unnest. + +When selecting multiple columns, values from the same row will be recycled +to their common size.} + +\item{...}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}: +previously you could write \code{df \%>\% unnest(x, y, z)}. +Convert to \code{df \%>\% unnest(c(x, y, z))}. If you previously created a new +variable in \code{unnest()} you'll now need to do it explicitly with \code{mutate()}. +Convert \code{df \%>\% unnest(y = fun(x, y, z))} +to \code{df \%>\% mutate(y = fun(x, y, z)) \%>\% unnest(y)}.} + +\item{keep_empty}{By default, you get one row of output for each element +of the list that you are unchopping/unnesting. This means that if there's a +size-0 element (like \code{NULL} or an empty data frame or vector), then that +entire row will be dropped from the output. If you want to preserve all +rows, use \code{keep_empty = TRUE} to replace size-0 elements with a single row +of missing values.} + +\item{ptype}{Optionally, a named list of column name-prototype pairs to +coerce \code{cols} to, overriding the default that will be guessed from +combining the individual values. Alternatively, a single empty ptype +can be supplied, which will be applied to all \code{cols}.} + +\item{names_sep}{If \code{NULL}, the default, the outer names will come from the +inner names. If a string, the outer names will be formed by pasting +together the outer and the inner column names, separated by \code{names_sep}.} + +\item{names_repair}{Used to check that output data frame has valid +names. Must be one of the following options: +\itemize{ +\item \verb{"minimal}": no name repair or checks, beyond basic existence, +\item \verb{"unique}": make sure names are unique and not empty, +\item \verb{"check_unique}": (the default), no name repair, but check they are unique, +\item \verb{"universal}": make the names unique and syntactic +\item a function: apply custom name repair. +\item \link[tidyr]{tidyr_legacy}: use the name repair from tidyr 0.8. +\item a formula: a purrr-style anonymous function (see \code{\link[rlang:as_function]{rlang::as_function()}}) +} + +See \code{\link[vctrs:vec_as_names]{vctrs::vec_as_names()}} for more details on these terms and the +strategies used to enforce them.} + +\item{.drop, .preserve}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}: +all list-columns are now preserved; If there are any that you +don't want in the output use \code{select()} to remove them prior to +unnesting.} + +\item{.id}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}: +convert \code{df \%>\% unnest(x, .id = "id")} to \verb{df \%>\% mutate(id = names(x)) \%>\% unnest(x))}.} + +\item{.sep}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}: +use \code{names_sep} instead.} +} +\value{ +`tidySingleCellExperiment` +} +\description{ +Unnest expands a list-column containing data frames into rows and columns. +} +\section{New syntax}{ + + +tidyr 1.0.0 introduced a new syntax for \code{nest()} and \code{unnest()} that's +designed to be more similar to other functions. Converting to the new syntax +should be straightforward (guided by the message you'll receive) but if +you just need to run an old analysis, you can easily revert to the previous +behaviour using \code{\link[tidyr:nest_legacy]{nest_legacy()}} and \code{\link[tidyr:unnest_legacy]{unnest_legacy()}} as follows: + +\if{html}{\out{
}}\preformatted{library(tidyr) +nest <- nest_legacy +unnest <- unnest_legacy +}\if{html}{\out{
}} + +} + +\examples{ +data(se_mini) +se_mini |> tidybulk() |> nest( data = -.feature) |> unnest(data) + +} +\seealso{ +Other rectangling: +\code{\link[tidyr]{hoist}()}, +\code{\link[tidyr]{unnest_longer}()}, +\code{\link[tidyr]{unnest_wider}()} +} diff --git a/tests/testthat/test-bulk_methods.R b/tests/testthat/test-bulk_methods.R index 417c5827..e8023418 100755 --- a/tests/testthat/test-bulk_methods.R +++ b/tests/testthat/test-bulk_methods.R @@ -883,7 +883,7 @@ test_that("differential trancript abundance - random effects",{ head(4) |> expect_equal( c(0.1081176, 0.1303558, 0.1303558, 0.1693276), - tolerance=1e-3 + tolerance=1e-2 ) }) diff --git a/tests/testthat/test-bulk_methods_SummarizedExperiment.R b/tests/testthat/test-bulk_methods_SummarizedExperiment.R index 067bd6bc..49366aed 100755 --- a/tests/testthat/test-bulk_methods_SummarizedExperiment.R +++ b/tests/testthat/test-bulk_methods_SummarizedExperiment.R @@ -518,7 +518,7 @@ test_that("differential trancript abundance - random effects SE",{ head(4) |> expect_equal( c(0.1153254, 0.1668555, 0.1668555 , NA), - tolerance=1e-3 + tolerance=1e-2 ) }) From bd82a3c86cf10539fa8737089dbf8cc9f08c56eb Mon Sep 17 00:00:00 2001 From: chilampoon Date: Sat, 16 Sep 2023 22:48:54 -0400 Subject: [PATCH 2/7] rm ggplot2 dep and add .Rinstignore --- .Rinstignore | 1 + R/attach.R | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 .Rinstignore diff --git a/.Rinstignore b/.Rinstignore new file mode 100644 index 00000000..caf01ec9 --- /dev/null +++ b/.Rinstignore @@ -0,0 +1 @@ +dev/ \ No newline at end of file diff --git a/R/attach.R b/R/attach.R index 0543a45c..8e58b773 100644 --- a/R/attach.R +++ b/R/attach.R @@ -1,4 +1,4 @@ -core <- c("dplyr", "tidyr", "ttservice", "ggplot2") +core <- c("dplyr", "tidyr", "ttservice") core_unloaded <- function() { search <- paste0("package:", core) From 89b106f2df5e9f8453cc1c8361ea39683d14d192 Mon Sep 17 00:00:00 2001 From: chilampoon Date: Sun, 17 Sep 2023 11:06:07 -0400 Subject: [PATCH 3/7] try to find the warning source --- dev/dplyr-master-methods.R | 1177 ------------------------------------ 1 file changed, 1177 deletions(-) delete mode 100755 dev/dplyr-master-methods.R diff --git a/dev/dplyr-master-methods.R b/dev/dplyr-master-methods.R deleted file mode 100755 index e24ef031..00000000 --- a/dev/dplyr-master-methods.R +++ /dev/null @@ -1,1177 +0,0 @@ - -#' Arrange rows by column values -#' -#' See \code{dpyr::\link[dpyr:arrange]{arrange}} for details. -#' -#' @description -#' `arrange()` order the rows of a data frame rows by the values of selected -#' columns. -#' -#' Unlike other dplyr verbs, `arrange()` largely ignores grouping; you -#' need to explicit mention grouping variables (or use `by_group = TRUE`) -#' in order to group by them, and functions of variables are evaluated -#' once per data frame, not once per group. -#' -#' @details -#' ## Locales -#' The sort order for character vectors will depend on the collating sequence -#' of the locale in use: see [locales()]. -#' -#' ## Missing values -#' Unlike base sorting with `sort()`, `NA` are: -#' * always sorted to the end for local data, even when wrapped with `desc()`. -#' * treated differently for remote data, depending on the backend. -#' -#' @return -#' An object of the same type as `.data`. -#' -#' * All rows appear in the output, but (usually) in a different place. -#' * Columns are not modified. -#' * Groups are not modified. -#' * Data frame attributes are preserved. -#' @section Methods: -#' This function is a **generic**, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' The following methods are currently available in loaded packages: -#' \Sexpr[stage=render,results=Rd]{dplyr:::methods_rd("arrange")}. -#' @export -#' @param .data A data frame, data frame extension (e.g. a tibble), or a -#' lazy data frame (e.g. from dbplyr or dtplyr). See *Methods*, below, for -#' more details. -#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Variables, or functions or -#' variables. Use [desc()] to sort a variable in descending order. -#' @family single table verbs -#' @examples -#' arrange(mtcars, cyl, disp) -#' arrange(mtcars, desc(disp)) -#' -#' # grouped arrange ignores groups -#' by_cyl <- mtcars %>% group_by(cyl) -#' by_cyl %>% arrange(desc(wt)) -#' # Unless you specifically ask: -#' by_cyl %>% arrange(desc(wt), .by_group = TRUE) -arrange <- function(.data, ..., .by_group = FALSE) { - UseMethod("arrange") -} - -#' @param .by_group If `TRUE`, will sort first by grouping variable. Applies to -#' grouped data frames only. -#' @rdname arrange -#' @export -#' -############# START ADDED tidybulk ################################### - -arrange.default <- function(.data, ..., .by_group = FALSE) { - - dplyr::arrange(.data, ..., .by_group = .by_group) - -} - -#' @export -arrange.tidybulk <- function(.data, ..., .by_group = FALSE) { - - .data %>% - drop_class(c("tidybulk", "tt")) %>% - dplyr::arrange( ..., .by_group = .by_group) %>% - - # Attach attributes - reattach_internals() %>% - - # Add class - add_class("tt") %>% - add_class("tidybulk") - -} - -############# END ADDED tidybulk ##################################### - -#' Efficiently bind multiple data frames by row and column -#' -#' This is an efficient implementation of the common pattern of -#' `do.call(rbind, dfs)` or `do.call(cbind, dfs)` for binding many -#' data frames into one. -#' -#' The output of `bind_rows()` will contain a column if that column -#' appears in any of the inputs. -#' -#' @param ... Data frames to combine. -#' -#' Each argument can either be a data frame, a list that could be a data -#' frame, or a list of data frames. -#' -#' When row-binding, columns are matched by name, and any missing -#' columns will be filled with NA. -#' -#' When column-binding, rows are matched by position, so all data -#' frames must have the same number of rows. To match by value, not -#' position, see [mutate-joins]. -#' @param .id Data frame identifier. -#' -#' When `.id` is supplied, a new column of identifiers is -#' created to link each row to its original data frame. The labels -#' are taken from the named arguments to `bind_rows()`. When a -#' list of data frames is supplied, the labels are taken from the -#' names of the list. If no names are found a numeric sequence is -#' used instead. -#' @return `bind_rows()` and `bind_cols()` return the same type as -#' the first input, either a data frame, `tbl_df`, or `grouped_df`. -#' @examples -#' one <- mtcars[1:4, ] -#' two <- mtcars[11:14, ] -#' -#' # You can supply data frames as arguments: -#' bind_rows(one, two) -#' -#' # The contents of lists are spliced automatically: -#' bind_rows(list(one, two)) -#' bind_rows(split(mtcars, mtcars$cyl)) -#' bind_rows(list(one, two), list(two, one)) -#' -#' -#' # In addition to data frames, you can supply vectors. In the rows -#' # direction, the vectors represent rows and should have inner -#' # names: -#' bind_rows( -#' c(a = 1, b = 2), -#' c(a = 3, b = 4) -#' ) -#' -#' # You can mix vectors and data frames: -#' bind_rows( -#' c(a = 1, b = 2), -#' tibble(a = 3:4, b = 5:6), -#' c(a = 7, b = 8) -#' ) -#' -#' -#' # Note that for historical reasons, lists containing vectors are -#' # always treated as data frames. Thus their vectors are treated as -#' # columns rather than rows, and their inner names are ignored: -#' ll <- list( -#' a = c(A = 1, B = 2), -#' b = c(A = 3, B = 4) -#' ) -#' bind_rows(ll) -#' -#' # You can circumvent that behaviour with explicit splicing: -#' bind_rows(!!!ll) -#' -#' -#' # When you supply a column name with the `.id` argument, a new -#' # column is created to link each row to its original data frame -#' bind_rows(list(one, two), .id = "id") -#' bind_rows(list(a = one, b = two), .id = "id") -#' bind_rows("group 1" = one, "group 2" = two, .id = "groups") -#' -#' # Columns don't need to match when row-binding -#' bind_rows(data.frame(x = 1:3), data.frame(y = 1:4)) -#' \dontrun{ -#' # Rows do need to match when column-binding -#' bind_cols(data.frame(x = 1), data.frame(y = 1:2)) -#' } -#' -#' bind_cols(one, two) -#' bind_cols(list(one, two)) -#' @name bind -NULL - -#' @export -#' @rdname bind -#' @export -#' -############# START ADDED tidybulk ##################################### - -bind_rows <- function(..., .id = NULL) { - UseMethod("bind_rows") -} - -#' @export -bind_rows.default <- function(..., .id = NULL) -{ - bind_rows(..., .id = .id) -} - -#' @export -bind_rows.tidybulk <- function(..., .id = NULL) -{ - - tts = flatten_if(dots_values(...), is_spliced) # Original that fails Bioconductor dplyr:::flatten_bindable(rlang::dots_values(...)) - - par1 = tts[[1]] %>% get_tt_columns() %>% unlist - par2 = tts[[2]] %>% get_tt_columns() %>% unlist - - # tt_columns of the two objects must match - error_if_parameters_not_match(par1, par2) - - bind_rows(..., .id = .id) %>% - - # Attach attributes from the first object - add_attr(tts[[1]] %>% attr("internals"), "internals") - -} - -############# END ADDED tidybulk ##################################### - -#' @export -#' @rdname bind -############# START ADDED tidybulk ##################################### - -bind_cols <- function(..., .id = NULL) { - UseMethod("bind_cols") -} - -#' @export -bind_cols.default <- function(..., .id = NULL) -{ - bind_cols(..., .id = .id) -} - -#' @export -bind_cols.tidybulk <- function(..., .id = NULL) -{ - - tts = flatten_if(dots_values(...), is_spliced) # Original that fails Bioconductor dplyr:::flatten_bindable(rlang::dots_values(...)) - - bind_cols(..., .id = .id) %>% - - # Attach attributes - add_attr(tts[[1]] %>% attr("internals"), "internals") - -} - -############# END ADDED tidybulk ##################################### -############# START ADDED tidybulk ##################################### - -#' @importFrom dplyr arrange_all -#' @export -dplyr::arrange_all - -#' @importFrom dplyr arrange_at -#' @export -dplyr::arrange_at - -#' @importFrom dplyr arrange_if -#' @export -dplyr::arrange_if - -############# END ADDED tidybulk ##################################### -############# START ADDED tidybulk ##################################### - -#' distinct -#' @param .data A tbl. (See dplyr) -#' @param ... Data frames to combine (See dplyr) -#' @param .keep_all If TRUE, keep all variables in .data. If a combination of ... is not distinct, this keeps the first row of values. (See dplyr) -#' -#' @return A tt object -#' -#' @examples -#' -#' tidybulk::se_mini %>% tidybulk() %>% distinct() -#' -#' -#' @export -distinct <- function (.data, ..., .keep_all = FALSE) { - UseMethod("distinct") -} - -#' @export -distinct.default <- function (.data, ..., .keep_all = FALSE) -{ - dplyr::distinct(.data, ..., .keep_all = FALSE) -} - -#' @export -distinct.tidybulk <- function (.data, ..., .keep_all = FALSE) -{ - .data %>% - drop_class(c("tidybulk", "tt")) %>% - dplyr::distinct(..., .keep_all = .keep_all) %>% - - # Attach attributes - reattach_internals() %>% - - # Add class - add_class("tt") %>% - add_class("tidybulk") - -} -############# END ADDED tidybulk ##################################### - -############# START ADDED tidybulk ##################################### - -#' @importFrom dplyr distinct_all -#' @export -dplyr::distinct_all - -#' @importFrom dplyr distinct_at -#' @export -dplyr::distinct_at - -#' @importFrom dplyr distinct_if -#' @export -dplyr::distinct_if - -############# END ADDED tidybulk ##################################### - -#' Subset rows using column values -#' -#' `filter()` retains the rows where the conditions you provide a `TRUE`. Note -#' that, unlike base subsetting with `[`, rows where the condition evaluates -#' to `NA` are dropped. -#' -#' dplyr is not yet smart enough to optimise filtering optimisation -#' on grouped datasets that don't need grouped calculations. For this reason, -#' filtering is often considerably faster on [ungroup()]ed data. -#' -#' @section Useful filter functions: -#' -#' * [`==`], [`>`], [`>=`] etc -#' * [`&`], [`|`], [`!`], [xor()] -#' * [is.na()] -#' * [between()], [near()] -#' -#' @section Grouped tibbles: -#' -#' Because filtering expressions are computed within groups, they may -#' yield different results on grouped tibbles. This will be the case -#' as soon as an aggregating, lagging, or ranking function is -#' involved. Compare this ungrouped filtering: -#' -#' ``` -#' starwars %>% filter(mass > mean(mass, na.rm = TRUE)) -#' ``` -#' -#' With the grouped equivalent: -#' -#' ``` -#' starwars %>% group_by(gender) %>% filter(mass > mean(mass, na.rm = TRUE)) -#' ``` -#' -#' The former keeps rows with `mass` greater than the global average -#' whereas the latter keeps rows with `mass` greater than the gender -#' -#' average. -#' @family single table verbs -#' @inheritParams arrange -#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Logical predicates defined in -#' terms of the variables in `.data`. -#' Multiple conditions are combined with `&`. Only rows where the -#' condition evaluates to `TRUE` are kept. -#' @param .preserve when `FALSE` (the default), the grouping structure -#' is recalculated based on the resulting data, otherwise it is kept as is. -#' @return -#' An object of the same type as `.data`. -#' -#' * Rows are a subset of the input, but appear in the same order. -#' * Columns are not modified. -#' * The number of groups may be reduced (if `.preserve` is not `TRUE`). -#' * Data frame attributes are preserved. -#' @section Methods: -#' This function is a **generic**, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' The following methods are currently available in loaded packages: -#' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("filter")}. -#' @seealso [filter_all()], [filter_if()] and [filter_at()]. -#' @export -#' @examples -#' filter(starwars, species == "Human") -#' filter(starwars, mass > 1000) -#' -#' # Multiple criteria -#' filter(starwars, hair_color == "none" & eye_color == "black") -#' filter(starwars, hair_color == "none" | eye_color == "black") -#' -#' # Multiple arguments are equivalent to and -#' filter(starwars, hair_color == "none", eye_color == "black") -#' -#' -#' # The filtering operation may yield different results on grouped -#' # tibbles because the expressions are computed within groups. -#' # -#' # The following filters rows where `mass` is greater than the -#' # global average: -#' starwars %>% filter(mass > mean(mass, na.rm = TRUE)) -#' -#' # Whereas this keeps rows with `mass` greater than the gender -#' # average: -#' starwars %>% group_by(gender) %>% filter(mass > mean(mass, na.rm = TRUE)) -#' -#' -#' # Refer to column names stored as strings with the `.data` pronoun: -#' vars <- c("mass", "height") -#' cond <- c(80, 150) -#' starwars %>% -#' filter( -#' .data[[vars[[1]]]] > cond[[1]], -#' .data[[vars[[2]]]] > cond[[2]] -#' ) -#' # Learn more in ?dplyr_tidy_eval -############# START ADDED tidybulk ##################################### -#' @export -filter <- function (.data, ..., .preserve = FALSE) { - UseMethod("filter") -} - -#' @export -filter.default <- function (.data, ..., .preserve = FALSE) -{ - dplyr::filter(.data, ..., .preserve = .preserve) -} - -#' @export -filter.tidybulk <- function (.data, ..., .preserve = FALSE) -{ - .data %>% - drop_class(c("tidybulk", "tt")) %>% - dplyr::filter( ..., .preserve = .preserve) %>% - - # Attach attributes - reattach_internals() %>% - - # Add class - add_class("tt") %>% - add_class("tidybulk") - -} -############# END ADDED tidybulk ##################################### - - -############# START ADDED tidybulk ##################################### - -#' @importFrom dplyr filter_all -#' @export -dplyr::filter_all - -#' @importFrom dplyr filter_at -#' @export -dplyr::filter_at - -#' @importFrom dplyr filter_if -#' @export -dplyr::filter_if - -############# END ADDED tidybulk ##################################### - -#' Group by one or more variables -#' -#' @description -#' Most data operations are done on groups defined by variables. -#' `group_by()` takes an existing tbl and converts it into a grouped tbl -#' where operations are performed "by group". `ungroup()` removes grouping. -#' -#' @family grouping functions -#' @inheritParams arrange -#' @param ... In `group_by()`, variables or computations to group by. -#' In `ungroup()`, variables to remove from the grouping. -#' @param .add When `FALSE`, the default, `group_by()` will -#' override existing groups. To add to the existing groups, use -#' `.add = TRUE`. -#' -#' This argument was previously called `add`, but that prevented -#' creating a new grouping variable called `add`, and conflicts with -#' our naming conventions. -#' @param .drop When `.drop = TRUE`, empty groups are dropped. See [group_by_drop_default()] for -#' what the default value is for this argument. -#' @return A [grouped data frame][grouped_df()], unless the combination of `...` and `add` -#' yields a non empty set of grouping columns, a regular (ungrouped) data frame -#' otherwise. -#' @section Methods: -#' These function are **generic**s, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' Methods available in currently loaded packages: -#' -#' * `group_by()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("group_by")}. -#' * `ungroup()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("ungroup")}. -#' @export -#' @examples -#' by_cyl <- mtcars %>% group_by(cyl) -#' -#' # grouping doesn't change how the data looks (apart from listing -#' # how it's grouped): -#' by_cyl -#' -#' # It changes how it acts with the other dplyr verbs: -#' by_cyl %>% summarise( -#' disp = mean(disp), -#' hp = mean(hp) -#' ) -#' by_cyl %>% filter(disp == max(disp)) -#' -#' # Each call to summarise() removes a layer of grouping -#' by_vs_am <- mtcars %>% group_by(vs, am) -#' by_vs <- by_vs_am %>% summarise(n = n()) -#' by_vs -#' by_vs %>% summarise(n = sum(n)) -#' -#' # To removing grouping, use ungroup -#' by_vs %>% -#' ungroup() %>% -#' summarise(n = sum(n)) -#' -#' # You can group by expressions: this is just short-hand for -#' # a mutate() followed by a group_by() -#' mtcars %>% group_by(vsam = vs + am) -#' -#' # By default, group_by() overrides existing grouping -#' by_cyl %>% -#' group_by(vs, am) %>% -#' group_vars() -#' -#' # Use add = TRUE to instead append -#' by_cyl %>% -#' group_by(vs, am, .add = TRUE) %>% -#' group_vars() -#' -#' -#' # when factors are involved, groups can be empty -#' tbl <- tibble( -#' x = 1:10, -#' y = factor(rep(c("a", "c"), each = 5), levels = c("a", "b", "c")) -#' ) -#' tbl %>% -#' group_by(y) %>% -#' group_rows() -#' -############# START ADDED tidybulk ##################################### -#' @export -filter <- function (.data, ..., .preserve = FALSE) { - UseMethod("filter") -} - -#' @export -filter.default <- function (.data, ..., .preserve = FALSE) -{ - dplyr::filter(.data, ..., .preserve = .preserve) -} - -#' @export -filter.tidybulk <- function (.data, ..., .preserve = FALSE) -{ - .data %>% - drop_class(c("tidybulk", "tt")) %>% - dplyr::filter(..., .preserve = .preserve) %>% - - # Attach attributes - reattach_internals() %>% - - # Add class - add_class("tt") %>% - add_class("tidybulk") - -} -############# END ADDED tidybulk ##################################### - -#' Group by one or more variables -#' -#' @description -#' Most data operations are done on groups defined by variables. -#' `group_by()` takes an existing tbl and converts it into a grouped tbl -#' where operations are performed "by group". `ungroup()` removes grouping. -#' -#' @family grouping functions -#' @inheritParams arrange -#' @param ... In `group_by()`, variables or computations to group by. -#' In `ungroup()`, variables to remove from the grouping. -#' @param .add When `FALSE`, the default, `group_by()` will -#' override existing groups. To add to the existing groups, use -#' `.add = TRUE`. -#' -#' This argument was previously called `add`, but that prevented -#' creating a new grouping variable called `add`, and conflicts with -#' our naming conventions. -#' @param .drop When `.drop = TRUE`, empty groups are dropped. See [group_by_drop_default()] for -#' what the default value is for this argument. -#' @return A [grouped data frame][grouped_df()], unless the combination of `...` and `add` -#' yields a non empty set of grouping columns, a regular (ungrouped) data frame -#' otherwise. -#' @section Methods: -#' These function are **generic**s, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' Methods available in currently loaded packages: -#' -#' * `group_by()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("group_by")}. -#' * `ungroup()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("ungroup")}. -#' @export -#' @examples -#' by_cyl <- mtcars %>% group_by(cyl) -#' -#' # grouping doesn't change how the data looks (apart from listing -#' # how it's grouped): -#' by_cyl -#' -#' # It changes how it acts with the other dplyr verbs: -#' by_cyl %>% summarise( -#' disp = mean(disp), -#' hp = mean(hp) -#' ) -#' by_cyl %>% filter(disp == max(disp)) -#' -#' # Each call to summarise() removes a layer of grouping -#' by_vs_am <- mtcars %>% group_by(vs, am) -#' by_vs <- by_vs_am %>% summarise(n = n()) -#' by_vs -#' by_vs %>% summarise(n = sum(n)) -#' -#' # To removing grouping, use ungroup -#' by_vs %>% -#' ungroup() %>% -#' summarise(n = sum(n)) -#' -#' # You can group by expressions: this is just short-hand for -#' # a mutate() followed by a group_by() -#' mtcars %>% group_by(vsam = vs + am) -#' -#' # By default, group_by() overrides existing grouping -#' by_cyl %>% -#' group_by(vs, am) %>% -#' group_vars() -#' -#' # Use add = TRUE to instead append -#' by_cyl %>% -#' group_by(vs, am, .add = TRUE) %>% -#' group_vars() -#' -#' -#' # when factors are involved, groups can be empty -#' tbl <- tibble( -#' x = 1:10, -#' y = factor(rep(c("a", "c"), each = 5), levels = c("a", "b", "c")) -#' ) -#' tbl %>% -#' group_by(y) %>% -#' group_rows() -############# START ADDED tidybulk ##################################### -#' @export -group_by <- function (.data, ..., .add = FALSE, .drop = group_by_drop_default(.data)) { - UseMethod("group_by") -} - -#' @export -group_by.default <- function (.data, ..., .add = FALSE, .drop = group_by_drop_default(.data)) -{ - dplyr::group_by(.data, ..., .add = .add, .drop = .drop) -} - -#' @export -group_by.tidybulk <- function (.data, ..., .add = FALSE, .drop = group_by_drop_default(.data)) -{ - .data %>% - drop_class(c("tidybulk", "tt")) %>% - dplyr::group_by( ..., .add = .add, .drop = .drop) %>% - - # Attach attributes - reattach_internals() %>% - - # Add class - add_class("tt") %>% - add_class("tidybulk") - -} -############# END ADDED tidybulk ##################################### - - -#' @rdname group_by -#' @export -#' @param x A [tbl()] -ungroup <- function(x, ...) { - UseMethod("ungroup") -} -############# START ADDED tidybulk ##################################### - -#' @export -ungroup.tidybulk <- function (.data, ...) -{ - .data %>% - drop_class(c("tidybulk", "tt")) %>% - dplyr::ungroup( ...) %>% - - # Attach attributes - reattach_internals() %>% - - # Add class - add_class("tt") %>% - add_class("tidybulk") - -} -############# END ADDED tidybulk ##################################### - -############# START ADDED tidybulk ##################################### - -#' @importFrom dplyr group_by_all -#' @export -dplyr::group_by_all - -#' @importFrom dplyr group_by_at -#' @export -dplyr::group_by_at - -#' @importFrom dplyr group_by_if -#' @export -dplyr::group_by_if - -############# END ADDED tidybulk ##################################### - -#' Summarise each group to fewer rows -#' -#' @description -#' `summarise()` creates a new data frame. It will have one (or more) rows for -#' each combination of grouping variables; if there are no grouping variables, -#' the output will have a single row summarising all observations in the input. -#' It will contain one column for each grouping variable and one column -#' for each of the summary statistics that you have specified. -#' -#' `summarise()` and `summarize()` are synonyms. -#' -#' @section Useful functions: -#' -#' * Center: [mean()], [median()] -#' * Spread: [sd()], [IQR()], [mad()] -#' * Range: [min()], [max()], [quantile()] -#' * Position: [first()], [last()], [nth()], -#' * Count: [n()], [n_distinct()] -#' * Logical: [any()], [all()] -#' -#' @section Backend variations: -#' -#' The data frame backend supports creating a variable and using it in the -#' same summary. This means that previously created summary variables can be -#' further transformed or combined within the summary, as in [mutate()]. -#' However, it also means that summary variables with the same names as previous -#' variables overwrite them, making those variables unavailable to later summary -#' variables. -#' -#' This behaviour may not be supported in other backends. To avoid unexpected -#' results, consider using new names for your summary variables, especially when -#' creating multiple summaries. -#' -#' @export -#' @inheritParams arrange -#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Name-value pairs of summary -#' functions. The name will be the name of the variable in the result. -#' -#' The value can be: -#' -#' * A vector of length 1, e.g. `min(x)`, `n()`, or `sum(is.na(y))`. -#' * A vector of length `n`, e.g. `quantile()`. -#' * A data frame, to add multiple columns from a single expression. -#' @family single table verbs -#' @return -#' An object _usually_ of the same type as `.data`. -#' -#' * The rows come from the underlying `group_keys()`. -#' * The columns are a combination of the grouping keys and the summary -#' expressions that you provide. -#' * If `x` is grouped by more than one variable, the output will be another -#' [grouped_df] with the right-most group removed. -#' * If `x` is grouped by one variable, or is not grouped, the output will -#' be a [tibble]. -#' * Data frame attributes are **not** preserved, because `summarise()` -#' fundamentally creates a new data frame. -#' @section Methods: -#' This function is a **generic**, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' The following methods are currently available in loaded packages: -#' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("summarise")}. -#' @examples -#' # A summary applied to ungrouped tbl returns a single row -#' mtcars %>% -#' summarise(mean = mean(disp), n = n()) -#' -#' # Usually, you'll want to group first -#' mtcars %>% -#' group_by(cyl) %>% -#' summarise(mean = mean(disp), n = n()) -#' -#' # dplyr 1.0.0 allows to summarise to more than one value: -#' mtcars %>% -#' group_by(cyl) %>% -#' summarise(qs = quantile(disp, c(0.25, 0.75)), prob = c(0.25, 0.75)) -#' -#' # You use a data frame to create multiple columns so you can wrap -#' # this up into a function: -#' my_quantile <- function(x, probs) { -#' tibble(x = quantile(x, probs), probs = probs) -#' } -#' mtcars %>% -#' group_by(cyl) %>% -#' summarise(my_quantile(disp, c(0.25, 0.75))) -#' -#' # Each summary call removes one grouping level (since that group -#' # is now just a single row) -#' mtcars %>% -#' group_by(cyl, vs) %>% -#' summarise(cyl_n = n()) %>% -#' group_vars() -#' -#' # BEWARE: reusing variables may lead to unexpected results -#' mtcars %>% -#' group_by(cyl) %>% -#' summarise(disp = mean(disp), sd = sd(disp)) -#' -#' # Refer to column names stored as strings with the `.data` pronoun: -#' var <- "mass" -#' summarise(starwars, avg = mean(.data[[var]], na.rm = TRUE)) -#' # Learn more in ?dplyr_tidy_eval -############# START ADDED tidybulk ##################################### -#' @export -summarise <- function (.data, ...) { - UseMethod("summarise") -} - -#' @export -summarise.default <- function (.data, ...) -{ - dplyr::summarise(.data, ...) -} - -#' @export -summarise.tidybulk <- function (.data, ...) -{ - .data %>% - drop_class(c("tidybulk", "tt")) %>% - dplyr::summarise( ...) %>% - - # Attach attributes - reattach_internals() %>% - - # Add class - add_class("tt") %>% - add_class("tidybulk") - -} -############# END ADDED tidybulk ##################################### - -############# START ADDED tidybulk ##################################### - -#' @importFrom dplyr summarize_all -#' @export -dplyr::summarize_all - -#' @importFrom dplyr summarize_at -#' @export -dplyr::summarize_at - -#' @importFrom dplyr summarize_if -#' @export -dplyr::summarize_if - -############# END ADDED tidybulk ##################################### - -#' @rdname summarise_all -#' @export -summarize_all <- summarise_all -#' @rdname summarise_all -#' @export -summarize_if <- summarise_if -#' @rdname summarise_all -#' @export -summarize_at <- summarise_at - -#' Create, modify, and delete columns -#' -#' `mutate()` adds new variables and preserves existing ones; -#' `transmute()` adds new variables and drops existing ones. -#' New variables overwrite existing variables of the same name. -#' Variables can be removed by setting their value to `NULL`. -#' -#' @section Useful mutate functions: -#' -#' * [`+`], [`-`], [log()], etc., for their usual mathematical meanings -#' -#' * [lead()], [lag()] -#' -#' * [dense_rank()], [min_rank()], [percent_rank()], [row_number()], -#' [cume_dist()], [ntile()] -#' -#' * [cumsum()], [cummean()], [cummin()], [cummax()], [cumany()], [cumall()] -#' -#' * [na_if()], [coalesce()] -#' -#' * [if_else()], [recode()], [case_when()] -#' -#' @section Grouped tibbles: -#' -#' Because mutating expressions are computed within groups, they may -#' yield different results on grouped tibbles. This will be the case -#' as soon as an aggregating, lagging, or ranking function is -#' involved. Compare this ungrouped mutate: -#' -#' ``` -#' starwars %>% -#' mutate(mass / mean(mass, na.rm = TRUE)) %>% -#' pull() -#' ``` -#' -#' With the grouped equivalent: -#' -#' ``` -#' starwars %>% -#' group_by(gender) %>% -#' mutate(mass / mean(mass, na.rm = TRUE)) %>% -#' pull() -#' ``` -#' -#' The former normalises `mass` by the global average whereas the -#' latter normalises by the averages within gender levels. -#' -#' @export -#' @inheritParams arrange -#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Name-value pairs. -#' The name gives the name of the column in the output. -#' -#' The value can be: -#' -#' * A vector of length 1, which will be recycled to the correct length. -#' * A vector the same length as the current group (or the whole data frame -#' if ungrouped). -#' * `NULL`, to remove the column. -#' * A data frame or tibble, to create multiple columns in the output. -#' -#' @family single table verbs -#' @return -#' An object of the same type as `.data`. -#' -#' For `mutate()`: -#' -#' * Rows are not affected. -#' * Existing columns will be preserved unless explicitly modified. -#' * New columns will be added to the right of existing columns. -#' * Columns given value `NULL` will be removed -#' * Groups will be recomputed if a grouping variable is mutated. -#' * Data frame attributes are preserved. -#' -#' For `transmute()`: -#' -#' * Rows are not affected. -#' * Apart from grouping variables, existing columns will be remove unless -#' explicitly kept. -#' * Column order matches order of expressions. -#' * Groups will be recomputed if a grouping variable is mutated. -#' * Data frame attributes are preserved. -#' @section Methods: -#' These function are **generic**s, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' Methods available in currently loaded packages: -#' -#' * `mutate()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("mutate")}. -#' * `transmute()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("transmute")}. -#' @examples -#' # Newly created variables are available immediately -#' mtcars %>% as_tibble() %>% mutate( -#' cyl2 = cyl * 2, -#' cyl4 = cyl2 * 2 -#' ) -#' -#' # As well as adding new variables, you can use mutate() to -#' # remove variables and modify existing variables. -#' mtcars %>% as_tibble() %>% mutate( -#' mpg = NULL, -#' disp = disp * 0.0163871 # convert to litres -#' ) -#' -#' # window functions are useful for grouped mutates -#' mtcars %>% -#' group_by(cyl) %>% -#' mutate(rank = min_rank(desc(mpg))) -#' # see `vignette("window-functions")` for more details -#' -#' # mutate() vs transmute -------------------------- -#' # mutate() keeps all existing variables -#' mtcars %>% -#' mutate(displ_l = disp / 61.0237) -#' -#' # transmute keeps only the variables you create -#' mtcars %>% -#' transmute(displ_l = disp / 61.0237) -#' -#' # Grouping ---------------------------------------- -#' # The mutate operation may yield different results on grouped -#' # tibbles because the expressions are computed within groups. -#' # The following normalises `mass` by the global average: -#' starwars %>% -#' mutate(mass / mean(mass, na.rm = TRUE)) %>% -#' pull() -#' -#' # Whereas this normalises `mass` by the averages within gender -#' # levels: -#' starwars %>% -#' group_by(gender) %>% -#' mutate(mass / mean(mass, na.rm = TRUE)) %>% -#' pull() -#' -#' # Indirection ---------------------------------------- -#' # Refer to column names stored as strings with the `.data` pronoun: -#' vars <- c("mass", "height") -#' mutate(starwars, prod = .data[[vars[[1]]]] * .data[[vars[[2]]]]) -#' # Learn more in ?dplyr_tidy_eval -############# START ADDED tidybulk ##################################### -#' @export -mutate <- function(.data, ...) { - UseMethod("mutate") -} - -#' @export -mutate.default <- function(.data, ...) -{ - dplyr::mutate(.data, ...) -} - -#' @export -mutate.tidybulk <- function(.data, ...) -{ - .data %>% - drop_class(c("tidybulk", "tt")) %>% - dplyr::mutate(...) %>% - - # Attach attributes - reattach_internals() %>% - - # Add class - add_class("tt") %>% - add_class("tidybulk") - - -} -############# END ADDED tidybulk ##################################### - -############# START ADDED tidybulk ##################################### - -#' @importFrom dplyr mutate_all -#' @export -dplyr::mutate_all - -#' @importFrom dplyr mutate_at -#' @export -dplyr::mutate_at - -#' @importFrom dplyr mutate_if -#' @export -dplyr::mutate_if - -############# END ADDED tidybulk ##################################### - -#' Rename columns -#' -#' Rename individual variables using `new_name = old_name` syntax. -#' -#' @section Scoped selection and renaming: -#' -#' Use the three scoped variants ([rename_all()], [rename_if()], [rename_at()]) -#' to renaming a set of variables with a function. -#' -#' @inheritParams arrange -#' @param ... <[`tidy-select`][dplyr_tidy_select]> Use `new_name = old_name` -#' to rename selected variables. -#' @return -#' An object of the same type as `.data`. -#' * Rows are not affected. -#' * Column names are changed; column order is preserved -#' * Data frame attributes are preserved. -#' * Groups are updated to reflect new names. -#' @section Methods: -#' This function is a **generic**, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' The following methods are currently available in loaded packages: -#' \Sexpr[stage=render,results=Rd]{dplyr:::methods_rd("rename")}. -#' @family single table verbs -#' @export -#' @examples -#' iris <- as_tibble(iris) # so it prints a little nicer -#' rename(iris, petal_length = Petal.Length) -############# START ADDED tidybulk ##################################### -#' @export -rename <- function(.data, ...) { - UseMethod("rename") -} - -#' @export -rename.default <- function(.data, ...) -{ - dplyr::rename(.data, ...) -} - -#' @export -rename.tidybulk <- function(.data, ...) -{ - .data %>% - drop_class(c("tidybulk", "tt")) %>% - dplyr::rename(...) %>% - - # Attach attributes - reattach_internals() %>% - - # Add class - add_class("tt") %>% - add_class("tidybulk") - - -} -############# END ADDED tidybulk ##################################### - -#' Group input by rows -#' -#' \Sexpr[results=rd, stage=render]{lifecycle::badge("questioning")} -#' -#' See [this repository](https://github.com/jennybc/row-oriented-workflows) -#' for alternative ways to perform row-wise operations. -#' -#' `rowwise()` is used for the results of [do()] when you -#' create list-variables. It is also useful to support arbitrary -#' complex operations that need to be applied to each row. -#' -#' Currently, rowwise grouping only works with data frames. Its -#' main impact is to allow you to work with list-variables in -#' [summarise()] and [mutate()] without having to -#' use \code{[[1]]}. This makes `summarise()` on a rowwise tbl -#' effectively equivalent to [plyr::ldply()]. -#' -#' @param data Input data frame. -#' @export -#' @examples -#' df <- expand.grid(x = 1:3, y = 3:1) -#' df_done <- df %>% rowwise() %>% do(i = seq(.$x, .$y)) -#' df_done -#' df_done %>% summarise(n = length(i)) -############# START ADDED tidybulk ##################################### -#' @export -rowwise <- function(.data) { - UseMethod("rowwise") -} - -#' @export -rowwise.default <- function(.data) -{ - dplyr::rowwise(.data) -} - -#' @export -rowwise.tidybulk <- function(.data) -{ - .data %>% - drop_class(c("tidybulk", "tt")) %>% - dplyr::rowwise() %>% - - # Attach attributes - reattach_internals() %>% - - # Add class - add_class("tt") %>% - add_class("tidybulk") - - -} -############# END ADDED tidybulk ##################################### From d7690dd212b7a1d6b68813f9be3b9445138b72a6 Mon Sep 17 00:00:00 2001 From: chilampoon Date: Mon, 18 Sep 2023 23:11:59 -0400 Subject: [PATCH 4/7] update R ver and check-bioc --- .github/workflows/check-bioc.yml | 158 ++-- DESCRIPTION | 4 +- dev/dplyr-master-methods.R | 1177 ++++++++++++++++++++++++++++++ 3 files changed, 1288 insertions(+), 51 deletions(-) create mode 100644 dev/dplyr-master-methods.R diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index 7c7151e4..e432bb90 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -39,6 +39,7 @@ env: run_pkgdown: 'true' has_RUnit: 'false' cache-version: 'cache-v1' + run_docker: 'false' jobs: build-check: @@ -51,9 +52,11 @@ jobs: fail-fast: false matrix: config: - - { os: ubuntu-latest, r: '4.2', bioc: '3.16', cont: "bioconductor/bioconductor_docker:RELEASE_3_16", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } - - { os: macOS-latest, r: '4.2', bioc: '3.16'} - - { os: windows-latest, r: '4.2', bioc: '3.16'} + - { os: ubuntu-latest, r: '4.3', bioc: '3.18', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/jammy/latest" } + - { os: macOS-latest, r: '4.3', bioc: '3.18'} + - { os: windows-latest, r: '4.3', bioc: '3.18'} + ## Check https://github.com/r-lib/actions/tree/master/examples + ## for examples using the http-user-agent env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true RSPM: ${{ matrix.config.rspm }} @@ -76,7 +79,7 @@ jobs: ## https://github.com/r-lib/actions/blob/master/examples/check-standard.yaml ## If they update their steps, we will also need to update ours. - name: Checkout Repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 ## R is already included in the Bioconductor docker images - name: Setup R from r-lib @@ -84,6 +87,7 @@ jobs: uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} ## pandoc is already included in the Bioconductor docker images - name: Setup pandoc from r-lib @@ -96,28 +100,28 @@ jobs: saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) shell: Rscript {0} - - name: Cache R packages + - name: Restore R package cache if: "!contains(github.event.head_commit.message, '/nocache') && runner.os != 'Linux'" uses: actions/cache@v3 with: path: ${{ env.R_LIBS_USER }} - key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2- + key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.3-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.3- - name: Cache R packages on Linux if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' " uses: actions/cache@v3 with: path: /home/runner/work/_temp/Library - key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2- + key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.3-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.3- - - name: Install Linux system dependencies - if: runner.os == 'Linux' - run: | - sysreqs=$(Rscript -e 'cat("apt-get update -y && apt-get install -y", paste(gsub("apt-get install -y ", "", remotes::system_requirements("ubuntu", "20.04")), collapse = " "))') - echo $sysreqs - sudo -s eval "$sysreqs" + # - name: Install Linux system dependencies + # if: runner.os == 'Linux' + # run: | + # sysreqs=$(Rscript -e 'cat("apt-get update -y && apt-get install -y", paste(gsub("apt-get install -y ", "", remotes::system_requirements("ubuntu", "20.04")), collapse = " "))') + # echo $sysreqs + # sudo -s eval "$sysreqs" - name: Install macOS system dependencies if: matrix.config.os == 'macOS-latest' @@ -135,10 +139,9 @@ jobs: ## For installing usethis's dependency gert brew install libgit2 - - ## To fix x11/cairo error with tidyHeatmap/Complexheatmap here https://github.com/stemangiola/tidybulk/runs/1388237421?check_suite_focus=true#step:14:2134 - ## Suggested here https://stackoverflow.com/questions/63648591/how-to-install-x11-before-testing-with-github-actions-for-macos - brew install --cask xquartz + + ## Required for tcltk + brew install xquartz --cask - name: Install Windows system dependencies if: runner.os == 'Windows' @@ -154,7 +157,7 @@ jobs: - name: Set BiocVersion run: | - BiocManager::install(version = "${{ matrix.config.bioc }}", ask = FALSE) + BiocManager::install(version = "${{ matrix.config.bioc }}", ask = FALSE, force = TRUE) shell: Rscript {0} - name: Install dependencies pass 1 @@ -166,9 +169,13 @@ jobs: ## https://github.com/r-lib/remotes/issues/296 ## Ideally, all dependencies should get installed in the first pass. + ## For running the checks + message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****')) + install.packages(c("rcmdcheck", "BiocCheck"), repos = BiocManager::repositories()) + ## Pass #1 at installing dependencies message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****')) - remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE) + remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE) continue-on-error: true shell: Rscript {0} @@ -176,12 +183,7 @@ jobs: run: | ## Pass #2 at installing dependencies message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****')) - remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE) - - ## For running the checks - message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****')) - remotes::install_cran("rcmdcheck") - BiocManager::install("BiocCheck") + remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE, force = TRUE) shell: Rscript {0} - name: Install BiocGenerics @@ -192,15 +194,15 @@ jobs: shell: Rscript {0} - name: Install covr - if: github.ref == 'refs/heads/master' && env.run_covr == 'true' && runner.os == 'Linux' + if: github.ref == 'refs/heads/devel' && env.run_covr == 'true' && runner.os == 'Linux' run: | remotes::install_cran("covr") shell: Rscript {0} - name: Install pkgdown - if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux' + if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux' run: | - remotes::install_cran("pkgdown") + remotes::install_github("r-lib/pkgdown") shell: Rscript {0} - name: Session info @@ -213,10 +215,12 @@ jobs: - name: Run CMD check env: _R_CHECK_CRAN_INCOMING_: false + DISPLAY: 99.0 run: | + options(crayon.enabled = TRUE) rcmdcheck::rcmdcheck( - args = c("--no-build-vignettes", "--no-manual", "--timings"), - build_args = c("--no-manual", "--no-resave-data"), + args = c("--no-manual", "--no-vignettes", "--timings"), + build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"), error_on = "warning", check_dir = "check" ) @@ -234,45 +238,101 @@ jobs: shell: Rscript {0} - name: Run BiocCheck + env: + DISPLAY: 99.0 run: | BiocCheck::BiocCheck( dir('check', 'tar.gz$', full.names = TRUE), `quit-with-status` = TRUE, `no-check-R-ver` = TRUE, - `no-check-bioc-help` = TRUE, - `no-check-library-calls` = TRUE, - `no-check-coding-practices` = TRUE + `no-check-bioc-help` = TRUE ) shell: Rscript {0} - name: Test coverage - if: github.ref == 'refs/heads/master' && env.run_covr == 'true' && runner.os == 'Linux' + if: github.ref == 'refs/heads/devel' && env.run_covr == 'true' && runner.os == 'Linux' run: | covr::codecov() shell: Rscript {0} - name: Install package - if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux' + if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux' run: R CMD INSTALL . - - name: Deploy package - if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux' - run: | - ## Temporary workaround for https://github.com/actions/checkout/issues/766 - git config --global --add safe.directory "$GITHUB_WORKSPACE" - - git config --local user.email "actions@github.com" - git config --local user.name "GitHub Actions" - Rscript -e "pkgdown::deploy_to_branch(new_process = FALSE)" - shell: bash {0} + - name: Build pkgdown site + if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux' + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + shell: Rscript {0} ## Note that you need to run pkgdown::deploy_to_branch(new_process = FALSE) ## at least one locally before this will work. This creates the gh-pages ## branch (erasing anything you haven't version controlled!) and ## makes the git history recognizable by pkgdown. + - name: Install deploy dependencies + if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux' + run: | + apt-get update && apt-get -y install rsync + + - name: Deploy pkgdown site to GitHub pages 🚀 + if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux' + uses: JamesIves/github-pages-deploy-action@releases/v4 + with: + clean: false + branch: gh-pages + folder: docs + - name: Upload check results if: failure() - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@master with: - name: ${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2-results + name: ${{ runner.os }}-biocversion-devel-r-4.3-results path: check + + + ## Code adapted from + ## https://github.com/waldronlab/cBioPortalData/blob/e0440a4445f0cc731e426363a76faa22ee5e0f9d/.github/workflows/devel_check_dock.yml#L65-L92 + docker-build-and-push: + runs-on: ubuntu-latest + needs: build-check + steps: + - name: Checkout Repository + if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'" + uses: actions/checkout@v3 + + - name: Register repo name + if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'" + id: reg_repo_name + run: | + echo CONT_IMG_NAME=$(echo ${{ github.event.repository.name }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV + + - name: Set up QEMU + if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'" + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'" + uses: docker/setup-buildx-action@v2 + + - name: Login to Docker Hub + if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'" + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + ## Note that DOCKERHUB_TOKEN is really a token for your dockerhub + ## account, not your actual dockerhub account password. You can get it + ## from https://hub.docker.com/settings/security. + ## Check https://github.com/docker/build-push-action/tree/v4.0.0 + ## for more details. + ## Alternatively, try checking + ## https://seandavi.github.io/BuildABiocWorkshop/articles/HOWTO_BUILD_WORKSHOP.html. + + - name: Build and Push Docker + if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel' && success()" + uses: docker/build-push-action@v4 + with: + context: . + push: true + tags: > + ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.CONT_IMG_NAME }}:latest, + ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.CONT_IMG_NAME }}:devel \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index ecbd460e..b5604bdb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,8 +12,8 @@ Description: This is a collection of utility functions that allow a modular, pipe-friendly and tidy fashion. License: GPL-3 Depends: - R (>= 4.1.0), - ttservice (>= 0.3.6) + R (>= 4.3.0), + ttservice Imports: tibble, readr, diff --git a/dev/dplyr-master-methods.R b/dev/dplyr-master-methods.R new file mode 100644 index 00000000..e24ef031 --- /dev/null +++ b/dev/dplyr-master-methods.R @@ -0,0 +1,1177 @@ + +#' Arrange rows by column values +#' +#' See \code{dpyr::\link[dpyr:arrange]{arrange}} for details. +#' +#' @description +#' `arrange()` order the rows of a data frame rows by the values of selected +#' columns. +#' +#' Unlike other dplyr verbs, `arrange()` largely ignores grouping; you +#' need to explicit mention grouping variables (or use `by_group = TRUE`) +#' in order to group by them, and functions of variables are evaluated +#' once per data frame, not once per group. +#' +#' @details +#' ## Locales +#' The sort order for character vectors will depend on the collating sequence +#' of the locale in use: see [locales()]. +#' +#' ## Missing values +#' Unlike base sorting with `sort()`, `NA` are: +#' * always sorted to the end for local data, even when wrapped with `desc()`. +#' * treated differently for remote data, depending on the backend. +#' +#' @return +#' An object of the same type as `.data`. +#' +#' * All rows appear in the output, but (usually) in a different place. +#' * Columns are not modified. +#' * Groups are not modified. +#' * Data frame attributes are preserved. +#' @section Methods: +#' This function is a **generic**, which means that packages can provide +#' implementations (methods) for other classes. See the documentation of +#' individual methods for extra arguments and differences in behaviour. +#' +#' The following methods are currently available in loaded packages: +#' \Sexpr[stage=render,results=Rd]{dplyr:::methods_rd("arrange")}. +#' @export +#' @param .data A data frame, data frame extension (e.g. a tibble), or a +#' lazy data frame (e.g. from dbplyr or dtplyr). See *Methods*, below, for +#' more details. +#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Variables, or functions or +#' variables. Use [desc()] to sort a variable in descending order. +#' @family single table verbs +#' @examples +#' arrange(mtcars, cyl, disp) +#' arrange(mtcars, desc(disp)) +#' +#' # grouped arrange ignores groups +#' by_cyl <- mtcars %>% group_by(cyl) +#' by_cyl %>% arrange(desc(wt)) +#' # Unless you specifically ask: +#' by_cyl %>% arrange(desc(wt), .by_group = TRUE) +arrange <- function(.data, ..., .by_group = FALSE) { + UseMethod("arrange") +} + +#' @param .by_group If `TRUE`, will sort first by grouping variable. Applies to +#' grouped data frames only. +#' @rdname arrange +#' @export +#' +############# START ADDED tidybulk ################################### + +arrange.default <- function(.data, ..., .by_group = FALSE) { + + dplyr::arrange(.data, ..., .by_group = .by_group) + +} + +#' @export +arrange.tidybulk <- function(.data, ..., .by_group = FALSE) { + + .data %>% + drop_class(c("tidybulk", "tt")) %>% + dplyr::arrange( ..., .by_group = .by_group) %>% + + # Attach attributes + reattach_internals() %>% + + # Add class + add_class("tt") %>% + add_class("tidybulk") + +} + +############# END ADDED tidybulk ##################################### + +#' Efficiently bind multiple data frames by row and column +#' +#' This is an efficient implementation of the common pattern of +#' `do.call(rbind, dfs)` or `do.call(cbind, dfs)` for binding many +#' data frames into one. +#' +#' The output of `bind_rows()` will contain a column if that column +#' appears in any of the inputs. +#' +#' @param ... Data frames to combine. +#' +#' Each argument can either be a data frame, a list that could be a data +#' frame, or a list of data frames. +#' +#' When row-binding, columns are matched by name, and any missing +#' columns will be filled with NA. +#' +#' When column-binding, rows are matched by position, so all data +#' frames must have the same number of rows. To match by value, not +#' position, see [mutate-joins]. +#' @param .id Data frame identifier. +#' +#' When `.id` is supplied, a new column of identifiers is +#' created to link each row to its original data frame. The labels +#' are taken from the named arguments to `bind_rows()`. When a +#' list of data frames is supplied, the labels are taken from the +#' names of the list. If no names are found a numeric sequence is +#' used instead. +#' @return `bind_rows()` and `bind_cols()` return the same type as +#' the first input, either a data frame, `tbl_df`, or `grouped_df`. +#' @examples +#' one <- mtcars[1:4, ] +#' two <- mtcars[11:14, ] +#' +#' # You can supply data frames as arguments: +#' bind_rows(one, two) +#' +#' # The contents of lists are spliced automatically: +#' bind_rows(list(one, two)) +#' bind_rows(split(mtcars, mtcars$cyl)) +#' bind_rows(list(one, two), list(two, one)) +#' +#' +#' # In addition to data frames, you can supply vectors. In the rows +#' # direction, the vectors represent rows and should have inner +#' # names: +#' bind_rows( +#' c(a = 1, b = 2), +#' c(a = 3, b = 4) +#' ) +#' +#' # You can mix vectors and data frames: +#' bind_rows( +#' c(a = 1, b = 2), +#' tibble(a = 3:4, b = 5:6), +#' c(a = 7, b = 8) +#' ) +#' +#' +#' # Note that for historical reasons, lists containing vectors are +#' # always treated as data frames. Thus their vectors are treated as +#' # columns rather than rows, and their inner names are ignored: +#' ll <- list( +#' a = c(A = 1, B = 2), +#' b = c(A = 3, B = 4) +#' ) +#' bind_rows(ll) +#' +#' # You can circumvent that behaviour with explicit splicing: +#' bind_rows(!!!ll) +#' +#' +#' # When you supply a column name with the `.id` argument, a new +#' # column is created to link each row to its original data frame +#' bind_rows(list(one, two), .id = "id") +#' bind_rows(list(a = one, b = two), .id = "id") +#' bind_rows("group 1" = one, "group 2" = two, .id = "groups") +#' +#' # Columns don't need to match when row-binding +#' bind_rows(data.frame(x = 1:3), data.frame(y = 1:4)) +#' \dontrun{ +#' # Rows do need to match when column-binding +#' bind_cols(data.frame(x = 1), data.frame(y = 1:2)) +#' } +#' +#' bind_cols(one, two) +#' bind_cols(list(one, two)) +#' @name bind +NULL + +#' @export +#' @rdname bind +#' @export +#' +############# START ADDED tidybulk ##################################### + +bind_rows <- function(..., .id = NULL) { + UseMethod("bind_rows") +} + +#' @export +bind_rows.default <- function(..., .id = NULL) +{ + bind_rows(..., .id = .id) +} + +#' @export +bind_rows.tidybulk <- function(..., .id = NULL) +{ + + tts = flatten_if(dots_values(...), is_spliced) # Original that fails Bioconductor dplyr:::flatten_bindable(rlang::dots_values(...)) + + par1 = tts[[1]] %>% get_tt_columns() %>% unlist + par2 = tts[[2]] %>% get_tt_columns() %>% unlist + + # tt_columns of the two objects must match + error_if_parameters_not_match(par1, par2) + + bind_rows(..., .id = .id) %>% + + # Attach attributes from the first object + add_attr(tts[[1]] %>% attr("internals"), "internals") + +} + +############# END ADDED tidybulk ##################################### + +#' @export +#' @rdname bind +############# START ADDED tidybulk ##################################### + +bind_cols <- function(..., .id = NULL) { + UseMethod("bind_cols") +} + +#' @export +bind_cols.default <- function(..., .id = NULL) +{ + bind_cols(..., .id = .id) +} + +#' @export +bind_cols.tidybulk <- function(..., .id = NULL) +{ + + tts = flatten_if(dots_values(...), is_spliced) # Original that fails Bioconductor dplyr:::flatten_bindable(rlang::dots_values(...)) + + bind_cols(..., .id = .id) %>% + + # Attach attributes + add_attr(tts[[1]] %>% attr("internals"), "internals") + +} + +############# END ADDED tidybulk ##################################### +############# START ADDED tidybulk ##################################### + +#' @importFrom dplyr arrange_all +#' @export +dplyr::arrange_all + +#' @importFrom dplyr arrange_at +#' @export +dplyr::arrange_at + +#' @importFrom dplyr arrange_if +#' @export +dplyr::arrange_if + +############# END ADDED tidybulk ##################################### +############# START ADDED tidybulk ##################################### + +#' distinct +#' @param .data A tbl. (See dplyr) +#' @param ... Data frames to combine (See dplyr) +#' @param .keep_all If TRUE, keep all variables in .data. If a combination of ... is not distinct, this keeps the first row of values. (See dplyr) +#' +#' @return A tt object +#' +#' @examples +#' +#' tidybulk::se_mini %>% tidybulk() %>% distinct() +#' +#' +#' @export +distinct <- function (.data, ..., .keep_all = FALSE) { + UseMethod("distinct") +} + +#' @export +distinct.default <- function (.data, ..., .keep_all = FALSE) +{ + dplyr::distinct(.data, ..., .keep_all = FALSE) +} + +#' @export +distinct.tidybulk <- function (.data, ..., .keep_all = FALSE) +{ + .data %>% + drop_class(c("tidybulk", "tt")) %>% + dplyr::distinct(..., .keep_all = .keep_all) %>% + + # Attach attributes + reattach_internals() %>% + + # Add class + add_class("tt") %>% + add_class("tidybulk") + +} +############# END ADDED tidybulk ##################################### + +############# START ADDED tidybulk ##################################### + +#' @importFrom dplyr distinct_all +#' @export +dplyr::distinct_all + +#' @importFrom dplyr distinct_at +#' @export +dplyr::distinct_at + +#' @importFrom dplyr distinct_if +#' @export +dplyr::distinct_if + +############# END ADDED tidybulk ##################################### + +#' Subset rows using column values +#' +#' `filter()` retains the rows where the conditions you provide a `TRUE`. Note +#' that, unlike base subsetting with `[`, rows where the condition evaluates +#' to `NA` are dropped. +#' +#' dplyr is not yet smart enough to optimise filtering optimisation +#' on grouped datasets that don't need grouped calculations. For this reason, +#' filtering is often considerably faster on [ungroup()]ed data. +#' +#' @section Useful filter functions: +#' +#' * [`==`], [`>`], [`>=`] etc +#' * [`&`], [`|`], [`!`], [xor()] +#' * [is.na()] +#' * [between()], [near()] +#' +#' @section Grouped tibbles: +#' +#' Because filtering expressions are computed within groups, they may +#' yield different results on grouped tibbles. This will be the case +#' as soon as an aggregating, lagging, or ranking function is +#' involved. Compare this ungrouped filtering: +#' +#' ``` +#' starwars %>% filter(mass > mean(mass, na.rm = TRUE)) +#' ``` +#' +#' With the grouped equivalent: +#' +#' ``` +#' starwars %>% group_by(gender) %>% filter(mass > mean(mass, na.rm = TRUE)) +#' ``` +#' +#' The former keeps rows with `mass` greater than the global average +#' whereas the latter keeps rows with `mass` greater than the gender +#' +#' average. +#' @family single table verbs +#' @inheritParams arrange +#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Logical predicates defined in +#' terms of the variables in `.data`. +#' Multiple conditions are combined with `&`. Only rows where the +#' condition evaluates to `TRUE` are kept. +#' @param .preserve when `FALSE` (the default), the grouping structure +#' is recalculated based on the resulting data, otherwise it is kept as is. +#' @return +#' An object of the same type as `.data`. +#' +#' * Rows are a subset of the input, but appear in the same order. +#' * Columns are not modified. +#' * The number of groups may be reduced (if `.preserve` is not `TRUE`). +#' * Data frame attributes are preserved. +#' @section Methods: +#' This function is a **generic**, which means that packages can provide +#' implementations (methods) for other classes. See the documentation of +#' individual methods for extra arguments and differences in behaviour. +#' +#' The following methods are currently available in loaded packages: +#' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("filter")}. +#' @seealso [filter_all()], [filter_if()] and [filter_at()]. +#' @export +#' @examples +#' filter(starwars, species == "Human") +#' filter(starwars, mass > 1000) +#' +#' # Multiple criteria +#' filter(starwars, hair_color == "none" & eye_color == "black") +#' filter(starwars, hair_color == "none" | eye_color == "black") +#' +#' # Multiple arguments are equivalent to and +#' filter(starwars, hair_color == "none", eye_color == "black") +#' +#' +#' # The filtering operation may yield different results on grouped +#' # tibbles because the expressions are computed within groups. +#' # +#' # The following filters rows where `mass` is greater than the +#' # global average: +#' starwars %>% filter(mass > mean(mass, na.rm = TRUE)) +#' +#' # Whereas this keeps rows with `mass` greater than the gender +#' # average: +#' starwars %>% group_by(gender) %>% filter(mass > mean(mass, na.rm = TRUE)) +#' +#' +#' # Refer to column names stored as strings with the `.data` pronoun: +#' vars <- c("mass", "height") +#' cond <- c(80, 150) +#' starwars %>% +#' filter( +#' .data[[vars[[1]]]] > cond[[1]], +#' .data[[vars[[2]]]] > cond[[2]] +#' ) +#' # Learn more in ?dplyr_tidy_eval +############# START ADDED tidybulk ##################################### +#' @export +filter <- function (.data, ..., .preserve = FALSE) { + UseMethod("filter") +} + +#' @export +filter.default <- function (.data, ..., .preserve = FALSE) +{ + dplyr::filter(.data, ..., .preserve = .preserve) +} + +#' @export +filter.tidybulk <- function (.data, ..., .preserve = FALSE) +{ + .data %>% + drop_class(c("tidybulk", "tt")) %>% + dplyr::filter( ..., .preserve = .preserve) %>% + + # Attach attributes + reattach_internals() %>% + + # Add class + add_class("tt") %>% + add_class("tidybulk") + +} +############# END ADDED tidybulk ##################################### + + +############# START ADDED tidybulk ##################################### + +#' @importFrom dplyr filter_all +#' @export +dplyr::filter_all + +#' @importFrom dplyr filter_at +#' @export +dplyr::filter_at + +#' @importFrom dplyr filter_if +#' @export +dplyr::filter_if + +############# END ADDED tidybulk ##################################### + +#' Group by one or more variables +#' +#' @description +#' Most data operations are done on groups defined by variables. +#' `group_by()` takes an existing tbl and converts it into a grouped tbl +#' where operations are performed "by group". `ungroup()` removes grouping. +#' +#' @family grouping functions +#' @inheritParams arrange +#' @param ... In `group_by()`, variables or computations to group by. +#' In `ungroup()`, variables to remove from the grouping. +#' @param .add When `FALSE`, the default, `group_by()` will +#' override existing groups. To add to the existing groups, use +#' `.add = TRUE`. +#' +#' This argument was previously called `add`, but that prevented +#' creating a new grouping variable called `add`, and conflicts with +#' our naming conventions. +#' @param .drop When `.drop = TRUE`, empty groups are dropped. See [group_by_drop_default()] for +#' what the default value is for this argument. +#' @return A [grouped data frame][grouped_df()], unless the combination of `...` and `add` +#' yields a non empty set of grouping columns, a regular (ungrouped) data frame +#' otherwise. +#' @section Methods: +#' These function are **generic**s, which means that packages can provide +#' implementations (methods) for other classes. See the documentation of +#' individual methods for extra arguments and differences in behaviour. +#' +#' Methods available in currently loaded packages: +#' +#' * `group_by()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("group_by")}. +#' * `ungroup()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("ungroup")}. +#' @export +#' @examples +#' by_cyl <- mtcars %>% group_by(cyl) +#' +#' # grouping doesn't change how the data looks (apart from listing +#' # how it's grouped): +#' by_cyl +#' +#' # It changes how it acts with the other dplyr verbs: +#' by_cyl %>% summarise( +#' disp = mean(disp), +#' hp = mean(hp) +#' ) +#' by_cyl %>% filter(disp == max(disp)) +#' +#' # Each call to summarise() removes a layer of grouping +#' by_vs_am <- mtcars %>% group_by(vs, am) +#' by_vs <- by_vs_am %>% summarise(n = n()) +#' by_vs +#' by_vs %>% summarise(n = sum(n)) +#' +#' # To removing grouping, use ungroup +#' by_vs %>% +#' ungroup() %>% +#' summarise(n = sum(n)) +#' +#' # You can group by expressions: this is just short-hand for +#' # a mutate() followed by a group_by() +#' mtcars %>% group_by(vsam = vs + am) +#' +#' # By default, group_by() overrides existing grouping +#' by_cyl %>% +#' group_by(vs, am) %>% +#' group_vars() +#' +#' # Use add = TRUE to instead append +#' by_cyl %>% +#' group_by(vs, am, .add = TRUE) %>% +#' group_vars() +#' +#' +#' # when factors are involved, groups can be empty +#' tbl <- tibble( +#' x = 1:10, +#' y = factor(rep(c("a", "c"), each = 5), levels = c("a", "b", "c")) +#' ) +#' tbl %>% +#' group_by(y) %>% +#' group_rows() +#' +############# START ADDED tidybulk ##################################### +#' @export +filter <- function (.data, ..., .preserve = FALSE) { + UseMethod("filter") +} + +#' @export +filter.default <- function (.data, ..., .preserve = FALSE) +{ + dplyr::filter(.data, ..., .preserve = .preserve) +} + +#' @export +filter.tidybulk <- function (.data, ..., .preserve = FALSE) +{ + .data %>% + drop_class(c("tidybulk", "tt")) %>% + dplyr::filter(..., .preserve = .preserve) %>% + + # Attach attributes + reattach_internals() %>% + + # Add class + add_class("tt") %>% + add_class("tidybulk") + +} +############# END ADDED tidybulk ##################################### + +#' Group by one or more variables +#' +#' @description +#' Most data operations are done on groups defined by variables. +#' `group_by()` takes an existing tbl and converts it into a grouped tbl +#' where operations are performed "by group". `ungroup()` removes grouping. +#' +#' @family grouping functions +#' @inheritParams arrange +#' @param ... In `group_by()`, variables or computations to group by. +#' In `ungroup()`, variables to remove from the grouping. +#' @param .add When `FALSE`, the default, `group_by()` will +#' override existing groups. To add to the existing groups, use +#' `.add = TRUE`. +#' +#' This argument was previously called `add`, but that prevented +#' creating a new grouping variable called `add`, and conflicts with +#' our naming conventions. +#' @param .drop When `.drop = TRUE`, empty groups are dropped. See [group_by_drop_default()] for +#' what the default value is for this argument. +#' @return A [grouped data frame][grouped_df()], unless the combination of `...` and `add` +#' yields a non empty set of grouping columns, a regular (ungrouped) data frame +#' otherwise. +#' @section Methods: +#' These function are **generic**s, which means that packages can provide +#' implementations (methods) for other classes. See the documentation of +#' individual methods for extra arguments and differences in behaviour. +#' +#' Methods available in currently loaded packages: +#' +#' * `group_by()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("group_by")}. +#' * `ungroup()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("ungroup")}. +#' @export +#' @examples +#' by_cyl <- mtcars %>% group_by(cyl) +#' +#' # grouping doesn't change how the data looks (apart from listing +#' # how it's grouped): +#' by_cyl +#' +#' # It changes how it acts with the other dplyr verbs: +#' by_cyl %>% summarise( +#' disp = mean(disp), +#' hp = mean(hp) +#' ) +#' by_cyl %>% filter(disp == max(disp)) +#' +#' # Each call to summarise() removes a layer of grouping +#' by_vs_am <- mtcars %>% group_by(vs, am) +#' by_vs <- by_vs_am %>% summarise(n = n()) +#' by_vs +#' by_vs %>% summarise(n = sum(n)) +#' +#' # To removing grouping, use ungroup +#' by_vs %>% +#' ungroup() %>% +#' summarise(n = sum(n)) +#' +#' # You can group by expressions: this is just short-hand for +#' # a mutate() followed by a group_by() +#' mtcars %>% group_by(vsam = vs + am) +#' +#' # By default, group_by() overrides existing grouping +#' by_cyl %>% +#' group_by(vs, am) %>% +#' group_vars() +#' +#' # Use add = TRUE to instead append +#' by_cyl %>% +#' group_by(vs, am, .add = TRUE) %>% +#' group_vars() +#' +#' +#' # when factors are involved, groups can be empty +#' tbl <- tibble( +#' x = 1:10, +#' y = factor(rep(c("a", "c"), each = 5), levels = c("a", "b", "c")) +#' ) +#' tbl %>% +#' group_by(y) %>% +#' group_rows() +############# START ADDED tidybulk ##################################### +#' @export +group_by <- function (.data, ..., .add = FALSE, .drop = group_by_drop_default(.data)) { + UseMethod("group_by") +} + +#' @export +group_by.default <- function (.data, ..., .add = FALSE, .drop = group_by_drop_default(.data)) +{ + dplyr::group_by(.data, ..., .add = .add, .drop = .drop) +} + +#' @export +group_by.tidybulk <- function (.data, ..., .add = FALSE, .drop = group_by_drop_default(.data)) +{ + .data %>% + drop_class(c("tidybulk", "tt")) %>% + dplyr::group_by( ..., .add = .add, .drop = .drop) %>% + + # Attach attributes + reattach_internals() %>% + + # Add class + add_class("tt") %>% + add_class("tidybulk") + +} +############# END ADDED tidybulk ##################################### + + +#' @rdname group_by +#' @export +#' @param x A [tbl()] +ungroup <- function(x, ...) { + UseMethod("ungroup") +} +############# START ADDED tidybulk ##################################### + +#' @export +ungroup.tidybulk <- function (.data, ...) +{ + .data %>% + drop_class(c("tidybulk", "tt")) %>% + dplyr::ungroup( ...) %>% + + # Attach attributes + reattach_internals() %>% + + # Add class + add_class("tt") %>% + add_class("tidybulk") + +} +############# END ADDED tidybulk ##################################### + +############# START ADDED tidybulk ##################################### + +#' @importFrom dplyr group_by_all +#' @export +dplyr::group_by_all + +#' @importFrom dplyr group_by_at +#' @export +dplyr::group_by_at + +#' @importFrom dplyr group_by_if +#' @export +dplyr::group_by_if + +############# END ADDED tidybulk ##################################### + +#' Summarise each group to fewer rows +#' +#' @description +#' `summarise()` creates a new data frame. It will have one (or more) rows for +#' each combination of grouping variables; if there are no grouping variables, +#' the output will have a single row summarising all observations in the input. +#' It will contain one column for each grouping variable and one column +#' for each of the summary statistics that you have specified. +#' +#' `summarise()` and `summarize()` are synonyms. +#' +#' @section Useful functions: +#' +#' * Center: [mean()], [median()] +#' * Spread: [sd()], [IQR()], [mad()] +#' * Range: [min()], [max()], [quantile()] +#' * Position: [first()], [last()], [nth()], +#' * Count: [n()], [n_distinct()] +#' * Logical: [any()], [all()] +#' +#' @section Backend variations: +#' +#' The data frame backend supports creating a variable and using it in the +#' same summary. This means that previously created summary variables can be +#' further transformed or combined within the summary, as in [mutate()]. +#' However, it also means that summary variables with the same names as previous +#' variables overwrite them, making those variables unavailable to later summary +#' variables. +#' +#' This behaviour may not be supported in other backends. To avoid unexpected +#' results, consider using new names for your summary variables, especially when +#' creating multiple summaries. +#' +#' @export +#' @inheritParams arrange +#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Name-value pairs of summary +#' functions. The name will be the name of the variable in the result. +#' +#' The value can be: +#' +#' * A vector of length 1, e.g. `min(x)`, `n()`, or `sum(is.na(y))`. +#' * A vector of length `n`, e.g. `quantile()`. +#' * A data frame, to add multiple columns from a single expression. +#' @family single table verbs +#' @return +#' An object _usually_ of the same type as `.data`. +#' +#' * The rows come from the underlying `group_keys()`. +#' * The columns are a combination of the grouping keys and the summary +#' expressions that you provide. +#' * If `x` is grouped by more than one variable, the output will be another +#' [grouped_df] with the right-most group removed. +#' * If `x` is grouped by one variable, or is not grouped, the output will +#' be a [tibble]. +#' * Data frame attributes are **not** preserved, because `summarise()` +#' fundamentally creates a new data frame. +#' @section Methods: +#' This function is a **generic**, which means that packages can provide +#' implementations (methods) for other classes. See the documentation of +#' individual methods for extra arguments and differences in behaviour. +#' +#' The following methods are currently available in loaded packages: +#' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("summarise")}. +#' @examples +#' # A summary applied to ungrouped tbl returns a single row +#' mtcars %>% +#' summarise(mean = mean(disp), n = n()) +#' +#' # Usually, you'll want to group first +#' mtcars %>% +#' group_by(cyl) %>% +#' summarise(mean = mean(disp), n = n()) +#' +#' # dplyr 1.0.0 allows to summarise to more than one value: +#' mtcars %>% +#' group_by(cyl) %>% +#' summarise(qs = quantile(disp, c(0.25, 0.75)), prob = c(0.25, 0.75)) +#' +#' # You use a data frame to create multiple columns so you can wrap +#' # this up into a function: +#' my_quantile <- function(x, probs) { +#' tibble(x = quantile(x, probs), probs = probs) +#' } +#' mtcars %>% +#' group_by(cyl) %>% +#' summarise(my_quantile(disp, c(0.25, 0.75))) +#' +#' # Each summary call removes one grouping level (since that group +#' # is now just a single row) +#' mtcars %>% +#' group_by(cyl, vs) %>% +#' summarise(cyl_n = n()) %>% +#' group_vars() +#' +#' # BEWARE: reusing variables may lead to unexpected results +#' mtcars %>% +#' group_by(cyl) %>% +#' summarise(disp = mean(disp), sd = sd(disp)) +#' +#' # Refer to column names stored as strings with the `.data` pronoun: +#' var <- "mass" +#' summarise(starwars, avg = mean(.data[[var]], na.rm = TRUE)) +#' # Learn more in ?dplyr_tidy_eval +############# START ADDED tidybulk ##################################### +#' @export +summarise <- function (.data, ...) { + UseMethod("summarise") +} + +#' @export +summarise.default <- function (.data, ...) +{ + dplyr::summarise(.data, ...) +} + +#' @export +summarise.tidybulk <- function (.data, ...) +{ + .data %>% + drop_class(c("tidybulk", "tt")) %>% + dplyr::summarise( ...) %>% + + # Attach attributes + reattach_internals() %>% + + # Add class + add_class("tt") %>% + add_class("tidybulk") + +} +############# END ADDED tidybulk ##################################### + +############# START ADDED tidybulk ##################################### + +#' @importFrom dplyr summarize_all +#' @export +dplyr::summarize_all + +#' @importFrom dplyr summarize_at +#' @export +dplyr::summarize_at + +#' @importFrom dplyr summarize_if +#' @export +dplyr::summarize_if + +############# END ADDED tidybulk ##################################### + +#' @rdname summarise_all +#' @export +summarize_all <- summarise_all +#' @rdname summarise_all +#' @export +summarize_if <- summarise_if +#' @rdname summarise_all +#' @export +summarize_at <- summarise_at + +#' Create, modify, and delete columns +#' +#' `mutate()` adds new variables and preserves existing ones; +#' `transmute()` adds new variables and drops existing ones. +#' New variables overwrite existing variables of the same name. +#' Variables can be removed by setting their value to `NULL`. +#' +#' @section Useful mutate functions: +#' +#' * [`+`], [`-`], [log()], etc., for their usual mathematical meanings +#' +#' * [lead()], [lag()] +#' +#' * [dense_rank()], [min_rank()], [percent_rank()], [row_number()], +#' [cume_dist()], [ntile()] +#' +#' * [cumsum()], [cummean()], [cummin()], [cummax()], [cumany()], [cumall()] +#' +#' * [na_if()], [coalesce()] +#' +#' * [if_else()], [recode()], [case_when()] +#' +#' @section Grouped tibbles: +#' +#' Because mutating expressions are computed within groups, they may +#' yield different results on grouped tibbles. This will be the case +#' as soon as an aggregating, lagging, or ranking function is +#' involved. Compare this ungrouped mutate: +#' +#' ``` +#' starwars %>% +#' mutate(mass / mean(mass, na.rm = TRUE)) %>% +#' pull() +#' ``` +#' +#' With the grouped equivalent: +#' +#' ``` +#' starwars %>% +#' group_by(gender) %>% +#' mutate(mass / mean(mass, na.rm = TRUE)) %>% +#' pull() +#' ``` +#' +#' The former normalises `mass` by the global average whereas the +#' latter normalises by the averages within gender levels. +#' +#' @export +#' @inheritParams arrange +#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Name-value pairs. +#' The name gives the name of the column in the output. +#' +#' The value can be: +#' +#' * A vector of length 1, which will be recycled to the correct length. +#' * A vector the same length as the current group (or the whole data frame +#' if ungrouped). +#' * `NULL`, to remove the column. +#' * A data frame or tibble, to create multiple columns in the output. +#' +#' @family single table verbs +#' @return +#' An object of the same type as `.data`. +#' +#' For `mutate()`: +#' +#' * Rows are not affected. +#' * Existing columns will be preserved unless explicitly modified. +#' * New columns will be added to the right of existing columns. +#' * Columns given value `NULL` will be removed +#' * Groups will be recomputed if a grouping variable is mutated. +#' * Data frame attributes are preserved. +#' +#' For `transmute()`: +#' +#' * Rows are not affected. +#' * Apart from grouping variables, existing columns will be remove unless +#' explicitly kept. +#' * Column order matches order of expressions. +#' * Groups will be recomputed if a grouping variable is mutated. +#' * Data frame attributes are preserved. +#' @section Methods: +#' These function are **generic**s, which means that packages can provide +#' implementations (methods) for other classes. See the documentation of +#' individual methods for extra arguments and differences in behaviour. +#' +#' Methods available in currently loaded packages: +#' +#' * `mutate()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("mutate")}. +#' * `transmute()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("transmute")}. +#' @examples +#' # Newly created variables are available immediately +#' mtcars %>% as_tibble() %>% mutate( +#' cyl2 = cyl * 2, +#' cyl4 = cyl2 * 2 +#' ) +#' +#' # As well as adding new variables, you can use mutate() to +#' # remove variables and modify existing variables. +#' mtcars %>% as_tibble() %>% mutate( +#' mpg = NULL, +#' disp = disp * 0.0163871 # convert to litres +#' ) +#' +#' # window functions are useful for grouped mutates +#' mtcars %>% +#' group_by(cyl) %>% +#' mutate(rank = min_rank(desc(mpg))) +#' # see `vignette("window-functions")` for more details +#' +#' # mutate() vs transmute -------------------------- +#' # mutate() keeps all existing variables +#' mtcars %>% +#' mutate(displ_l = disp / 61.0237) +#' +#' # transmute keeps only the variables you create +#' mtcars %>% +#' transmute(displ_l = disp / 61.0237) +#' +#' # Grouping ---------------------------------------- +#' # The mutate operation may yield different results on grouped +#' # tibbles because the expressions are computed within groups. +#' # The following normalises `mass` by the global average: +#' starwars %>% +#' mutate(mass / mean(mass, na.rm = TRUE)) %>% +#' pull() +#' +#' # Whereas this normalises `mass` by the averages within gender +#' # levels: +#' starwars %>% +#' group_by(gender) %>% +#' mutate(mass / mean(mass, na.rm = TRUE)) %>% +#' pull() +#' +#' # Indirection ---------------------------------------- +#' # Refer to column names stored as strings with the `.data` pronoun: +#' vars <- c("mass", "height") +#' mutate(starwars, prod = .data[[vars[[1]]]] * .data[[vars[[2]]]]) +#' # Learn more in ?dplyr_tidy_eval +############# START ADDED tidybulk ##################################### +#' @export +mutate <- function(.data, ...) { + UseMethod("mutate") +} + +#' @export +mutate.default <- function(.data, ...) +{ + dplyr::mutate(.data, ...) +} + +#' @export +mutate.tidybulk <- function(.data, ...) +{ + .data %>% + drop_class(c("tidybulk", "tt")) %>% + dplyr::mutate(...) %>% + + # Attach attributes + reattach_internals() %>% + + # Add class + add_class("tt") %>% + add_class("tidybulk") + + +} +############# END ADDED tidybulk ##################################### + +############# START ADDED tidybulk ##################################### + +#' @importFrom dplyr mutate_all +#' @export +dplyr::mutate_all + +#' @importFrom dplyr mutate_at +#' @export +dplyr::mutate_at + +#' @importFrom dplyr mutate_if +#' @export +dplyr::mutate_if + +############# END ADDED tidybulk ##################################### + +#' Rename columns +#' +#' Rename individual variables using `new_name = old_name` syntax. +#' +#' @section Scoped selection and renaming: +#' +#' Use the three scoped variants ([rename_all()], [rename_if()], [rename_at()]) +#' to renaming a set of variables with a function. +#' +#' @inheritParams arrange +#' @param ... <[`tidy-select`][dplyr_tidy_select]> Use `new_name = old_name` +#' to rename selected variables. +#' @return +#' An object of the same type as `.data`. +#' * Rows are not affected. +#' * Column names are changed; column order is preserved +#' * Data frame attributes are preserved. +#' * Groups are updated to reflect new names. +#' @section Methods: +#' This function is a **generic**, which means that packages can provide +#' implementations (methods) for other classes. See the documentation of +#' individual methods for extra arguments and differences in behaviour. +#' +#' The following methods are currently available in loaded packages: +#' \Sexpr[stage=render,results=Rd]{dplyr:::methods_rd("rename")}. +#' @family single table verbs +#' @export +#' @examples +#' iris <- as_tibble(iris) # so it prints a little nicer +#' rename(iris, petal_length = Petal.Length) +############# START ADDED tidybulk ##################################### +#' @export +rename <- function(.data, ...) { + UseMethod("rename") +} + +#' @export +rename.default <- function(.data, ...) +{ + dplyr::rename(.data, ...) +} + +#' @export +rename.tidybulk <- function(.data, ...) +{ + .data %>% + drop_class(c("tidybulk", "tt")) %>% + dplyr::rename(...) %>% + + # Attach attributes + reattach_internals() %>% + + # Add class + add_class("tt") %>% + add_class("tidybulk") + + +} +############# END ADDED tidybulk ##################################### + +#' Group input by rows +#' +#' \Sexpr[results=rd, stage=render]{lifecycle::badge("questioning")} +#' +#' See [this repository](https://github.com/jennybc/row-oriented-workflows) +#' for alternative ways to perform row-wise operations. +#' +#' `rowwise()` is used for the results of [do()] when you +#' create list-variables. It is also useful to support arbitrary +#' complex operations that need to be applied to each row. +#' +#' Currently, rowwise grouping only works with data frames. Its +#' main impact is to allow you to work with list-variables in +#' [summarise()] and [mutate()] without having to +#' use \code{[[1]]}. This makes `summarise()` on a rowwise tbl +#' effectively equivalent to [plyr::ldply()]. +#' +#' @param data Input data frame. +#' @export +#' @examples +#' df <- expand.grid(x = 1:3, y = 3:1) +#' df_done <- df %>% rowwise() %>% do(i = seq(.$x, .$y)) +#' df_done +#' df_done %>% summarise(n = length(i)) +############# START ADDED tidybulk ##################################### +#' @export +rowwise <- function(.data) { + UseMethod("rowwise") +} + +#' @export +rowwise.default <- function(.data) +{ + dplyr::rowwise(.data) +} + +#' @export +rowwise.tidybulk <- function(.data) +{ + .data %>% + drop_class(c("tidybulk", "tt")) %>% + dplyr::rowwise() %>% + + # Attach attributes + reattach_internals() %>% + + # Add class + add_class("tt") %>% + add_class("tidybulk") + + +} +############# END ADDED tidybulk ##################################### From 8709db96acfc945f87b107e2fd617a35b2c47b71 Mon Sep 17 00:00:00 2001 From: chilampoon Date: Mon, 18 Sep 2023 23:14:26 -0400 Subject: [PATCH 5/7] imports conflict --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index b5604bdb..572f0deb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,7 +36,7 @@ Imports: methods, S4Vectors, crayon, - pkgconfig + Matrix Suggests: BiocStyle, testthat, From b6fdbc40f79a492a4ddd5f7fe7a91a74b08cbf9e Mon Sep 17 00:00:00 2001 From: chilampoon Date: Tue, 19 Sep 2023 00:01:37 -0400 Subject: [PATCH 6/7] set _R_CHECK_FORCE_SUGGESTS_ to false --- .github/workflows/check-bioc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index e432bb90..d5ac53e6 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -59,6 +59,7 @@ jobs: ## for examples using the http-user-agent env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + _R_CHECK_FORCE_SUGGESTS_: false RSPM: ${{ matrix.config.rspm }} NOT_CRAN: true TZ: UTC From 0c1670a5ccf38110570b578008663adb6acdc977 Mon Sep 17 00:00:00 2001 From: chilampoon Date: Tue, 19 Sep 2023 00:59:32 -0400 Subject: [PATCH 7/7] add pkgconfig --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 572f0deb..f38f2d3e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,7 +36,8 @@ Imports: methods, S4Vectors, crayon, - Matrix + Matrix, + pkgconfig Suggests: BiocStyle, testthat,