Skip to content

Commit

Permalink
Merge branch 'main' into 206_rm_coerce_to_numeric
Browse files Browse the repository at this point in the history
  • Loading branch information
IndrajeetPatil authored Aug 31, 2022
2 parents 9b5ce47 + 9018673 commit c4c88c4
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 43 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.5.1.3
Version: 0.5.1.4
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
62 changes: 33 additions & 29 deletions R/categorize.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,38 +46,38 @@
#' @param ... not used.
#' @inheritParams find_columns
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @inherit data_rename seealso
#'
#' @return `x`, recoded into groups. By default `x` is numeric, unless `labels`
#' is specified. In this case, a factor is returned, where the factor levels
#' (i.e. recoded groups are labelled accordingly.
#'
#' @details
#'
#' \subsection{Splits and breaks (cut-off values)}{
#' Breaks are in general _exclusive_, this means that these values indicate
#' the lower bound of the next group or interval to begin. Take a simple
#' example, a numeric variable with values from 1 to 9. The median would be 5,
#' thus the first interval ranges from 1-4 and is recoded into 1, while 5-9
#' would turn into 2 (compare `cbind(1:9, categorize(1:9))`). The same variable,
#' using `split = "quantile"` and `n_groups = 3` would define breaks at 3.67
#' and 6.33 (see `quantile(1:9, probs = c(1/3, 2/3))`), which means that values
#' from 1 to 3 belong to the first interval and are recoded into 1 (because
#' the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3.
#' }
#' # Splits and breaks (cut-off values)
#'
#' Breaks are in general _exclusive_, this means that these values indicate
#' the lower bound of the next group or interval to begin. Take a simple
#' example, a numeric variable with values from 1 to 9. The median would be 5,
#' thus the first interval ranges from 1-4 and is recoded into 1, while 5-9
#' would turn into 2 (compare `cbind(1:9, categorize(1:9))`). The same variable,
#' using `split = "quantile"` and `n_groups = 3` would define breaks at 3.67
#' and 6.33 (see `quantile(1:9, probs = c(1/3, 2/3))`), which means that values
#' from 1 to 3 belong to the first interval and are recoded into 1 (because
#' the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3.
#'
#' # Recoding into groups with equal size or range
#'
#' \subsection{Recoding into groups with equal size or range}{
#' `split = "equal_length"` and `split = "equal_range"` try to divide the
#' range of `x` into intervals of similar (or same) length. The difference is
#' that `split = "equal_length"` will divide the range of `x` into `n_groups`
#' pieces and thereby defining the intervals used as breaks (hence, it is
#' equivalent to `cut(x, breaks = n_groups)`), while `split = "equal_range"`
#' will cut `x` into intervals that all have the length of `range`, where the
#' first interval by defaults starts at `1`. The lowest (or starting) value
#' of that interval can be defined using the `lowest` argument.
#' }
#' `split = "equal_length"` and `split = "equal_range"` try to divide the
#' range of `x` into intervals of similar (or same) length. The difference is
#' that `split = "equal_length"` will divide the range of `x` into `n_groups`
#' pieces and thereby defining the intervals used as breaks (hence, it is
#' equivalent to `cut(x, breaks = n_groups)`), while `split = "equal_range"`
#' will cut `x` into intervals that all have the length of `range`, where the
#' first interval by defaults starts at `1`. The lowest (or starting) value
#' of that interval can be defined using the `lowest` argument.
#'
#' @inheritSection center Selection of variables - the `select` argument
#'
#' @return `x`, recoded into groups. By default `x` is numeric, unless `labels`
#' is specified. In this case, a factor is returned, where the factor levels
#' (i.e. recoded groups are labelled accordingly.
#'
#' @examples
#' set.seed(123)
Expand Down Expand Up @@ -121,7 +121,9 @@ categorize <- function(x, ...) {
#' @export
categorize.default <- function(x, verbose = TRUE, ...) {
if (isTRUE(verbose)) {
message(insight::format_message(paste0("Variables of class '", class(x)[1], "' can't be recoded and remain unchanged.")))
message(insight::format_message(
paste0("Variables of class '", class(x)[1], "' can't be recoded and remain unchanged.")
))
}
return(x)
}
Expand Down Expand Up @@ -180,7 +182,9 @@ categorize.numeric <- function(x,
# stop if all NA
if (!length(x)) {
if (isTRUE(verbose)) {
warning(insight::format_message("Variable contains only missing values. No recoding carried out."), call. = FALSE)
warning(insight::format_message(
"Variable contains only missing values. No recoding carried out."
), call. = FALSE)
}
return(original_x)
}
Expand Down
57 changes: 53 additions & 4 deletions R/data_match.R
Original file line number Diff line number Diff line change
Expand Up @@ -149,16 +149,65 @@ data_match <- function(x, to, match = "and", return_indices = FALSE, drop_na = T
#' @export
data_filter <- function(x, filter, ...) {
condition <- substitute(filter)

# condition can be a numeric vector, to slice rows by indices,
# or a logical condition to filter observations. first, we check
# for numeric vector. the logical condition can also be passed
# as character vector, which allows to use data_filer() from inside
# other function w/o the need to define "globalVariables".

# numeric vector to slice data frame?
rows <- try(eval(condition, envir = parent.frame()), silent = TRUE)
if (is.numeric(rows)) {
out <- x[rows, , drop = FALSE]
} else {
# "filter" is no expression, but a string?
if (is.character(condition)) {
condition <- .str2lang(condition)
if (!is.character(condition)) {
condition <- insight::safe_deparse(condition)
}
# Check syntax of the filter. Must be done *before* calling subset() (cf
# easystats/datawizard#237)
# Give more informative message to users
# about possible misspelled comparisons / logical conditions
# check if "=" instead of "==" was used?
# NOTE: We cannot check for `=` when "filter" is not a character vector
# because the function will then fail in general. I.e.,
# "data_filter(mtcars, filter = mpg > 10 & cyl = 4)" will not start
# running this function and never reaches the first code line,
# but immediately stops...
tmp <- gsub("==", "", condition, fixed = TRUE)
tmp <- gsub("!=", "", tmp, fixed = TRUE)
if (any(grepl("=", tmp, fixed = TRUE))) {
stop(insight::format_message(
"Filtering did not work. Please check if you need `==` (instead of `=`) for comparison."
), call. = FALSE)
}
# check if "&&" etc instead of "&" was used?
logical_operator <- NULL
if (any(grepl("&&", condition, fixed = TRUE))) {
logical_operator <- "&&"
}
if (any(grepl("||", condition, fixed = TRUE))) {
logical_operator <- "||"
}
if (!is.null(logical_operator)) {
stop(insight::format_message(
paste0(
"Filtering did not work. Please check if you need `",
substr(logical_operator, 0, 1),
"` (instead of `", logical_operator, "`) as logical operator."
)
), call. = FALSE)
}
out <- tryCatch(
subset(x, subset = eval(parse(text = condition), envir = new.env())),
warning = function(e) NULL,
error = function(e) NULL
)
if (is.null(out)) {
stop(insight::format_message(
"Filtering did not work. Please check the syntax of your `filter` argument."
), call. = FALSE)
}
out <- do.call(subset, list(x, subset = condition))
}
# restore value and variable labels
for (i in colnames(out)) {
Expand Down
12 changes: 7 additions & 5 deletions R/select_helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
.select_nse <- function(select, data, exclude, ignore_case, regex = FALSE, verbose = FALSE) {
# check if data argument is valid
if (is.null(data)) {
stop(insight::format_message("The 'data' argument must be provided."), call. = FALSE)
stop(insight::format_message("The `data` argument must be provided."), call. = FALSE)
}

# check data frame input
if (!is.null(data) && !is.data.frame(data)) {
data <- try(as.data.frame(data), silent = TRUE)
if (inherits(data, c("try-error", "simpleError"))) {
stop(insight::format_message("The 'data' argument must be a data frame, or an object that can be coerced to a data frame."), call. = FALSE)
stop(insight::format_message(
"The `data` argument must be a data frame, or an object that can be coerced to a data frame."
), call. = FALSE)
}
}

Expand Down Expand Up @@ -189,7 +191,7 @@
stop("Could not find variable '", from_to[2], "' in data.", call. = FALSE)
}
if (negate) {
pattern <- colnames(data)[setdiff(1:ncol(data), from:to)]
pattern <- colnames(data)[setdiff(seq_len(ncol(data)), from:to)]
} else {
pattern <- colnames(data)[from:to]
}
Expand Down Expand Up @@ -240,7 +242,7 @@
# select last column(s)
pattern[pattern < 0] <- sort(ncol(data) + pattern[pattern < 0] + 1)
}
pattern <- colnames(data)[intersect(pattern, 1:ncol(data))]
pattern <- colnames(data)[intersect(pattern, seq_len(ncol(data)))]
}

# special token - select all columns?
Expand Down Expand Up @@ -288,7 +290,7 @@
.attach_packages <- function(packages = NULL) {
if (!is.null(packages)) {
pkg <- packages$package
for (i in 1:length(pkg)) {
for (i in seq_along(pkg)) {
if (isTRUE(packages$namespace[i])) {
loadNamespace(pkg[i])
}
Expand Down
7 changes: 3 additions & 4 deletions man/categorize.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions tests/testthat/test-data_match.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,22 @@ test_that("data_filter works like slice", {
out <- data_filter(mtcars, 5:10)
expect_equal(out, mtcars[5:10, ], ignore_attr = TRUE)
})

test_that("data_filter gives informative message on errors", {
expect_error(
data_filter(mtcars, "mpg > 10 || cyl = 4"),
"`==`"
)
expect_error(
data_filter(mtcars, filter = mpg > 10 || cyl == 4),
"`||`"
)
expect_error(
data_filter(mtcars, filter = mpg > 10 && cyl == 4),
"`&&`"
)
expect_error(
data_filter(mtcars, filter = mpg > 10 ? cyl == 4),
"syntax"
)
})

0 comments on commit c4c88c4

Please sign in to comment.