Merge branch 'main' into 206_rm_coerce_to_numeric

easystats · Aug 31, 2022 · c4c88c4 · c4c88c4
2 parents 9b5ce47 + 9018673
commit c4c88c4
Show file tree

Hide file tree

Showing 6 changed files with 116 additions and 43 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.5.1.3
+Version: 0.5.1.4
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/R/categorize.R b/R/categorize.R
@@ -46,38 +46,38 @@
 #' @param ... not used.
 #' @inheritParams find_columns
 #'
-#' @inheritSection center Selection of variables - the `select` argument
-#'
 #' @inherit data_rename seealso
 #'
-#' @return `x`, recoded into groups. By default `x` is numeric, unless `labels`
-#'   is specified. In this case, a factor is returned, where the factor levels
-#'   (i.e. recoded groups are labelled accordingly.
-#'
 #' @details
 #'
-#'   \subsection{Splits and breaks (cut-off values)}{
-#'   Breaks are in general _exclusive_, this means that these values indicate
-#'   the lower bound of the next group or interval to begin. Take a simple
-#'   example, a numeric variable with values from 1 to 9. The median would be 5,
-#'   thus the first interval ranges from 1-4 and is recoded into 1, while 5-9
-#'   would turn into 2 (compare `cbind(1:9, categorize(1:9))`). The same variable,
-#'   using `split = "quantile"` and `n_groups = 3` would define breaks at 3.67
-#'   and 6.33 (see `quantile(1:9, probs = c(1/3, 2/3))`), which means that values
-#'   from 1 to 3 belong to the first interval and are recoded into 1 (because
-#'   the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3.
-#'   }
+#' # Splits and breaks (cut-off values)
+#'
+#' Breaks are in general _exclusive_, this means that these values indicate
+#' the lower bound of the next group or interval to begin. Take a simple
+#' example, a numeric variable with values from 1 to 9. The median would be 5,
+#' thus the first interval ranges from 1-4 and is recoded into 1, while 5-9
+#' would turn into 2 (compare `cbind(1:9, categorize(1:9))`). The same variable,
+#' using `split = "quantile"` and `n_groups = 3` would define breaks at 3.67
+#' and 6.33 (see `quantile(1:9, probs = c(1/3, 2/3))`), which means that values
+#' from 1 to 3 belong to the first interval and are recoded into 1 (because
+#' the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3.
+#'
+#' # Recoding into groups with equal size or range
 #'
-#'   \subsection{Recoding into groups with equal size or range}{
-#'   `split = "equal_length"` and `split = "equal_range"` try to divide the
-#'   range of `x` into intervals of similar (or same) length. The difference is
-#'   that `split = "equal_length"` will divide the range of `x` into `n_groups`
-#'   pieces and thereby defining the intervals used as breaks (hence, it is
-#'   equivalent to `cut(x, breaks = n_groups)`), while  `split = "equal_range"`
-#'   will cut `x` into intervals that all have the length of `range`, where the
-#'   first interval by defaults starts at `1`. The lowest (or starting) value
-#'   of that interval can be defined using the `lowest` argument.
-#'   }
+#' `split = "equal_length"` and `split = "equal_range"` try to divide the
+#' range of `x` into intervals of similar (or same) length. The difference is
+#' that `split = "equal_length"` will divide the range of `x` into `n_groups`
+#' pieces and thereby defining the intervals used as breaks (hence, it is
+#' equivalent to `cut(x, breaks = n_groups)`), while  `split = "equal_range"`
+#' will cut `x` into intervals that all have the length of `range`, where the
+#' first interval by defaults starts at `1`. The lowest (or starting) value
+#' of that interval can be defined using the `lowest` argument.
+#'
+#' @inheritSection center Selection of variables - the `select` argument
+#'
+#' @return `x`, recoded into groups. By default `x` is numeric, unless `labels`
+#'   is specified. In this case, a factor is returned, where the factor levels
+#'   (i.e. recoded groups are labelled accordingly.
 #'
 #' @examples
 #' set.seed(123)
@@ -121,7 +121,9 @@ categorize <- function(x, ...) {
 #' @export
 categorize.default <- function(x, verbose = TRUE, ...) {
   if (isTRUE(verbose)) {
-    message(insight::format_message(paste0("Variables of class '", class(x)[1], "' can't be recoded and remain unchanged.")))
+    message(insight::format_message(
+      paste0("Variables of class '", class(x)[1], "' can't be recoded and remain unchanged.")
+    ))
   }
   return(x)
 }
@@ -180,7 +182,9 @@ categorize.numeric <- function(x,
   # stop if all NA
   if (!length(x)) {
     if (isTRUE(verbose)) {
-      warning(insight::format_message("Variable contains only missing values. No recoding carried out."), call. = FALSE)
+      warning(insight::format_message(
+        "Variable contains only missing values. No recoding carried out."
+      ), call. = FALSE)
     }
     return(original_x)
   }

diff --git a/R/data_match.R b/R/data_match.R
@@ -149,16 +149,65 @@ data_match <- function(x, to, match = "and", return_indices = FALSE, drop_na = T
 #' @export
 data_filter <- function(x, filter, ...) {
   condition <- substitute(filter)
+
+  # condition can be a numeric vector, to slice rows by indices,
+  # or a logical condition to filter observations. first, we check
+  # for numeric vector. the logical condition can also be passed
+  # as character vector, which allows to use data_filer() from inside
+  # other function w/o the need to define "globalVariables".
+
   # numeric vector to slice data frame?
   rows <- try(eval(condition, envir = parent.frame()), silent = TRUE)
   if (is.numeric(rows)) {
     out <- x[rows, , drop = FALSE]
   } else {
-    # "filter" is no expression, but a string?
-    if (is.character(condition)) {
-      condition <- .str2lang(condition)
+    if (!is.character(condition)) {
+      condition <- insight::safe_deparse(condition)
+    }
+    # Check syntax of the filter. Must be done *before* calling subset() (cf
+    # easystats/datawizard#237)
+    # Give more informative message to users
+    # about possible misspelled comparisons / logical conditions
+    # check if "=" instead of "==" was used?
+    # NOTE: We cannot check for `=` when "filter" is not a character vector
+    # because the function will then fail in general. I.e.,
+    # "data_filter(mtcars, filter = mpg > 10 & cyl = 4)" will not start
+    # running this function and never reaches the first code line,
+    # but immediately stops...
+    tmp <- gsub("==", "", condition, fixed = TRUE)
+    tmp <- gsub("!=", "", tmp, fixed = TRUE)
+    if (any(grepl("=", tmp, fixed = TRUE))) {
+      stop(insight::format_message(
+        "Filtering did not work. Please check if you need `==` (instead of `=`) for comparison."
+      ), call. = FALSE)
+    }
+    # check if "&&" etc instead of "&" was used?
+    logical_operator <- NULL
+    if (any(grepl("&&", condition, fixed = TRUE))) {
+      logical_operator <- "&&"
+    }
+    if (any(grepl("||", condition, fixed = TRUE))) {
+      logical_operator <- "||"
+    }
+    if (!is.null(logical_operator)) {
+      stop(insight::format_message(
+        paste0(
+          "Filtering did not work. Please check if you need `",
+          substr(logical_operator, 0, 1),
+          "` (instead of `", logical_operator, "`) as logical operator."
+        )
+      ), call. = FALSE)
+    }
+    out <- tryCatch(
+      subset(x, subset = eval(parse(text = condition), envir = new.env())),
+      warning = function(e) NULL,
+      error = function(e) NULL
+    )
+    if (is.null(out)) {
+      stop(insight::format_message(
+        "Filtering did not work. Please check the syntax of your `filter` argument."
+      ), call. = FALSE)
     }
-    out <- do.call(subset, list(x, subset = condition))
   }
   # restore value and variable labels
   for (i in colnames(out)) {

diff --git a/R/select_helpers.R b/R/select_helpers.R
@@ -3,14 +3,16 @@
 .select_nse <- function(select, data, exclude, ignore_case, regex = FALSE, verbose = FALSE) {
   # check if data argument is valid
   if (is.null(data)) {
-    stop(insight::format_message("The 'data' argument must be provided."), call. = FALSE)
+    stop(insight::format_message("The `data` argument must be provided."), call. = FALSE)
   }
 
   # check data frame input
   if (!is.null(data) && !is.data.frame(data)) {
     data <- try(as.data.frame(data), silent = TRUE)
     if (inherits(data, c("try-error", "simpleError"))) {
-      stop(insight::format_message("The 'data' argument must be a data frame, or an object that can be coerced to a data frame."), call. = FALSE)
+      stop(insight::format_message(
+        "The `data` argument must be a data frame, or an object that can be coerced to a data frame."
+      ), call. = FALSE)
     }
   }
 
@@ -189,7 +191,7 @@
       stop("Could not find variable '", from_to[2], "' in data.", call. = FALSE)
     }
     if (negate) {
-      pattern <- colnames(data)[setdiff(1:ncol(data), from:to)]
+      pattern <- colnames(data)[setdiff(seq_len(ncol(data)), from:to)]
     } else {
       pattern <- colnames(data)[from:to]
     }
@@ -240,7 +242,7 @@
       # select last column(s)
       pattern[pattern < 0] <- sort(ncol(data) + pattern[pattern < 0] + 1)
     }
-    pattern <- colnames(data)[intersect(pattern, 1:ncol(data))]
+    pattern <- colnames(data)[intersect(pattern, seq_len(ncol(data)))]
   }
 
   # special token - select all columns?
@@ -288,7 +290,7 @@
 .attach_packages <- function(packages = NULL) {
   if (!is.null(packages)) {
     pkg <- packages$package
-    for (i in 1:length(pkg)) {
+    for (i in seq_along(pkg)) {
       if (isTRUE(packages$namespace[i])) {
         loadNamespace(pkg[i])
       }

diff --git a/man/categorize.Rd b/man/categorize.Rd
diff --git a/tests/testthat/test-data_match.R b/tests/testthat/test-data_match.R
@@ -78,3 +78,22 @@ test_that("data_filter works like slice", {
   out <- data_filter(mtcars, 5:10)
   expect_equal(out, mtcars[5:10, ], ignore_attr = TRUE)
 })
+
+test_that("data_filter gives informative message on errors", {
+  expect_error(
+    data_filter(mtcars, "mpg > 10 || cyl = 4"),
+    "`==`"
+  )
+  expect_error(
+    data_filter(mtcars, filter = mpg > 10 || cyl == 4),
+    "`||`"
+  )
+  expect_error(
+    data_filter(mtcars, filter = mpg > 10 && cyl == 4),
+    "`&&`"
+  )
+  expect_error(
+    data_filter(mtcars, filter = mpg > 10 ? cyl == 4),
+    "syntax"
+  )
+})