Skip to content

Commit

Permalink
fix data_to_numeric for factors with NA
Browse files Browse the repository at this point in the history
  • Loading branch information
strengejacke committed Mar 4, 2022
1 parent 18ed674 commit 9d372e5
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 4 deletions.
3 changes: 2 additions & 1 deletion R/center.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ center.numeric <- function(x,
attr(centered_x, "center") <- args$center
attr(centered_x, "scale") <- args$scale
attr(centered_x, "robust") <- robust
centered_x
# labels
.set_back_labels(centered_x, x, include_values = FALSE)
}


Expand Down
48 changes: 45 additions & 3 deletions R/convert_data_to_numeric.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,27 @@ data_to_numeric <- convert_data_to_numeric
#' @export
convert_data_to_numeric.data.frame <- function(x, dummy_factors = TRUE, ...) {
out <- sapply(x, convert_data_to_numeric, dummy_factors = dummy_factors, simplify = FALSE)
as.data.frame(do.call(cbind, out))
# save variable attributes
attr_vars <- lapply(out, attributes)
# "out" is currently a list, bind columns and to data frame
out <- as.data.frame(do.call(cbind, out))
# set back attributes
for (i in colnames(out)) {
if (is.list(attr_vars[[i]])) {
if (is.list(attributes(out[[i]]))) {
attributes(out[[i]]) <- utils::modifyList(attr_vars[[i]], attributes(out[[i]]))
} else {
attributes(out[[i]]) <- attr_vars[[i]]
}
}
}
out
}


#' @export
convert_data_to_numeric.numeric <- function(x, ...) {
as.numeric(x)
.set_back_labels(as.numeric(x), x)
}

#' @export
Expand All @@ -49,9 +63,37 @@ convert_data_to_numeric.factor <- function(x, dummy_factors = TRUE, ...) {
if (dummy_factors) {
out <- as.data.frame(stats::model.matrix(~x, contrasts.arg = list(x = "contr.treatment")))
out[1] <- as.numeric(rowSums(out[2:ncol(out)]) == 0)

# insert back NA rows. if "x" had missing values, model.matrix() creates an
# array with only non-missing values, so some rows are missing. First, we
# need to now which rows are missing (na_values) and the length of the
# original vector (which will be the number of rows in the final data frame)

na_values <- which(is.na(x))
rows_x <- length(x)

if (any(na_values)) {
# iterate all missing values that have
for (i in 1:length(na_values)) {
# if the first observation was missing, add NA row and bind data frame
if (i == 1 && na_values[i] == 1) {
out <- rbind(NA, out)
} else {
# if the last observation was NA, add NA row to data frame
if (na_values[i] == rows_x) {
out <- rbind(out, NA)
} else {
# else, pick rows from beginning to current NA value, add NA,
# and rbind the remaining rows
out <- rbind(out[1:(na_values[i] - 1), ], NA, out[na_values[i]:nrow(out), ])
}
}
}
rownames(out) <- NULL
}
names(out) <- levels(x)
} else {
out <- as.numeric(x)
out <- .set_back_labels(as.numeric(x), x)
}
out
}
Expand Down
Binary file modified data/efc.RData
Binary file not shown.
49 changes: 49 additions & 0 deletions tests/testthat/test-convert_data_to_numeric.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,52 @@ test_that("convert factor to numeric, dummy factors", {
ignore_attr = TRUE
)
})

test_that("convert factor to numeric, dummy factors, with NA", {
x1 <- factor(rep(c("a", "b"), 3))
x2 <- factor(c("a", NA_character_, "a", "b", "a", "b"))
x3 <- factor(c(NA_character_, "b", "a", "b", "a", "b"))
x4 <- factor(c("a", "b", "a", "b", "a", NA_character_))
x5 <- factor(c(NA_character_, "b", "a", "b", "a", NA_character_))
x6 <- factor(c(NA_character_, "b", NA_character_, "b", "a", NA_character_))
x7 <- factor(c(NA_character_, "b", "a", "b", "a", "b", NA_character_, "b", "a", NA_character_, "a", "b", "a", "b", "a", NA_character_))

# same observations are missing
expect_equal(
which(!complete.cases(convert_data_to_numeric(x1, dummy_factors = TRUE))),
which(is.na(x1))
)
expect_equal(
which(!complete.cases(convert_data_to_numeric(x2, dummy_factors = TRUE))),
which(is.na(x2))
)
expect_equal(
which(!complete.cases(convert_data_to_numeric(x3, dummy_factors = TRUE))),
which(is.na(x3))
)
expect_equal(
which(!complete.cases(convert_data_to_numeric(x4, dummy_factors = TRUE))),
which(is.na(x4))
)
expect_equal(
which(!complete.cases(convert_data_to_numeric(x5, dummy_factors = TRUE))),
which(is.na(x5))
)
expect_equal(
which(!complete.cases(convert_data_to_numeric(x6, dummy_factors = TRUE))),
which(is.na(x6))
)
expect_equal(
which(!complete.cases(convert_data_to_numeric(x7, dummy_factors = TRUE))),
which(is.na(x7))
)

# output has same number of observation as input
expect_equal(nrow(convert_data_to_numeric(x1, dummy_factors = TRUE)), length(x1))
expect_equal(nrow(convert_data_to_numeric(x2, dummy_factors = TRUE)), length(x2))
expect_equal(nrow(convert_data_to_numeric(x3, dummy_factors = TRUE)), length(x3))
expect_equal(nrow(convert_data_to_numeric(x4, dummy_factors = TRUE)), length(x4))
expect_equal(nrow(convert_data_to_numeric(x5, dummy_factors = TRUE)), length(x5))
expect_equal(nrow(convert_data_to_numeric(x6, dummy_factors = TRUE)), length(x6))
expect_equal(nrow(convert_data_to_numeric(x7, dummy_factors = TRUE)), length(x7))
})
20 changes: 20 additions & 0 deletions tests/testthat/test-labelled_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,23 @@ test_that("data_addsuffix, labels preserved", {
ignore_attr = TRUE
)
})



# data_to_numeric -----------------------------------

test_that("data_to_numeric, labels preserved", {
x <- data_to_numeric(efc) |> str()
# factor
expect_equal(
attr(x$e42dep_new, "label", exact = TRUE),
attr(efc$e42dep, "label", exact = TRUE),
ignore_attr = TRUE
)
# numeric
expect_equal(
attr(x$c12hour_new, "label", exact = TRUE),
attr(efc$c12hour, "label", exact = TRUE),
ignore_attr = TRUE
)
})

0 comments on commit 9d372e5

Please sign in to comment.