From f16cf420344ec4a2d8eb6601bd97236080b98f12 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Thu, 7 Dec 2023 08:42:26 -0500 Subject: [PATCH] Excel time to numeric (#479) Add excel_time_to_numeric to convert many time formats to seconds --------- Co-authored-by: Sam Firke Co-authored-by: olivroy <52606734+olivroy@users.noreply.github.com> --- NAMESPACE | 6 + NEWS.md | 2 + R/excel_dates.R | 1 + R/excel_time_to_numeric.R | 166 ++++++++++++++++++++ man/convert_to_date.Rd | 1 + man/excel_numeric_to_date.Rd | 3 + man/excel_time_to_numeric.Rd | 44 ++++++ man/janitor-package.Rd | 1 + man/sas_numeric_to_date.Rd | 3 +- tests/testthat/test-excel_time_to_numeric.R | 152 ++++++++++++++++++ 10 files changed, 378 insertions(+), 1 deletion(-) create mode 100644 R/excel_time_to_numeric.R create mode 100644 man/excel_time_to_numeric.Rd create mode 100644 tests/testthat/test-excel_time_to_numeric.R diff --git a/NAMESPACE b/NAMESPACE index 0067ad1d..27073345 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,11 @@ S3method(clean_names,tbl_graph) S3method(clean_names,tbl_lazy) S3method(describe_class,default) S3method(describe_class,factor) +S3method(excel_time_to_numeric,POSIXct) +S3method(excel_time_to_numeric,POSIXlt) +S3method(excel_time_to_numeric,character) +S3method(excel_time_to_numeric,logical) +S3method(excel_time_to_numeric,numeric) S3method(fisher.test,default) S3method(fisher.test,tabyl) S3method(print,tabyl) @@ -34,6 +39,7 @@ export(convert_to_datetime) export(crosstab) export(describe_class) export(excel_numeric_to_date) +export(excel_time_to_numeric) export(find_header) export(fisher.test) export(get_dupes) diff --git a/NEWS.md b/NEWS.md index cd506af2..3e387eb8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,8 @@ These are all minor breaking changes resulting from enhancements and are not exp * `row_to_names()` now accepts multiple rows as input, and merges them using a new `sep` argument (#536). The default is `sep = "_"`. When handling multiple `NA` values, `row_to_names()` ignores them and only merges non-NA values for column names. When all values are `NA`, `row_to_names()` creates a column name of `"NA"`, a character, rather than `NA`. +* The new function `excel_time_to_numeric()` converts times from Excel that do not have accompanying dates into a number of seconds. (#245, thanks to **@billdenney** for the feature.) + ## Bug fixes * `adorn_totals("row")` now succeeds if the new `name` of the totals row is already a factor level of the input data.frame (#529, thanks @egozoglu for reporting). diff --git a/R/excel_dates.R b/R/excel_dates.R index a570d5e6..a505bf10 100644 --- a/R/excel_dates.R +++ b/R/excel_dates.R @@ -33,6 +33,7 @@ #' https://support.microsoft.com/en-us/help/2722715/support-for-the-leap-second). #' #' @export +#' @seealso \code{\link{excel_time_to_numeric}} #' @examples #' excel_numeric_to_date(40000) #' excel_numeric_to_date(40000.5) # No time is included diff --git a/R/excel_time_to_numeric.R b/R/excel_time_to_numeric.R new file mode 100644 index 00000000..d6366bde --- /dev/null +++ b/R/excel_time_to_numeric.R @@ -0,0 +1,166 @@ +#' Convert a time that may be inconsistently or inconveniently formatted from +#' Microsoft Excel to a numeric number of seconds between 0 and 86400. +#' +#' @details +#' +#' `time_value` may be one of the following formats: +#' \itemize{ +#' \item{numeric}{The input must be a value from 0 to 1 (exclusive of 1); this value is returned as-is.} +#' \item{POSIXlt or POSIXct}{The input must be on the day 1899-12-31 (any other day will cause an error). The time of day is extracted and converted to a fraction of a day.} +#' \item{character}{Any of the following (or a mixture of the choices):} +#' \itemize{ +#' \item{A character string that is a number between 0 and 1 (exclusive of 1). This value will be converted like a numeric value.} +#' \item{A character string that looks like a date on 1899-12-31 (specifically, it must start with `"1899-12-31 "`), converted like a POSIXct object as described above.} +#' \item{A character string that looks like a time. Choices are 12-hour time as hour, minute, and optionally second followed by "am" or "pm" (case insensitive) or 24-hour time when hour, minute, optionally second, and no "am" or "pm" is included.} +#' } +#' } +#' +#' @param time_value A vector of values to convert (see Details) +#' @param round_seconds Should the output number of seconds be rounded to an +#' integer? +#' @return A vector of numbers >= 0 and <86400 +#' @family Date-time cleaning +#' @seealso `\link{excel_numeric_to_date}` +#' @export +excel_time_to_numeric <- function(time_value, round_seconds = TRUE) { + UseMethod("excel_time_to_numeric") +} + +#' @export +excel_time_to_numeric.logical <- function(time_value, round_seconds = TRUE) { + if (all(is.na(time_value))) { + rep(NA_real_, length(time_value)) + } else { + stop("If given as a logical vector, all values must be ") + } +} + +#' @export +excel_time_to_numeric.numeric <- function(time_value, round_seconds = TRUE) { + if (all(is.na(time_value) | + (time_value >= 0 & + time_value < 1))) { + seconds <- time_value * 86400 + if (round_seconds) { + seconds <- round(seconds) + } + } else { + stop("When numeric, all `time_value`s must be between 0 and 1 (exclusive of 1)") + } + seconds +} + +#' @export +excel_time_to_numeric.POSIXct <- function(time_value, round_seconds = TRUE) { + # using trunc removes timezone inconsistency. Timezones aren't used in Excel. + seconds <- as.numeric(time_value) - as.numeric(trunc(time_value, units = "days")) + mask_good_seconds <- is.na(seconds) | (seconds >= 0 & seconds < 86400) + if (all(mask_good_seconds)) { + if (round_seconds) { + seconds <- round(seconds) + } + } else { + # This should be impossible except for leap seconds + stop(sum(!mask_good_seconds), " `time_value`s were not at or above 0 and below 86400.") # nocov + } + seconds +} + +#' @export +excel_time_to_numeric.POSIXlt <- function(time_value, round_seconds = TRUE) { + excel_time_to_numeric.POSIXct( + as.POSIXct(time_value), + round_seconds = round_seconds + ) +} + +#' @export +excel_time_to_numeric.character <- function(time_value, round_seconds = TRUE) { + ret <- rep(NA_real_, length(time_value)) + patterns <- + list( + number = "^0(\\.[0-9]*)?$", + # SI numbers have to have the form [number]E-[number] becasue the number + # has to be between 0 and 1 and can't be bigger than 1. + si_number = "^[1-9](\\.[0-9]*)?E-[0-9]+$", + "12hr" = "^([0]?[1-9]|1[0-2]):([0-5][0-9])(?::([0-5][0-9]))? ?([AP]M)$", + "24hr" = "^([0-1]?[0-9]|2[0-3]):([0-5][0-9])(?::([0-5][0-9]))?$", + # The ".*?" at the end of POSIX is to allow for a time zone, but it allows + # for imperfect parsing if there were just a date and a space. + # The entire time is optional to allow for midnight which shows as + # just the date and time zone. + POSIX = "1899-12-31 (?:([0-1]?[0-9]|2[0-3]):([0-5][0-9])(?::([0-5][0-9]))?)?.*?$" + ) + mask_na <- is.na(time_value) + mask_number <- + grepl(pattern = patterns$number, x = time_value) | + grepl(pattern = patterns$si_number, x = time_value) + mask_POSIX <- grepl(pattern = patterns[["POSIX"]], x = time_value) + mask_12hr <- grepl(pattern = patterns[["12hr"]], x = time_value, ignore.case = TRUE) + mask_24hr <- grepl(pattern = patterns[["24hr"]], x = time_value) + unmatched <- !(mask_na | mask_number | mask_POSIX | mask_12hr | mask_24hr) + if (any(unmatched)) { + stop( + "The following character strings did not match an interpretable ", + "character format for time conversion: ", + paste(unique(time_value[unmatched])) + ) + } + if (any(mask_number)) { + ret[mask_number] <- + excel_time_to_numeric.numeric( + time_value = as.numeric(time_value[mask_number]), + round_seconds = round_seconds + ) + } + mask_clock <- mask_12hr | mask_24hr | mask_POSIX + if (any(mask_clock)) { + hours <- minutes <- seconds <- rep(NA_real_, length(time_value)) + if (any(mask_POSIX)) { + hours[mask_POSIX] <- + gsub(pattern = patterns$POSIX, replacement = "\\1", x = time_value[mask_POSIX]) + minutes[mask_POSIX] <- + gsub(pattern = patterns$POSIX, replacement = "\\2", x = time_value[mask_POSIX]) + seconds[mask_POSIX] <- + gsub(pattern = patterns$POSIX, replacement = "\\3", x = time_value[mask_POSIX]) + } + if (any(mask_12hr)) { + mask_pm <- rep(FALSE, length(time_value)) + hours[mask_12hr] <- + gsub(pattern = patterns[["12hr"]], replacement = "\\1", x = time_value[mask_12hr], ignore.case = TRUE) + minutes[mask_12hr] <- + gsub(pattern = patterns[["12hr"]], replacement = "\\2", x = time_value[mask_12hr], ignore.case = TRUE) + seconds[mask_12hr] <- + gsub(pattern = patterns[["12hr"]], replacement = "\\3", x = time_value[mask_12hr], ignore.case = TRUE) + # 12 is 0 hours in the AM and the PM conversion below adds the needed 12 + # at noon. + mask_0_hours <- mask_12hr & (hours %in% "12") + hours[mask_0_hours] <- "0" + mask_pm[mask_12hr] <- + tolower( + gsub(pattern = patterns[["12hr"]], replacement = "\\4", x = time_value[mask_12hr], ignore.case = TRUE) + ) %in% "pm" + hours[mask_pm] <- 12 + as.numeric(hours[mask_pm]) + } + if (any(mask_24hr)) { + hours[mask_24hr] <- + gsub(pattern = patterns[["24hr"]], replacement = "\\1", x = time_value[mask_24hr]) + minutes[mask_24hr] <- + gsub(pattern = patterns[["24hr"]], replacement = "\\2", x = time_value[mask_24hr]) + seconds[mask_24hr] <- + gsub(pattern = patterns[["24hr"]], replacement = "\\3", x = time_value[mask_24hr]) + } + hours[hours %in% ""] <- "0" + minutes[minutes %in% ""] <- "0" + seconds[seconds %in% ""] <- "0" + + ret[mask_clock] <- + as.numeric(hours[mask_clock]) * 3600 + + as.numeric(minutes[mask_clock]) * 60 + + as.numeric(seconds[mask_clock]) + } + if (round_seconds) { + ret <- round(ret) + } + ret +} diff --git a/man/convert_to_date.Rd b/man/convert_to_date.Rd index 4765d08a..ac8837d5 100644 --- a/man/convert_to_date.Rd +++ b/man/convert_to_date.Rd @@ -71,6 +71,7 @@ convert_to_datetime( \seealso{ Other Date-time cleaning: \code{\link{excel_numeric_to_date}()}, +\code{\link{excel_time_to_numeric}()}, \code{\link{sas_numeric_to_date}()} } \concept{Date-time cleaning} diff --git a/man/excel_numeric_to_date.Rd b/man/excel_numeric_to_date.Rd index 22b49409..eda1d8b1 100644 --- a/man/excel_numeric_to_date.Rd +++ b/man/excel_numeric_to_date.Rd @@ -64,8 +64,11 @@ excel_numeric_to_date(40000.521, ) # Time with fractional seconds is included } \seealso{ +\code{\link{excel_time_to_numeric}} + Other Date-time cleaning: \code{\link{convert_to_date}()}, +\code{\link{excel_time_to_numeric}()}, \code{\link{sas_numeric_to_date}()} } \concept{Date-time cleaning} diff --git a/man/excel_time_to_numeric.Rd b/man/excel_time_to_numeric.Rd new file mode 100644 index 00000000..59e9b033 --- /dev/null +++ b/man/excel_time_to_numeric.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/excel_time_to_numeric.R +\name{excel_time_to_numeric} +\alias{excel_time_to_numeric} +\title{Convert a time that may be inconsistently or inconveniently formatted from +Microsoft Excel to a numeric number of seconds between 0 and 86400.} +\usage{ +excel_time_to_numeric(time_value, round_seconds = TRUE) +} +\arguments{ +\item{time_value}{A vector of values to convert (see Details)} + +\item{round_seconds}{Should the output number of seconds be rounded to an +integer?} +} +\value{ +A vector of numbers >= 0 and <86400 +} +\description{ +Convert a time that may be inconsistently or inconveniently formatted from +Microsoft Excel to a numeric number of seconds between 0 and 86400. +} +\details{ +\code{time_value} may be one of the following formats: +\itemize{ +\item{numeric}{The input must be a value from 0 to 1 (exclusive of 1); this value is returned as-is.} +\item{POSIXlt or POSIXct}{The input must be on the day 1899-12-31 (any other day will cause an error). The time of day is extracted and converted to a fraction of a day.} +\item{character}{Any of the following (or a mixture of the choices):} +\itemize{ +\item{A character string that is a number between 0 and 1 (exclusive of 1). This value will be converted like a numeric value.} +\item{A character string that looks like a date on 1899-12-31 (specifically, it must start with \code{"1899-12-31 "}), converted like a POSIXct object as described above.} +\item{A character string that looks like a time. Choices are 12-hour time as hour, minute, and optionally second followed by "am" or "pm" (case insensitive) or 24-hour time when hour, minute, optionally second, and no "am" or "pm" is included.} +} +} +} +\seealso{ +\verb{\link{excel_numeric_to_date}} + +Other Date-time cleaning: +\code{\link{convert_to_date}()}, +\code{\link{excel_numeric_to_date}()}, +\code{\link{sas_numeric_to_date}()} +} +\concept{Date-time cleaning} diff --git a/man/janitor-package.Rd b/man/janitor-package.Rd index abc82e51..f58f513a 100644 --- a/man/janitor-package.Rd +++ b/man/janitor-package.Rd @@ -48,6 +48,7 @@ Other contributors: \item Ryan Knight \email{ryangknight@gmail.com} [contributor] \item Malte Grosser \email{malte.grosser@gmail.com} [contributor] \item Jonathan Zadra \email{jonathan.zadra@sorensonimpact.com} [contributor] + \item Olivier Roy [contributor] } } diff --git a/man/sas_numeric_to_date.Rd b/man/sas_numeric_to_date.Rd index 3dffc7af..94ff9af9 100644 --- a/man/sas_numeric_to_date.Rd +++ b/man/sas_numeric_to_date.Rd @@ -38,6 +38,7 @@ SAS Date, Time, and Datetime Values reference (retrieved on \seealso{ Other Date-time cleaning: \code{\link{convert_to_date}()}, -\code{\link{excel_numeric_to_date}()} +\code{\link{excel_numeric_to_date}()}, +\code{\link{excel_time_to_numeric}()} } \concept{Date-time cleaning} diff --git a/tests/testthat/test-excel_time_to_numeric.R b/tests/testthat/test-excel_time_to_numeric.R new file mode 100644 index 00000000..42d7ac93 --- /dev/null +++ b/tests/testthat/test-excel_time_to_numeric.R @@ -0,0 +1,152 @@ +test_that("excel_time_to_numeric numbers function correctly", { + expect_equal(excel_time_to_numeric(0.1), 8640) + expect_equal(excel_time_to_numeric(0.1000000001), 8640) + expect_equal(excel_time_to_numeric(0), 0) + expect_equal(excel_time_to_numeric(NA_real_), NA_real_) +}) + +test_that("excel_time_to_numeric POSIX objects extract the correct part of the time", { + expect_equal(excel_time_to_numeric(as.POSIXct("1899-12-31 00:01")), 60) + expect_equal(excel_time_to_numeric(as.POSIXct("1899-12-31 08:00")), 8 * 3600) + expect_equal(excel_time_to_numeric(as.POSIXct("1899-12-31 13:00")), 13 * 3600) + expect_equal(excel_time_to_numeric(as.POSIXct("1899-12-31 13:05:10")), 13 * 3600 + 5 * 60 + 10) +}) + +test_that("excel_time_to_numeric POSIX objects ignore the time zone", { + expect_equal(excel_time_to_numeric(as.POSIXct("1899-12-31 13:00", tz = "EST")), 13 * 3600) + expect_equal(excel_time_to_numeric(as.POSIXct("1899-12-31 13:00", tz = "UTC")), 13 * 3600) + expect_equal( + excel_time_to_numeric( + as.POSIXct(c("1899-12-31 13:00", "1899-12-31 13:00"), tz = "EST") + ), + rep(13 * 3600, 2) + ) +}) + +test_that("excel_time_to_numeric POSIXlt works like POSIXct", { + expect_equal( + excel_time_to_numeric(as.POSIXct("1899-12-31 13:00", tz = "EST")), + excel_time_to_numeric(as.POSIXlt("1899-12-31 13:00", tz = "EST")) + ) +}) + +test_that("excel_time_to_numeric numbers errors when out of range", { + expect_error(excel_time_to_numeric(1)) + expect_error(excel_time_to_numeric(-0.1)) +}) + +test_that("excel_time_to_numeric logical values return as expected", { + expect_equal(excel_time_to_numeric(NA), NA_real_) + expect_error(excel_time_to_numeric(c(NA, TRUE))) + expect_error(excel_time_to_numeric(TRUE)) +}) + +test_that("excel_time_to_numeric, character strings of numbers work as expected", { + expect_equal(excel_time_to_numeric("0.5"), 12 * 3600) + expect_equal(excel_time_to_numeric("0"), 0) + expect_equal(excel_time_to_numeric("0."), 0) + expect_equal(excel_time_to_numeric("0.000000"), 0) + expect_equal(excel_time_to_numeric("0.00001"), 1) + expect_equal( + excel_time_to_numeric("0.00001", round_seconds = FALSE), + 0.00001 * 86400 + ) + # Confirm scientific notation values + expect_equal( + excel_time_to_numeric("2.9166666666666664E-2", round_seconds = TRUE), + 2520 + ) +}) + +test_that("excel_time_to_numeric, am/pm times work", { + expect_equal(excel_time_to_numeric("8:00am"), 8 * 3600) + expect_equal(excel_time_to_numeric("8:00pm"), 20 * 3600) + expect_equal(excel_time_to_numeric("8:10am"), 8 * 3600 + 10 * 60) + expect_equal(excel_time_to_numeric("8:10:05am"), 8 * 3600 + 10 * 60 + 5) + expect_equal( + excel_time_to_numeric("12:10:05am"), 10 * 60 + 5, + info = "After midnight is treated as 0 not 12." + ) + expect_equal( + excel_time_to_numeric("12:10:05pm"), 12 * 3600 + 10 * 60 + 5, + info = "After noon is treated as 12." + ) + # Test mixed AM/PM and 24-hour clock values + expect_equal( + excel_time_to_numeric(c("8:00pm", "8:00", "9:00")), + c(20, 8, 9) * 3600 + ) +}) + +test_that("excel_time_to_numeric, am/pm times work case insensitively and with spaces", { + expect_equal( + excel_time_to_numeric("8:00am"), + excel_time_to_numeric("8:00AM") + ) + expect_equal( + excel_time_to_numeric("8:00am"), + excel_time_to_numeric("8:00Am") + ) + expect_equal( + excel_time_to_numeric("8:00am"), + excel_time_to_numeric("8:00aM") + ) + expect_equal( + excel_time_to_numeric("8:00am"), + excel_time_to_numeric("8:00 AM") + ) +}) + +test_that("excel_time_to_numeric, 24-hour times work (zero-padded hours or not)", { + expect_equal(excel_time_to_numeric("8:00"), 8 * 3600) + expect_equal(excel_time_to_numeric("08:00"), 8 * 3600) + expect_equal(excel_time_to_numeric("08:10"), 8 * 3600 + 10 * 60) + expect_equal(excel_time_to_numeric("8:10:05"), 8 * 3600 + 10 * 60 + 5) + expect_equal(excel_time_to_numeric("21:05"), 21 * 3600 + 5 * 60) + expect_equal(excel_time_to_numeric("0:05"), 5 * 60) + expect_equal(excel_time_to_numeric("00:05"), 5 * 60) + expect_equal(excel_time_to_numeric("21:05:20"), 21 * 3600 + 5 * 60 + 20) +}) + +test_that("excel_time_to_numeric, POSIX times on 1899-12-31 work", { + expect_equal(excel_time_to_numeric("1899-12-31 8:00"), 8 * 3600) + expect_equal(excel_time_to_numeric("1899-12-31 08:00"), 8 * 3600) + expect_equal(excel_time_to_numeric("1899-12-31 08:10"), 8 * 3600 + 10 * 60) + expect_equal(excel_time_to_numeric("1899-12-31 8:10:05"), 8 * 3600 + 10 * 60 + 5) + expect_equal(excel_time_to_numeric("1899-12-31 21:05"), 21 * 3600 + 5 * 60) + expect_equal(excel_time_to_numeric("1899-12-31 0:05"), 5 * 60) + expect_equal(excel_time_to_numeric("1899-12-31 00:05"), 5 * 60) + expect_equal(excel_time_to_numeric("1899-12-31 21:05:20"), 21 * 3600 + 5 * 60 + 20) +}) + +test_that("excel_time_to_numeric, POSIX times ignore extra text (which is hopefully a time zone)", { + expect_equal(excel_time_to_numeric("1899-12-31 8:00 foo"), 8 * 3600) + expect_equal(excel_time_to_numeric("1899-12-31 08:00 foo"), 8 * 3600) + expect_equal(excel_time_to_numeric("1899-12-31 08:10 foo"), 8 * 3600 + 10 * 60) + expect_equal(excel_time_to_numeric("1899-12-31 8:10:05 foo"), 8 * 3600 + 10 * 60 + 5) + expect_equal(excel_time_to_numeric("1899-12-31 21:05 foo"), 21 * 3600 + 5 * 60) + expect_equal(excel_time_to_numeric("1899-12-31 0:05 foo"), 5 * 60) + expect_equal(excel_time_to_numeric("1899-12-31 00:05 foo"), 5 * 60) + expect_equal(excel_time_to_numeric("1899-12-31 21:05:20 foo"), 21 * 3600 + 5 * 60 + 20) +}) + +test_that("excel_time_to_numeric, POSIX times treat no time as midnight but only if there is a space indicating a mostly-well-formed date-time object.", { + # the just-a-space requirement is there because some time formatting puts the + # date then a space then the time zone. + expect_equal(excel_time_to_numeric("1899-12-31 foo"), 0) + expect_error(excel_time_to_numeric("1899-12-31foo")) +}) + +test_that("excel_time_to_numeric, invalid character times trigger an error", { + expect_error(excel_time_to_numeric("1")) + expect_error(excel_time_to_numeric("-0.1")) + expect_error(excel_time_to_numeric("0:05:20am")) + expect_error(excel_time_to_numeric("1:60:20am")) + expect_error(excel_time_to_numeric("1:00:70am")) + expect_error(excel_time_to_numeric("13:05:20am")) + expect_error(excel_time_to_numeric("13:05:20am")) + expect_error(excel_time_to_numeric("25:05:20")) + expect_error(excel_time_to_numeric("23:65:20")) + expect_error(excel_time_to_numeric("23:05:90")) + expect_error(excel_time_to_numeric("1899-12-30 21:05:20")) +})