Skip to content

Commit

Permalink
Defer as.Date() coercion to R level (#6107)
Browse files Browse the repository at this point in the history
* Refactor colClasses handling for readability

* remove debug prints

* C-level changes needed

* update git hash again :\

* revert atime test, defer to follow-up

* re-site NEWS
  • Loading branch information
MichaelChirico authored Sep 8, 2024
1 parent e9087ce commit a77e8c2
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 17 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ rowwiseDT(

6. Fixed a segfault in `fcase()`, [#6448](https://github.com/Rdatatable/data.table/issues/6448). Thanks @ethanbsmith for reporting with reprex, @aitap for finding the root cause, and @MichaelChirico for the PR.

7. `fread()` performance improves when specifying `Date` among `colClasses`, [#6105](https://github.com/Rdatatable/data.table/issues/6105). One implication of the change is that the column will be an `IDate` (which also inherits from `Date`), which may affect code strongly relying on the column class to be `Date` exactly; computations with `IDate` and `Date` columns should otherwise be the same. If you strongly prefer the `Date` class, run `as.Date()` explicitly following `fread()`. Thanks @scipima for the report and @MichaelChirico for the fix.

## NOTES

1. Tests run again when some Suggests packages are missing, [#6411](https://github.com/Rdatatable/data.table/issues/6411). Thanks @aadler for the note and @MichaelChirico for the fix.
Expand Down
6 changes: 3 additions & 3 deletions inst/tests/S4.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,14 @@ ans_print = utils::capture.output(print(ans))
if (TZnotUTC) {
test(5.2, options=list(datatable.old.fread.datetime.character=NULL),
fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct"), tz=""),
ans, output=ans_print)
copy(ans)[, a := as.IDate(a)], output=ans_print)
test(5.3, options=list(datatable.old.fread.datetime.character=NULL),
fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA), tz=""),
data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c="2015-01-03 01:02:03"), output=ans_print)
data.table(a=as.IDate("2015-01-01"), b=as.IDate("2015-01-02"), c="2015-01-03 01:02:03"), output=ans_print)
} else {
test(5.4, options=list(datatable.old.fread.datetime.character=NULL),
fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")),
ans<-data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03", tz="UTC")), output=ans_print)
ans<-data.table(a=as.IDate("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03", tz="UTC")), output=ans_print)
test(5.5, options=list(datatable.old.fread.datetime.character=NULL),
fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA)),
ans, output=ans_print)
Expand Down
23 changes: 13 additions & 10 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -11042,20 +11042,20 @@ test(1743.044, fread("a\n1", colClasses=""), data.table(a=1L))
# Issue #1634: 'fread doesn't check colClasses to be valid type'
# Currently using BioGenerics, which doesn't support USE.NAMES
## Date supported character/list colClasses
test(1743.05, sapply(fread("a,b\n2017-01-01,1", colClasses=c("Date", "integer")), class), c(a="Date", b="integer"))
test(1743.06, sapply(fread("a,b\n2017-01-01,1", colClasses=list(Date = 1L, integer = 2L)), class), c(a="Date", b="integer"))
test(1743.07, sapply(fread("a,b\n2017-01-01,2017-01-02", colClasses=list(Date = 1:2)), class), c(a="Date", b="Date"))
test(1743.08, sapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "integer", "complex")), class), c(a="Date", b="integer", c="complex"))
test(1743.09, sapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "integer", "complex")), class), c(a="Date", b="integer", c="complex"))
test(1743.10, sapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", NA)), class), c(a="Date",b="integer",c="complex",d="integer"))
test(1743.11, sapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", "raw")), class), c(a="Date",b="integer",c="complex",d="raw"))
test(1743.05, lapply(fread("a,b\n2017-01-01,1", colClasses=c("Date", "integer")), class), list(a=c("IDate", "Date"), b="integer"))
test(1743.06, lapply(fread("a,b\n2017-01-01,1", colClasses=list(Date = 1L, integer = 2L)), class), list(a=c("IDate", "Date"), b="integer"))
test(1743.07, lapply(fread("a,b\n2017-01-01,2017-01-02", colClasses=list(Date = 1:2)), class), list(a=c("IDate", "Date"), b=c("IDate", "Date")))
test(1743.08, lapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "integer", "complex")), class), list(a=c("IDate", "Date"), b="integer", c="complex"))
test(1743.09, lapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "integer", "complex")), class), list(a=c("IDate", "Date"), b="integer", c="complex"))
test(1743.10, lapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", NA)), class), list(a=c("IDate", "Date"), b="integer", c="complex", d="integer"))
test(1743.11, lapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", "raw")), class), list(a=c("IDate", "Date"), b="integer", c="complex", d="raw"))
test(1743.121, sapply(fread("a,b\n2015-01-01,2015-01-01", colClasses=c(NA,"IDate")), inherits, what="IDate"), c(a=TRUE, b=TRUE))
test(1743.122, fread("a,b\n2015-01-01,2015-01-01", colClasses=c("POSIXct","Date")), data.table(a=as.POSIXct("2015-01-01"), b=as.Date("2015-01-01")))
test(1743.122, fread("a,b\n2015-01-01,2015-01-01", colClasses=c("POSIXct","Date")), data.table(a=as.POSIXct("2015-01-01"), b=as.IDate("2015-01-01")))
test(1743.123, fread("a,b\n1+3i,2015-01-01", colClasses=c(NA,"IDate")), data.table(a="1+3i", b=as.IDate("2015-01-01")))

## Attempts to impose incompatible colClasses is a warning (not an error)
## and does not change the value of the columns
test(1743.13, sapply(fread("a,b\n09/05/98,2015-01-01", colClasses = "Date"), class), y=c(a="character", b="Date"), warning=base_messages$ambiguous_date_fmt)
test(1743.13, lapply(fread("a,b\n09/05/98,2015-01-01", colClasses = "Date"), class), y=list(a="character", b=c("IDate", "Date")), warning=base_messages$ambiguous_date_fmt)

## Just invalid
test(1743.14, options = c(useFancyQuotes = FALSE),
Expand Down Expand Up @@ -11085,6 +11085,9 @@ test(1743.201, fread("A,B,C,D\nA,B,X,4", select=list(factor="A"), colClasses="ch
test(1743.202, fread("A,B,C,D\nA,B,X,4", select=c(A="factor"), colClasses="character"), error="select= is a named vector.*but colClasses= has been provided as well. Please remove colClasses=.")
test(1743.203, fread("A,B,C,D\nA,B,X,4", select=list(character="D", factor="B")), data.table(D="4", B=factor("B")))
test(1743.204, fread("A,B,C,D\nA,B,X,4", select=list(character=4, character=2)), data.table(D="4", B="B"))
## Date+select
test(1743.205, fread("a,b\n2017-01-01,1", colClasses="Date", select="a"), data.table(a=as.IDate("2017-01-01")))
test(1743.206, fread("a,b\n2017-01-01,1", select=list(Date="a")), data.table(a=as.IDate("2017-01-01")))

## factors
test(1743.211, sapply(fread("a,b,c\n2,2,f", colClasses = list(factor = 1L), select = 2:3), class), y = c(b="integer", c="character"))
Expand Down Expand Up @@ -16958,7 +16961,7 @@ test(2150.10, fread(tmp), copy(DT)[ , times := format(times, '%FT%T+00:XX')])
test(2150.11,fread("a,b\n2015-01-01,2015-01-01", colClasses="POSIXct"), # local time for backwards compatibility
data.table(a=as.POSIXct("2015-01-01"), b=as.POSIXct("2015-01-01")))
test(2150.12,fread("a,b\n2015-01-01,2015-01-01", select=c(a="Date",b="POSIXct")), # select colClasses form, for coverage
data.table(a=as.Date("2015-01-01"), b=as.POSIXct("2015-01-01")))
data.table(a=as.IDate("2015-01-01"), b=as.POSIXct("2015-01-01")))
test(2150.13, fread("a,b\n2015-01-01,1.1\n2015-01-02 01:02:03,1.2", tz=""), # no Z, tz="" needed for this test from v1.14.0
if (TZnotUTC) data.table(a=c("2015-01-01","2015-01-02 01:02:03"), b=c(1.1, 1.2))
else data.table(a=setattr(as.POSIXct(c("2015-01-01 00:00:00", "2015-01-02 01:02:03"), tz="UTC"), "tzone", "UTC"), b=c(1.1, 1.2)))
Expand Down
10 changes: 6 additions & 4 deletions src/freadR.c
Original file line number Diff line number Diff line change
Expand Up @@ -335,10 +335,10 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int
type[i]=CT_STRING; // e.g. CT_ISO8601_DATE changed to character here so that as.POSIXct treats the date-only as local time in tests 1743.122 and 2150.11
SET_STRING_ELT(colClassesAs, i, tt);
}
} else {
} else if (type[i] != CT_ISO8601_DATE || tt != char_Date) {
type[i] = typeEnum[w-1]; // freadMain checks bump up only not down
if (w==NUT) SET_STRING_ELT(colClassesAs, i, tt);
}
} // else (when colClasses="Date" and fread found an IDate), don't update type[i] and don't signal any coercion needed on R side
}
} else { // selectColClasses==true
if (!selectInts) internal_error(__func__, "selectInts is NULL but selectColClasses is true"); // # nocov
Expand All @@ -355,7 +355,7 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int
type[y-1]=CT_STRING;
SET_STRING_ELT(colClassesAs, y-1, tt);
}
} else {
} else if (type[i] != CT_ISO8601_DATE || tt != char_Date) {
type[y-1] = typeEnum[w-1];
if (w==NUT) SET_STRING_ELT(colClassesAs, y-1, tt);
}
Expand Down Expand Up @@ -405,7 +405,9 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int
}
if (selectRankD) selectRankD[colIdx-1] = rank++;
// NB: mark as negative to indicate 'seen'
if (colClassType == CT_ISO8601_TIME && type[colIdx-1]!=CT_ISO8601_TIME) {
if (type[colIdx-1]==CT_ISO8601_DATE && colClassType==CT_STRING && STRING_ELT(listNames, i) == char_Date) {
type[colIdx-1] *= -1;
} else if (colClassType == CT_ISO8601_TIME && type[colIdx-1]!=CT_ISO8601_TIME) {
type[colIdx-1] = -CT_STRING; // don't use in-built UTC parser, defer to character and as.POSIXct afterwards which reads in local time
SET_STRING_ELT(colClassesAs, colIdx-1, STRING_ELT(listNames, i));
} else {
Expand Down

0 comments on commit a77e8c2

Please sign in to comment.