Skip to content

Commit

Permalink
Reset dec='\0' to detect numbers while dec is undecided (#6445)
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelChirico authored Sep 16, 2024
1 parent 4d42cdc commit 8f505d5
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 11 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ rowwiseDT(

10. Fixed possible segfault in `setDT(df); attr(df, key) <- value; set(df, ...)`, i.e. adding columns to an object with `set()` that was converted to data.table with `setDT()` and later had attributes add with `attr<-`, [#6410](https://github.com/Rdatatable/data.table/issues/6410). Thanks to @hongyuanjia for the report and @ben-schwen for the PR. Note that `setattr()` should be preferred for adding attributes to a data.table.

11. `fread()` automatically detects timestamps with sub-second accuracy again, [#6440](https://github.com/Rdatatable/data.table/issues/6440). This was a regression due to interference with new `dec='auto'` support. Thanks @kav2k for the concise report and @MichaelChirico for the fix.

## NOTES

1. Tests run again when some Suggests packages are missing, [#6411](https://github.com/Rdatatable/data.table/issues/6411). Thanks @aadler for the note and @MichaelChirico for the fix.
Expand Down
29 changes: 23 additions & 6 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -18549,21 +18549,38 @@ test(2255, as.data.table(DF), output="DF1.V1.*DF1.V2.*DF2.V3.*DF2.V4.*V5")
DT = data.table(a = letters, b = 1:26/6, c = 1:26)
## auto-detect dec=','
fwrite(DT, f <- tempfile(), dec=',', sep=';')
test(2256.1, fread(f), DT)
test(2256.01, fread(f), DT)

fwrite(DT, f, dec=',', sep='|')
test(2256.2, fread(f), DT)
test(2256.02, fread(f), DT)

## auto-detect dec='.'
fwrite(DT, f)
test(2256.3, fread(f), DT)
test(2256.03, fread(f), DT)

## verbose output
test(2256.4, fread(f, verbose=TRUE), DT, output="sep=',' so dec set to '.'")
test(2256.04, fread(f, verbose=TRUE), DT, output="sep=',' so dec set to '.'")

fwrite(DT, f, dec=',', sep=';')
test(2256.5, fread(f, verbose=TRUE), DT, output="dec=',' detected based on a balance of 18")
test(2256.6, fread('a;b\n1,14;5', verbose=TRUE), data.table(a=1.14, b=5L), output="dec=',' detected based on a balance of 1 ")
test(2256.05, fread(f, verbose=TRUE), DT, output="dec=',' detected based on a balance of 18")
test(2256.06, fread('a;b\n1,14;5', verbose=TRUE), data.table(a=1.14, b=5L), output="dec=',' detected based on a balance of 1 ")

## timestamps with subsecond accuracy thrown off by auto-dec #6440
test(2256.07, fread(text="t\n2023-10-12T06:53:53.123Z"), data.table(t=as.POSIXct('2023-10-12 06:53:53.123', tz='UTC')))
### TODO(#6447): sep="\t" shouldn't be needed here, right?
test(2256.08, fread(text="t\n2023-10-12T06:53:53,123Z", sep="\t"), data.table(t=as.POSIXct('2023-10-12 06:53:53.123', tz='UTC')))
test(2256.09, fread(text="x,t\n1.0,2023-10-12T06:53:53.123Z"), data.table(x=1.0, t=as.POSIXct('2023-10-12 06:53:53.123', tz='UTC')))
test(2256.10, fread(text="t,x\n2023-10-12T06:53:53.123Z,1.0"), data.table(t=as.POSIXct('2023-10-12 06:53:53.123', tz='UTC'), x=1.0))
### from PR comment
s = 'CoilID;AntennaID;Time;TagID;Pen;Side;Position;Location;Coil_Y;Coil_X
1;16403160;2023-10-12T10:30:55.270Z;DA2C6411;1;AKB;Litter central;middle;1;6
3;16403160;2023-10-12T10:30:55.270Z;DA459D86;1;AKB;Litter central;middle;1;4
15;16402963;2023-10-12T10:31:00.900Z;DA459D86;1;AKB;Litter central;right;2;3
14;16402963;2023-10-12T10:31:02.240Z;DA2C6411;1;AKB;Litter central;right;2;1
11;16403160;2023-10-12T10:31:02.650Z;DA2C6411;1;AKB;Litter central;middle;2;6'
test(2256.11,
unname(sapply(fread(s, sep=';'), function(x) class(x)[1L])),
c("integer", "integer", "POSIXct", "character", "integer", "character", "character", "character", "integer", "integer"))

# helpful error about deleting during grouping, #1873
DT = data.table(id = c(1, 1, 2, 2), a = 1:4, b = 5:8)
Expand Down
11 changes: 7 additions & 4 deletions src/fread.c
Original file line number Diff line number Diff line change
Expand Up @@ -1072,7 +1072,6 @@ static void parse_iso8601_timestamp(FieldParseContext *ctx)

date_only:

//Rprintf("date=%d\thour=%d\tz_hour=%d\tminute=%d\ttz_minute=%d\tsecond=%.1f\n", date, hour, tz_hour, minute, tz_minute, second);
// cast upfront needed to prevent silent overflow
*target = 86400*(double)date + 3600*(hour - tz_hour) + 60*(minute - tz_minute) + second;

Expand Down Expand Up @@ -1233,9 +1232,13 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped
}
}
ch = fieldStart;
if (autoDec && IS_DEC_TYPE(tmpType[field]) && dec == '.') { // . didn't parse a double; try ,
dec = ',';
continue;
if (autoDec && IS_DEC_TYPE(tmpType[field])) {
if (dec == '.') { // '.' didn't parse a double; try ','
dec = ',';
continue; // i.e., restart since tmpType not incremented
} else if (dec == ',') { // ',' didn't parse a double either; reset
dec = '\0';
}
}
while (++tmpType[field]<CT_STRING && disabled_parsers[tmpType[field]]) {};
*bumped = true;
Expand Down
2 changes: 1 addition & 1 deletion src/fread.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ typedef enum {
NUMTYPE // placeholder for the number of types including drop; used for allocation and loop bounds
} colType;

#define IS_DEC_TYPE(x) ((x) == CT_FLOAT64 || (x) == CT_FLOAT64_EXT) // types where dec matters
#define IS_DEC_TYPE(x) ((x) == CT_FLOAT64 || (x) == CT_FLOAT64_EXT || (x) == CT_ISO8601_TIME) // types where dec matters

extern int8_t typeSize[NUMTYPE];
extern const char typeName[NUMTYPE][10];
Expand Down

0 comments on commit 8f505d5

Please sign in to comment.