Fix naming in j=c() under by= queries with lapply() optimization (#4883)

* added failing tests * successfully fixed 2164.1 but broke others. also, havent fixed named list yet * fixed random broken tests. only the c(a=list(),lapply(.SD,FUN)) situation left * More tests. Some still failing * c(A=list()) shouldn't get a number prefixed * fixed previously failing test. the issue was that I had 2164.1 and 2164.10. oops. also added more tests. all passing now * add to author list * added github link to issue * code style changes * reworked tests. now includes tests that fail only when optimize=0 * fix breaking tests related to inconsistent naming of blank columnames * added more passing tests * cleaned up news item * added another passing test ensuring dt[, c(x[1], list(.),] gets a variable name for the column created by x[1]. previously this could be an empty string column name in some circumstances * fix code style * moved news item to the correct release * fixup * modernize: options= in test() * fix test numbers * copy-edit NEWS * redundant test objects * test in a loop, also test opt=2 * consistent GH name, reduce diff * same * oops, undo that one. Prefer leaving released NEWS alone * Style touch-up, refer to tests over long comment * Extra ')' in DESCRIPTION * nzchar again * Bring test code & result closer together for easier reading * bad copy-paste: optimize=opt * more test style --------- Co-authored-by: Michael Young <[email protected]> Co-authored-by: Michael Chirico <[email protected]>
Rdatatable · Sep 6, 2024 · cf05b4a · cf05b4a
1 parent b566822
commit cf05b4a
Show file tree

Hide file tree

Showing 4 changed files with 116 additions and 4 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -89,5 +89,6 @@ Authors@R: c(
   person("Iago", "Giné-Vázquez",   role="ctb"),
   person("Anirban", "Chetia",      role="ctb"),
   person("Doris", "Amoakohene",    role="ctb"),
-  person("Ivan", "Krylov",         role="ctb")
+  person("Ivan", "Krylov",         role="ctb"),
+  person("Michael","Young",        role="ctb")
   )
diff --git a/NEWS.md b/NEWS.md
@@ -33,6 +33,26 @@ rowwiseDT(
 
 3. The data.table-only attribute `$.internal.selfref` is no longer set for data.frames. [#5286](https://github.com/Rdatatable/data.table/issues/5286). Thanks @OfekShilon for the report and fix.
 
+4. Tagging/naming arguments of `c()` in `j=c()` should now more closely follow base R conventions for concatenation of named lists during grouping, [#2311](https://github.com/Rdatatable/data.table/issues/2311). Naming an `lapply(.SD, FUN)` call as an argument of `c()` in `j` will now always cause that tag to get prepended (with a single dot separator) to the resulting column names. Additionally, naming a `list()` call as an argument of `c()` in `j` will now always cause that tag to get prepended to any names specified within the list call. This bug only affected queries with (1) `by=` grouping (2) `getOption("datatable.optimize") >= 1L` and (3) `lapply(.SD, FUN)` in `j`.
+
+    While the names returned by `data.table` when `j=c()` will now mostly follow base R conventions for concatenating lists, note that names which are completely unspecified will still be named positionally, matching the typical behavior in `j` and `data.table()`. according to position in `j` (e.g. `V1`, `V2`).
+
+    Thanks to @franknarf1 for reporting and @myoung3 for the PR.
+
+    ```r
+    # tag 'mean' prepended to lapply()-named columns
+    names(mtcars[, c(mean=lapply(.SD,sum)), by="cyl", .SDcols=c("am", "carb")])
+    # [1] "cyl" "mean.am" "mean.carb"
+
+    # tag 'mean' is prepended to the first named sublist, 'sum' to the second
+    names(mtcars[, c(mean=list(a=mean(hp), b=mean(wt)), sum=lapply(.SD, sum)), by="cyl", .SDcols=c("am", "carb")])
+    # [1] "cyl" "mean.a" "mean.b" "sum.am" "sum.carb"
+
+    # strict base naming would result in names c("", "b", "c") here
+    names(mtcars[, c(list(mean(hp), b=mean(wt)), c=list(mean(cyl)))])
+    # [1] "V1" "b" "c"
+    ```
+
 ## NOTES
 
 1. Tests run again when some Suggests packages are missing, [#6411](https://github.com/Rdatatable/data.table/issues/6411). Thanks @aadler for the note and @MichaelChirico for the fix.

diff --git a/R/data.table.R b/R/data.table.R
@@ -1697,12 +1697,30 @@ replace_dot_alias = function(e) {
               deparse_ans = .massageSD(this)
               funi = funi + 1L # Fix for #985
               jsubl[[i_]] = as.list(deparse_ans[[1L]][-1L]) # just keep the '.' from list(.)
-              jvnames = c(jvnames, deparse_ans[[2L]])
+              jn__ = deparse_ans[[2L]]
+              if (isTRUE(nzchar(names(jsubl)[i_]))) {
+                # Fix for #2311, prepend named arguments of c() to column names of .SD
+                #   e.g. c(mean=lapply(.SD, mean))
+                jn__ = paste(names(jsubl)[i_], jn__, sep=".") # sep="." for consistency with c(A=list(a=1,b=1))
+              }
+              jvnames = c(jvnames, jn__)
             } else if (this[[1L]] == "list") {
               # also handle c(lapply(.SD, sum), list()) - silly, yes, but can happen
               if (length(this) > 1L) {
                 jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.)
-                jn__ = if (is.null(names(jl__))) rep("", length(jl__)) else names(jl__)
+                if (isTRUE(nzchar(names(jsubl)[i_]))) {
+                  # Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.*
+                  njl__ = if (is.null(names(jl__))) rep("", length(jl__)) else names(jl__)
+                  njl__nonblank = nzchar(names(jl__))
+                  if (length(jl__) > 1L) {
+                    jn__ = paste0(names(jsubl)[i_], seq_along(jl__))
+                  } else {
+                    jn__ = names(jsubl)[i_]
+                  }
+                  jn__[njl__nonblank] = paste(names(jsubl)[i_], njl__[njl__nonblank], sep=".")
+                } else {
+                  jn__ = if (is.null(names(jl__))) rep("", length(jl__)) else names(jl__)
+                }
                 idx  = unlist(lapply(jl__, function(x) is.name(x) && x == ".I"))
                 if (any(idx))
                   jn__[idx & !nzchar(jn__)] = "I"  # this & is correct not &&
@@ -1967,7 +1985,10 @@ replace_dot_alias = function(e) {
     if (any(ww)) jvnames[ww] = paste0("V",ww)
     setattr(ans, "names", c(bynames, jvnames))
   } else {
-    setnames(ans,seq_along(bynames),bynames)   # TO DO: reinvestigate bynames flowing from dogroups here and simplify
+    nonbynames = names(ans)[-seq_along(bynames)] #related to 2311. make naming of empty columns names more consistent
+    ww = which(!nzchar(nonbynames))
+    if (length(ww)) nonbynames[ww] = paste0("V", ww)
+    setattr(ans, "names", c(bynames, nonbynames))   # TO DO: reinvestigate bynames flowing from dogroups here and simplify
   }
   if (byjoin && keyby && !bysameorder) {
     if (verbose) {last.started.at=proc.time();catf("setkey() afterwards for keyby=.EACHI ... ");flush.console()}

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -19085,3 +19085,73 @@ test(2282.08, rowwiseDT(A=,B=,1,2,C=,4), error="Header must be the first N argum
 # evaluate arguments in the correct frame
 ncols = 1e6
 test(2282.09, rowwiseDT(A=,ncols), data.table(A=ncols))
+
+# named arguments of c() in j get prepended to lapply(.SD, FUN) #2311
+
+M <- as.data.table(mtcars)
+M[, " " := hp]
+M[, "." := hp]
+
+sdnames <- setdiff(names(M), "cyl")
+sdlist <- vector("list", length(sdnames))
+names(sdlist) <- sdnames
+
+for (opt in c(0, 1, 2)) {
+  test(2283 + opt/10 + 0.001, options=c(datatable.optimize=opt),
+       names(M[, c(m=lapply(.SD, mean)), by="cyl"]),
+       c("cyl", names(c(m=sdlist))))
+  test(2283 + opt/10 + 0.002, options=c(datatable.optimize=opt),
+       names(M[, c(Mpg=list(mpg), lapply(.SD, mean)), by="cyl"]),
+       c("cyl", "Mpg", sdnames))
+  test(2283 + opt/10 + 0.003, options=c(datatable.optimize=opt),
+       names(M[, c(Mpg=list(mpg), m=lapply(.SD, mean)), by="cyl"]),
+       c("cyl", "Mpg", names(c(m=sdlist))))
+  test(2283 + opt/10 + 0.004, options=c(datatable.optimize=opt),
+       names(M[, c(mpg=list(mpg), mpg=lapply(.SD, mean)), by="cyl"]),
+       c("cyl", "mpg", names(c(mpg=sdlist))))
+  test(2283 + opt/10 + 0.005, options=c(datatable.optimize=opt),
+       names(M[, c(list(mpg), lapply(.SD, mean)), by="cyl"]),
+       c("cyl", "V1", sdnames))
+  test(2283 + opt/10 + 0.006, options=c(datatable.optimize=opt),
+       names(M[, c(lapply(.SD, mean), list(mpg)), by="cyl"]),
+       c("cyl", sdnames, sprintf("V%d", length(sdnames)+1L)))
+  test(2283 + opt/10 + 0.007, options=c(datatable.optimize=opt),
+       names(M[, c(lapply(.SD, mean), lapply(.SD, sum)), by="cyl"]),
+       c("cyl", sdnames, sdnames))
+  test(2283 + opt/10 + 0.008, options=c(datatable.optimize=opt),
+       names(M[, c(mean=lapply(.SD, mean), sum=lapply(.SD,  sum)), by="cyl"]),
+       c("cyl", names(c(mean=sdlist, sum=sdlist))))
+  test(2283 + opt/10 + 0.009, options=c(datatable.optimize=opt),
+       names(M[, c(lapply(.SD, mean), sum=lapply(.SD,  sum)), by="cyl"]),
+       c("cyl", sdnames, names(c(sum=sdlist))) )
+  test(2283 + opt/10 + 0.010, options=c(datatable.optimize=opt),
+       names(M[, c(" "=lapply(.SD, mean), "."=lapply(.SD,  sum)), by="cyl"]),
+       c("cyl", names(c(" "=sdlist, "."=sdlist))))
+  test(2283 + opt/10 + 0.011, options=c(datatable.optimize=opt),
+       names(M[, c(A=list(a=mpg, b=hp), lapply(.SD, mean)), by="cyl"]),
+       c("cyl", names(c(A=list(a=0, b=0))), sdnames))
+  test(2283 + opt/10 + 0.012, options=c(datatable.optimize=opt),
+       names(M[, c(A=list(mpg, hp), lapply(.SD, mean)), by="cyl"]),
+       c("cyl", names(c(A=list(0, 0))), sdnames))
+  test(2283 + opt/10 + 0.013, options=c(datatable.optimize=opt),
+       names(M[, c(A=list(mpg, b=hp, wt), lapply(.SD, mean)), by="cyl"]),
+       c("cyl", names(c(A=list(0, b=0, 0))), sdnames))
+  test(2283 + opt/10 + 0.014, options=c(datatable.optimize=opt),
+       names(M[, c(A=list(mpg), lapply(.SD, mean)), by="cyl"]),
+       c("cyl", names(c(A=list(0))), sdnames))
+  test(2283 + opt/10 + 0.015, options=c(datatable.optimize=opt),
+       names(M[, c(" "=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]),
+       c("cyl", names(c(" "=list(" "=0, "."=0, 0))), sdnames))
+  test(2283 + opt/10 + 0.016, options=c(datatable.optimize=opt),
+       names(M[, c("."=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]),
+       c("cyl", names(c("."=list(" "=0, "."=0, 0))), sdnames))
+  test(2283 + opt/10 + 0.017, options=c(datatable.optimize=opt),
+       names(M[, c(list(mpg, b=hp), lapply(.SD, mean)), by="cyl", .SDcols=c("vs", "am")]),
+       c("cyl", "V1", "b", "vs", "am"))
+  test(2283 + opt/10 + 0.018, options=c(datatable.optimize=opt),
+       names(M[, c(list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]),
+       c("cyl", "V1", "b", "vs", "am"))
+  test(2283 + opt/10 + 0.019, options=c(datatable.optimize=opt),
+       names(M[, c(mpg[1], list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]),
+       c("cyl", "V1", "V2", "b", "vs", "am"))
+}