[r] Backport sparse COO writer to 1.14 (#3217)

single-cell-data · Oct 21, 2024 · b5e7e01 · b5e7e01
1 parent 85339bd
commit b5e7e01
Show file tree

Hide file tree

Showing 12 changed files with 687 additions and 91 deletions.
diff --git a/apis/r/NAMESPACE b/apis/r/NAMESPACE
@@ -2,15 +2,25 @@
 
 S3method("[[",MappingBase)
 S3method("[[<-",MappingBase)
+S3method(.is_integerish,Array)
+S3method(.is_integerish,ChunkedArray)
+S3method(.is_integerish,DataType)
+S3method(.is_integerish,Field)
+S3method(.is_integerish,default)
+S3method(.is_integerish,integer64)
 S3method(.read_soma_joinids,SOMADataFrame)
 S3method(.read_soma_joinids,SOMASparseNDArray)
 S3method(as.list,CoordsStrider)
 S3method(as.list,MappingBase)
+S3method(as.logical,Scalar)
 S3method(iterators::nextElem,CoordsStrider)
 S3method(itertools::hasNext,CoordsStrider)
 S3method(length,CoordsStrider)
 S3method(length,MappingBase)
 S3method(names,MappingBase)
+S3method(r_type_from_arrow_type,DataType)
+S3method(r_type_from_arrow_type,Field)
+S3method(r_type_from_arrow_type,Schema)
 S3method(write_soma,Assay)
 S3method(write_soma,DataFrame)
 S3method(write_soma,DimReduc)
@@ -84,6 +94,7 @@ export(has_metadata)
 export(list_datasets)
 export(load_dataset)
 export(matrixZeroBasedView)
+export(r_type_from_arrow_type)
 export(set_log_level)
 export(set_metadata)
 export(show_package_versions)

diff --git a/apis/r/NEWS.md b/apis/r/NEWS.md
@@ -1,3 +1,10 @@
+# Unreleased
+
+## Changes
+
+* Add new Arrow-to-R type mapper
+* Expose block/random writer for sparse arrays [#3204](https://github.com/single-cell-data/TileDB-SOMA/pull/3204)
+
 # tiledbsoma 1.14.3
 
 ## Changes

diff --git a/apis/r/R/SOMASparseNDArray.R b/apis/r/R/SOMASparseNDArray.R
@@ -66,12 +66,14 @@ SOMASparseNDArray <- R6::R6Class(
     #' @param bbox A vector of integers describing the upper bounds of each
     #' dimension of `values`. Generally should be `NULL`.
     #'
+    #' @return Invisibly returns \code{self}
+    #'
     write = function(values, bbox = NULL) {
       stopifnot(
         "'values' must be a matrix" = is_matrix(values),
         "'bbox' must contain two entries" = is.null(bbox) || length(bbox) == length(dim(values)),
         "'bbox' must be a vector of two integers or a list with each entry containg two integers" = is.null(bbox) ||
-          (is_integerish(bbox) || bit64::is.integer64(bbox)) ||
+          .is_integerish(bbox) ||
           (is.list(bbox) && all(vapply_lgl(bbox, function(x, n) length(x) == 2L)))
       )
       # coerce to a TsparseMatrix, which uses 0-based COO indexing
@@ -81,6 +83,12 @@ SOMASparseNDArray <- R6::R6Class(
         j = bit64::as.integer64(values@j),
         x = values@x
       )
+      if (!is.null(private$.type)) {
+        rt <- r_type_from_arrow_type(private$.type)
+        if (rt == 'integer' && rlang::is_integerish(coo$x)) {
+          coo$x <- as.integer(coo$x)
+        }
+      }
       dnames <- self$dimnames()
       colnames(coo) <- c(dnames, self$attrnames())
       ranges <- sapply(
@@ -181,9 +189,9 @@ SOMASparseNDArray <- R6::R6Class(
         self$tiledb_timestamp %||% "now"
       )
 
-      private$.write_coo_dataframe(coo)
+      self$.write_coordinates(coo)
 
-      invisible(self)
+      return(invisible(self))
     },
 
     #' @description Retrieve number of non-zero elements (lifecycle: maturing)
@@ -224,6 +232,96 @@ SOMASparseNDArray <- R6::R6Class(
       )
       # Checking slotwise new shape >= old shape, and <= max_shape, is already done in libtiledbsoma
       tiledbsoma_upgrade_shape(self$uri, shape, private$.soma_context)
+    },
+
+    #' @description Write a COO table to the array
+    #'
+    #' @param values A \code{data.frame} or \code{\link[arrow:Table]{Arrow::Table}}
+    #' with data in COO format; must be named with the dimension and attribute
+    #' labels of the array
+    #'
+    #' @return Invisibly returns \code{self}
+    #'
+    .write_coordinates = function(values) {
+      private$check_open_for_write()
+      dnames <- self$dimnames()
+      attrn <- self$attrnames()
+
+      stopifnot(
+        "'values' must be a data frame or Arrow Table" = is.data.frame(values) ||
+          inherits(values, what = 'Table'),
+        "'values' must have one column for each dimension and the data" = ncol(values) == length(dnames) + 1L,
+        "'values' must be named with the dimension and attribute labels" = is.null(names(values)) ||
+          identical(names(values), c(dnames, attrn))
+      )
+
+      # Arrow Tables cannot have NULL names, so this only applies to dataframes
+      if (is.null(names(values))) {
+        spdl::warn("[SOMASparseNDArray$.write_coordinates] no names on input data frame, assuming <dimensions[...], data> order")
+        names(values) <- c(dnames, attrn)
+      }
+
+      # Check dimensions
+      spdl::debug("[SOMASparseNDArray$.write_coordinates] checking dimension values")
+      for (i in seq_along(dnames)) {
+        dn <- dnames[i]
+        offending <- sprintf("(offending column: '%s')", dn)
+        if (!.is_integerish(values[[dn]])) {
+          stop("All dimension columns must be integerish ", offending)
+        }
+        if (as.logical(min(values[[dn]]) < 0L)) {
+          stop("Dimension columns cannot contain negative values ", offending)
+        }
+        if (as.logical(max(values[[dn]]) >= as.numeric(self$shape()[i]))) {
+          stop("Dimension columns cannot exceed the shape of the array ", offending)
+        }
+      }
+
+      # Check attribute
+      spdl::debug("[SOMASparseNDArray$.write_coordinates] checking data values")
+      if (is.null(private$.type)) {
+        tt <- self$schema()[attrn]$type
+        if (is.null(tt)) {
+          tt <- if (is.data.frame(values)) {
+            arrow::infer_type(values[[attrn]])
+          } else {
+            values[[attrn]]$type
+          }
+        }
+        private$.type <- tt
+      }
+      vt <- if (is.data.frame(values)) {
+        arrow::infer_type(values[[attrn]])
+      } else {
+        values[[attrn]]$type
+      }
+      if ((vrt <- r_type_from_arrow_type(vt)) != (rt <- r_type_from_arrow_type(private$.type))) {
+        stop("The data column must be of type '", rt, "', got '", vrt, "'")
+      }
+
+      # Build our Arrow table and schema
+      fields <- c(
+        lapply(dnames, arrow::field, type = arrow::int64()),
+        arrow::field(attrn, private$.type)
+      )
+      sch <- do.call(arrow::schema, fields)
+      tbl <- arrow::as_arrow_table(values, schema = sch)
+
+      # Write via libtiledbsoma
+      spdl::debug("[SOMASparseNDArray$.write_coordinates] writing arrow table")
+      naap <- nanoarrow::nanoarrow_allocate_array()
+      nasp <- nanoarrow::nanoarrow_allocate_schema()
+      arrow::as_record_batch(tbl)$export_to_c(naap, nasp)
+      writeArrayFromArrow(
+        uri = self$uri,
+        naap = naap,
+        nasp = nasp,
+        ctxxp = private$.soma_context,
+        arraytype = "SOMASparseNDArray",
+        config = NULL,
+        tsvec = self$.tiledb_timestamp_range
+      )
+      return(invisible(self))
     }
 
   ),
@@ -252,53 +350,6 @@ SOMASparseNDArray <- R6::R6Class(
       out
     },
 
-    # @description Ingest COO-formatted dataframe into the TileDB array.
-    # (lifecycle: maturing)
-    # @param values A [`data.frame`].
-    .write_coo_dataframe = function(values) {
-      private$check_open_for_write()
-
-      stopifnot(is.data.frame(values))
-      # private$log_array_ingestion()
-      #arr <- self$object
-      #if (!is.null(self$tiledb_timestamp)) {
-      #    # arr@timestamp <- self$tiledb_timestamp
-      #   arr@timestamp_end <- self$tiledb_timestamp
-      #}
-      nms <- colnames(values)
-
-      ## the 'soma_data' data type may not have been cached, and if so we need to fetch it
-      if (is.null(private$.type)) {
-          ## TODO: replace with a libtiledbsoma accessor as discussed
-          tpstr <- tiledb::datatype(tiledb::attrs(tiledb::schema(self$uri))[["soma_data"]])
-          arstr <- arrow_type_from_tiledb_type(tpstr)
-          private$.type <- arstr
-      }
-
-      arrsch <- arrow::schema(arrow::field(nms[1], arrow::int64()),
-                              arrow::field(nms[2], arrow::int64()),
-                              arrow::field(nms[3], private$.type))
-
-      tbl <- arrow::arrow_table(values, schema = arrsch)
-      spdl::debug(
-        "[SOMASparseNDArray$write] array created, writing to {} at ({})",
-        self$uri,
-        self$tiledb_timestamp %||% "now"
-      )
-      naap <- nanoarrow::nanoarrow_allocate_array()
-      nasp <- nanoarrow::nanoarrow_allocate_schema()
-      arrow::as_record_batch(tbl)$export_to_c(naap, nasp)
-      writeArrayFromArrow(
-        uri = self$uri,
-        naap = naap,
-        nasp = nasp,
-        ctxxp = private$.soma_context,
-        arraytype = "SOMASparseNDArray",
-        config = NULL,
-        tsvec = self$.tiledb_timestamp_range
-      )
-    },
-
     # Internal marking of one or zero based matrices for iterated reads
     zero_based = NA
 

diff --git a/apis/r/R/utils-arrow.R b/apis/r/R/utils-arrow.R
@@ -34,6 +34,11 @@ is_arrow_dictionary <- function(x) {
   is_arrow_object(x) && inherits(x, "Field") && inherits(x$type, "DictionaryType")
 }
 
+#' @method as.logical Scalar
+#' @export
+#'
+as.logical.Scalar <- \(x, ...) as.logical(x$as_vector(), ...)
+
 #' Convert Arrow types to supported TileDB type
 #' List of TileDB types supported in R: https://github.com/TileDB-Inc/TileDB-R/blob/8014da156b5fee5b4cc221d57b4aa7d388abc968/inst/tinytest/test_dim.R#L97-L121
 #' Note: TileDB attrs may be UTF-8; TileDB dims may not.
@@ -114,6 +119,92 @@ arrow_type_from_tiledb_type <- function(x) {
   )
 }
 
+#' Get the \R Type from an Arrow Type
+#'
+#' Get an \R \link[base:typeof]{type} from an Arrow type. This function is
+#' equivalent to \code{\link[base]{typeof}()} rather than
+#' \code{\link[base]{mode}()} or \code{\link[base]{class}()}, and returns the
+#' equivalent \strong{type}. For example, the equivalent \R type to an Arrow
+#' \link[arrow]{dictionary} is \dQuote{\code{integer}}, not
+#' \dQuote{\code{factor}}; likewise, the equivalent \R type to an Arrow 64-bit
+#' integer is \dQuote{\code{double}}
+#'
+#' @param x An \CRANpkg{Arrow} \link[arrow:Schema]{schema},
+#' \link[arrow:Field]{field}, or \link[arrow:infer_type]{data type}
+#'
+#' @return If \code{x} is a \link[arrow:infer_type]{data type}, a single
+#' character value giving the \R \link[base:typeof]{type} of \code{x}; if no
+#' corresponding \R type, returns the \CRANpkg{Arrow} type name
+#'
+#' @return If \code{x} is a \link[arrow:Field]{field}, a single named character
+#' vector with the name being the field name and the value being the \R
+#' \link[base:typeof]{type}
+#'
+#' @return If \code{x} is a \link[arrow:Schema]{schema}, a named vector where
+#' the names are field names and the values are the \R \link[base:typeof]{types}
+#' of each field
+#'
+#' @keywords internal
+#'
+#' @export
+#'
+#' @seealso \code{\link[base]{typeof}()}
+#'
+r_type_from_arrow_type <- function(x) UseMethod('r_type_from_arrow_type')
+
+#' @rdname r_type_from_arrow_type
+#'
+#' @method r_type_from_arrow_type Schema
+#' @export
+#'
+r_type_from_arrow_type.Schema <- function(x) {
+  return(vapply(
+    X = x$names,
+    FUN = function(f) r_type_from_arrow_type(x[[f]]),
+    FUN.VALUE = character(1L),
+    USE.NAMES = TRUE
+  ))
+}
+
+#' @rdname r_type_from_arrow_type
+#'
+#' @method r_type_from_arrow_type Field
+#' @export
+#'
+r_type_from_arrow_type.Field <- function(x) {
+  tt <- r_type_from_arrow_type(x$type)
+  names(x = tt) <- x$name
+  return(tt)
+}
+
+#' @rdname r_type_from_arrow_type
+#'
+#' @method r_type_from_arrow_type DataType
+#' @export
+#'
+r_type_from_arrow_type.DataType <- function(x) {
+  # Types are equivalent to `typeof()`, not `mode()` or `class()`
+  return(switch(
+    EXPR = x$name,
+    int8 = ,
+    int16 = ,
+    int32 = ,
+    dictionary = ,
+    uint8 = ,
+    uint16 = ,
+    uint32 = 'integer',
+    int64 = ,
+    uint64 = ,
+    date32 = ,
+    timestamp = ,
+    float = 'double',
+    bool = 'logical',
+    utf8 = ,
+    large_utf8 = 'character',
+    x$name
+  ))
+}
+
 #' Retrieve limits for Arrow types
 #' @importFrom bit64 lim.integer64
 #' @noRd
@@ -174,10 +265,9 @@ arrow_field_from_tiledb_dim <- function(x) {
 ## With a nod to Kevin Ushey
 #' @noRd
 yoink <- function(package, symbol) {
-    do.call(":::", list(package, symbol))
+  do.call(":::", list(package, symbol))
 }
 
-
 #' Create an Arrow field from a TileDB attribute
 #' @noRd
 arrow_field_from_tiledb_attr <- function(x, arrptr=NULL) {