Skip to content

Commit

Permalink
[r] Backport sparse COO writer to 1.14 (#3217)
Browse files Browse the repository at this point in the history
  • Loading branch information
mojaveazure authored Oct 21, 2024
1 parent 85339bd commit b5e7e01
Show file tree
Hide file tree
Showing 12 changed files with 687 additions and 91 deletions.
11 changes: 11 additions & 0 deletions apis/r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,25 @@

S3method("[[",MappingBase)
S3method("[[<-",MappingBase)
S3method(.is_integerish,Array)
S3method(.is_integerish,ChunkedArray)
S3method(.is_integerish,DataType)
S3method(.is_integerish,Field)
S3method(.is_integerish,default)
S3method(.is_integerish,integer64)
S3method(.read_soma_joinids,SOMADataFrame)
S3method(.read_soma_joinids,SOMASparseNDArray)
S3method(as.list,CoordsStrider)
S3method(as.list,MappingBase)
S3method(as.logical,Scalar)
S3method(iterators::nextElem,CoordsStrider)
S3method(itertools::hasNext,CoordsStrider)
S3method(length,CoordsStrider)
S3method(length,MappingBase)
S3method(names,MappingBase)
S3method(r_type_from_arrow_type,DataType)
S3method(r_type_from_arrow_type,Field)
S3method(r_type_from_arrow_type,Schema)
S3method(write_soma,Assay)
S3method(write_soma,DataFrame)
S3method(write_soma,DimReduc)
Expand Down Expand Up @@ -84,6 +94,7 @@ export(has_metadata)
export(list_datasets)
export(load_dataset)
export(matrixZeroBasedView)
export(r_type_from_arrow_type)
export(set_log_level)
export(set_metadata)
export(show_package_versions)
Expand Down
7 changes: 7 additions & 0 deletions apis/r/NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# Unreleased

## Changes

* Add new Arrow-to-R type mapper
* Expose block/random writer for sparse arrays [#3204](https://github.com/single-cell-data/TileDB-SOMA/pull/3204)

# tiledbsoma 1.14.3

## Changes
Expand Down
151 changes: 101 additions & 50 deletions apis/r/R/SOMASparseNDArray.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,14 @@ SOMASparseNDArray <- R6::R6Class(
#' @param bbox A vector of integers describing the upper bounds of each
#' dimension of `values`. Generally should be `NULL`.
#'
#' @return Invisibly returns \code{self}
#'
write = function(values, bbox = NULL) {
stopifnot(
"'values' must be a matrix" = is_matrix(values),
"'bbox' must contain two entries" = is.null(bbox) || length(bbox) == length(dim(values)),
"'bbox' must be a vector of two integers or a list with each entry containg two integers" = is.null(bbox) ||
(is_integerish(bbox) || bit64::is.integer64(bbox)) ||
.is_integerish(bbox) ||
(is.list(bbox) && all(vapply_lgl(bbox, function(x, n) length(x) == 2L)))
)
# coerce to a TsparseMatrix, which uses 0-based COO indexing
Expand All @@ -81,6 +83,12 @@ SOMASparseNDArray <- R6::R6Class(
j = bit64::as.integer64(values@j),
x = values@x
)
if (!is.null(private$.type)) {
rt <- r_type_from_arrow_type(private$.type)
if (rt == 'integer' && rlang::is_integerish(coo$x)) {
coo$x <- as.integer(coo$x)
}
}
dnames <- self$dimnames()
colnames(coo) <- c(dnames, self$attrnames())
ranges <- sapply(
Expand Down Expand Up @@ -181,9 +189,9 @@ SOMASparseNDArray <- R6::R6Class(
self$tiledb_timestamp %||% "now"
)

private$.write_coo_dataframe(coo)
self$.write_coordinates(coo)

invisible(self)
return(invisible(self))
},

#' @description Retrieve number of non-zero elements (lifecycle: maturing)
Expand Down Expand Up @@ -224,6 +232,96 @@ SOMASparseNDArray <- R6::R6Class(
)
# Checking slotwise new shape >= old shape, and <= max_shape, is already done in libtiledbsoma
tiledbsoma_upgrade_shape(self$uri, shape, private$.soma_context)
},

#' @description Write a COO table to the array
#'
#' @param values A \code{data.frame} or \code{\link[arrow:Table]{Arrow::Table}}
#' with data in COO format; must be named with the dimension and attribute
#' labels of the array
#'
#' @return Invisibly returns \code{self}
#'
.write_coordinates = function(values) {
private$check_open_for_write()
dnames <- self$dimnames()
attrn <- self$attrnames()

stopifnot(
"'values' must be a data frame or Arrow Table" = is.data.frame(values) ||
inherits(values, what = 'Table'),
"'values' must have one column for each dimension and the data" = ncol(values) == length(dnames) + 1L,
"'values' must be named with the dimension and attribute labels" = is.null(names(values)) ||
identical(names(values), c(dnames, attrn))
)

# Arrow Tables cannot have NULL names, so this only applies to dataframes
if (is.null(names(values))) {
spdl::warn("[SOMASparseNDArray$.write_coordinates] no names on input data frame, assuming <dimensions[...], data> order")
names(values) <- c(dnames, attrn)
}

# Check dimensions
spdl::debug("[SOMASparseNDArray$.write_coordinates] checking dimension values")
for (i in seq_along(dnames)) {
dn <- dnames[i]
offending <- sprintf("(offending column: '%s')", dn)
if (!.is_integerish(values[[dn]])) {
stop("All dimension columns must be integerish ", offending)
}
if (as.logical(min(values[[dn]]) < 0L)) {
stop("Dimension columns cannot contain negative values ", offending)
}
if (as.logical(max(values[[dn]]) >= as.numeric(self$shape()[i]))) {
stop("Dimension columns cannot exceed the shape of the array ", offending)
}
}

# Check attribute
spdl::debug("[SOMASparseNDArray$.write_coordinates] checking data values")
if (is.null(private$.type)) {
tt <- self$schema()[attrn]$type
if (is.null(tt)) {
tt <- if (is.data.frame(values)) {
arrow::infer_type(values[[attrn]])
} else {
values[[attrn]]$type
}
}
private$.type <- tt
}
vt <- if (is.data.frame(values)) {
arrow::infer_type(values[[attrn]])
} else {
values[[attrn]]$type
}
if ((vrt <- r_type_from_arrow_type(vt)) != (rt <- r_type_from_arrow_type(private$.type))) {
stop("The data column must be of type '", rt, "', got '", vrt, "'")
}

# Build our Arrow table and schema
fields <- c(
lapply(dnames, arrow::field, type = arrow::int64()),
arrow::field(attrn, private$.type)
)
sch <- do.call(arrow::schema, fields)
tbl <- arrow::as_arrow_table(values, schema = sch)

# Write via libtiledbsoma
spdl::debug("[SOMASparseNDArray$.write_coordinates] writing arrow table")
naap <- nanoarrow::nanoarrow_allocate_array()
nasp <- nanoarrow::nanoarrow_allocate_schema()
arrow::as_record_batch(tbl)$export_to_c(naap, nasp)
writeArrayFromArrow(
uri = self$uri,
naap = naap,
nasp = nasp,
ctxxp = private$.soma_context,
arraytype = "SOMASparseNDArray",
config = NULL,
tsvec = self$.tiledb_timestamp_range
)
return(invisible(self))
}

),
Expand Down Expand Up @@ -252,53 +350,6 @@ SOMASparseNDArray <- R6::R6Class(
out
},

# @description Ingest COO-formatted dataframe into the TileDB array.
# (lifecycle: maturing)
# @param values A [`data.frame`].
.write_coo_dataframe = function(values) {
private$check_open_for_write()

stopifnot(is.data.frame(values))
# private$log_array_ingestion()
#arr <- self$object
#if (!is.null(self$tiledb_timestamp)) {
# # arr@timestamp <- self$tiledb_timestamp
# arr@timestamp_end <- self$tiledb_timestamp
#}
nms <- colnames(values)

## the 'soma_data' data type may not have been cached, and if so we need to fetch it
if (is.null(private$.type)) {
## TODO: replace with a libtiledbsoma accessor as discussed
tpstr <- tiledb::datatype(tiledb::attrs(tiledb::schema(self$uri))[["soma_data"]])
arstr <- arrow_type_from_tiledb_type(tpstr)
private$.type <- arstr
}

arrsch <- arrow::schema(arrow::field(nms[1], arrow::int64()),
arrow::field(nms[2], arrow::int64()),
arrow::field(nms[3], private$.type))

tbl <- arrow::arrow_table(values, schema = arrsch)
spdl::debug(
"[SOMASparseNDArray$write] array created, writing to {} at ({})",
self$uri,
self$tiledb_timestamp %||% "now"
)
naap <- nanoarrow::nanoarrow_allocate_array()
nasp <- nanoarrow::nanoarrow_allocate_schema()
arrow::as_record_batch(tbl)$export_to_c(naap, nasp)
writeArrayFromArrow(
uri = self$uri,
naap = naap,
nasp = nasp,
ctxxp = private$.soma_context,
arraytype = "SOMASparseNDArray",
config = NULL,
tsvec = self$.tiledb_timestamp_range
)
},

# Internal marking of one or zero based matrices for iterated reads
zero_based = NA

Expand Down
94 changes: 92 additions & 2 deletions apis/r/R/utils-arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ is_arrow_dictionary <- function(x) {
is_arrow_object(x) && inherits(x, "Field") && inherits(x$type, "DictionaryType")
}

#' @method as.logical Scalar
#' @export
#'
as.logical.Scalar <- \(x, ...) as.logical(x$as_vector(), ...)

#' Convert Arrow types to supported TileDB type
#' List of TileDB types supported in R: https://github.com/TileDB-Inc/TileDB-R/blob/8014da156b5fee5b4cc221d57b4aa7d388abc968/inst/tinytest/test_dim.R#L97-L121
#' Note: TileDB attrs may be UTF-8; TileDB dims may not.
Expand Down Expand Up @@ -114,6 +119,92 @@ arrow_type_from_tiledb_type <- function(x) {
)
}

#' Get the \R Type from an Arrow Type
#'
#' Get an \R \link[base:typeof]{type} from an Arrow type. This function is
#' equivalent to \code{\link[base]{typeof}()} rather than
#' \code{\link[base]{mode}()} or \code{\link[base]{class}()}, and returns the
#' equivalent \strong{type}. For example, the equivalent \R type to an Arrow
#' \link[arrow]{dictionary} is \dQuote{\code{integer}}, not
#' \dQuote{\code{factor}}; likewise, the equivalent \R type to an Arrow 64-bit
#' integer is \dQuote{\code{double}}
#'
#' @param x An \CRANpkg{Arrow} \link[arrow:Schema]{schema},
#' \link[arrow:Field]{field}, or \link[arrow:infer_type]{data type}
#'
#' @return If \code{x} is a \link[arrow:infer_type]{data type}, a single
#' character value giving the \R \link[base:typeof]{type} of \code{x}; if no
#' corresponding \R type, returns the \CRANpkg{Arrow} type name
#'
#' @return If \code{x} is a \link[arrow:Field]{field}, a single named character
#' vector with the name being the field name and the value being the \R
#' \link[base:typeof]{type}
#'
#' @return If \code{x} is a \link[arrow:Schema]{schema}, a named vector where
#' the names are field names and the values are the \R \link[base:typeof]{types}
#' of each field
#'
#' @keywords internal
#'
#' @export
#'
#' @seealso \code{\link[base]{typeof}()}
#'
r_type_from_arrow_type <- function(x) UseMethod('r_type_from_arrow_type')

#' @rdname r_type_from_arrow_type
#'
#' @method r_type_from_arrow_type Schema
#' @export
#'
r_type_from_arrow_type.Schema <- function(x) {
return(vapply(
X = x$names,
FUN = function(f) r_type_from_arrow_type(x[[f]]),
FUN.VALUE = character(1L),
USE.NAMES = TRUE
))
}

#' @rdname r_type_from_arrow_type
#'
#' @method r_type_from_arrow_type Field
#' @export
#'
r_type_from_arrow_type.Field <- function(x) {
tt <- r_type_from_arrow_type(x$type)
names(x = tt) <- x$name
return(tt)
}

#' @rdname r_type_from_arrow_type
#'
#' @method r_type_from_arrow_type DataType
#' @export
#'
r_type_from_arrow_type.DataType <- function(x) {
# Types are equivalent to `typeof()`, not `mode()` or `class()`
return(switch(
EXPR = x$name,
int8 = ,
int16 = ,
int32 = ,
dictionary = ,
uint8 = ,
uint16 = ,
uint32 = 'integer',
int64 = ,
uint64 = ,
date32 = ,
timestamp = ,
float = 'double',
bool = 'logical',
utf8 = ,
large_utf8 = 'character',
x$name
))
}

#' Retrieve limits for Arrow types
#' @importFrom bit64 lim.integer64
#' @noRd
Expand Down Expand Up @@ -174,10 +265,9 @@ arrow_field_from_tiledb_dim <- function(x) {
## With a nod to Kevin Ushey
#' @noRd
yoink <- function(package, symbol) {
do.call(":::", list(package, symbol))
do.call(":::", list(package, symbol))
}


#' Create an Arrow field from a TileDB attribute
#' @noRd
arrow_field_from_tiledb_attr <- function(x, arrptr=NULL) {
Expand Down
Loading

0 comments on commit b5e7e01

Please sign in to comment.