From b5e7e017035e840e2a61317e7563b7c6cdae0117 Mon Sep 17 00:00:00 2001 From: Paul Hoffman Date: Mon, 21 Oct 2024 17:13:41 -0400 Subject: [PATCH] [r] Backport sparse COO writer to 1.14 (#3217) --- apis/r/NAMESPACE | 11 ++ apis/r/NEWS.md | 7 + apis/r/R/SOMASparseNDArray.R | 151 ++++++++++++------ apis/r/R/utils-arrow.R | 94 ++++++++++- apis/r/R/utils.R | 76 +++++++++ apis/r/man/SOMADenseNDArray.Rd | 2 + apis/r/man/SOMANDArrayBase.Rd | 48 ++++++ apis/r/man/SOMASparseNDArray.Rd | 50 ++---- apis/r/man/r_type_from_arrow_type.Rd | 47 ++++++ .../r/tests/testthat/test-SOMASparseNDArray.R | 104 +++++++++++- apis/r/tests/testthat/test-r-arrow-types.R | 93 +++++++++++ apis/r/tests/testthat/test-utils.R | 95 ++++++++++- 12 files changed, 687 insertions(+), 91 deletions(-) create mode 100644 apis/r/man/r_type_from_arrow_type.Rd create mode 100644 apis/r/tests/testthat/test-r-arrow-types.R diff --git a/apis/r/NAMESPACE b/apis/r/NAMESPACE index 522816de8a..83db784869 100644 --- a/apis/r/NAMESPACE +++ b/apis/r/NAMESPACE @@ -2,15 +2,25 @@ S3method("[[",MappingBase) S3method("[[<-",MappingBase) +S3method(.is_integerish,Array) +S3method(.is_integerish,ChunkedArray) +S3method(.is_integerish,DataType) +S3method(.is_integerish,Field) +S3method(.is_integerish,default) +S3method(.is_integerish,integer64) S3method(.read_soma_joinids,SOMADataFrame) S3method(.read_soma_joinids,SOMASparseNDArray) S3method(as.list,CoordsStrider) S3method(as.list,MappingBase) +S3method(as.logical,Scalar) S3method(iterators::nextElem,CoordsStrider) S3method(itertools::hasNext,CoordsStrider) S3method(length,CoordsStrider) S3method(length,MappingBase) S3method(names,MappingBase) +S3method(r_type_from_arrow_type,DataType) +S3method(r_type_from_arrow_type,Field) +S3method(r_type_from_arrow_type,Schema) S3method(write_soma,Assay) S3method(write_soma,DataFrame) S3method(write_soma,DimReduc) @@ -84,6 +94,7 @@ export(has_metadata) export(list_datasets) export(load_dataset) export(matrixZeroBasedView) +export(r_type_from_arrow_type) export(set_log_level) export(set_metadata) export(show_package_versions) diff --git a/apis/r/NEWS.md b/apis/r/NEWS.md index 1067665c76..d197a9c477 100644 --- a/apis/r/NEWS.md +++ b/apis/r/NEWS.md @@ -1,3 +1,10 @@ +# Unreleased + +## Changes + +* Add new Arrow-to-R type mapper +* Expose block/random writer for sparse arrays [#3204](https://github.com/single-cell-data/TileDB-SOMA/pull/3204) + # tiledbsoma 1.14.3 ## Changes diff --git a/apis/r/R/SOMASparseNDArray.R b/apis/r/R/SOMASparseNDArray.R index 4bbeb22b92..251c9cca91 100644 --- a/apis/r/R/SOMASparseNDArray.R +++ b/apis/r/R/SOMASparseNDArray.R @@ -66,12 +66,14 @@ SOMASparseNDArray <- R6::R6Class( #' @param bbox A vector of integers describing the upper bounds of each #' dimension of `values`. Generally should be `NULL`. #' + #' @return Invisibly returns \code{self} + #' write = function(values, bbox = NULL) { stopifnot( "'values' must be a matrix" = is_matrix(values), "'bbox' must contain two entries" = is.null(bbox) || length(bbox) == length(dim(values)), "'bbox' must be a vector of two integers or a list with each entry containg two integers" = is.null(bbox) || - (is_integerish(bbox) || bit64::is.integer64(bbox)) || + .is_integerish(bbox) || (is.list(bbox) && all(vapply_lgl(bbox, function(x, n) length(x) == 2L))) ) # coerce to a TsparseMatrix, which uses 0-based COO indexing @@ -81,6 +83,12 @@ SOMASparseNDArray <- R6::R6Class( j = bit64::as.integer64(values@j), x = values@x ) + if (!is.null(private$.type)) { + rt <- r_type_from_arrow_type(private$.type) + if (rt == 'integer' && rlang::is_integerish(coo$x)) { + coo$x <- as.integer(coo$x) + } + } dnames <- self$dimnames() colnames(coo) <- c(dnames, self$attrnames()) ranges <- sapply( @@ -181,9 +189,9 @@ SOMASparseNDArray <- R6::R6Class( self$tiledb_timestamp %||% "now" ) - private$.write_coo_dataframe(coo) + self$.write_coordinates(coo) - invisible(self) + return(invisible(self)) }, #' @description Retrieve number of non-zero elements (lifecycle: maturing) @@ -224,6 +232,96 @@ SOMASparseNDArray <- R6::R6Class( ) # Checking slotwise new shape >= old shape, and <= max_shape, is already done in libtiledbsoma tiledbsoma_upgrade_shape(self$uri, shape, private$.soma_context) + }, + + #' @description Write a COO table to the array + #' + #' @param values A \code{data.frame} or \code{\link[arrow:Table]{Arrow::Table}} + #' with data in COO format; must be named with the dimension and attribute + #' labels of the array + #' + #' @return Invisibly returns \code{self} + #' + .write_coordinates = function(values) { + private$check_open_for_write() + dnames <- self$dimnames() + attrn <- self$attrnames() + + stopifnot( + "'values' must be a data frame or Arrow Table" = is.data.frame(values) || + inherits(values, what = 'Table'), + "'values' must have one column for each dimension and the data" = ncol(values) == length(dnames) + 1L, + "'values' must be named with the dimension and attribute labels" = is.null(names(values)) || + identical(names(values), c(dnames, attrn)) + ) + + # Arrow Tables cannot have NULL names, so this only applies to dataframes + if (is.null(names(values))) { + spdl::warn("[SOMASparseNDArray$.write_coordinates] no names on input data frame, assuming order") + names(values) <- c(dnames, attrn) + } + + # Check dimensions + spdl::debug("[SOMASparseNDArray$.write_coordinates] checking dimension values") + for (i in seq_along(dnames)) { + dn <- dnames[i] + offending <- sprintf("(offending column: '%s')", dn) + if (!.is_integerish(values[[dn]])) { + stop("All dimension columns must be integerish ", offending) + } + if (as.logical(min(values[[dn]]) < 0L)) { + stop("Dimension columns cannot contain negative values ", offending) + } + if (as.logical(max(values[[dn]]) >= as.numeric(self$shape()[i]))) { + stop("Dimension columns cannot exceed the shape of the array ", offending) + } + } + + # Check attribute + spdl::debug("[SOMASparseNDArray$.write_coordinates] checking data values") + if (is.null(private$.type)) { + tt <- self$schema()[attrn]$type + if (is.null(tt)) { + tt <- if (is.data.frame(values)) { + arrow::infer_type(values[[attrn]]) + } else { + values[[attrn]]$type + } + } + private$.type <- tt + } + vt <- if (is.data.frame(values)) { + arrow::infer_type(values[[attrn]]) + } else { + values[[attrn]]$type + } + if ((vrt <- r_type_from_arrow_type(vt)) != (rt <- r_type_from_arrow_type(private$.type))) { + stop("The data column must be of type '", rt, "', got '", vrt, "'") + } + + # Build our Arrow table and schema + fields <- c( + lapply(dnames, arrow::field, type = arrow::int64()), + arrow::field(attrn, private$.type) + ) + sch <- do.call(arrow::schema, fields) + tbl <- arrow::as_arrow_table(values, schema = sch) + + # Write via libtiledbsoma + spdl::debug("[SOMASparseNDArray$.write_coordinates] writing arrow table") + naap <- nanoarrow::nanoarrow_allocate_array() + nasp <- nanoarrow::nanoarrow_allocate_schema() + arrow::as_record_batch(tbl)$export_to_c(naap, nasp) + writeArrayFromArrow( + uri = self$uri, + naap = naap, + nasp = nasp, + ctxxp = private$.soma_context, + arraytype = "SOMASparseNDArray", + config = NULL, + tsvec = self$.tiledb_timestamp_range + ) + return(invisible(self)) } ), @@ -252,53 +350,6 @@ SOMASparseNDArray <- R6::R6Class( out }, - # @description Ingest COO-formatted dataframe into the TileDB array. - # (lifecycle: maturing) - # @param values A [`data.frame`]. - .write_coo_dataframe = function(values) { - private$check_open_for_write() - - stopifnot(is.data.frame(values)) - # private$log_array_ingestion() - #arr <- self$object - #if (!is.null(self$tiledb_timestamp)) { - # # arr@timestamp <- self$tiledb_timestamp - # arr@timestamp_end <- self$tiledb_timestamp - #} - nms <- colnames(values) - - ## the 'soma_data' data type may not have been cached, and if so we need to fetch it - if (is.null(private$.type)) { - ## TODO: replace with a libtiledbsoma accessor as discussed - tpstr <- tiledb::datatype(tiledb::attrs(tiledb::schema(self$uri))[["soma_data"]]) - arstr <- arrow_type_from_tiledb_type(tpstr) - private$.type <- arstr - } - - arrsch <- arrow::schema(arrow::field(nms[1], arrow::int64()), - arrow::field(nms[2], arrow::int64()), - arrow::field(nms[3], private$.type)) - - tbl <- arrow::arrow_table(values, schema = arrsch) - spdl::debug( - "[SOMASparseNDArray$write] array created, writing to {} at ({})", - self$uri, - self$tiledb_timestamp %||% "now" - ) - naap <- nanoarrow::nanoarrow_allocate_array() - nasp <- nanoarrow::nanoarrow_allocate_schema() - arrow::as_record_batch(tbl)$export_to_c(naap, nasp) - writeArrayFromArrow( - uri = self$uri, - naap = naap, - nasp = nasp, - ctxxp = private$.soma_context, - arraytype = "SOMASparseNDArray", - config = NULL, - tsvec = self$.tiledb_timestamp_range - ) - }, - # Internal marking of one or zero based matrices for iterated reads zero_based = NA diff --git a/apis/r/R/utils-arrow.R b/apis/r/R/utils-arrow.R index 7ee2ebc2ae..c64099a692 100644 --- a/apis/r/R/utils-arrow.R +++ b/apis/r/R/utils-arrow.R @@ -34,6 +34,11 @@ is_arrow_dictionary <- function(x) { is_arrow_object(x) && inherits(x, "Field") && inherits(x$type, "DictionaryType") } +#' @method as.logical Scalar +#' @export +#' +as.logical.Scalar <- \(x, ...) as.logical(x$as_vector(), ...) + #' Convert Arrow types to supported TileDB type #' List of TileDB types supported in R: https://github.com/TileDB-Inc/TileDB-R/blob/8014da156b5fee5b4cc221d57b4aa7d388abc968/inst/tinytest/test_dim.R#L97-L121 #' Note: TileDB attrs may be UTF-8; TileDB dims may not. @@ -114,6 +119,92 @@ arrow_type_from_tiledb_type <- function(x) { ) } +#' Get the \R Type from an Arrow Type +#' +#' Get an \R \link[base:typeof]{type} from an Arrow type. This function is +#' equivalent to \code{\link[base]{typeof}()} rather than +#' \code{\link[base]{mode}()} or \code{\link[base]{class}()}, and returns the +#' equivalent \strong{type}. For example, the equivalent \R type to an Arrow +#' \link[arrow]{dictionary} is \dQuote{\code{integer}}, not +#' \dQuote{\code{factor}}; likewise, the equivalent \R type to an Arrow 64-bit +#' integer is \dQuote{\code{double}} +#' +#' @param x An \CRANpkg{Arrow} \link[arrow:Schema]{schema}, +#' \link[arrow:Field]{field}, or \link[arrow:infer_type]{data type} +#' +#' @return If \code{x} is a \link[arrow:infer_type]{data type}, a single +#' character value giving the \R \link[base:typeof]{type} of \code{x}; if no +#' corresponding \R type, returns the \CRANpkg{Arrow} type name +#' +#' @return If \code{x} is a \link[arrow:Field]{field}, a single named character +#' vector with the name being the field name and the value being the \R +#' \link[base:typeof]{type} +#' +#' @return If \code{x} is a \link[arrow:Schema]{schema}, a named vector where +#' the names are field names and the values are the \R \link[base:typeof]{types} +#' of each field +#' +#' @keywords internal +#' +#' @export +#' +#' @seealso \code{\link[base]{typeof}()} +#' +r_type_from_arrow_type <- function(x) UseMethod('r_type_from_arrow_type') + +#' @rdname r_type_from_arrow_type +#' +#' @method r_type_from_arrow_type Schema +#' @export +#' +r_type_from_arrow_type.Schema <- function(x) { + return(vapply( + X = x$names, + FUN = function(f) r_type_from_arrow_type(x[[f]]), + FUN.VALUE = character(1L), + USE.NAMES = TRUE + )) +} + +#' @rdname r_type_from_arrow_type +#' +#' @method r_type_from_arrow_type Field +#' @export +#' +r_type_from_arrow_type.Field <- function(x) { + tt <- r_type_from_arrow_type(x$type) + names(x = tt) <- x$name + return(tt) +} + +#' @rdname r_type_from_arrow_type +#' +#' @method r_type_from_arrow_type DataType +#' @export +#' +r_type_from_arrow_type.DataType <- function(x) { + # Types are equivalent to `typeof()`, not `mode()` or `class()` + return(switch( + EXPR = x$name, + int8 = , + int16 = , + int32 = , + dictionary = , + uint8 = , + uint16 = , + uint32 = 'integer', + int64 = , + uint64 = , + date32 = , + timestamp = , + float = 'double', + bool = 'logical', + utf8 = , + large_utf8 = 'character', + x$name + )) +} + #' Retrieve limits for Arrow types #' @importFrom bit64 lim.integer64 #' @noRd @@ -174,10 +265,9 @@ arrow_field_from_tiledb_dim <- function(x) { ## With a nod to Kevin Ushey #' @noRd yoink <- function(package, symbol) { - do.call(":::", list(package, symbol)) + do.call(":::", list(package, symbol)) } - #' Create an Arrow field from a TileDB attribute #' @noRd arrow_field_from_tiledb_attr <- function(x, arrptr=NULL) { diff --git a/apis/r/R/utils.R b/apis/r/R/utils.R index dec2f8aa38..1459fad5a4 100644 --- a/apis/r/R/utils.R +++ b/apis/r/R/utils.R @@ -98,6 +98,82 @@ uns_hint <- function(type = c('1d', '2d')) { }) } +#' Is an Object Integerish +#' +#' @inheritParams rlang::is_integerish +#' +#' @return \code{TRUE} if \code{x} is integerish, otherwise \code{FALSE} +#' +#' @keywords internal +#' +#' @noRd +#' +.is_integerish <- function(x, n = NULL, finite = NULL) { + UseMethod(generic = '.is_integerish', object = x) +} + +#' @method .is_integerish default +#' @export +#' +.is_integerish.default <- function(x, n = NULL, finite = NULL) { + return(rlang::is_integerish(x = x, n = n, finite = finite)) +} + +#' @method .is_integerish integer64 +#' @export +#' +.is_integerish.integer64 <- function(x, n = NULL, finite = NULL) { + res <- if (!is.null(x = n)) { + stopifnot( + "'n' must be a single integerish value" = .is_integerish(x = n) && + length(x = n) == 1L && + is.finite(x = n) + ) + length(x = x) == n + } else { + TRUE + } + res <- res && if (!is.null(x = finite)) { + stopifnot(isTRUE(x = finite) || isFALSE(x = finite)) + # In `rlang::is_integerish()`, + # `finite = TRUE`: all values are finite + # `finite = FALSE`: at least one value is infinite + # `bit64::is.infinite()` returns FALSE for NA + ifelse( + test = finite, + yes = all(is.finite(x = x)), + no = any(is.infinite(x = x) | is.na(x = x)) + ) + } else { + TRUE + } + return(res) +} + +#' @method .is_integerish Field +#' @export +#' +.is_integerish.Field <-function(x, n = NULL, finite = NULL) { + return(.is_integerish(x = x$type, n = n, finite = finite)) +} + +#' @method .is_integerish Array +#' @export +#' +.is_integerish.Array <- .is_integerish.Field + +#' @method .is_integerish ChunkedArray +#' @export +#' +.is_integerish.ChunkedArray <- .is_integerish.Field + +#' @method .is_integerish DataType +#' @export +#' +.is_integerish.DataType <-function(x, n = NULL, finite = NULL) { + return(grepl(pattern = '^[u]?int[[:digit:]]{1,2}$', x = x$name)) +} + .maybe_muffle <- function(w, cond = getOption('verbose', default = FALSE)) { if (isTRUE(x = cond)) { warning(warningCondition( diff --git a/apis/r/man/SOMADenseNDArray.Rd b/apis/r/man/SOMADenseNDArray.Rd index e2156ec384..20aef8bbab 100644 --- a/apis/r/man/SOMADenseNDArray.Rd +++ b/apis/r/man/SOMADenseNDArray.Rd @@ -66,8 +66,10 @@ The \code{write} method is currently limited to writing from 2-d matrices.
  • tiledbsoma::TileDBArray$tiledb_schema()
  • tiledbsoma::TileDBArray$used_shape()
  • tiledbsoma::SOMANDArrayBase$create()
  • +
  • tiledbsoma::SOMANDArrayBase$resize()
  • tiledbsoma::SOMANDArrayBase$set_data_type()
  • tiledbsoma::SOMANDArrayBase$tiledbsoma_has_upgraded_shape()
  • +
  • tiledbsoma::SOMANDArrayBase$tiledbsoma_upgrade_shape()
  • }} diff --git a/apis/r/man/SOMANDArrayBase.Rd b/apis/r/man/SOMANDArrayBase.Rd index 1f43ce1306..0722a2eb3e 100644 --- a/apis/r/man/SOMANDArrayBase.Rd +++ b/apis/r/man/SOMANDArrayBase.Rd @@ -17,6 +17,8 @@ Adds NDArray-specific functionality to the \code{\link{SOMAArrayBase}} class. \item \href{#method-SOMANDArrayBase-create}{\code{SOMANDArrayBase$create()}} \item \href{#method-SOMANDArrayBase-set_data_type}{\code{SOMANDArrayBase$set_data_type()}} \item \href{#method-SOMANDArrayBase-tiledbsoma_has_upgraded_shape}{\code{SOMANDArrayBase$tiledbsoma_has_upgraded_shape()}} +\item \href{#method-SOMANDArrayBase-resize}{\code{SOMANDArrayBase$resize()}} +\item \href{#method-SOMANDArrayBase-tiledbsoma_upgrade_shape}{\code{SOMANDArrayBase$tiledbsoma_upgrade_shape()}} \item \href{#method-SOMANDArrayBase-clone}{\code{SOMANDArrayBase$clone()}} } } @@ -119,6 +121,52 @@ Logical } } \if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SOMANDArrayBase-resize}{}}} +\subsection{Method \code{resize()}}{ +Increases the shape of the array as specfied. Raises an error +if the new shape is less than the current shape in any dimension. Raises +an error if the new shape exceeds maxshape in any dimension. Raises an +error if the array doesn't already have a shape: in that case please call +tiledbsoma_upgrade_shape. +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{SOMANDArrayBase$resize(new_shape)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{new_shape}}{A vector of integerish, of the same length as the array's \code{ndim}.} +} +\if{html}{\out{
    }} +} +\subsection{Returns}{ +No return value +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SOMANDArrayBase-tiledbsoma_upgrade_shape}{}}} +\subsection{Method \code{tiledbsoma_upgrade_shape()}}{ +Allows the array to have a resizeable shape as described in the +TileDB-SOMA 1.15 release notes. Raises an error if the shape exceeds maxshape in any +dimension. Raises an error if the array already has a shape. +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{SOMANDArrayBase$tiledbsoma_upgrade_shape(shape)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{shape}}{A vector of integerish, of the same length as the array's \code{ndim}.} +} +\if{html}{\out{
    }} +} +\subsection{Returns}{ +No return value +} +} +\if{html}{\out{
    }} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-SOMANDArrayBase-clone}{}}} \subsection{Method \code{clone()}}{ diff --git a/apis/r/man/SOMASparseNDArray.Rd b/apis/r/man/SOMASparseNDArray.Rd index 7a030c04f0..326736a42f 100644 --- a/apis/r/man/SOMASparseNDArray.Rd +++ b/apis/r/man/SOMASparseNDArray.Rd @@ -31,8 +31,7 @@ the object are overwritten and new index values are added. (lifecycle: maturing) \item \href{#method-SOMASparseNDArray-read}{\code{SOMASparseNDArray$read()}} \item \href{#method-SOMASparseNDArray-write}{\code{SOMASparseNDArray$write()}} \item \href{#method-SOMASparseNDArray-nnz}{\code{SOMASparseNDArray$nnz()}} -\item \href{#method-SOMASparseNDArray-resize}{\code{SOMASparseNDArray$resize()}} -\item \href{#method-SOMASparseNDArray-tiledbsoma_upgrade_shape}{\code{SOMASparseNDArray$tiledbsoma_upgrade_shape()}} +\item \href{#method-SOMASparseNDArray-.write_coordinates}{\code{SOMASparseNDArray$.write_coordinates()}} \item \href{#method-SOMASparseNDArray-clone}{\code{SOMASparseNDArray$clone()}} } } @@ -66,8 +65,10 @@ the object are overwritten and new index values are added. (lifecycle: maturing)
  • tiledbsoma::TileDBArray$tiledb_schema()
  • tiledbsoma::TileDBArray$used_shape()
  • tiledbsoma::SOMANDArrayBase$create()
  • +
  • tiledbsoma::SOMANDArrayBase$resize()
  • tiledbsoma::SOMANDArrayBase$set_data_type()
  • tiledbsoma::SOMANDArrayBase$tiledbsoma_has_upgraded_shape()
  • +
  • tiledbsoma::SOMANDArrayBase$tiledbsoma_upgrade_shape()
  • }} @@ -126,6 +127,9 @@ dimension of \code{values}. Generally should be \code{NULL}.} } \if{html}{\out{}} } +\subsection{Returns}{ +Invisibly returns \code{self} +} } \if{html}{\out{
    }} \if{html}{\out{}} @@ -141,49 +145,25 @@ A scalar with the number of non-zero elements } } \if{html}{\out{
    }} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-SOMASparseNDArray-resize}{}}} -\subsection{Method \code{resize()}}{ -Increases the shape of the array as specfied. Raises an error -if the new shape is less than the current shape in any dimension. Raises -an error if the new shape exceeds maxshape in any dimension. Raises an -error if the array doesn't already have a shape: in that case please call -tiledbsoma_upgrade_shape. -\subsection{Usage}{ -\if{html}{\out{
    }}\preformatted{SOMASparseNDArray$resize(new_shape)}\if{html}{\out{
    }} -} - -\subsection{Arguments}{ -\if{html}{\out{
    }} -\describe{ -\item{\code{new_shape}}{A vector of integerish, of the same length as the array's \code{ndim}.} -} -\if{html}{\out{
    }} -} -\subsection{Returns}{ -No return value -} -} -\if{html}{\out{
    }} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-SOMASparseNDArray-tiledbsoma_upgrade_shape}{}}} -\subsection{Method \code{tiledbsoma_upgrade_shape()}}{ -Allows the array to have a resizeable shape as described in the -TileDB-SOMA 1.15 release notes. Raises an error if the shape exceeds maxshape in any -dimension. Raises an error if the array already has a shape. +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SOMASparseNDArray-.write_coordinates}{}}} +\subsection{Method \code{.write_coordinates()}}{ +Write a COO table to the array \subsection{Usage}{ -\if{html}{\out{
    }}\preformatted{SOMASparseNDArray$tiledbsoma_upgrade_shape(shape)}\if{html}{\out{
    }} +\if{html}{\out{
    }}\preformatted{SOMASparseNDArray$.write_coordinates(values)}\if{html}{\out{
    }} } \subsection{Arguments}{ \if{html}{\out{
    }} \describe{ -\item{\code{shape}}{A vector of integerish, of the same length as the array's \code{ndim}.} +\item{\code{values}}{A \code{data.frame} or \code{\link[arrow:Table]{Arrow::Table}} +with data in COO format; must be named with the dimension and attribute +labels of the array} } \if{html}{\out{
    }} } \subsection{Returns}{ -No return value +Invisibly returns \code{self} } } \if{html}{\out{
    }} diff --git a/apis/r/man/r_type_from_arrow_type.Rd b/apis/r/man/r_type_from_arrow_type.Rd new file mode 100644 index 0000000000..4bb62a59c7 --- /dev/null +++ b/apis/r/man/r_type_from_arrow_type.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-arrow.R +\name{r_type_from_arrow_type} +\alias{r_type_from_arrow_type} +\alias{r_type_from_arrow_type.Schema} +\alias{r_type_from_arrow_type.Field} +\alias{r_type_from_arrow_type.DataType} +\title{Get the \R Type from an Arrow Type} +\usage{ +r_type_from_arrow_type(x) + +\method{r_type_from_arrow_type}{Schema}(x) + +\method{r_type_from_arrow_type}{Field}(x) + +\method{r_type_from_arrow_type}{DataType}(x) +} +\arguments{ +\item{x}{An \CRANpkg{Arrow} \link[arrow:Schema]{schema}, +\link[arrow:Field]{field}, or \link[arrow:infer_type]{data type}} +} +\value{ +If \code{x} is a \link[arrow:infer_type]{data type}, a single +character value giving the \R \link[base:typeof]{type} of \code{x}; if no +corresponding \R type, returns the \CRANpkg{Arrow} type name + +If \code{x} is a \link[arrow:Field]{field}, a single named character +vector with the name being the field name and the value being the \R +\link[base:typeof]{type} + +If \code{x} is a \link[arrow:Schema]{schema}, a named vector where +the names are field names and the values are the \R \link[base:typeof]{types} +of each field +} +\description{ +Get an \R \link[base:typeof]{type} from an Arrow type. This function is +equivalent to \code{\link[base]{typeof}()} rather than +\code{\link[base]{mode}()} or \code{\link[base]{class}()}, and returns the +equivalent \strong{type}. For example, the equivalent \R type to an Arrow +\link[arrow]{dictionary} is \dQuote{\code{integer}}, not +\dQuote{\code{factor}}; likewise, the equivalent \R type to an Arrow 64-bit +integer is \dQuote{\code{double}} +} +\seealso{ +\code{\link[base]{typeof}()} +} +\keyword{internal} diff --git a/apis/r/tests/testthat/test-SOMASparseNDArray.R b/apis/r/tests/testthat/test-SOMASparseNDArray.R index 9777bcaefd..808ff6e977 100644 --- a/apis/r/tests/testthat/test-SOMASparseNDArray.R +++ b/apis/r/tests/testthat/test-SOMASparseNDArray.R @@ -1,3 +1,4 @@ + test_that("SOMASparseNDArray creation", { skip_if(!extended_tests()) uri <- tempfile(pattern="sparse-ndarray") @@ -96,6 +97,105 @@ test_that("SOMASparseNDArray creation", { }) +test_that("SOMASparseNDArray write COO assertions", { + skip_if(!extended_tests()) + uri <- tempfile(pattern = "sparse-ndarray-coo") + shape <- c(10L, 10L) + ndarray <- SOMASparseNDArrayCreate(uri, arrow::int32(), shape = shape) + + expect_s3_class(ndarray, 'SOMASparseNDArray') + expect_equal(ndarray$ndim(), 2L) + mat <- create_sparse_matrix_with_int_dims(10L, 10L) + df <- data.frame( + soma_dim_0 = mat@i, + soma_dim_1 = mat@j, + soma_data = as.integer(mat@x) + ) + + ndarray$reopen("WRITE") + expect_invisible(ndarray$.write_coordinates(df)) + ndarray$close() + + # Test write with Table + tbl <- arrow::as_arrow_table(df) + ndarray <- SOMASparseNDArrayCreate( + tempfile(pattern = "sparse-ndarray-coo-table"), + type = tbl$soma_data$type, + shape = shape + ) + expect_invisible(ndarray$.write_coordinates(tbl)) + ndarray$close() + + # Test write unnamed data frame + udf <- df + names(udf) <- NULL + ndarray <- SOMASparseNDArrayCreate( + uri = tempfile(pattern = "sparse-ndarray-coo-unnamed"), + type = arrow::int32(), + shape = shape + ) + expect_invisible(ndarray$.write_coordinates(udf)) + ndarray$close() + + # Test argument assertions + arr <- SOMASparseNDArrayCreate(tempfile(), arrow::int32(), shape = shape) + on.exit(arr$close(), add = TRUE, after = FALSE) + expect_error( + arr$.write_coordinates(mat), + regexp = "'values' must be a data frame or Arrow Table" + ) + expect_error( + arr$.write_coordinates(mtcars), + regexp = "'values' must have one column for each dimension and the data" + ) + + sdf <- df + while (identical(names(sdf), c(ndarray$dimnames(), ndarray$attrnames()))) { + sdf <- sdf[, sample(names(sdf)), drop = FALSE] + } + expect_error( + arr$.write_coordinates(sdf), + regexp = "'values' must be named with the dimension and attribute labels" + ) + + # Test dimension assertions + ddf <- df + ddf$soma_dim_0 <- ddf$soma_dim_0 + 0.1 + expect_error( + arr$.write_coordinates(ddf), + regexp = "All dimension columns must be integerish" + ) + + ndf <- df + ndf$soma_dim_0 <- -ndf$soma_dim_0 + expect_error( + arr$.write_coordinates(ndf), + regexp = "Dimension columns cannot contain negative values" + ) + + bdf <- df + bdf$soma_dim_0 <- bdf$soma_dim_0 * 1000 + expect_error( + arr$.write_coordinates(bdf), + regexp = "Dimension columns cannot exceed the shape of the array" + ) + + # Test attribute assertions + ldf <- df + ldf$soma_data <- TRUE + expect_error( + arr$.write_coordinates(ldf), + regexp = "The data column must be of type 'integer'" + ) + + fdf <- df + fdf$soma_data <- fdf$soma_data + 0.1 + expect_error( + arr$.write_coordinates(fdf), + regexp = "The data column must be of type 'integer'" + ) +}) + test_that("SOMASparseNDArray read_sparse_matrix", { skip_if(!extended_tests()) uri <- tempfile(pattern="sparse-ndarray-3") @@ -611,10 +711,10 @@ test_that("SOMASparseNDArray with failed bounding box", { coo <- data.frame( i = bit64::as.integer64(slot(mat, "i")), j = bit64::as.integer64(slot(mat, "j")), - x = slot(mat, "x") + x = as.integer(slot(mat, "x")) ) names(coo) <- c(ndarray$dimnames(), ndarray$attrnames()) - ndarray$.__enclos_env__$private$.write_coo_dataframe(coo) + expect_invisible(ndarray$.write_coordinates(coo)) ndarray$close() diff --git a/apis/r/tests/testthat/test-r-arrow-types.R b/apis/r/tests/testthat/test-r-arrow-types.R new file mode 100644 index 0000000000..42ef712358 --- /dev/null +++ b/apis/r/tests/testthat/test-r-arrow-types.R @@ -0,0 +1,93 @@ +test_that("Arrow to R types: data type", { + skip_if(!extended_tests()) + skip_if_not_installed('arrow') + + ints <- apply( + expand.grid(c('', 'u'), 'int', c('8', '16', '32')), + MARGIN = 1L, + FUN = paste, + collapse = '' + ) + for (i in c(ints, 'dictionary')) { + f <- get(i, envir = asNamespace('arrow')) + expect_type(rt <- r_type_from_arrow_type(f()), 'character') + expect_length(rt, 1L) + expect_null(names(rt)) + expect_identical( + rt, + 'integer', + label = sprintf('r_type_from_arrow_type(arrow::%s())', i) + ) + } + + dbls <- c('int64', 'uint64', 'date32', 'timestamp' ,'float', 'float32') + for (i in dbls) { + f <- get(i, envir = asNamespace('arrow')) + expect_type(rt <- r_type_from_arrow_type(f()), 'character') + expect_length(rt, 1L) + expect_null(names(rt)) + expect_identical( + rt, + 'double', + label = sprintf('r_type_from_arrow_type(arrow::%s())', i) + ) + } + + for (i in c('bool', 'boolean')) { + f <- get(i, envir = asNamespace('arrow')) + expect_type(rt <- r_type_from_arrow_type(f()), 'character') + expect_length(rt, 1L) + expect_null(names(rt)) + expect_identical( + rt, + 'logical', + label = sprintf('r_type_from_arrow_type(arrow::%s())', i) + ) + } + for (i in c('utf8', 'string', 'large_utf8')) { + f <- get(i, envir = asNamespace('arrow')) + expect_type(rt <- r_type_from_arrow_type(f()), 'character') + expect_length(rt, 1L) + expect_null(names(rt)) + expect_identical( + rt, + 'character', + label = sprintf('r_type_from_arrow_type(arrow::%s())', i) + ) + } +}) + +test_that("Arrow to R types: field", { + skip_if(!extended_tests()) + skip_if_not_installed('arrow') + + field <- arrow::field(name = random_name(), type = arrow::int8()) + expect_type(rt <- r_type_from_arrow_type(field), 'character') + expect_length(rt, 1L) + expect_named(rt, field$name) + expect_equivalent(rt, 'integer') +}) + +test_that("Arrow to R types: schema", { + skip_if(!extended_tests()) + + asch <- create_arrow_schema() + expect_type(rt <- r_type_from_arrow_type(asch), 'character') + expect_length(rt, length(asch)) + expect_named(rt, asch$names) + for (fn in names(rt)) { + et <- switch( + EXPR = fn, + foo = 'integer', + soma_joinid = 'double', + bar = 'double', + baz = 'character' + ) + expect_equivalent( + rt[fn], + et, + label = sprintf('r_type_from_arrow_type(schema[[%s]])', fn), + expected.label = dQuote(et, FALSE) + ) + } +}) diff --git a/apis/r/tests/testthat/test-utils.R b/apis/r/tests/testthat/test-utils.R index 261ee26df0..b967a427fe 100644 --- a/apis/r/tests/testthat/test-utils.R +++ b/apis/r/tests/testthat/test-utils.R @@ -29,7 +29,6 @@ test_that("validate read coords", { ) }) - test_that("validate read coords with dimension names", { # assume vector or unnamed list of length 1 corresponds to first dimension @@ -50,7 +49,6 @@ test_that("validate read coords with dimension names", { ) }) - test_that("validate read coords with dimension names and schema", { asch <- create_arrow_schema() @@ -82,3 +80,96 @@ test_that("validate read coords with dimension names and schema", { expect_equal(test_coords$foo, 1:10) expect_equal(test_coords$soma_joinid, as.integer64(1:10)) }) + +test_that("is_integerish: default", { + expect_true(.is_integerish(vector("integer"))) + expect_true(.is_integerish(vector("numeric"))) + types <- c("logical", "complex", "character", "expression", "list", "raw") + for (tt in types) { + expect_false( + .is_integerish(vector(tt)), + label = sprintf(".is_integerish(vector('%s'))", tt) + ) + } +}) + +test_that("is_integerish: integer64", { + # Basic tests + for (n in 0:3) { + expect_true( + .is_integerish(bit64::integer64(length = n)), + label = sprintf(".is_integerish(integer64(length = %i))", n) + ) + expect_true( + .is_integerish(bit64::integer64(length = n), n = n), + label = sprintf(".is_integerish(integer64(length = %i), n = %i)", n, n) + ) + expect_false( + .is_integerish(bit64::integer64(length = n), n = n + 1L), + label = sprintf(".is_integerish(integer64(length = %i), n = %i)", n, n + 1L) + ) + } + + # Test finiteness + expect_true(.is_integerish(bit64::NA_integer64_)) + expect_true(.is_integerish(bit64::NA_integer64_, finite = FALSE)) + expect_false(.is_integerish(bit64::integer64(), finite = FALSE)) + expect_false(.is_integerish(bit64::NA_integer64_, finite = TRUE)) + + # Test large number + expect_true(.is_integerish(bit64::as.integer64((2 ^ 31) + 1L))) +}) + +test_that("is_integerish: arrow::DataType", { + + ints <- paste0("int", c(8, 16, 32, 64)) + for (it in c(ints, paste0("u", ints))) { + f <- get(it, envir = asNamespace("arrow")) + expect_true( + .is_integerish(f()), + label = sprintf(".is_integerish(arrow::%s())", it) + ) + } + + types <- c( + paste0("float", c("", 16, 32, 64)), + paste0("bool", c("", "ean")), + "string", + paste0(c("", "large_"), "utf8"), + paste0("date", c(32, 64)), + paste0("time", c(32, 64, "stamp")) + ) + for (at in types) { + f <- get(at, envir = asNamespace("arrow")) + expect_false( + .is_integerish(f()), + label = sprintf(".is_integerish(arrow::%s())", at) + ) + } +}) + +test_that("is_integerish: arrow::Field", { + sch <- create_arrow_schema() + for (i in names(sch)) { + label <- sprintf(".is_integerish(sch[['%s']])", i) + switch( + EXPR = i, + foo = , + soma_joinid = expect_true(.is_integerish(sch[[i]]), label = label), + expect_false(.is_integerish(sch[[i]]), label = label) + ) + } +}) + +test_that("is_integerish: arrow::Arrays", { + tbl <- create_arrow_table() + for (i in names(tbl)) { + label <- sprintf(".is_integerish(tbl[['%s']])", i) + switch( + EXPR = i, + foo = , + soma_joinid = expect_true(.is_integerish(tbl[[i]]), label = label), + expect_false(.is_integerish(tbl[[i]]), label = label) + ) + } +})