Skip to content

Commit

Permalink
[python] Support tiledbsoma.io.update_obs with post-nullable non-st…
Browse files Browse the repository at this point in the history
…ring attributes (#2719)

* [python] Support `tiledbsoma.io.update_obs` with post-nullable non-string attributes

* fix unit-test case
  • Loading branch information
johnkerl authored Jun 12, 2024
1 parent b56390e commit a9eb694
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 38 deletions.
55 changes: 30 additions & 25 deletions apis/python/src/tiledbsoma/_arrow_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,33 @@ def df_to_arrow(df: pd.DataFrame) -> pa.Table:
nullable_fields = set()
# Not for name, col in df.items() since we need df[k] on the left-hand sides
for key in df:
if df[key].isnull().any():
nullable_fields.add(key)
# Make attributes nullable. Context:
# * df_to_arrow is _solely_ for use of tiledbsoma.io
# o Anyone calling the SOMA API directly has user-provided Arrow
# schema which must be respected
# o Anyone calling tiledbsoma.io -- including from_h5ad/from_anndata,
# and update_obs/update_var -- does not provide an Arrow schema
# explicitly. We compute an Arrow schema for them here.
# * Even when the _initial_ data is all non-null down a particular
# string column, there are two ways a _subsequent_ write can provide
# nulls: append-mode ingest, or, update_obs/update_var wherein the new
# data has nulls even when the data used at schema-create time was
# non-null.
# * We have no way of knowing at initial ingest time whether or not
# users will later be appending, or updating, with null data.
# * Note that Arrow has a per-field nullable flag in its schema metadata
# -- and so do TileDB array schemas.
#
# Note in particular this is for the use of tiledbsoma.io:
#
# * In the tiledbsoma API (e.g. DataFrame.create) the user passes an
# Arrow schema and we respect it as-is. They specify nullability, or
# not, as they wish.
# * In tiledbsoma.io, the user-provided inputs are AnnData objects.
# We compute the Arrow schema _for_ them. And we must accommodate
# reasonable/predictable needs.

nullable_fields.add(key)

# Handle special cases for all null columns where the dtype is "object"
# or "category" and must be explicitly casted to the correct pandas
Expand Down Expand Up @@ -264,31 +289,11 @@ def df_to_arrow(df: pd.DataFrame) -> pa.Table:
values=column, categories=categories, ordered=ordered
)

# Always make string columns nullable. Context:
# * df_to_arrow is _solely_ for use of tiledbsoma.io
# o Anyone calling the SOMA API directly has user-provided Arrow schema which
# must be respected
# o Anyone calling tiledbsoma.io -- including from_h5ad/from_anndata, and
# update_obs/update_var -- does not provide an Arrow schema explicitly.
# We compute an Arrow schema for them here.
# * Even when the _initial_ data is all non-null down a particular string
# column, there are two ways a _subsequent_ write can provide nulls:
# append-mode ingest, or, update_obs/update_var wherein the new data has
# nulls even when the data used at schema-create time was non-null.
# * We have no way of knowing at initial ingest time whether or not users
# will later be appending, or updating, with null data.
# * Note that Arrow has a per-field nullable flag in its schema metadata
# -- and so do TileDB array schemas.
for key in df:
if is_string_dtype(df[key].dtype):
nullable_fields.add(key)

arrow_table = pa.Table.from_pandas(df)

if nullable_fields:
md = arrow_table.schema.metadata
md.update(dict.fromkeys(nullable_fields, "nullable"))
arrow_table = arrow_table.replace_schema_metadata(md)
md = arrow_table.schema.metadata
md.update(dict.fromkeys(nullable_fields, "nullable"))
arrow_table = arrow_table.replace_schema_metadata(md)

# For tiledbsoma.io (for which this method exists) _any_ dataset can be appended to
# later on. This means that on fresh ingest we must use a larger bit-width than
Expand Down
17 changes: 11 additions & 6 deletions apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
eta,
logging,
)
from .._arrow_types import df_to_arrow, is_string_dtypelike, tiledb_type_from_arrow_type
from .._arrow_types import df_to_arrow, tiledb_type_from_arrow_type
from .._collection import AnyTileDBCollection, CollectionBase
from .._common_nd_array import NDArray
from .._constants import SOMA_JOINID
Expand Down Expand Up @@ -1557,23 +1557,28 @@ def _update_dataframe(
# An update can create (or drop) columns, or mutate existing ones. A
# brand-new column might have nulls in it -- or it might not. And a
# subsequent mutator-update might set null values to non-null -- or vice
# versa. Therefore we must be careful to set nullability for all types
# we want to be nullable: principal use-case being pd.NA / NaN in
# string columns which map to TileDB nullity.
# versa. Therefore we must be careful to set nullability for all types.
#
# Note: this must match what DataFrame.create does:
# * DataFrame.create sets nullability for obs/var columns on initial ingest
# * Here, we set nullabiliity for obs/var columns on update_obs
# Users should get the same behavior either way.
nullable = is_string_dtypelike(dtype)
#
# Note: this is specific to tiledbsoma.io.
# * In the SOMA API -- e.g. soma.DataFrame.create -- users bring their
# own Arrow schema (including nullabilities) and we must do what they
# say.
# * In the tiledbsoma.io API, users bring their AnnData objects, and
# we compute Arrow schemas on their behalf, and we must accommodate
# reasonable/predictable needs.

se.add_attribute(
tiledb.Attr(
name=add_key,
dtype=dtype,
filters=filters,
enum_label=enum_label,
nullable=nullable,
nullable=True,
)
)

Expand Down
2 changes: 1 addition & 1 deletion apis/python/tests/test_basic_anndata_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,7 @@ def test_null_obs(conftest_pbmc_small, tmp_path: Path):
# ensure that `isnullable` reflects the null-ness
# of the Pandas data frame
for k in conftest_pbmc_small.obs:
assert obs.attr(k).isnullable == conftest_pbmc_small.obs[k].isnull().any()
assert obs.attr(k).isnullable


def test_export_obsm_with_holes(h5ad_file_with_obsm_holes, tmp_path):
Expand Down
15 changes: 9 additions & 6 deletions apis/python/tests/test_update_dataframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,14 +276,14 @@ def test_update_non_null_to_null(tmp_path, conftest_pbmc3k_adata, separate_inges
# Two ways to test:
#
# One way:
# * Create a string column with non-nulls before from_anndata
# * Ingest, having that new column with non-nulls
# * Call update_obs to set the column to have nulls
# * Create columns with non-nulls before from_anndata
# * Ingest, having those new column with non-nulls
# * Call update_obs to set the columns to have nulls
#
# Other way:
# * Ingest, without any new column
# * Call update_obs once to add the new column with non-null values
# * Call update_obs again to add the new column with null values
# * Ingest, without any new columns
# * Call update_obs once to add the new columns with non-null values
# * Call update_obs again to add the new columns with null values

if separate_ingest:
tiledbsoma.io.from_anndata(
Expand All @@ -294,10 +294,12 @@ def test_update_non_null_to_null(tmp_path, conftest_pbmc3k_adata, separate_inges
)

conftest_pbmc3k_adata.obs["batch_id"] = "testing"
conftest_pbmc3k_adata.obs["myfloat"] = 12.34
verify_updates(uri, conftest_pbmc3k_adata.obs, conftest_pbmc3k_adata.var)

else:
conftest_pbmc3k_adata.obs["batch_id"] = "testing"
conftest_pbmc3k_adata.obs["myfloat"] = 12.34

tiledbsoma.io.from_anndata(
uri,
Expand All @@ -307,6 +309,7 @@ def test_update_non_null_to_null(tmp_path, conftest_pbmc3k_adata, separate_inges
)

conftest_pbmc3k_adata.obs["batch_id"] = pd.NA
conftest_pbmc3k_adata.obs["myfloat"] = np.nan
# We need nan_safe since pd.NA != pd.NA
verify_updates(
uri, conftest_pbmc3k_adata.obs, conftest_pbmc3k_adata.var, nan_safe=True
Expand Down

0 comments on commit a9eb694

Please sign in to comment.