[python] Support tiledbsoma.io.update_obs with post-nullable non-st…

…ring attributes (#2719) * [python] Support `tiledbsoma.io.update_obs` with post-nullable non-string attributes * fix unit-test case
single-cell-data · Jun 12, 2024 · a9eb694 · a9eb694
1 parent b56390e
commit a9eb694
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 38 deletions.
diff --git a/apis/python/src/tiledbsoma/_arrow_types.py b/apis/python/src/tiledbsoma/_arrow_types.py
@@ -229,8 +229,33 @@ def df_to_arrow(df: pd.DataFrame) -> pa.Table:
     nullable_fields = set()
     # Not for name, col in df.items() since we need df[k] on the left-hand sides
     for key in df:
-        if df[key].isnull().any():
-            nullable_fields.add(key)
+        # Make attributes nullable. Context:
+        # * df_to_arrow is _solely_ for use of tiledbsoma.io
+        #   o Anyone calling the SOMA API directly has user-provided Arrow
+        #     schema which must be respected
+        #   o Anyone calling tiledbsoma.io -- including from_h5ad/from_anndata,
+        #     and update_obs/update_var -- does not provide an Arrow schema
+        #     explicitly.  We compute an Arrow schema for them here.
+        # * Even when the _initial_ data is all non-null down a particular
+        #   string column, there are two ways a _subsequent_ write can provide
+        #   nulls: append-mode ingest, or, update_obs/update_var wherein the new
+        #   data has nulls even when the data used at schema-create time was
+        #   non-null.
+        # * We have no way of knowing at initial ingest time whether or not
+        #   users will later be appending, or updating, with null data.
+        # * Note that Arrow has a per-field nullable flag in its schema metadata
+        #   -- and so do TileDB array schemas.
+        #
+        # Note in particular this is for the use of tiledbsoma.io:
+        #
+        # * In the tiledbsoma API (e.g. DataFrame.create) the user passes an
+        #   Arrow schema and we respect it as-is. They specify nullability, or
+        #   not, as they wish.
+        # * In tiledbsoma.io, the user-provided inputs are AnnData objects.
+        #   We compute the Arrow schema _for_ them. And we must accommodate
+        #   reasonable/predictable needs.
+
+        nullable_fields.add(key)
 
         # Handle special cases for all null columns where the dtype is "object"
         # or "category" and must be explicitly casted to the correct pandas
@@ -264,31 +289,11 @@ def df_to_arrow(df: pd.DataFrame) -> pa.Table:
                 values=column, categories=categories, ordered=ordered
             )
 
-    # Always make string columns nullable. Context:
-    # * df_to_arrow is _solely_ for use of tiledbsoma.io
-    #   o Anyone calling the SOMA API directly has user-provided Arrow schema which
-    #     must be respected
-    #   o Anyone calling tiledbsoma.io -- including from_h5ad/from_anndata, and
-    #     update_obs/update_var -- does not provide an Arrow schema explicitly.
-    #     We compute an Arrow schema for them here.
-    # * Even when the _initial_ data is all non-null down a particular string
-    #   column, there are two ways a _subsequent_ write can provide nulls:
-    #   append-mode ingest, or, update_obs/update_var wherein the new data has
-    #   nulls even when the data used at schema-create time was non-null.
-    # * We have no way of knowing at initial ingest time whether or not users
-    #   will later be appending, or updating, with null data.
-    # * Note that Arrow has a per-field nullable flag in its schema metadata
-    #   -- and so do TileDB array schemas.
-    for key in df:
-        if is_string_dtype(df[key].dtype):
-            nullable_fields.add(key)
-
     arrow_table = pa.Table.from_pandas(df)
 
-    if nullable_fields:
-        md = arrow_table.schema.metadata
-        md.update(dict.fromkeys(nullable_fields, "nullable"))
-        arrow_table = arrow_table.replace_schema_metadata(md)
+    md = arrow_table.schema.metadata
+    md.update(dict.fromkeys(nullable_fields, "nullable"))
+    arrow_table = arrow_table.replace_schema_metadata(md)
 
     # For tiledbsoma.io (for which this method exists) _any_ dataset can be appended to
     # later on. This means that on fresh ingest we must use a larger bit-width than

diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py
@@ -54,7 +54,7 @@
     eta,
     logging,
 )
-from .._arrow_types import df_to_arrow, is_string_dtypelike, tiledb_type_from_arrow_type
+from .._arrow_types import df_to_arrow, tiledb_type_from_arrow_type
 from .._collection import AnyTileDBCollection, CollectionBase
 from .._common_nd_array import NDArray
 from .._constants import SOMA_JOINID
@@ -1557,23 +1557,28 @@ def _update_dataframe(
         # An update can create (or drop) columns, or mutate existing ones.  A
         # brand-new column might have nulls in it -- or it might not.  And a
         # subsequent mutator-update might set null values to non-null -- or vice
-        # versa. Therefore we must be careful to set nullability for all types
-        # we want to be nullable: principal use-case being pd.NA / NaN in
-        # string columns which map to TileDB nullity.
+        # versa. Therefore we must be careful to set nullability for all types.
         #
         # Note: this must match what DataFrame.create does:
         # * DataFrame.create sets nullability for obs/var columns on initial ingest
         # * Here, we set nullabiliity for obs/var columns on update_obs
         # Users should get the same behavior either way.
-        nullable = is_string_dtypelike(dtype)
+        #
+        # Note: this is specific to tiledbsoma.io.
+        # * In the SOMA API -- e.g. soma.DataFrame.create -- users bring their
+        #   own Arrow schema (including nullabilities) and we must do what they
+        #   say.
+        # * In the tiledbsoma.io API, users bring their AnnData objects, and
+        #   we compute Arrow schemas on their behalf, and we must accommodate
+        #   reasonable/predictable needs.
 
         se.add_attribute(
             tiledb.Attr(
                 name=add_key,
                 dtype=dtype,
                 filters=filters,
                 enum_label=enum_label,
-                nullable=nullable,
+                nullable=True,
             )
         )
 

diff --git a/apis/python/tests/test_basic_anndata_io.py b/apis/python/tests/test_basic_anndata_io.py
@@ -783,7 +783,7 @@ def test_null_obs(conftest_pbmc_small, tmp_path: Path):
         #   ensure that `isnullable` reflects the null-ness
         #   of the Pandas data frame
         for k in conftest_pbmc_small.obs:
-            assert obs.attr(k).isnullable == conftest_pbmc_small.obs[k].isnull().any()
+            assert obs.attr(k).isnullable
 
 
 def test_export_obsm_with_holes(h5ad_file_with_obsm_holes, tmp_path):

diff --git a/apis/python/tests/test_update_dataframes.py b/apis/python/tests/test_update_dataframes.py
@@ -276,14 +276,14 @@ def test_update_non_null_to_null(tmp_path, conftest_pbmc3k_adata, separate_inges
     # Two ways to test:
     #
     # One way:
-    # * Create a string column with non-nulls before from_anndata
-    # * Ingest, having that new column with non-nulls
-    # * Call update_obs to set the column to have nulls
+    # * Create columns with non-nulls before from_anndata
+    # * Ingest, having those new column with non-nulls
+    # * Call update_obs to set the columns to have nulls
     #
     # Other way:
-    # * Ingest, without any new column
-    # * Call update_obs once to add the new column with non-null values
-    # * Call update_obs again to add the new column with null values
+    # * Ingest, without any new columns
+    # * Call update_obs once to add the new columns with non-null values
+    # * Call update_obs again to add the new columns with null values
 
     if separate_ingest:
         tiledbsoma.io.from_anndata(
@@ -294,10 +294,12 @@ def test_update_non_null_to_null(tmp_path, conftest_pbmc3k_adata, separate_inges
         )
 
         conftest_pbmc3k_adata.obs["batch_id"] = "testing"
+        conftest_pbmc3k_adata.obs["myfloat"] = 12.34
         verify_updates(uri, conftest_pbmc3k_adata.obs, conftest_pbmc3k_adata.var)
 
     else:
         conftest_pbmc3k_adata.obs["batch_id"] = "testing"
+        conftest_pbmc3k_adata.obs["myfloat"] = 12.34
 
         tiledbsoma.io.from_anndata(
             uri,
@@ -307,6 +309,7 @@ def test_update_non_null_to_null(tmp_path, conftest_pbmc3k_adata, separate_inges
         )
 
     conftest_pbmc3k_adata.obs["batch_id"] = pd.NA
+    conftest_pbmc3k_adata.obs["myfloat"] = np.nan
     # We need nan_safe since pd.NA != pd.NA
     verify_updates(
         uri, conftest_pbmc3k_adata.obs, conftest_pbmc3k_adata.var, nan_safe=True