[python] Specially handle cast for object and category dtypes

* "object" and "category" need explicit casting as the converter functions pa.infer_type and pa.from_numpy_dtype do not map to the correct associated pandas extension dtype * Use None instead of pa.null as pa.null can only be used for numerics
single-cell-data · Jan 29, 2024 · 76a8056 · 76a8056
1 parent 8567e84
commit 76a8056
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 25 deletions.
diff --git a/apis/python/src/tiledbsoma/_arrow_types.py b/apis/python/src/tiledbsoma/_arrow_types.py
@@ -189,34 +189,28 @@ def tiledb_schema_to_arrow(
 
 def df_to_arrow(df: pd.DataFrame) -> pa.Table:
     """
-    Categoricals are not yet well supported, so we must flatten.
-    Also replace Numpy/Pandas-style nulls with Arrow-style nulls.
+    Handle special cases where pa.Table.from_pandas is not sufficient.
     """
     null_fields = set()
     # Not for name, col in df.items() since we need df[k] on the left-hand sides
     for k in df:
         if df[k].isnull().any():
-            if df[k].isnull().all():
-                # Special case: Pandas dtype is string, but the values are
-                # math.NaN, for which pa.infer_type fails with "Could not
-                # convert <NA> with type NAType".
-                #
-                # Note: with
-                #   anndata.obs['new_col'] = pd.Series(data=np.nan, dtype=np.dtype(str))
-                # the dtype comes in to us via `tiledbsoma.io.from_anndata` not
-                # as `pd.StringDtype()` but rather as `object`.
-                if df[k].dtype == pd.StringDtype() or df[k].dtype.name == "object":
-                    df[k] = pd.Series([None] * df.shape[0], dtype=pd.StringDtype())
-                else:
-                    df[k] = pa.nulls(df.shape[0], pa.infer_type(df[k]))
-            else:
-                df[k].where(
-                    df[k].notnull(),
-                    pd.Series(pa.nulls(df[k].isnull().sum(), pa.infer_type(df[k]))),
-                    inplace=True,
-                )
             null_fields.add(k)
 
+        # Handle special cases for all null columns where the dtype is "object"
+        # or "category" and must be expliitly casted to the correct pandas
+        # extension dtype.
+        #
+        # Note: with
+        #   anndata.obs['new_col'] = pd.Series(data=np.nan, dtype=np.dtype(str))
+        # the dtype comes in to us via `tiledbsoma.io.from_anndata` not
+        # as `pd.StringDtype()` but rather as `object`.
+        if df[k].isnull().all():
+            if df[k].dtype.name == "object":
+                df[k] = pd.Series([None] * df.shape[0], dtype=pd.StringDtype())
+            elif df[k].dtype.name == "category":
+                df[k] = pd.Series([None] * df.shape[0], dtype=pd.CategoricalDtype())
+
     # For categoricals, it's possible to get
     #   TypeError: Object of type bool_ is not JSON serializable
     # deep within library functions. Debugging reveals that this happens when

diff --git a/apis/python/tests/test_basic_anndata_io.py b/apis/python/tests/test_basic_anndata_io.py
@@ -694,20 +694,30 @@ def test_null_obs(adata, tmp_path: Path):
     output_path = tmp_path.as_uri()
     seed = 42
     #   Create column of all null values
-    adata.obs["empty_all"] = pd.Categorical(
+    adata.obs["empty_categorical_all"] = pd.Categorical(
         [np.NaN] * adata.n_obs, dtype=pd.CategoricalDtype(categories=[], ordered=False)
     )
+    adata.obs["empty_extension_all"] = pd.Series(
+        [np.nan] * adata.n_obs, dtype=pd.Int64Dtype()
+    )
     #   Create column of partially-null values
     rng = np.random.RandomState(seed)
-    adata.obs["empty_partial"] = rng.choice((np.NaN, 1.0), adata.n_obs, True)
+    adata.obs["empty_categorical_partial"] = rng.choice(
+        (np.NaN, 1.0), adata.n_obs, True
+    )
+    adata.obs["empty_extension_partial"] = pd.Series(
+        [1] * adata.n_obs + [np.nan], dtype=pd.Int64Dtype()
+    )
     uri = tiledbsoma.io.from_anndata(
         output_path, adata, "RNA", ingest_mode="write", X_kind=tiledbsoma.SparseNDArray
     )
     exp = tiledbsoma.Experiment.open(uri)
     with tiledb.open(exp.obs.uri, "r") as obs:
         #   Explicitly check columns created above
-        assert obs.attr("empty_all").isnullable
-        assert obs.attr("empty_partial").isnullable
+        assert obs.attr("empty_categorical_all").isnullable
+        assert obs.attr("empty_categorical_partial").isnullable
+        assert obs.attr("empty_extension_all").isnullable
+        assert obs.attr("empty_extension_partial").isnullable
         #   For every column in the data frame
         #   ensure that `isnullable` reflects the null-ness
         #   of the Pandas data frame