Skip to content

Commit

Permalink
[python] Specially handle cast for object and category dtypes
Browse files Browse the repository at this point in the history
* "object" and "category" need explicit casting as the converter
  functions pa.infer_type and pa.from_numpy_dtype do not map to the
  correct associated pandas extension dtype
* Use None instead of pa.null as pa.null can only be used for numerics
  • Loading branch information
nguyenv committed Jan 29, 2024
1 parent 8567e84 commit 76a8056
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 25 deletions.
36 changes: 15 additions & 21 deletions apis/python/src/tiledbsoma/_arrow_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,34 +189,28 @@ def tiledb_schema_to_arrow(

def df_to_arrow(df: pd.DataFrame) -> pa.Table:
"""
Categoricals are not yet well supported, so we must flatten.
Also replace Numpy/Pandas-style nulls with Arrow-style nulls.
Handle special cases where pa.Table.from_pandas is not sufficient.
"""
null_fields = set()
# Not for name, col in df.items() since we need df[k] on the left-hand sides
for k in df:
if df[k].isnull().any():
if df[k].isnull().all():
# Special case: Pandas dtype is string, but the values are
# math.NaN, for which pa.infer_type fails with "Could not
# convert <NA> with type NAType".
#
# Note: with
# anndata.obs['new_col'] = pd.Series(data=np.nan, dtype=np.dtype(str))
# the dtype comes in to us via `tiledbsoma.io.from_anndata` not
# as `pd.StringDtype()` but rather as `object`.
if df[k].dtype == pd.StringDtype() or df[k].dtype.name == "object":
df[k] = pd.Series([None] * df.shape[0], dtype=pd.StringDtype())
else:
df[k] = pa.nulls(df.shape[0], pa.infer_type(df[k]))
else:
df[k].where(
df[k].notnull(),
pd.Series(pa.nulls(df[k].isnull().sum(), pa.infer_type(df[k]))),
inplace=True,
)
null_fields.add(k)

# Handle special cases for all null columns where the dtype is "object"
# or "category" and must be expliitly casted to the correct pandas
# extension dtype.
#
# Note: with
# anndata.obs['new_col'] = pd.Series(data=np.nan, dtype=np.dtype(str))
# the dtype comes in to us via `tiledbsoma.io.from_anndata` not
# as `pd.StringDtype()` but rather as `object`.
if df[k].isnull().all():
if df[k].dtype.name == "object":
df[k] = pd.Series([None] * df.shape[0], dtype=pd.StringDtype())
elif df[k].dtype.name == "category":
df[k] = pd.Series([None] * df.shape[0], dtype=pd.CategoricalDtype())

# For categoricals, it's possible to get
# TypeError: Object of type bool_ is not JSON serializable
# deep within library functions. Debugging reveals that this happens when
Expand Down
18 changes: 14 additions & 4 deletions apis/python/tests/test_basic_anndata_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,20 +694,30 @@ def test_null_obs(adata, tmp_path: Path):
output_path = tmp_path.as_uri()
seed = 42
# Create column of all null values
adata.obs["empty_all"] = pd.Categorical(
adata.obs["empty_categorical_all"] = pd.Categorical(
[np.NaN] * adata.n_obs, dtype=pd.CategoricalDtype(categories=[], ordered=False)
)
adata.obs["empty_extension_all"] = pd.Series(
[np.nan] * adata.n_obs, dtype=pd.Int64Dtype()
)
# Create column of partially-null values
rng = np.random.RandomState(seed)
adata.obs["empty_partial"] = rng.choice((np.NaN, 1.0), adata.n_obs, True)
adata.obs["empty_categorical_partial"] = rng.choice(
(np.NaN, 1.0), adata.n_obs, True
)
adata.obs["empty_extension_partial"] = pd.Series(
[1] * adata.n_obs + [np.nan], dtype=pd.Int64Dtype()
)
uri = tiledbsoma.io.from_anndata(
output_path, adata, "RNA", ingest_mode="write", X_kind=tiledbsoma.SparseNDArray
)
exp = tiledbsoma.Experiment.open(uri)
with tiledb.open(exp.obs.uri, "r") as obs:
# Explicitly check columns created above
assert obs.attr("empty_all").isnullable
assert obs.attr("empty_partial").isnullable
assert obs.attr("empty_categorical_all").isnullable
assert obs.attr("empty_categorical_partial").isnullable
assert obs.attr("empty_extension_all").isnullable
assert obs.attr("empty_extension_partial").isnullable
# For every column in the data frame
# ensure that `isnullable` reflects the null-ness
# of the Pandas data frame
Expand Down

0 comments on commit 76a8056

Please sign in to comment.