Provide arrow schema on HiPSCat catalog creation (#383)

* Include the arrow schema in catalog operations * Resolve circular import issue * Allow schema to be provided in from_dataframe * Remove filtering of columns from arrow schema
astronomy-commons · Jul 31, 2024 · efdd685 · efdd685
1 parent 28375b7
commit efdd685
Show file tree

Hide file tree

Showing 15 changed files with 168 additions and 32 deletions.
diff --git a/src/lsdb/catalog/catalog.py b/src/lsdb/catalog/catalog.py
@@ -20,6 +20,7 @@
 from lsdb.dask.crossmatch_catalog_data import crossmatch_catalog_data
 from lsdb.dask.join_catalog_data import join_catalog_data_on, join_catalog_data_through
 from lsdb.dask.partition_indexer import PartitionIndexer
+from lsdb.io.schema import get_arrow_schema
 from lsdb.types import DaskDFPixelMap
 
 
@@ -199,7 +200,7 @@ def crossmatch(
             ra_column=self.hc_structure.catalog_info.ra_column + suffixes[0],
             dec_column=self.hc_structure.catalog_info.dec_column + suffixes[0],
         )
-        hc_catalog = hc.catalog.Catalog(new_catalog_info, alignment.pixel_tree)
+        hc_catalog = hc.catalog.Catalog(new_catalog_info, alignment.pixel_tree, schema=get_arrow_schema(ddf))
         return Catalog(ddf, ddf_map, hc_catalog)
 
     def cone_search(self, ra: float, dec: float, radius_arcsec: float, fine: bool = True) -> Catalog:
@@ -418,7 +419,9 @@ def join(
                 ra_column=self.hc_structure.catalog_info.ra_column + suffixes[0],
                 dec_column=self.hc_structure.catalog_info.dec_column + suffixes[0],
             )
-            hc_catalog = hc.catalog.Catalog(new_catalog_info, alignment.pixel_tree)
+            hc_catalog = hc.catalog.Catalog(
+                new_catalog_info, alignment.pixel_tree, schema=get_arrow_schema(ddf)
+            )
             return Catalog(ddf, ddf_map, hc_catalog)
         if left_on is None or right_on is None:
             raise ValueError("Either both of left_on and right_on, or through must be set")
@@ -439,5 +442,5 @@ def join(
             ra_column=self.hc_structure.catalog_info.ra_column + suffixes[0],
             dec_column=self.hc_structure.catalog_info.dec_column + suffixes[0],
         )
-        hc_catalog = hc.catalog.Catalog(new_catalog_info, alignment.pixel_tree)
+        hc_catalog = hc.catalog.Catalog(new_catalog_info, alignment.pixel_tree, schema=get_arrow_schema(ddf))
         return Catalog(ddf, ddf_map, hc_catalog)
diff --git a/src/lsdb/io/schema.py b/src/lsdb/io/schema.py
@@ -0,0 +1,17 @@
+from __future__ import annotations
+
+import dask.dataframe as dd
+import pyarrow as pa
+
+
+def get_arrow_schema(ddf: dd.DataFrame) -> pa.Schema:
+    """Constructs the pyarrow schema from the meta of a Dask DataFrame.
+
+    Args:
+        ddf (dd.DataFrame): A Dask DataFrame.
+
+    Returns:
+        The arrow schema for the provided Dask DataFrame.
+    """
+    # pylint: disable=protected-access
+    return pa.Schema.from_pandas(ddf._meta)
diff --git a/src/lsdb/loaders/dataframe/dataframe_catalog_loader.py b/src/lsdb/loaders/dataframe/dataframe_catalog_loader.py
@@ -10,6 +10,7 @@
 import hipscat as hc
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from hipscat.catalog import CatalogType
 from hipscat.catalog.catalog_info import CatalogInfo
 from hipscat.pixel_math import HealpixPixel, generate_histogram
@@ -18,6 +19,7 @@
 from mocpy import MOC
 
 from lsdb.catalog.catalog import Catalog
+from lsdb.io.schema import get_arrow_schema
 from lsdb.loaders.dataframe.from_dataframe_utils import (
     _append_partition_information_to_dataframe,
     _generate_dask_dataframe,
@@ -30,6 +32,7 @@
 class DataframeCatalogLoader:
     """Creates a HiPSCat formatted Catalog from a Pandas Dataframe"""
 
+    # pylint: disable=too-many-arguments
     def __init__(
         self,
         dataframe: pd.DataFrame,
@@ -41,6 +44,7 @@ def __init__(
         should_generate_moc: bool = True,
         moc_max_order: int = 10,
         use_pyarrow_types: bool = True,
+        schema: pa.Schema | None = None,
         **kwargs,
     ) -> None:
         """Initializes a DataframeCatalogLoader
@@ -59,6 +63,8 @@ def __init__(
             moc_max_order (int): if generating a MOC, what to use as the max order. Defaults to 10.
             use_pyarrow_types (bool): If True, the data is backed by pyarrow, otherwise we keep the
                 original data types. Defaults to True.
+            schema (pa.Schema): the arrow schema to create the catalog with. If None, the schema is
+                automatically inferred from the provided DataFrame using `pa.Schema.from_pandas`.
             **kwargs: Arguments to pass to the creation of the catalog info.
         """
         self.dataframe = dataframe
@@ -70,6 +76,7 @@ def __init__(
         self.should_generate_moc = should_generate_moc
         self.moc_max_order = moc_max_order
         self.use_pyarrow_types = use_pyarrow_types
+        self.schema = schema
 
     def _calculate_threshold(self, partition_size: int | None = None, threshold: int | None = None) -> int:
         """Calculates the number of pixels per HEALPix pixel (threshold) for the
@@ -130,7 +137,8 @@ def load_catalog(self) -> Catalog:
         ddf, ddf_pixel_map, total_rows = self._generate_dask_df_and_map(pixel_list)
         self.catalog_info = dataclasses.replace(self.catalog_info, total_rows=total_rows)
         moc = self._generate_moc() if self.should_generate_moc else None
-        hc_structure = hc.catalog.Catalog(self.catalog_info, pixel_list, moc=moc)
+        schema = self.schema if self.schema is not None else get_arrow_schema(ddf)
+        hc_structure = hc.catalog.Catalog(self.catalog_info, pixel_list, moc=moc, schema=schema)
         return Catalog(ddf, ddf_pixel_map, hc_structure)
 
     def _set_hipscat_index(self):

diff --git a/src/lsdb/loaders/dataframe/from_dataframe.py b/src/lsdb/loaders/dataframe/from_dataframe.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import pandas as pd
+import pyarrow as pa
 
 from lsdb.catalog import Catalog
 from lsdb.loaders.dataframe.dataframe_catalog_loader import DataframeCatalogLoader
@@ -21,6 +22,7 @@ def from_dataframe(
     should_generate_moc: bool = True,
     moc_max_order: int = 10,
     use_pyarrow_types: bool = True,
+    schema: pa.Schema | None = None,
     **kwargs,
 ) -> Catalog:
     """Load a catalog from a Pandas Dataframe in CSV format.
@@ -46,6 +48,8 @@ def from_dataframe(
         moc_max_order (int): if generating a MOC, what to use as the max order. Defaults to 10.
         use_pyarrow_types (bool): If True, the data is backed by pyarrow, otherwise we keep the
             original data types. Defaults to True.
+        schema (pa.Schema): the arrow schema to create the catalog with. If None, the schema is
+            automatically inferred from the provided DataFrame using `pa.Schema.from_pandas`.
         **kwargs: Arguments to pass to the creation of the catalog info.
 
     Returns:
@@ -61,6 +65,7 @@ def from_dataframe(
         should_generate_moc=should_generate_moc,
         moc_max_order=moc_max_order,
         use_pyarrow_types=use_pyarrow_types,
+        schema=schema,
         **kwargs,
     ).load_catalog()
     if margin_threshold:

diff --git a/src/lsdb/loaders/dataframe/margin_catalog_generator.py b/src/lsdb/loaders/dataframe/margin_catalog_generator.py
@@ -44,6 +44,7 @@ def __init__(
         self.margin_threshold = margin_threshold
         self.margin_order = self._set_margin_order(margin_order)
         self.use_pyarrow_types = use_pyarrow_types
+        self.schema = catalog.hc_structure.schema
 
     def _set_margin_order(self, margin_order: int | None) -> int:
         """Calculate the order of the margin cache to be generated. If not provided
@@ -79,7 +80,7 @@ def create_catalog(self) -> MarginCatalog | None:
         ddf, ddf_pixel_map, total_rows = self._generate_dask_df_and_map(pixels, partitions)
         margin_pixels = list(ddf_pixel_map.keys())
         margin_catalog_info = self._create_catalog_info(total_rows)
-        margin_structure = hc.catalog.MarginCatalog(margin_catalog_info, margin_pixels)
+        margin_structure = hc.catalog.MarginCatalog(margin_catalog_info, margin_pixels, schema=self.schema)
         return MarginCatalog(ddf, ddf_pixel_map, margin_structure)
 
     def _get_margins(self) -> Tuple[List[HealpixPixel], List[pd.DataFrame]]:

diff --git a/src/lsdb/loaders/hipscat/abstract_catalog_loader.py b/src/lsdb/loaders/hipscat/abstract_catalog_loader.py
@@ -7,6 +7,7 @@
 import hipscat as hc
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from hipscat.catalog.healpix_dataset.healpix_dataset import HealpixDataset as HCHealpixDataset
 from hipscat.io.file_io import file_io
 from hipscat.pixel_math import HealpixPixel
@@ -45,7 +46,13 @@ def load_catalog(self) -> CatalogTypeVar | None:
 
     def _load_hipscat_catalog(self, catalog_type: Type[HCCatalogTypeVar]) -> HCCatalogTypeVar:
         """Load `hipscat` library catalog object with catalog metadata and partition data"""
-        return catalog_type.read_from_hipscat(self.path, storage_options=self.storage_options)
+        hc_catalog = catalog_type.read_from_hipscat(self.path, storage_options=self.storage_options)
+        if hc_catalog.schema is None:
+            raise ValueError(
+                "The catalog schema could not be loaded from metadata."
+                " Ensure your catalog has _common_metadata or _metadata files"
+            )
+        return hc_catalog
 
     def _load_dask_df_and_map(self, catalog: HCHealpixDataset) -> Tuple[dd.DataFrame, DaskDFPixelMap]:
         """Load Dask DF from parquet files and make dict of HEALPix pixel to partition index"""
@@ -71,29 +78,30 @@ def _get_paths_from_pixels(
     def _load_df_from_paths(
         self, catalog: HCHealpixDataset, paths: List[hc.io.FilePointer], divisions: Tuple[int, ...] | None
     ) -> dd.DataFrame:
-        dask_meta_schema = self._load_metadata_schema(catalog)
-        if self.config.columns:
-            dask_meta_schema = dask_meta_schema[self.config.columns]
-        kwargs = dict(self.config.kwargs)
-        if self.config.dtype_backend is not None:
-            kwargs["dtype_backend"] = self.config.dtype_backend
+        dask_meta_schema = self._create_dask_meta_schema(catalog.schema)
         if len(paths) > 0:
             return dd.from_map(
                 file_io.read_parquet_file_to_pandas,
                 paths,
                 columns=self.config.columns,
                 divisions=divisions,
                 meta=dask_meta_schema,
+                schema=catalog.schema,
                 storage_options=self.storage_options,
-                **kwargs,
+                **self._get_kwargs(),
             )
         return dd.from_pandas(dask_meta_schema, npartitions=1)
 
-    def _load_metadata_schema(self, catalog: HCHealpixDataset) -> pd.DataFrame:
-        metadata_pointer = hc.io.paths.get_common_metadata_pointer(catalog.catalog_base_dir)
-        metadata = file_io.read_parquet_metadata(metadata_pointer, storage_options=self.storage_options)
-        return (
-            metadata.schema.to_arrow_schema()
-            .empty_table()
-            .to_pandas(types_mapper=self.config.get_dtype_mapper())
-        )
+    def _create_dask_meta_schema(self, schema: pa.Schema) -> pd.DataFrame:
+        """Creates the Dask meta DataFrame from the HiPSCat catalog schema."""
+        dask_meta_schema = schema.empty_table().to_pandas(types_mapper=self.config.get_dtype_mapper())
+        if self.config.columns is not None:
+            dask_meta_schema = dask_meta_schema[self.config.columns]
+        return dask_meta_schema
+
+    def _get_kwargs(self) -> dict:
+        """Constructs additional arguments for the `read_parquet` call"""
+        kwargs = dict(self.config.kwargs)
+        if self.config.dtype_backend is not None:
+            kwargs["dtype_backend"] = self.config.dtype_backend
+        return kwargs
diff --git a/src/lsdb/loaders/hipscat/association_catalog_loader.py b/src/lsdb/loaders/hipscat/association_catalog_loader.py
@@ -22,6 +22,6 @@ def load_catalog(self) -> AssociationCatalog:
         return AssociationCatalog(dask_df, dask_df_pixel_map, hc_catalog)
 
     def _load_empty_dask_df_and_map(self, hc_catalog):
-        dask_meta_schema = self._load_metadata_schema(hc_catalog)
+        dask_meta_schema = self._create_dask_meta_schema(hc_catalog.schema)
         ddf = dd.from_pandas(dask_meta_schema, npartitions=1)
         return ddf, {}
diff --git a/src/lsdb/loaders/hipscat/hipscat_catalog_loader.py b/src/lsdb/loaders/hipscat/hipscat_catalog_loader.py
@@ -2,9 +2,10 @@
 
 import hipscat as hc
 
-import lsdb
 from lsdb.catalog.catalog import Catalog, MarginCatalog
 from lsdb.loaders.hipscat.abstract_catalog_loader import AbstractCatalogLoader
+from lsdb.loaders.hipscat.hipscat_loading_config import HipscatLoadingConfig
+from lsdb.loaders.hipscat.margin_catalog_loader import MarginCatalogLoader
 
 
 class HipscatCatalogLoader(AbstractCatalogLoader[Catalog]):
@@ -40,6 +41,7 @@ def _filter_hipscat_catalog(self, hc_catalog: hc.catalog.Catalog) -> hc.catalog.
             filtered_catalog.pixel_tree,
             catalog_path=hc_catalog.catalog_path,
             moc=filtered_catalog.moc,
+            schema=filtered_catalog.schema,
             storage_options=hc_catalog.storage_options,
         )
 
@@ -53,13 +55,15 @@ def _load_margin_catalog(self) -> MarginCatalog | None:
                 # pylint: disable=protected-access
                 margin_catalog = margin_catalog.search(self.config.search_filter)
         elif self.config.margin_cache is not None:
-            margin_catalog = lsdb.read_hipscat(
-                path=self.config.margin_cache,
-                catalog_type=MarginCatalog,
-                search_filter=self.config.search_filter,
-                margin_cache=None,
-                dtype_backend=self.config.dtype_backend,
+            margin_catalog = MarginCatalogLoader(
+                str(self.config.margin_cache),
+                HipscatLoadingConfig(
+                    search_filter=self.config.search_filter,
+                    columns=self.config.columns,
+                    margin_cache=None,
+                    dtype_backend=self.config.dtype_backend,
+                    **self.config.kwargs,
+                ),
                 storage_options=self.storage_options,
-                **self.config.kwargs,
-            )
+            ).load_catalog()
         return margin_catalog
diff --git a/src/lsdb/loaders/hipscat/margin_catalog_loader.py b/src/lsdb/loaders/hipscat/margin_catalog_loader.py
@@ -34,5 +34,6 @@ def _filter_hipscat_catalog(self, hc_catalog: hc.catalog.MarginCatalog) -> hc.ca
             filtered_catalog.catalog_info,
             filtered_catalog.pixel_tree,
             catalog_path=hc_catalog.catalog_path,
+            schema=filtered_catalog.schema,
             storage_options=hc_catalog.storage_options,
         )
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -24,6 +24,7 @@
 SMALL_SKY_TO_ORDER1_SOURCE_NAME = "small_sky_to_o1source"
 SMALL_SKY_TO_ORDER1_SOURCE_SOFT_NAME = "small_sky_to_o1source_soft"
 SMALL_SKY_ORDER1_CSV = "small_sky_order1.csv"
+SMALL_SKY_NO_METADATA = "small_sky_no_metadata"
 XMATCH_CORRECT_FILE = "xmatch_correct.csv"
 XMATCH_CORRECT_005_FILE = "xmatch_correct_0_005.csv"
 XMATCH_CORRECT_002_005_FILE = "xmatch_correct_002_005.csv"
@@ -200,6 +201,11 @@ def small_sky_order3_source_margin_catalog(test_data_dir):
     return lsdb.read_hipscat(test_data_dir / SMALL_SKY_ORDER3_SOURCE_MARGIN_NAME)
 
 
+@pytest.fixture
+def small_sky_no_metadata_dir(test_data_dir):
+    return test_data_dir / "raw" / SMALL_SKY_NO_METADATA
+
+
 @pytest.fixture
 def xmatch_expected_dir(test_data_dir):
     return test_data_dir / "raw" / "xmatch_expected"

diff --git a/tests/data/raw/small_sky_no_metadata/catalog_info.json b/tests/data/raw/small_sky_no_metadata/catalog_info.json
@@ -0,0 +1,8 @@
+{
+    "catalog_name": "small_sky",
+    "catalog_type": "object",
+    "total_rows": 131,
+    "epoch": "J2000",
+    "ra_column": "ra",
+    "dec_column": "dec"
+}
diff --git a/tests/data/raw/small_sky_no_metadata/partition_info.csv b/tests/data/raw/small_sky_no_metadata/partition_info.csv
@@ -0,0 +1,2 @@
+Norder,Npix,Dir
+0,11,0
diff --git a/tests/data/raw/small_sky_no_metadata/provenance_info.json b/tests/data/raw/small_sky_no_metadata/provenance_info.json
@@ -0,0 +1,53 @@
+{
+    "catalog_name": "small_sky",
+    "catalog_type": "object",
+    "total_rows": 131,
+    "epoch": "J2000",
+    "ra_column": "ra",
+    "dec_column": "dec",
+    "version": "0.2.7.dev15+g85ec4a0",
+    "generation_date": "2024.03.06",
+    "tool_args": {
+        "tool_name": "hipscat_import",
+        "version": "0.2.5.dev5+g0733afb",
+        "runtime_args": {
+            "catalog_name": "small_sky",
+            "output_path": ".",
+            "output_artifact_name": "small_sky",
+            "tmp_dir": "/tmp/user/11115/tmphywoxno9",
+            "overwrite": true,
+            "dask_tmp": "",
+            "dask_n_workers": 1,
+            "dask_threads_per_worker": 1,
+            "catalog_path": "./small_sky",
+            "tmp_path": "/tmp/user/11115/tmphywoxno9/small_sky/intermediate",
+            "epoch": "J2000",
+            "catalog_type": "object",
+            "input_path": null,
+            "input_paths": [
+                "small_sky_order1/small_sky_order1.csv"
+            ],
+            "input_file_list": [
+                "small_sky_order1/small_sky_order1.csv"
+            ],
+            "ra_column": "ra",
+            "dec_column": "dec",
+            "use_hipscat_index": false,
+            "sort_columns": null,
+            "constant_healpix_order": -1,
+            "highest_healpix_order": 7,
+            "pixel_threshold": 1000000,
+            "mapping_healpix_order": 7,
+            "debug_stats_only": false,
+            "file_reader_info": {
+                "input_reader_type": "CsvReader",
+                "chunksize": 500000,
+                "header": "infer",
+                "schema_file": null,
+                "separator": ",",
+                "column_names": null,
+                "type_map": {}
+            }
+        }
+    }
+}