Skip to content

Commit

Permalink
Clean up unused methods. Prepare for file_system argument. (#315)
Browse files Browse the repository at this point in the history
* Clean up unused methods. Prepare for file_system argument.

* Increase test coverage

* Update src/hipscat/io/file_io/file_pointer.py

Co-authored-by: Konstantin Malanchev <[email protected]>

---------

Co-authored-by: Konstantin Malanchev <[email protected]>
  • Loading branch information
delucchi-cmu and hombit authored Jul 26, 2024
1 parent 2c625ce commit 3234fcf
Show file tree
Hide file tree
Showing 12 changed files with 69 additions and 77 deletions.
6 changes: 3 additions & 3 deletions src/.pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -493,16 +493,16 @@ score=yes
ignore-comments=yes

# Docstrings are removed from the similarity computation
ignore-docstrings=yes
ignore-docstrings=no

# Imports are removed from the similarity computation
ignore-imports=yes

# Signatures are removed from the similarity computation
ignore-signatures=yes
ignore-signatures=no

# Minimum lines number of a similarity.
min-similarity-lines=6
min-similarity-lines=8


[SPELLING]
Expand Down
8 changes: 2 additions & 6 deletions src/hipscat/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .parquet_metadata import (
read_row_group_fragments,
row_group_stat_single_value,
write_parquet_metadata,
write_parquet_metadata_for_batches,
)
from .paths import (
Expand All @@ -18,9 +19,4 @@
pixel_catalog_file,
pixel_directory,
)
from .write_metadata import (
write_catalog_info,
write_parquet_metadata,
write_partition_info,
write_provenance_info,
)
from .write_metadata import write_catalog_info, write_partition_info, write_provenance_info
2 changes: 1 addition & 1 deletion src/hipscat/io/file_io/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from .file_io import (
delete_file,
load_csv_to_pandas,
load_csv_to_pandas_generator,
load_json_file,
load_parquet_to_pandas,
load_text_file,
make_directory,
read_fits_image,
read_parquet_dataset,
read_parquet_file,
read_parquet_file_to_pandas,
read_parquet_metadata,
remove_directory,
Expand Down
40 changes: 26 additions & 14 deletions src/hipscat/io/file_io/file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json
import tempfile
import warnings
from typing import Any, Dict, Tuple, Union
from typing import Any, Dict, Generator, Tuple, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -143,9 +143,32 @@ def load_csv_to_pandas(
Returns:
pandas dataframe loaded from CSV
"""
file_system, file_pointer = get_fs(file_pointer, storage_options=storage_options)
with file_system.open(file_pointer, "r") as csv_file:
frame = pd.read_csv(csv_file, **kwargs)
return frame

pd_storage_option = unnest_headers_for_pandas(storage_options)
return pd.read_csv(file_pointer, storage_options=pd_storage_option, **kwargs)

def load_csv_to_pandas_generator(
file_pointer: FilePointer,
chunksize=10_000,
*,
storage_options: Union[Dict[Any, Any], None] = None,
**kwargs,
) -> Generator[pd.DataFrame]:
"""Load a csv file to a pandas dataframe
Args:
file_pointer: location of csv file to load
file_system: fsspec or pyarrow filesystem, default None
storage_options: dictionary that contains abstract filesystem credentials
**kwargs: arguments to pass to pandas `read_csv` loading method
Returns:
pandas dataframe loaded from CSV
"""
file_system, file_pointer = get_fs(file_pointer, storage_options=storage_options)
with file_system.open(file_pointer, "r") as csv_file:
with pd.read_csv(csv_file, chunksize=chunksize, **kwargs) as reader:
yield from reader


def load_parquet_to_pandas(
Expand Down Expand Up @@ -251,17 +274,6 @@ def read_parquet_dataset(
return (source, dataset)


def read_parquet_file(file_pointer: FilePointer, storage_options: Union[Dict[Any, Any], None] = None):
"""Read parquet file from file pointer.
Args:
file_pointer: location of file to read metadata from
storage_options: dictionary that contains abstract filesystem credentials
"""
file_system, file_pointer = get_fs(file_pointer, storage_options=storage_options)
return pq.ParquetFile(file_pointer, filesystem=file_system)


def write_parquet_metadata(
schema: Any,
file_pointer: FilePointer,
Expand Down
4 changes: 2 additions & 2 deletions src/hipscat/io/file_io/file_pointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ def get_file_pointer_for_fs(protocol: str, file_pointer: FilePointer) -> FilePoi
split_pointer = file_pointer.split("file://")[1]
else:
split_pointer = file_pointer
elif protocol == "https":
# https should include the protocol in the file path
elif protocol.startswith("http"):
# http/https should include the protocol in the file path
split_pointer = file_pointer # don't split
else:
split_pointer = file_pointer.split(f"{protocol}://")[1]
Expand Down
16 changes: 0 additions & 16 deletions src/hipscat/io/write_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import pandas as pd

from hipscat.io import file_io, paths
from hipscat.io.parquet_metadata import write_parquet_metadata as wpm
from hipscat.pixel_math.healpix_pixel import HealpixPixel


Expand Down Expand Up @@ -127,21 +126,6 @@ def write_partition_info(
)


def write_parquet_metadata(catalog_path, storage_options: Union[Dict[Any, Any], None] = None):
"""Generate parquet metadata, using the already-partitioned parquet files
for this catalog
Args:
catalog_path (str): base path for the catalog
storage_options: dictionary that contains abstract filesystem credentials
"""
wpm(
catalog_path=catalog_path,
storage_options=storage_options,
output_path=catalog_path,
)


def write_fits_map(catalog_path, histogram: np.ndarray, storage_options: Union[Dict[Any, Any], None] = None):
"""Write the object spatial distribution information to a healpix FITS file.
Expand Down
6 changes: 0 additions & 6 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
ALMANAC_DIR_NAME = "almanac"
SMALL_SKY_DIR_NAME = "small_sky"
SMALL_SKY_ORDER1_DIR_NAME = "small_sky_order1"
SMALL_SKY_TO_SMALL_SKY_ORDER1_DIR_NAME = "small_sky_to_small_sky_order1"
SMALL_SKY_SOURCE_OBJECT_INDEX_DIR_NAME = "small_sky_source_object_index"

TEST_DIR = os.path.dirname(__file__)
Expand Down Expand Up @@ -47,11 +46,6 @@ def small_sky_order1_dir(test_data_dir):
return test_data_dir / SMALL_SKY_ORDER1_DIR_NAME


@pytest.fixture
def small_sky_to_small_sky_order1_dir(test_data_dir):
return test_data_dir / SMALL_SKY_TO_SMALL_SKY_ORDER1_DIR_NAME


@pytest.fixture
def small_sky_source_object_index_dir(test_data_dir):
return test_data_dir / SMALL_SKY_SOURCE_OBJECT_INDEX_DIR_NAME
Expand Down
16 changes: 16 additions & 0 deletions tests/hipscat/catalog/loaders/test_read_from_hipscat.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,19 @@ def test_read_from_hipscat_wrong_catalog_type(small_sky_dir):
read_from_hipscat(small_sky_dir, catalog_type=CatalogType.ASSOCIATION)
with pytest.raises(NotImplementedError, match="load catalog of type"):
read_from_hipscat(small_sky_dir, catalog_type="unknown")


def test_read_hipscat_branches(
small_sky_dir,
small_sky_order1_dir,
association_catalog_path,
small_sky_source_object_index_dir,
margin_catalog_path,
small_sky_source_dir,
):
read_from_hipscat(small_sky_dir)
read_from_hipscat(small_sky_order1_dir)
read_from_hipscat(association_catalog_path)
read_from_hipscat(small_sky_source_object_index_dir)
read_from_hipscat(margin_catalog_path)
read_from_hipscat(small_sky_source_dir)
3 changes: 2 additions & 1 deletion tests/hipscat/io/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re

import numpy.testing as npt
import pyarrow.parquet as pq
import pytest

from hipscat.io import file_io
Expand Down Expand Up @@ -64,7 +65,7 @@ def check_parquet_schema(file_name, expected_schema, expected_num_row_groups=1):

assert schema.equals(expected_schema, check_metadata=False)

parquet_file = file_io.read_parquet_file(file_name)
parquet_file = pq.ParquetFile(file_name)
assert parquet_file.metadata.num_row_groups == expected_num_row_groups

for row_index in range(0, parquet_file.metadata.num_row_groups):
Expand Down
17 changes: 17 additions & 0 deletions tests/hipscat/io/file_io/test_file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from hipscat.io.file_io import (
delete_file,
get_file_pointer_from_path,
load_csv_to_pandas,
load_csv_to_pandas_generator,
load_json_file,
load_parquet_to_pandas,
make_directory,
Expand Down Expand Up @@ -115,6 +117,21 @@ def test_load_json(small_sky_dir):
assert loaded_json_dict == json_dict


def test_load_csv_to_pandas(small_sky_source_dir):
partition_info_path = small_sky_source_dir / "partition_info.csv"
frame = load_csv_to_pandas(partition_info_path)
assert len(frame) == 14


def test_load_csv_to_pandas_generator(small_sky_source_dir):
partition_info_path = small_sky_source_dir / "partition_info.csv"
num_reads = 0
for frame in load_csv_to_pandas_generator(partition_info_path, chunksize=7):
assert len(frame) == 7
num_reads += 1
assert num_reads == 2


def test_load_parquet_to_pandas(small_sky_dir):
pixel_data_path = pixel_catalog_file(small_sky_dir, 0, 11)
parquet_df = pd.read_parquet(pixel_data_path)
Expand Down
27 changes: 0 additions & 27 deletions tests/hipscat/io/test_write_metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Tests of file IO (reads and writes)"""

import shutil
from pathlib import Path

import numpy as np
Expand Down Expand Up @@ -178,32 +177,6 @@ def test_write_partition_info_float(assert_text_file_matches, tmp_path):
assert_text_file_matches(expected_lines, metadata_filename)


def test_write_parquet_metadata(tmp_path, small_sky_dir, small_sky_schema, check_parquet_schema):
"""Copy existing catalog and create new metadata files for it"""
catalog_base_dir = tmp_path / "catalog"
shutil.copytree(
small_sky_dir,
catalog_base_dir,
)
io.write_parquet_metadata(catalog_base_dir)
check_parquet_schema(catalog_base_dir / "_metadata", small_sky_schema)
## _common_metadata has 0 row groups
check_parquet_schema(
catalog_base_dir / "_common_metadata",
small_sky_schema,
0,
)
## Re-write - should still have the same properties.
io.write_parquet_metadata(catalog_base_dir)
check_parquet_schema(catalog_base_dir / "_metadata", small_sky_schema)
## _common_metadata has 0 row groups
check_parquet_schema(
catalog_base_dir / "_common_metadata",
small_sky_schema,
0,
)


def test_read_write_fits_point_map(tmp_path):
"""Check that we write and can read a FITS file for spatial distribution."""
initial_histogram = hist.empty_histogram(1)
Expand Down
1 change: 0 additions & 1 deletion tests/hipscat/pixel_math/test_partition_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,6 @@ def test_alignment_small_sky_order2(drop_empty_siblings):
(0, 11, 131),
]
expected[176:192] = tuples
print(result[176:192])

npt.assert_array_equal(result, expected)

Expand Down

0 comments on commit 3234fcf

Please sign in to comment.