Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 35 additions & 6 deletions python/pyarrow/_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -474,8 +474,14 @@ cdef class ColumnChunkMetaData(_Weakrefable):
has_dictionary_page: {self.has_dictionary_page}
dictionary_page_offset: {self.dictionary_page_offset}
data_page_offset: {self.data_page_offset}
has_index_page: {self.has_index_page}
index_page_offset: {self.index_page_offset}
has_bloom_filter: {self.has_bloom_filter}
bloom_filter_offset: {self.bloom_filter_offset}
total_compressed_size: {self.total_compressed_size}
total_uncompressed_size: {self.total_uncompressed_size}"""
total_uncompressed_size: {self.total_uncompressed_size}
has_offset_index: {self.has_offset_index}
has_column_index: {self.has_column_index}"""

def to_dict(self):
"""
Expand Down Expand Up @@ -506,8 +512,14 @@ cdef class ColumnChunkMetaData(_Weakrefable):
has_dictionary_page=self.has_dictionary_page,
dictionary_page_offset=self.dictionary_page_offset,
data_page_offset=self.data_page_offset,
has_index_page=self.has_index_page,
index_page_offset=self.index_page_offset,
has_bloom_filter=self.has_bloom_filter,
bloom_filter_offset=self.bloom_filter_offset,
total_compressed_size=self.total_compressed_size,
total_uncompressed_size=self.total_uncompressed_size
total_uncompressed_size=self.total_uncompressed_size,
has_offset_index=self.has_offset_index,
has_column_index=self.has_column_index
)
return d

Expand Down Expand Up @@ -627,13 +639,30 @@ cdef class ColumnChunkMetaData(_Weakrefable):

@property
def has_index_page(self):
"""Not yet supported."""
raise NotImplementedError('not supported in parquet-cpp')
"""Whether there is an index data present in the column chunk (bool)."""
return self.metadata.has_index_page()

@property
def index_page_offset(self):
"""Not yet supported."""
raise NotImplementedError("parquet-cpp doesn't return valid values")
"""Offset of index page relative to beginning of the file (int or None)."""
return self.metadata.index_page_offset() if self.has_index_page else None

@property
def has_bloom_filter(self):
"""Whether there is a bloom filter present in the column chunk (bool)."""
return self.metadata.bloom_filter_offset().has_value()

@property
def bloom_filter_offset(self):
"""Offset of bloom filter relative to beginning of the file (int or None)."""
offset = self.metadata.bloom_filter_offset()
return offset.value() if offset.has_value() else None

@property
def bloom_filter_length(self):
"""Length of bloom filter (int or None)."""
length = self.metadata.bloom_filter_length()
return length.value() if length.has_value() else None

@property
def total_compressed_size(self):
Expand Down
3 changes: 3 additions & 0 deletions python/pyarrow/includes/libparquet.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -364,9 +364,12 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
const vector[ParquetEncoding]& encodings() const
c_bool Equals(const CColumnChunkMetaData&) const

optional[int64_t] bloom_filter_offset() const
optional[int64_t] bloom_filter_length() const
int64_t has_dictionary_page() const
int64_t dictionary_page_offset() const
int64_t data_page_offset() const
c_bool has_index_page() const
int64_t index_page_offset() const
int64_t total_compressed_size() const
int64_t total_uncompressed_size() const
Expand Down
38 changes: 34 additions & 4 deletions python/pyarrow/tests/parquet/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,12 @@ def test_parquet_metadata_api():
assert col_meta.data_page_offset > 0
assert col_meta.total_compressed_size > 0
assert col_meta.total_uncompressed_size > 0
with pytest.raises(NotImplementedError):
col_meta.has_index_page
with pytest.raises(NotImplementedError):
col_meta.index_page_offset
assert col_meta.has_index_page is False
assert col_meta.index_page_offset is None
assert col_meta.has_bloom_filter is False
assert col_meta.bloom_filter_offset is None
assert col_meta.has_offset_index is False
assert col_meta.has_column_index is False


def test_parquet_metadata_lifetime(tempdir):
Expand Down Expand Up @@ -814,3 +816,31 @@ def msg(c):

with pytest.raises(TypeError, match=msg("FileMetaData")):
pq.FileMetaData()


def test_column_metadata_with_bloom_filter(parquet_test_datadir):
metadata = pq.read_metadata(parquet_test_datadir /
'data_index_bloom_encoding_with_length.parquet')
assert metadata.row_group(0).column(0).has_dictionary_page is True
assert metadata.row_group(0).column(0).dictionary_page_offset == 4
assert metadata.row_group(0).column(0).has_index_page is False
assert metadata.row_group(0).column(0).index_page_offset is None
assert metadata.row_group(0).column(0).has_bloom_filter is True
assert metadata.row_group(0).column(0).has_column_index is True
assert metadata.row_group(0).column(0).has_offset_index is True
assert metadata.row_group(0).column(0).bloom_filter_offset == 253
assert metadata.row_group(0).column(0).bloom_filter_length == 2064


def test_column_metadata_with_index_page(parquet_test_datadir):
metadata = pq.read_metadata(parquet_test_datadir /
'nan_in_stats.parquet')
assert metadata.row_group(0).column(0).has_dictionary_page is True
assert metadata.row_group(0).column(0).dictionary_page_offset == 4
assert metadata.row_group(0).column(0).has_index_page is True
assert metadata.row_group(0).column(0).index_page_offset == 0
Copy link
Member Author

@raulcd raulcd Nov 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feels weird that has_index_page is True and index_page_offset is 0 but this is a C++ issue (or a file issue) we are just acting as a pass-through here.

assert metadata.row_group(0).column(0).has_bloom_filter is False
assert metadata.row_group(0).column(0).has_column_index is False
assert metadata.row_group(0).column(0).has_offset_index is False
assert metadata.row_group(0).column(0).bloom_filter_offset is None
assert metadata.row_group(0).column(0).bloom_filter_length is None
Loading