Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-2962: Set dictionary_page_offset even when encoding_stats are missing #3012

Merged
merged 2 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -557,8 +557,9 @@ private void addRowGroup(
columnMetaData.getTotalUncompressedSize(),
columnMetaData.getTotalSize(),
columnMetaData.getFirstDataPageOffset());
if (columnMetaData.getEncodingStats() != null
&& columnMetaData.getEncodingStats().hasDictionaryPages()) {
if ((columnMetaData.getEncodingStats() != null
&& columnMetaData.getEncodingStats().hasDictionaryPages())
|| columnMetaData.hasDictionaryPage()) {
metaData.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset());
}
long bloomFilterOffset = columnMetaData.getBloomFilterOffset();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,16 @@ public void testSchemaConverterDecimal() {
@Test
public void testParquetMetadataConverterWithDictionary() throws IOException {
ParquetMetadata parquetMetaData = createParquetMetaData(Encoding.PLAIN_DICTIONARY, Encoding.PLAIN);
testParquetMetadataConverterWithDictionary(parquetMetaData);
}

@Test
public void testParquetMetadataConverterWithDictionaryAndWithoutEncodingStats() throws IOException {
ParquetMetadata parquetMetaData = createParquetMetaData(Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, false);
testParquetMetadataConverterWithDictionary(parquetMetaData);
}

private void testParquetMetadataConverterWithDictionary(ParquetMetadata parquetMetaData) throws IOException {
ParquetMetadataConverter converter = new ParquetMetadataConverter();
FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData);

Expand Down Expand Up @@ -1283,18 +1292,32 @@ private static Statistics<?> createStatsTyped(PrimitiveType type, BigInteger min
}

private static ParquetMetadata createParquetMetaData(Encoding dicEncoding, Encoding dataEncoding) {
return createParquetMetaData(dicEncoding, dataEncoding, true);
}

private static ParquetMetadata createParquetMetaData(
Encoding dicEncoding, Encoding dataEncoding, boolean includeEncodingStats) {
MessageType schema = parseMessageType("message schema { optional int32 col (INT_32); }");
org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData =
new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap<String, String>(), null);
List<BlockMetaData> blockMetaDataList = new ArrayList<BlockMetaData>();
BlockMetaData blockMetaData = new BlockMetaData();
EncodingStats.Builder builder = new EncodingStats.Builder();
if (dicEncoding != null) {
builder.addDictEncoding(dicEncoding).build();
EncodingStats es = null;
if (includeEncodingStats) {
EncodingStats.Builder builder = new EncodingStats.Builder();
if (dicEncoding != null) {
builder.addDictEncoding(dicEncoding).build();
}
builder.addDataEncoding(dataEncoding);
es = builder.build();
}
builder.addDataEncoding(dataEncoding);
EncodingStats es = builder.build();
Set<org.apache.parquet.column.Encoding> e = new HashSet<org.apache.parquet.column.Encoding>();
if (!includeEncodingStats) {
if (dicEncoding != null) {
e.add(dicEncoding);
}
e.add(dataEncoding);
}
PrimitiveTypeName t = PrimitiveTypeName.INT32;
ColumnPath p = ColumnPath.get("col");
CompressionCodecName c = CompressionCodecName.UNCOMPRESSED;
Expand Down