Skip to content

Commit 3b5fb4b

Browse files
authored
apacheGH-2988: Supports disabling statistics for specific columns (apache#2989)
1 parent 3ac860e commit 3b5fb4b

File tree

8 files changed

+353
-18
lines changed

8 files changed

+353
-18
lines changed

parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ public class ParquetProperties {
6363
public static final double DEFAULT_BLOOM_FILTER_FPP = 0.01;
6464
public static final boolean DEFAULT_ADAPTIVE_BLOOM_FILTER_ENABLED = false;
6565
public static final int DEFAULT_BLOOM_FILTER_CANDIDATES_NUMBER = 5;
66+
public static final boolean DEFAULT_STATISTICS_ENABLED = true;
6667

6768
public static final boolean DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED = true;
6869

@@ -122,6 +123,7 @@ public static WriterVersion fromString(String name) {
122123
private final boolean pageWriteChecksumEnabled;
123124
private final ColumnProperty<ByteStreamSplitMode> byteStreamSplitEnabled;
124125
private final Map<String, String> extraMetaData;
126+
private final ColumnProperty<Boolean> statistics;
125127

126128
private ParquetProperties(Builder builder) {
127129
this.pageSizeThreshold = builder.pageSize;
@@ -149,6 +151,7 @@ private ParquetProperties(Builder builder) {
149151
this.pageWriteChecksumEnabled = builder.pageWriteChecksumEnabled;
150152
this.byteStreamSplitEnabled = builder.byteStreamSplitEnabled.build();
151153
this.extraMetaData = builder.extraMetaData;
154+
this.statistics = builder.statistics.build();
152155
}
153156

154157
public static Builder builder() {
@@ -330,6 +333,10 @@ public Map<String, String> getExtraMetaData() {
330333
return extraMetaData;
331334
}
332335

336+
public boolean getStatisticsEnabled(ColumnDescriptor column) {
337+
return statistics.getValue(column);
338+
}
339+
333340
@Override
334341
public String toString() {
335342
return "Parquet page size to " + getPageSizeThreshold() + '\n'
@@ -372,6 +379,7 @@ public static class Builder {
372379
private boolean pageWriteChecksumEnabled = DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED;
373380
private final ColumnProperty.Builder<ByteStreamSplitMode> byteStreamSplitEnabled;
374381
private Map<String, String> extraMetaData = new HashMap<>();
382+
private final ColumnProperty.Builder<Boolean> statistics;
375383

376384
private Builder() {
377385
enableDict = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_IS_DICTIONARY_ENABLED);
@@ -387,6 +395,7 @@ private Builder() {
387395
ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_ADAPTIVE_BLOOM_FILTER_ENABLED);
388396
numBloomFilterCandidates =
389397
ColumnProperty.<Integer>builder().withDefaultValue(DEFAULT_BLOOM_FILTER_CANDIDATES_NUMBER);
398+
statistics = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_STATISTICS_ENABLED);
390399
}
391400

392401
private Builder(ParquetProperties toCopy) {
@@ -409,6 +418,7 @@ private Builder(ParquetProperties toCopy) {
409418
this.maxBloomFilterBytes = toCopy.maxBloomFilterBytes;
410419
this.byteStreamSplitEnabled = ColumnProperty.builder(toCopy.byteStreamSplitEnabled);
411420
this.extraMetaData = toCopy.extraMetaData;
421+
this.statistics = ColumnProperty.builder(toCopy.statistics);
412422
}
413423

414424
/**
@@ -657,6 +667,18 @@ public Builder withExtraMetaData(Map<String, String> extraMetaData) {
657667
return this;
658668
}
659669

670+
/**
671+
* Enable or disable the statistics for given column. All column statistics are enabled by default.
672+
*
673+
* @param columnPath the given column
674+
* @param enabled enable or disable
675+
* @return this builder for method chaining
676+
*/
677+
public Builder withStatisticsEnabled(String columnPath, boolean enabled) {
678+
this.statistics.withValue(columnPath, enabled);
679+
return this;
680+
}
681+
660682
public ParquetProperties build() {
661683
ParquetProperties properties = new ParquetProperties(this);
662684
// we pass a constructed but uninitialized factory to ParquetProperties above as currently

parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnValueCollector.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,19 +36,23 @@
3636
class ColumnValueCollector {
3737

3838
private final ColumnDescriptor path;
39+
private final boolean statisticsEnabled;
3940
private BloomFilterWriter bloomFilterWriter;
4041
private BloomFilter bloomFilter;
4142
private Statistics<?> statistics;
4243
private SizeStatistics.Builder sizeStatisticsBuilder;
4344

4445
ColumnValueCollector(ColumnDescriptor path, BloomFilterWriter bloomFilterWriter, ParquetProperties props) {
4546
this.path = path;
47+
this.statisticsEnabled = props.getStatisticsEnabled(path);
4648
resetPageStatistics();
4749
initBloomFilter(bloomFilterWriter, props);
4850
}
4951

5052
void resetPageStatistics() {
51-
this.statistics = Statistics.createStats(path.getPrimitiveType());
53+
this.statistics = statisticsEnabled
54+
? Statistics.createStats(path.getPrimitiveType())
55+
: Statistics.noopStats(path.getPrimitiveType());
5256
this.sizeStatisticsBuilder = SizeStatistics.newBuilder(
5357
path.getPrimitiveType(), path.getMaxRepetitionLevel(), path.getMaxDefinitionLevel());
5458
}
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.parquet.column.statistics;
20+
21+
import org.apache.parquet.io.api.Binary;
22+
import org.apache.parquet.schema.PrimitiveType;
23+
24+
/**
25+
* A noop statistics which always return empty.
26+
*/
27+
class NoopStatistics<T extends Comparable<T>> extends Statistics<T> {
28+
29+
NoopStatistics(PrimitiveType type) {
30+
super(type);
31+
}
32+
33+
@Override
34+
public void updateStats(int value) {}
35+
36+
@Override
37+
public void updateStats(long value) {}
38+
39+
@Override
40+
public void updateStats(float value) {}
41+
42+
@Override
43+
public void updateStats(double value) {}
44+
45+
@Override
46+
public void updateStats(boolean value) {}
47+
48+
@Override
49+
public void updateStats(Binary value) {}
50+
51+
@Override
52+
public boolean equals(Object other) {
53+
if (other == this) return true;
54+
if (!(other instanceof Statistics)) return false;
55+
Statistics stats = (Statistics) other;
56+
return type().equals(stats.type());
57+
}
58+
59+
@Override
60+
public int hashCode() {
61+
return 31 * type().hashCode();
62+
}
63+
64+
@Override
65+
protected void mergeStatisticsMinMax(Statistics stats) {}
66+
67+
@Override
68+
public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes) {}
69+
70+
@Override
71+
public T genericGetMin() {
72+
throw new UnsupportedOperationException(
73+
"genericGetMin is not supported by " + getClass().getName());
74+
}
75+
76+
@Override
77+
public T genericGetMax() {
78+
throw new UnsupportedOperationException(
79+
"genericGetMax is not supported by " + getClass().getName());
80+
}
81+
82+
@Override
83+
public byte[] getMaxBytes() {
84+
throw new UnsupportedOperationException(
85+
"getMaxBytes is not supported by " + getClass().getName());
86+
}
87+
88+
@Override
89+
public byte[] getMinBytes() {
90+
throw new UnsupportedOperationException(
91+
"getMinBytes is not supported by " + getClass().getName());
92+
}
93+
94+
@Override
95+
String stringify(T value) {
96+
throw new UnsupportedOperationException(
97+
"stringify is not supported by " + getClass().getName());
98+
}
99+
100+
@Override
101+
public boolean isSmallerThan(long size) {
102+
throw new UnsupportedOperationException(
103+
"isSmallerThan is not supported by " + getClass().getName());
104+
}
105+
106+
@Override
107+
public long getNumNulls() {
108+
return -1;
109+
}
110+
111+
@Override
112+
public boolean isEmpty() {
113+
return true;
114+
}
115+
116+
@Override
117+
public boolean hasNonNullValue() {
118+
return false;
119+
}
120+
121+
@Override
122+
public boolean isNumNullsSet() {
123+
return false;
124+
}
125+
126+
@Override
127+
public Statistics<T> copy() {
128+
return new NoopStatistics<>(this.type());
129+
}
130+
}

parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,15 @@ public static Statistics<?> createStats(Type type) {
252252
}
253253
}
254254

255+
/**
256+
* Creates a noop {@code NoopStatistics} statistics instance. This is only used when the user disables statistics for the specified column.
257+
* @param type type of the column
258+
* @return a noop statistics
259+
*/
260+
public static Statistics<?> noopStats(Type type) {
261+
return new NoopStatistics<>(type.asPrimitiveType());
262+
}
263+
255264
/**
256265
* Returns a builder to create new statistics object. Used to read the statistics from the parquet file.
257266
*

parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,11 @@
3030
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
3131
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
3232
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96;
33+
import static org.apache.parquet.schema.Type.Repetition.REQUIRED;
3334
import static org.junit.Assert.assertArrayEquals;
3435
import static org.junit.Assert.assertEquals;
3536
import static org.junit.Assert.assertFalse;
37+
import static org.junit.Assert.assertThrows;
3638
import static org.junit.Assert.assertTrue;
3739

3840
import java.nio.ByteBuffer;
@@ -900,4 +902,29 @@ public void testSpecBuilderForDouble() {
900902
assertEquals(0, Double.compare(-0.0, (Double) stats.genericGetMin()));
901903
assertEquals(0, Double.compare(0.0, (Double) stats.genericGetMax()));
902904
}
905+
906+
@Test
907+
public void testNoopStatistics() {
908+
// Test basic max/min
909+
integerArray = new int[] {1, 3, 14, 54, 66, 8, 0, 23, 54};
910+
Statistics<?> stats = Statistics.noopStats(new PrimitiveType(REQUIRED, INT32, "int32"));
911+
assertTrue(stats.isEmpty());
912+
913+
for (int i : integerArray) {
914+
stats.updateStats(i);
915+
}
916+
917+
assertEquals(stats.getNumNulls(), -1);
918+
assertFalse(stats.hasNonNullValue());
919+
assertFalse(stats.isNumNullsSet());
920+
assertTrue(stats.isEmpty());
921+
922+
assertThrows(UnsupportedOperationException.class, stats::genericGetMax);
923+
assertThrows(UnsupportedOperationException.class, stats::genericGetMin);
924+
assertThrows(UnsupportedOperationException.class, stats::getMaxBytes);
925+
assertThrows(UnsupportedOperationException.class, stats::getMinBytes);
926+
assertThrows(UnsupportedOperationException.class, stats::maxAsString);
927+
assertThrows(UnsupportedOperationException.class, stats::minAsString);
928+
assertThrows(UnsupportedOperationException.class, () -> stats.isSmallerThan(0));
929+
}
903930
}

parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -897,6 +897,18 @@ public SELF config(String property, String value) {
897897
return self();
898898
}
899899

900+
/**
901+
* Sets the statistics enabled/disabled for the specified column. All column statistics are enabled by default.
902+
*
903+
* @param columnPath the path of the column (dot-string)
904+
* @param enabled whether to write calculate statistics for the column
905+
* @return this builder for method chaining
906+
*/
907+
public SELF withStatisticsEnabled(String columnPath, boolean enabled) {
908+
encodingPropsBuilder.withStatisticsEnabled(columnPath, enabled);
909+
return self();
910+
}
911+
900912
/**
901913
* Build a {@link ParquetWriter} with the accumulated configuration.
902914
*

0 commit comments

Comments
 (0)