Skip to content

Commit

Permalink
[T2] Wide column metadata improvemnts
Browse files Browse the repository at this point in the history
1. Make `ColumnMetaData.type` optional
2. Make `ColumnMetaData.path_in_schema` optional
3. Add `ColumnMetaData.schema_index`. This is the ordinal in `FileMetaData.schema` this column corresponds to. This allows sparse representation of columns in a rowgroup.
  • Loading branch information
alkis committed May 29, 2024
1 parent 384bedd commit 9f5b94e
Showing 1 changed file with 40 additions and 24 deletions.
64 changes: 40 additions & 24 deletions src/main/thrift/parquet.thrift
Original file line number Diff line number Diff line change
Expand Up @@ -242,43 +242,36 @@ struct SizeStatistics {
* All fields are optional.
*/
struct Statistics {
/**
* DEPRECATED: min and max value of the column. Use min_value and max_value.
*
* Values are encoded using PLAIN encoding, except that variable-length byte
* arrays do not include a length prefix.
*
* These fields encode min and max values determined by signed comparison
* only. New files should use the correct order for a column's logical type
* and store the values in the min_value and max_value fields.
*
* To support older readers, these may be set when the column order is
* signed.
*/
/* DEPRECATED: do not use */
1: optional binary max;
2: optional binary min;
/** count of null value in the column */
3: optional i64 null_count;
/** count of distinct values occurring */
4: optional i64 distinct_count;
/**
* Lower and upper bound values for the column, determined by its ColumnOrder.
* Only one pair of max_value/min_value, max1/min1, max2/min2, max4/min4,
* max8/min8 can be set. The pair is determined by the physical type of the
* column. Floating point values are bitcasted to integers. Variable length
* values are set in min_value/max_value.
*
* Min and Max are the lower and upper bound values for the column,
* respectively, as determined by its ColumnOrder.
*
* These may be the actual minimum and maximum values found on a page or column
* chunk, but can also be (more compact) values that do not exist on a page or
* column chunk. For example, instead of storing "Blart Versenwald III", a writer
* may set min_value="B", max_value="C". Such more compact values must still be
* valid values within the column's logical type.
*
* Values are encoded using PLAIN encoding, except that variable-length byte
* arrays do not include a length prefix.
*/
5: optional binary max_value;
6: optional binary min_value;
/** If true, max_value is the actual maximum value for a column */
7: optional bool is_max_value_exact;
/** If true, min_value is the actual minimum value for a column */
8: optional bool is_min_value_exact;
9: optional i64 max8;
10: optional i64 min8;
}

/** Empty structs to use as logical type annotations */
Expand Down Expand Up @@ -490,7 +483,7 @@ enum Encoding {
// GROUP_VAR_INT = 1;

/**
* Deprecated: Dictionary encoding. The values in the dictionary are encoded in the
* DEPRECATED: Dictionary encoding. The values in the dictionary are encoded in the
* plain type.
* in a data page use RLE_DICTIONARY instead.
* in a Dictionary page use PLAIN instead
Expand Down Expand Up @@ -772,15 +765,25 @@ struct PageEncodingStats {
* Description for column metadata
*/
struct ColumnMetaData {
/** Type of this column **/
1: required Type type
/**
* DEPRECATED: can be found in SchemaElement
*
* Writers MUST NOT omit this field until 2025-10-01.
* Readers MUST ignore this field before 2025-10-01.
*/
1: optional Type type

/** Set of all encodings used for this column. The purpose is to validate
* whether we can decode those pages. **/
2: required list<Encoding> encodings

/** Path in schema **/
3: required list<string> path_in_schema
/**
* DEPRECATED: can be found in SchemaElement
*
* Writers MUST NOT omit this field until 2025-10-01.
* Readers MUST ignore this field before 2025-10-01.
*/
3: optional list<string> path_in_schema

/** Compression codec **/
4: required CompressionCodec codec
Expand Down Expand Up @@ -810,9 +813,13 @@ struct ColumnMetaData {
/** optional statistics for this column chunk */
12: optional Statistics statistics;

/** Set of all encodings used for pages in this column chunk.
/**
* DEPRECATED: use is_fully_dict_encoded instead
*
* Set of all encodings used for pages in this column chunk.
* This information can be used to determine if all data pages are
* dictionary encoded for example **/
* dictionary encoded for example
*/
13: optional list<PageEncodingStats> encoding_stats;

/** Byte offset from beginning of file to Bloom filter data. **/
Expand All @@ -833,6 +840,15 @@ struct ColumnMetaData {
* filter pushdown.
*/
16: optional SizeStatistics size_statistics;

/* True if all pages in this column chunk are dictionary encoded */
17: optional bool is_fully_dict_encoded;
/**
* The index into FileMetadata.schema (list<SchemaElement>) for this column.
* This implies that ColumnMetaData can be sparse in a rowgroup, if for example
* a column does not have any data pages in a rowgroup.
*/
18: optional i32 schema_index;
}

struct EncryptionWithFooterKey {
Expand Down

0 comments on commit 9f5b94e

Please sign in to comment.