-
Notifications
You must be signed in to change notification settings - Fork 4.8k
HIVE-29544: Fix Vectorized Parquet reading Struct columns with all fields null #6408
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -64,6 +64,8 @@ public void readBatch( | |
| int total, | ||
| ColumnVector column, | ||
| TypeInfo columnType) throws IOException { | ||
| this.currentDefLevels = new int[total]; | ||
| this.defLevelIndex = 0; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you elaborate on this change: I mean, I can see that passing and handling definition level to the struct reader solves the current issue, but this looks like fixing another bug, is it the case? |
||
| int rowId = 0; | ||
| while (total > 0) { | ||
| // Compute the number of values we want to read in this page. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,9 +29,11 @@ | |
| public class VectorizedStructColumnReader implements VectorizedColumnReader { | ||
|
|
||
| private final List<VectorizedColumnReader> fieldReaders; | ||
| private final int structDefLevel; | ||
|
|
||
| public VectorizedStructColumnReader(List<VectorizedColumnReader> fieldReaders) { | ||
| public VectorizedStructColumnReader(List<VectorizedColumnReader> fieldReaders, int structDefLevel) { | ||
| this.fieldReaders = fieldReaders; | ||
| this.structDefLevel = structDefLevel; | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -46,14 +48,35 @@ | |
| fieldReaders.get(i) | ||
| .readBatch(total, vectors[i], structTypeInfo.getAllStructFieldTypeInfos().get(i)); | ||
| structColumnVector.isRepeating = structColumnVector.isRepeating && vectors[i].isRepeating; | ||
| } | ||
| int[] defLevels = null; | ||
| for (VectorizedColumnReader reader : fieldReaders) { | ||
| defLevels = reader.getDefinitionLevels(); | ||
| if (defLevels != null) { | ||
| break; | ||
| } | ||
| } | ||
|
Comment on lines
+53
to
+58
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seems like repeated logic, could this be handled with |
||
|
|
||
| for (int j = 0; j < vectors[i].isNull.length; j++) { | ||
| structColumnVector.isNull[j] = | ||
| (i == 0) ? vectors[i].isNull[j] : structColumnVector.isNull[j] && vectors[i].isNull[j]; | ||
| // Evaluate struct nullability using Parquet Definition Levels | ||
| if (defLevels != null) { | ||
| for (int j = 0; j < total; j++) { | ||
| if (defLevels[j] < structDefLevel) { | ||
| // The D-Level boundary crossed the struct. The whole struct is null. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: is |
||
| structColumnVector.isNull[j] = true; | ||
| structColumnVector.noNulls = false; | ||
| } | ||
| } | ||
| structColumnVector.noNulls = | ||
| (i == 0) ? vectors[i].noNulls : structColumnVector.noNulls && vectors[i].noNulls; | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public int[] getDefinitionLevels() { | ||
| for (VectorizedColumnReader reader : fieldReaders) { | ||
| int[] defLevels = reader.getDefinitionLevels(); | ||
| if (defLevels != null) { | ||
| return defLevels; | ||
| } | ||
| } | ||
| return null; | ||
|
Check warning on line 80 in ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedStructColumnReader.java
|
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -317,14 +317,15 @@ protected static void writeData(ParquetWriter<Group> writer, boolean isDictionar | |
| g.addGroup("nsf").append("c", intVal).append("d", intVal); | ||
| g.append("e", doubleVal); | ||
|
|
||
| Group some_null_g = group.addGroup("struct_field_some_null"); | ||
| if (i % 2 != 0) { | ||
| some_null_g.append("f", intVal); | ||
| } | ||
| if (i % 3 != 0) { | ||
| some_null_g.append("g", doubleVal); | ||
| if (i % 6 != 0) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. even if I like the applied math here, which is |
||
| Group some_null_g = group.addGroup("struct_field_some_null"); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as you're already touching this part, please fix |
||
| if (i % 2 != 0) { | ||
| some_null_g.append("f", intVal); | ||
| } | ||
| if (i % 3 != 0) { | ||
| some_null_g.append("g", doubleVal); | ||
| } | ||
| } | ||
|
|
||
| Group mapGroup = group.addGroup("map_field"); | ||
| if (i % 13 != 1) { | ||
| mapGroup.addGroup("map").append("key", binary).append("value", "abc"); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| SET hive.vectorized.execution.enabled=true; | ||
| set hive.vectorized.execution.reduce.enabled=true; | ||
| SET hive.fetch.task.conversion=none; | ||
|
|
||
| CREATE TABLE test_parquet_struct_nulls ( | ||
| id INT, | ||
| st_prim STRUCT<x:INT, y:INT> | ||
| ) STORED AS PARQUET; | ||
|
|
||
| INSERT INTO test_parquet_struct_nulls VALUES | ||
| (1, named_struct('x', CAST(NULL AS INT), 'y', CAST(NULL AS INT))), | ||
| (2, if(1=0, named_struct('x', 1, 'y', 1), null)), | ||
| (3, named_struct('x', 3, 'y', CAST(NULL AS INT))), | ||
| (4, named_struct('x', 4, 'y', 4)); | ||
|
|
||
| -- Test A: Full table scan to check JSON representation | ||
| SELECT * FROM test_parquet_struct_nulls ORDER BY id; | ||
|
|
||
| -- Test B: Verify IS NULL evaluates correctly | ||
| SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NULL; | ||
|
|
||
| -- Test C: Verify IS NOT NULL evaluates correctly | ||
| SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NOT NULL ORDER BY id; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. instead of ORDER BY, what about: |
||
|
|
||
| -- Test D: Verify field-level null evaluation inside a valid struct | ||
| SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NOT NULL AND st_prim.x IS NULL; | ||
|
|
||
| -- Validate withou vectorization | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: |
||
| SET hive.vectorized.execution.enabled=true; | ||
| SET hive.vectorized.execution.reduce.enabled=false; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. isn't this simpler and closer to the fact "without vectorization" |
||
| SELECT * FROM test_parquet_struct_nulls ORDER BY id; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,85 @@ | ||
| PREHOOK: query: CREATE TABLE test_parquet_struct_nulls ( | ||
| id INT, | ||
| st_prim STRUCT<x:INT, y:INT> | ||
| ) STORED AS PARQUET | ||
| PREHOOK: type: CREATETABLE | ||
| PREHOOK: Output: database:default | ||
| PREHOOK: Output: default@test_parquet_struct_nulls | ||
| POSTHOOK: query: CREATE TABLE test_parquet_struct_nulls ( | ||
| id INT, | ||
| st_prim STRUCT<x:INT, y:INT> | ||
| ) STORED AS PARQUET | ||
| POSTHOOK: type: CREATETABLE | ||
| POSTHOOK: Output: database:default | ||
| POSTHOOK: Output: default@test_parquet_struct_nulls | ||
| PREHOOK: query: INSERT INTO test_parquet_struct_nulls VALUES | ||
| (1, named_struct('x', CAST(NULL AS INT), 'y', CAST(NULL AS INT))), | ||
| (2, if(1=0, named_struct('x', 1, 'y', 1), null)), | ||
| (3, named_struct('x', 3, 'y', CAST(NULL AS INT))), | ||
| (4, named_struct('x', 4, 'y', 4)) | ||
| PREHOOK: type: QUERY | ||
| PREHOOK: Input: _dummy_database@_dummy_table | ||
| PREHOOK: Output: default@test_parquet_struct_nulls | ||
| POSTHOOK: query: INSERT INTO test_parquet_struct_nulls VALUES | ||
| (1, named_struct('x', CAST(NULL AS INT), 'y', CAST(NULL AS INT))), | ||
| (2, if(1=0, named_struct('x', 1, 'y', 1), null)), | ||
| (3, named_struct('x', 3, 'y', CAST(NULL AS INT))), | ||
| (4, named_struct('x', 4, 'y', 4)) | ||
| POSTHOOK: type: QUERY | ||
| POSTHOOK: Input: _dummy_database@_dummy_table | ||
| POSTHOOK: Output: default@test_parquet_struct_nulls | ||
| POSTHOOK: Lineage: test_parquet_struct_nulls.id SCRIPT [] | ||
| POSTHOOK: Lineage: test_parquet_struct_nulls.st_prim SCRIPT [] | ||
| PREHOOK: query: SELECT * FROM test_parquet_struct_nulls ORDER BY id | ||
| PREHOOK: type: QUERY | ||
| PREHOOK: Input: default@test_parquet_struct_nulls | ||
| #### A masked pattern was here #### | ||
| POSTHOOK: query: SELECT * FROM test_parquet_struct_nulls ORDER BY id | ||
| POSTHOOK: type: QUERY | ||
| POSTHOOK: Input: default@test_parquet_struct_nulls | ||
| #### A masked pattern was here #### | ||
| 1 {"x":null,"y":null} | ||
| 2 NULL | ||
| 3 {"x":3,"y":null} | ||
| 4 {"x":4,"y":4} | ||
| PREHOOK: query: SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NULL | ||
| PREHOOK: type: QUERY | ||
| PREHOOK: Input: default@test_parquet_struct_nulls | ||
| #### A masked pattern was here #### | ||
| POSTHOOK: query: SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NULL | ||
| POSTHOOK: type: QUERY | ||
| POSTHOOK: Input: default@test_parquet_struct_nulls | ||
| #### A masked pattern was here #### | ||
| 2 | ||
| PREHOOK: query: SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NOT NULL ORDER BY id | ||
| PREHOOK: type: QUERY | ||
| PREHOOK: Input: default@test_parquet_struct_nulls | ||
| #### A masked pattern was here #### | ||
| POSTHOOK: query: SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NOT NULL ORDER BY id | ||
| POSTHOOK: type: QUERY | ||
| POSTHOOK: Input: default@test_parquet_struct_nulls | ||
| #### A masked pattern was here #### | ||
| 1 | ||
| 3 | ||
| 4 | ||
| PREHOOK: query: SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NOT NULL AND st_prim.x IS NULL | ||
| PREHOOK: type: QUERY | ||
| PREHOOK: Input: default@test_parquet_struct_nulls | ||
| #### A masked pattern was here #### | ||
| POSTHOOK: query: SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NOT NULL AND st_prim.x IS NULL | ||
| POSTHOOK: type: QUERY | ||
| POSTHOOK: Input: default@test_parquet_struct_nulls | ||
| #### A masked pattern was here #### | ||
| 1 | ||
| PREHOOK: query: SELECT * FROM test_parquet_struct_nulls ORDER BY id | ||
| PREHOOK: type: QUERY | ||
| PREHOOK: Input: default@test_parquet_struct_nulls | ||
| #### A masked pattern was here #### | ||
| POSTHOOK: query: SELECT * FROM test_parquet_struct_nulls ORDER BY id | ||
| POSTHOOK: type: QUERY | ||
| POSTHOOK: Input: default@test_parquet_struct_nulls | ||
| #### A masked pattern was here #### | ||
| 1 {"x":null,"y":null} | ||
| 2 NULL | ||
| 3 {"x":3,"y":null} | ||
| 4 {"x":4,"y":4} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is the same problem applies to another complex types? better to have more test cases and maybe even rename the qfile to
parquet_complex_..., maybeparquet_complex_types_null_vectorization.qto clearly distinguish from the already existingparquet_complex_types_vectorization.qe.g.:
LIST:
or MAP ("same" as struct but not fixed schema)
or even nested struct to validate the logic on deeper recursive callpaths:
and nested data can contain NULL on different levels where you patch is actually hit I assume, e.g.:
I would appreciate a lot if this patch could validate all of these