Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ POSTHOOK: Output: hdfs://### HDFS PATH ###
3 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general NULL
4 {"x":100,"y":99} NULL 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general NULL
5 {"x":100,"y":99} custom_name 30 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general NULL
6 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general NULL
6 {"x":null,"y":null} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general NULL
7 NULL null NULL 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general NULL
8 NULL null NULL 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}}
9 NULL null NULL 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":null,"address":{"street":null,"city":"Bangalore"}}
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ public abstract class BaseVectorizedColumnReader implements VectorizedColumnRead
protected int definitionLevel;
protected int repetitionLevel;

protected int[] currentDefLevels;
protected int defLevelIndex = 0;

/**
* Repetition/Definition/Value readers.
*/
Expand Down Expand Up @@ -154,6 +157,9 @@ public BaseVectorizedColumnReader(
protected void readRepetitionAndDefinitionLevels() {
repetitionLevel = repetitionLevelColumn.nextInt();
definitionLevel = definitionLevelColumn.nextInt();
if (currentDefLevels != null && defLevelIndex < currentDefLevels.length) {
currentDefLevels[defLevelIndex++] = definitionLevel;
}
valuesRead++;
}

Expand Down Expand Up @@ -309,4 +315,9 @@ int nextInt() {
return 0;
}
}

@Override
public int[] getDefinitionLevels() {
return currentDefLevels;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,8 @@
int total,
ColumnVector column,
TypeInfo columnType) throws IOException;

default int[] getDefinitionLevels() {
return null;

Check warning on line 41 in ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedColumnReader.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Return an empty array instead of null.

See more on https://sonarcloud.io/project/issues?id=apache_hive&issues=AZ1YguXxrVvptaKBtDF1&open=AZ1YguXxrVvptaKBtDF1&pullRequest=6408
}
}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the same problem applies to another complex types? better to have more test cases and maybe even rename the qfile to parquet_complex_..., maybe parquet_complex_types_null_vectorization.q to clearly distinguish from the already existing parquet_complex_types_vectorization.q

e.g.:
LIST:

NULL vs. [null, null] or [1, null]

or MAP ("same" as struct but not fixed schema)

NULL vs. { "a": 1, "b": null } and  { "a": 1, "b": null, "c": null }

or even nested struct to validate the logic on deeper recursive callpaths:

CREATE TABLE test_parquet_nested_struct_nulls (
    id INT,
    st_prim STRUCT<x:INT, y:INT>,
    st_nested STRUCT<x:INT, y:STRUCT<v:INT, w:INT>>
) STORED AS PARQUET;

and nested data can contain NULL on different levels where you patch is actually hit I assume, e.g.:

NULL
{x: 1, y: NULL}
{x: 1, y: {v: 2, w: NULL}
{x: 1, y: {v: 2, w: 3}

I would appreciate a lot if this patch could validate all of these

Original file line number Diff line number Diff line change
Expand Up @@ -459,14 +459,14 @@ private void checkEndOfRowGroup() throws IOException {
columnReaders[i] =
buildVectorizedParquetReader(columnTypesList.get(colsToInclude.get(i)), types.get(i),
pages, requestedSchema.getColumns(), skipTimestampConversion, writerTimezone, skipProlepticConversion,
legacyConversionEnabled, 0);
legacyConversionEnabled, 0, 0);
}
}
} else {
for (int i = 0; i < types.size(); ++i) {
columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(i), types.get(i), pages,
requestedSchema.getColumns(), skipTimestampConversion, writerTimezone, skipProlepticConversion,
legacyConversionEnabled, 0);
legacyConversionEnabled, 0, 0);
}
}

Expand Down Expand Up @@ -522,7 +522,12 @@ private VectorizedColumnReader buildVectorizedParquetReader(
ZoneId writerTimezone,
boolean skipProlepticConversion,
boolean legacyConversionEnabled,
int depth) throws IOException {
int depth, int currentDefLevel) throws IOException {

int typeDefLevel = currentDefLevel;
if (type.isRepetition(Type.Repetition.OPTIONAL) || type.isRepetition(Type.Repetition.REPEATED)) {
typeDefLevel++;
}
List<ColumnDescriptor> descriptors =
getAllColumnDescriptorByType(depth, type, columnDescriptors);
// Support for schema evolution: if the column from the current
Expand All @@ -549,8 +554,8 @@ private VectorizedColumnReader buildVectorizedParquetReader(
List<Type> types = type.asGroupType().getFields();
for (int i = 0; i < fieldTypes.size(); i++) {
VectorizedColumnReader r =
buildVectorizedParquetReader(fieldTypes.get(i), types.get(i), pages, descriptors,
skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, depth + 1);
buildVectorizedParquetReader(fieldTypes.get(i), types.get(i), pages, descriptors, skipTimestampConversion,
writerTimezone, skipProlepticConversion, legacyConversionEnabled, depth + 1, typeDefLevel);
if (r != null) {
fieldReaders.add(r);
} else {
Expand All @@ -559,7 +564,7 @@ private VectorizedColumnReader buildVectorizedParquetReader(
.getTypeName() + " and Parquet type" + types.get(i).toString());
}
}
return new VectorizedStructColumnReader(fieldReaders);
return new VectorizedStructColumnReader(fieldReaders, typeDefLevel);
case LIST:
checkListColumnSupport(((ListTypeInfo) typeInfo).getListElementTypeInfo());
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ public void readBatch(
int total,
ColumnVector column,
TypeInfo columnType) throws IOException {
this.currentDefLevels = new int[total];
this.defLevelIndex = 0;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you elaborate on this change: I mean, I can see that passing and handling definition level to the struct reader solves the current issue, but this looks like fixing another bug, is it the case?

int rowId = 0;
while (total > 0) {
// Compute the number of values we want to read in this page.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@
public class VectorizedStructColumnReader implements VectorizedColumnReader {

private final List<VectorizedColumnReader> fieldReaders;
private final int structDefLevel;

public VectorizedStructColumnReader(List<VectorizedColumnReader> fieldReaders) {
public VectorizedStructColumnReader(List<VectorizedColumnReader> fieldReaders, int structDefLevel) {
this.fieldReaders = fieldReaders;
this.structDefLevel = structDefLevel;
}

@Override
Expand All @@ -46,14 +48,35 @@
fieldReaders.get(i)
.readBatch(total, vectors[i], structTypeInfo.getAllStructFieldTypeInfos().get(i));
structColumnVector.isRepeating = structColumnVector.isRepeating && vectors[i].isRepeating;
}
int[] defLevels = null;
for (VectorizedColumnReader reader : fieldReaders) {
defLevels = reader.getDefinitionLevels();
if (defLevels != null) {
break;
}
}
Comment on lines +53 to +58
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems like repeated logic, could this be handled with getDefinitionLevels?


for (int j = 0; j < vectors[i].isNull.length; j++) {
structColumnVector.isNull[j] =
(i == 0) ? vectors[i].isNull[j] : structColumnVector.isNull[j] && vectors[i].isNull[j];
// Evaluate struct nullability using Parquet Definition Levels
if (defLevels != null) {
for (int j = 0; j < total; j++) {
if (defLevels[j] < structDefLevel) {
// The D-Level boundary crossed the struct. The whole struct is null.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: is D-Level a well-known abbrevation? if so, it's fine, otherwise we can use Definition Level similarly to one comment above

structColumnVector.isNull[j] = true;
structColumnVector.noNulls = false;
}
}
structColumnVector.noNulls =
(i == 0) ? vectors[i].noNulls : structColumnVector.noNulls && vectors[i].noNulls;
}
}

@Override
public int[] getDefinitionLevels() {
for (VectorizedColumnReader reader : fieldReaders) {
int[] defLevels = reader.getDefinitionLevels();
if (defLevels != null) {
return defLevels;
}
}
return null;

Check warning on line 80 in ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedStructColumnReader.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Return an empty array instead of null.

See more on https://sonarcloud.io/project/issues?id=apache_hive&issues=AZ1YguSNrVvptaKBtDFx&open=AZ1YguSNrVvptaKBtDFx&pullRequest=6408
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -317,14 +317,15 @@ protected static void writeData(ParquetWriter<Group> writer, boolean isDictionar
g.addGroup("nsf").append("c", intVal).append("d", intVal);
g.append("e", doubleVal);

Group some_null_g = group.addGroup("struct_field_some_null");
if (i % 2 != 0) {
some_null_g.append("f", intVal);
}
if (i % 3 != 0) {
some_null_g.append("g", doubleVal);
if (i % 6 != 0) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

even if I like the applied math here, which is i % 2 != 0 || i % 3 != 0 + De Morgan law's applied to prevent unnecessary group.addGroup call, I feel we can afford this kind of verbosity in the unit test to be easier to read

Group some_null_g = group.addGroup("struct_field_some_null");
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as you're already touching this part, please fix some_null_g variable naming

if (i % 2 != 0) {
some_null_g.append("f", intVal);
}
if (i % 3 != 0) {
some_null_g.append("g", doubleVal);
}
}

Group mapGroup = group.addGroup("map_field");
if (i % 13 != 1) {
mapGroup.addGroup("map").append("key", binary).append("value", "abc");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
SET hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled=true;
SET hive.fetch.task.conversion=none;

CREATE TABLE test_parquet_struct_nulls (
id INT,
st_prim STRUCT<x:INT, y:INT>
) STORED AS PARQUET;

INSERT INTO test_parquet_struct_nulls VALUES
(1, named_struct('x', CAST(NULL AS INT), 'y', CAST(NULL AS INT))),
(2, if(1=0, named_struct('x', 1, 'y', 1), null)),
(3, named_struct('x', 3, 'y', CAST(NULL AS INT))),
(4, named_struct('x', 4, 'y', 4));

-- Test A: Full table scan to check JSON representation
SELECT * FROM test_parquet_struct_nulls ORDER BY id;

-- Test B: Verify IS NULL evaluates correctly
SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NULL;

-- Test C: Verify IS NOT NULL evaluates correctly
SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NOT NULL ORDER BY id;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of ORDER BY, what about:

-- SORT_QUERY_RESULTS


-- Test D: Verify field-level null evaluation inside a valid struct
SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NOT NULL AND st_prim.x IS NULL;

-- Validate withou vectorization
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: without

SET hive.vectorized.execution.enabled=true;
SET hive.vectorized.execution.reduce.enabled=false;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isn't this simpler and closer to the fact "without vectorization"

SET hive.vectorized.execution.enabled=false;

SELECT * FROM test_parquet_struct_nulls ORDER BY id;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-- SORT_QUERY_RESULTS

Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
PREHOOK: query: CREATE TABLE test_parquet_struct_nulls (
id INT,
st_prim STRUCT<x:INT, y:INT>
) STORED AS PARQUET
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@test_parquet_struct_nulls
POSTHOOK: query: CREATE TABLE test_parquet_struct_nulls (
id INT,
st_prim STRUCT<x:INT, y:INT>
) STORED AS PARQUET
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@test_parquet_struct_nulls
PREHOOK: query: INSERT INTO test_parquet_struct_nulls VALUES
(1, named_struct('x', CAST(NULL AS INT), 'y', CAST(NULL AS INT))),
(2, if(1=0, named_struct('x', 1, 'y', 1), null)),
(3, named_struct('x', 3, 'y', CAST(NULL AS INT))),
(4, named_struct('x', 4, 'y', 4))
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@test_parquet_struct_nulls
POSTHOOK: query: INSERT INTO test_parquet_struct_nulls VALUES
(1, named_struct('x', CAST(NULL AS INT), 'y', CAST(NULL AS INT))),
(2, if(1=0, named_struct('x', 1, 'y', 1), null)),
(3, named_struct('x', 3, 'y', CAST(NULL AS INT))),
(4, named_struct('x', 4, 'y', 4))
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@test_parquet_struct_nulls
POSTHOOK: Lineage: test_parquet_struct_nulls.id SCRIPT []
POSTHOOK: Lineage: test_parquet_struct_nulls.st_prim SCRIPT []
PREHOOK: query: SELECT * FROM test_parquet_struct_nulls ORDER BY id
PREHOOK: type: QUERY
PREHOOK: Input: default@test_parquet_struct_nulls
#### A masked pattern was here ####
POSTHOOK: query: SELECT * FROM test_parquet_struct_nulls ORDER BY id
POSTHOOK: type: QUERY
POSTHOOK: Input: default@test_parquet_struct_nulls
#### A masked pattern was here ####
1 {"x":null,"y":null}
2 NULL
3 {"x":3,"y":null}
4 {"x":4,"y":4}
PREHOOK: query: SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NULL
PREHOOK: type: QUERY
PREHOOK: Input: default@test_parquet_struct_nulls
#### A masked pattern was here ####
POSTHOOK: query: SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NULL
POSTHOOK: type: QUERY
POSTHOOK: Input: default@test_parquet_struct_nulls
#### A masked pattern was here ####
2
PREHOOK: query: SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NOT NULL ORDER BY id
PREHOOK: type: QUERY
PREHOOK: Input: default@test_parquet_struct_nulls
#### A masked pattern was here ####
POSTHOOK: query: SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NOT NULL ORDER BY id
POSTHOOK: type: QUERY
POSTHOOK: Input: default@test_parquet_struct_nulls
#### A masked pattern was here ####
1
3
4
PREHOOK: query: SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NOT NULL AND st_prim.x IS NULL
PREHOOK: type: QUERY
PREHOOK: Input: default@test_parquet_struct_nulls
#### A masked pattern was here ####
POSTHOOK: query: SELECT id FROM test_parquet_struct_nulls WHERE st_prim IS NOT NULL AND st_prim.x IS NULL
POSTHOOK: type: QUERY
POSTHOOK: Input: default@test_parquet_struct_nulls
#### A masked pattern was here ####
1
PREHOOK: query: SELECT * FROM test_parquet_struct_nulls ORDER BY id
PREHOOK: type: QUERY
PREHOOK: Input: default@test_parquet_struct_nulls
#### A masked pattern was here ####
POSTHOOK: query: SELECT * FROM test_parquet_struct_nulls ORDER BY id
POSTHOOK: type: QUERY
POSTHOOK: Input: default@test_parquet_struct_nulls
#### A masked pattern was here ####
1 {"x":null,"y":null}
2 NULL
3 {"x":3,"y":null}
4 {"x":4,"y":4}
Loading