Skip to content

Commit 5b750f1

Browse files
committed
Merge branch 'master' into PARQUET-2171-vector-io-integrated
2 parents cb4eee0 + 45e14a5 commit 5b750f1

File tree

106 files changed

+3336
-649
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

106 files changed

+3336
-649
lines changed

.github/dependabot.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,3 @@ updates:
2929
schedule:
3030
interval: "weekly"
3131
day: "sunday"
32-
open-pull-requests-limit: 5

parquet-arrow/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
<url>https://parquet.apache.org</url>
3434

3535
<properties>
36-
<arrow.version>0.10.0</arrow.version>
36+
<arrow.version>14.0.1</arrow.version>
3737
</properties>
3838

3939
<dependencies>

parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/List3Levels.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
import static org.apache.parquet.schema.Type.Repetition.REPEATED;
2222

2323
import org.apache.parquet.schema.GroupType;
24-
import org.apache.parquet.schema.OriginalType;
24+
import org.apache.parquet.schema.LogicalTypeAnnotation;
2525
import org.apache.parquet.schema.Type;
2626

2727
/**
@@ -41,7 +41,7 @@ class List3Levels {
4141
* @param list the Parquet List
4242
*/
4343
public List3Levels(GroupType list) {
44-
if (list.getOriginalType() != OriginalType.LIST || list.getFields().size() != 1) {
44+
if (list.getLogicalTypeAnnotation() != LogicalTypeAnnotation.listType() || list.getFields().size() != 1) {
4545
throw new IllegalArgumentException("invalid list type: " + list);
4646
}
4747
this.list = list;
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.parquet.arrow.schema;
20+
21+
import org.apache.parquet.schema.GroupType;
22+
import org.apache.parquet.schema.LogicalTypeAnnotation;
23+
import org.apache.parquet.schema.OriginalType;
24+
import org.apache.parquet.schema.Type;
25+
26+
import static org.apache.parquet.schema.Type.Repetition.REPEATED;
27+
28+
/**
29+
* Represents a standard 3 levels Parquet map
30+
* - optional map
31+
* - repeated key_value
32+
* - required key, optional value
33+
*/
34+
class Map3Levels {
35+
private final GroupType map;
36+
private final GroupType repeated;
37+
private final Type key;
38+
private final Type value;
39+
40+
/**
41+
* Will validate the structure of the map
42+
* @param map the Parquet map
43+
*/
44+
public Map3Levels(GroupType map) {
45+
if (map.getLogicalTypeAnnotation() != LogicalTypeAnnotation.mapType() || map.getFields().size() != 1) {
46+
throw new IllegalArgumentException("invalid map type: " + map);
47+
}
48+
this.map = map;
49+
Type repeatedField = map.getFields().get(0);
50+
if (repeatedField.isPrimitive() || !repeatedField.isRepetition(REPEATED) || repeatedField.asGroupType().getFields().size() != 2) {
51+
throw new IllegalArgumentException("invalid map key: " + map);
52+
}
53+
this.repeated = repeatedField.asGroupType();
54+
this.key = repeated.getFields().get(0);
55+
this.value = repeated.getFields().get(1);
56+
}
57+
58+
/**
59+
* @return the root map element (an optional group with two children)
60+
*/
61+
public GroupType getMap() {
62+
return map;
63+
}
64+
65+
/**
66+
* @return repeated level, single child of map
67+
*/
68+
public GroupType getRepeated() {
69+
return repeated;
70+
}
71+
72+
/**
73+
* @return the key level
74+
*/
75+
public Type getKey() {
76+
return key;
77+
}
78+
79+
/**
80+
* @return the element level, single child of repeated.
81+
*/
82+
public Type getValue() {
83+
return value;
84+
}
85+
86+
}

parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java

Lines changed: 101 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
import static org.apache.parquet.schema.Type.Repetition.REQUIRED;
4343

4444
import java.util.ArrayList;
45+
import java.util.Collections;
4546
import java.util.List;
4647
import java.util.Optional;
4748

@@ -64,6 +65,7 @@
6465
import org.apache.arrow.vector.types.pojo.ArrowType.Union;
6566
import org.apache.arrow.vector.types.pojo.ArrowType.Utf8;
6667
import org.apache.arrow.vector.types.pojo.Field;
68+
import org.apache.arrow.vector.types.pojo.FieldType;
6769
import org.apache.arrow.vector.types.pojo.Schema;
6870
import org.apache.parquet.arrow.schema.SchemaMapping.ListTypeMapping;
6971
import org.apache.parquet.arrow.schema.SchemaMapping.PrimitiveTypeMapping;
@@ -158,6 +160,11 @@ public TypeMapping visit(org.apache.arrow.vector.types.pojo.ArrowType.List type)
158160
return createListTypeMapping();
159161
}
160162

163+
@Override
164+
public TypeMapping visit(ArrowType.LargeList largeList) {
165+
return createListTypeMapping();
166+
}
167+
161168
@Override
162169
public TypeMapping visit(org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList type) {
163170
return createListTypeMapping();
@@ -179,6 +186,17 @@ public TypeMapping visit(Union type) {
179186
return new UnionTypeMapping(field, addToBuilder(parquetTypes, Types.buildGroup(OPTIONAL)).named(fieldName), parquetTypes);
180187
}
181188

189+
@Override
190+
public TypeMapping visit(ArrowType.Map map) {
191+
if (children.size() != 2) {
192+
throw new IllegalArgumentException("Map fields must have exactly two children: " + field);
193+
}
194+
TypeMapping keyChild = fromArrow(children.get(0), "key");
195+
TypeMapping valueChild = fromArrow(children.get(1), "value");
196+
GroupType groupType = Types.optionalMap().key(keyChild.getParquetType()).value(valueChild.getParquetType()).named(fieldName);
197+
return new SchemaMapping.MapTypeMapping(field, new Map3Levels(groupType), keyChild, valueChild);
198+
}
199+
182200
@Override
183201
public TypeMapping visit(Int type) {
184202
boolean signed = type.getIsSigned();
@@ -214,11 +232,21 @@ public TypeMapping visit(Utf8 type) {
214232
return primitive(BINARY, stringType());
215233
}
216234

235+
@Override
236+
public TypeMapping visit(ArrowType.LargeUtf8 largeUtf8) {
237+
return primitive(BINARY, stringType());
238+
}
239+
217240
@Override
218241
public TypeMapping visit(Binary type) {
219242
return primitive(BINARY);
220243
}
221244

245+
@Override
246+
public TypeMapping visit(ArrowType.LargeBinary largeBinary) {
247+
return primitive(BINARY);
248+
}
249+
222250
@Override
223251
public TypeMapping visit(Bool type) {
224252
return primitive(BOOLEAN);
@@ -289,6 +317,16 @@ public TypeMapping visit(Interval type) {
289317
return primitiveFLBA(12, LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance());
290318
}
291319

320+
@Override
321+
public TypeMapping visit(ArrowType.Duration duration) {
322+
return primitiveFLBA(12, LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance());
323+
}
324+
325+
@Override
326+
public TypeMapping visit(ArrowType.ExtensionType type) {
327+
return ArrowTypeVisitor.super.visit(type);
328+
}
329+
292330
@Override
293331
public TypeMapping visit(ArrowType.FixedSizeBinary fixedSizeBinary) {
294332
return primitive(BINARY);
@@ -358,7 +396,7 @@ private TypeMapping fromParquet(Type type, String name, Repetition repetition) {
358396
if (repetition == REPEATED) {
359397
// case where we have a repeated field that is not in a List/Map
360398
TypeMapping child = fromParquet(type, null, REQUIRED);
361-
Field arrowField = new Field(name, false, new ArrowType.List(), asList(child.getArrowField()));
399+
Field arrowField = new Field(name, FieldType.notNullable(new ArrowType.List()), Collections.singletonList(child.getArrowField()));
362400
return new RepeatedTypeMapping(arrowField, type, child);
363401
}
364402
if (type.isPrimitive()) {
@@ -376,18 +414,32 @@ private TypeMapping fromParquet(Type type, String name, Repetition repetition) {
376414
private TypeMapping fromParquetGroup(GroupType type, String name) {
377415
LogicalTypeAnnotation logicalType = type.getLogicalTypeAnnotation();
378416
if (logicalType == null) {
417+
final FieldType field;
418+
if (type.isRepetition(OPTIONAL)) {
419+
field = FieldType.nullable(new Struct());
420+
} else {
421+
field = FieldType.notNullable(new Struct());
422+
}
379423
List<TypeMapping> typeMappings = fromParquet(type.getFields());
380-
Field arrowField = new Field(name, type.isRepetition(OPTIONAL), new Struct(), fields(typeMappings));
424+
Field arrowField = new Field(name, field, fields(typeMappings));
381425
return new StructTypeMapping(arrowField, type, typeMappings);
382426
} else {
383427
return logicalType.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<TypeMapping>() {
384428
@Override
385429
public Optional<TypeMapping> visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) {
386430
List3Levels list3Levels = new List3Levels(type);
387431
TypeMapping child = fromParquet(list3Levels.getElement(), null, list3Levels.getElement().getRepetition());
388-
Field arrowField = new Field(name, type.isRepetition(OPTIONAL), new ArrowType.List(), asList(child.getArrowField()));
432+
Field arrowField = new Field(name, FieldType.nullable(new ArrowType.List()), Collections.singletonList(child.getArrowField()));
389433
return of(new ListTypeMapping(arrowField, list3Levels, child));
390434
}
435+
@Override
436+
public Optional<TypeMapping> visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) {
437+
Map3Levels map3levels = new Map3Levels(type);
438+
TypeMapping keyType = fromParquet(map3levels.getKey(), null, map3levels.getKey().getRepetition());
439+
TypeMapping valueType = fromParquet(map3levels.getValue(), null, map3levels.getValue().getRepetition());
440+
Field arrowField = new Field(name, FieldType.nullable(new ArrowType.Map(false)), asList(keyType.getArrowField(), valueType.getArrowField()));
441+
return of(new SchemaMapping.MapTypeMapping(arrowField, map3levels, keyType, valueType));
442+
}
391443
}).orElseThrow(() -> new UnsupportedOperationException("Unsupported type " + type));
392444
}
393445
}
@@ -401,7 +453,12 @@ private TypeMapping fromParquetPrimitive(final PrimitiveType type, final String
401453
return type.getPrimitiveTypeName().convert(new PrimitiveType.PrimitiveTypeNameConverter<TypeMapping, RuntimeException>() {
402454

403455
private TypeMapping field(ArrowType arrowType) {
404-
Field field = new Field(name, type.isRepetition(OPTIONAL), arrowType, null);
456+
final Field field;
457+
if (type.isRepetition(OPTIONAL)) {
458+
field = Field.nullable(name, arrowType);
459+
} else {
460+
field = Field.notNullable(name, arrowType);
461+
}
405462
return new PrimitiveTypeMapping(field, type);
406463
}
407464

@@ -607,6 +664,11 @@ public TypeMapping visit(org.apache.arrow.vector.types.pojo.ArrowType.List type)
607664
return createListTypeMapping(type);
608665
}
609666

667+
@Override
668+
public TypeMapping visit(ArrowType.LargeList largeList) {
669+
return createListTypeMapping(largeList);
670+
}
671+
610672
@Override
611673
public TypeMapping visit(org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList type) {
612674
return createListTypeMapping(type);
@@ -639,6 +701,26 @@ public TypeMapping visit(Union type) {
639701
return new UnionTypeMapping(arrowField, groupType, map(arrowField.getChildren(), groupType.getFields()));
640702
}
641703

704+
@Override
705+
public TypeMapping visit(ArrowType.Map map) {
706+
if (arrowField.getChildren().size() != 2) {
707+
throw new IllegalArgumentException("Invalid map type: " + map);
708+
}
709+
if (parquetField.isPrimitive()) {
710+
throw new IllegalArgumentException("Parquet type not a group: " + parquetField);
711+
}
712+
Map3Levels map3levels = new Map3Levels(parquetField.asGroupType());
713+
if (arrowField.getChildren().size() != 2) {
714+
throw new IllegalArgumentException("invalid arrow map: " + arrowField);
715+
}
716+
Field keyChild = arrowField.getChildren().get(0);
717+
Field valueChild = arrowField.getChildren().get(1);
718+
return new SchemaMapping.MapTypeMapping(arrowField, map3levels,
719+
map(keyChild, map3levels.getKey()),
720+
map(valueChild, map3levels.getValue())
721+
);
722+
}
723+
642724
@Override
643725
public TypeMapping visit(Int type) {
644726
return primitive();
@@ -654,11 +736,21 @@ public TypeMapping visit(Utf8 type) {
654736
return primitive();
655737
}
656738

739+
@Override
740+
public TypeMapping visit(ArrowType.LargeUtf8 largeUtf8) {
741+
return primitive();
742+
}
743+
657744
@Override
658745
public TypeMapping visit(Binary type) {
659746
return primitive();
660747
}
661748

749+
@Override
750+
public TypeMapping visit(ArrowType.LargeBinary largeBinary) {
751+
return primitive();
752+
}
753+
662754
@Override
663755
public TypeMapping visit(Bool type) {
664756
return primitive();
@@ -689,6 +781,11 @@ public TypeMapping visit(Interval type) {
689781
return primitive();
690782
}
691783

784+
@Override
785+
public TypeMapping visit(ArrowType.Duration duration) {
786+
return primitive();
787+
}
788+
692789
@Override
693790
public TypeMapping visit(ArrowType.FixedSizeBinary fixedSizeBinary) {
694791
return primitive();

0 commit comments

Comments
 (0)