From b15ce9126dcc596c953fe202e6834cebdfca6080 Mon Sep 17 00:00:00 2001 From: Laughing Date: Wed, 3 Jul 2024 09:12:54 +0800 Subject: [PATCH] HIVE-28262:Single column use MultiDelimitSerDe parse column error (#5252)(Liu Weizheng, reviewed by Butao Zhang) --- .../hadoop/hive/serde2/lazy/LazyStruct.java | 4 +- .../hive/serde2/lazy/TestLazyStruct.java | 82 +++++++++++++++++++ 2 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyStruct.java diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java index 2848d348e30d..3bd92cc7b5b8 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java @@ -300,7 +300,7 @@ public void parseMultiDelimit(byte[] rawRow, byte[] fieldDelimit) { // first field always starts from 0, even when missing startPosition[0] = 0; for (int i = 1; i <= fields.length; i++) { - if (fields.length > 1 && delimitIndexes[i - 1] != -1) { + if (delimitIndexes[i - 1] != -1) { int start = delimitIndexes[i - 1] + fieldDelimit.length; startPosition[i] = start - i * diff; } else { @@ -313,7 +313,7 @@ public void parseMultiDelimit(byte[] rawRow, byte[] fieldDelimit) { // find all the indexes of the sub byte[] private int[] findIndexes(byte[] array, byte[] target) { - if (fields.length <= 1) { + if (fields.length < 1) { return new int[0]; } int[] indexes = new int[fields.length]; diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyStruct.java b/serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyStruct.java new file mode 100644 index 000000000000..a1f9b695f07a --- /dev/null +++ b/serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyStruct.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.lazy; + +import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyObjectInspectorParameters; +import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyObjectInspectorParametersImpl; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Test; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +/** + * TestLazyStruct.
+ */ +public class TestLazyStruct { + + @Test + public void testParseMultiDelimit() throws Throwable { + try { + // single field named id + List structFieldNames = new ArrayList<>(); + structFieldNames.add("id"); + // field type is string + List fieldTypes = new ArrayList<>(); + PrimitiveTypeInfo primitiveTypeInfo = new PrimitiveTypeInfo(); + primitiveTypeInfo.setTypeName("string"); + fieldTypes.add(primitiveTypeInfo); + // separators + escapeChar => "|" + byte[] separators = new byte[]{124, 2, 3, 4, 5, 6, 7, 8}; + + // sequence =>"\N" + Text sequence = new Text(); + sequence.set(new byte[]{92, 78}); + + // create lazy object inspector parameters + LazyObjectInspectorParameters lazyObjectInspectorParameters = new LazyObjectInspectorParametersImpl(false, (byte) '0', + false, null, separators, sequence); + // create a lazy struct inspector + ObjectInspector lazyStructInspector = LazyFactory.createLazyStructInspector(structFieldNames, fieldTypes, lazyObjectInspectorParameters); + LazyStruct lazyStruct = (LazyStruct) LazyFactory.createLazyObject(lazyStructInspector); + + // origin row data + String rowData = "1|@|"; + // row field delimiter + String fieldDelimiter = "|@|"; + + // parse row use multi delimit + lazyStruct.parseMultiDelimit(rowData.getBytes(StandardCharsets.UTF_8), + fieldDelimiter.getBytes(StandardCharsets.UTF_8)); + + // check the first field and second field start position index + // before fix result: 0,1 + // after fix result: 0,2 + Assert.assertArrayEquals(new int[]{0, 2}, lazyStruct.startPosition); + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + + } +} \ No newline at end of file