From ecb81ab44d14c9193116ba6a095d1a81d538cdde Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 31 Mar 2023 14:38:16 +0000 Subject: [PATCH 1/5] Try to update arrow library with 11.0.0 release --- contrib/arrow | 2 +- contrib/arrow-cmake/CMakeLists.txt | 25 +++++++++++++++++++++---- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/contrib/arrow b/contrib/arrow index d03245f801f7..1f1b3d35fb6e 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit d03245f801f798c63ee9a7d2b8914a9e5c5cd666 +Subproject commit 1f1b3d35fb6eb73e6492d3afd8a85cde848d174f diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt index 4181f916d63e..161988870752 100644 --- a/contrib/arrow-cmake/CMakeLists.txt +++ b/contrib/arrow-cmake/CMakeLists.txt @@ -202,6 +202,7 @@ set(ARROW_SRCS "${LIBRARY_DIR}/builder.cc" "${LIBRARY_DIR}/buffer.cc" "${LIBRARY_DIR}/chunked_array.cc" + "${LIBRARY_DIR}/chunk_resolver.cc" "${LIBRARY_DIR}/compare.cc" "${LIBRARY_DIR}/config.cc" "${LIBRARY_DIR}/datum.cc" @@ -268,6 +269,10 @@ set(ARROW_SRCS "${LIBRARY_DIR}/util/uri.cc" "${LIBRARY_DIR}/util/utf8.cc" "${LIBRARY_DIR}/util/value_parsing.cc" + "${LIBRARY_DIR}/util/byte_size.cc" + "${LIBRARY_DIR}/util/debug.cc" + "${LIBRARY_DIR}/util/tracing.cc" + "${LIBRARY_DIR}/util/atfork_internal.cc" "${LIBRARY_DIR}/vendored/base64.cpp" "${LIBRARY_DIR}/vendored/datetime/tz.cpp" @@ -301,9 +306,11 @@ set(ARROW_SRCS "${LIBRARY_DIR}/compute/exec/source_node.cc" "${LIBRARY_DIR}/compute/exec/sink_node.cc" "${LIBRARY_DIR}/compute/exec/order_by_impl.cc" + "${LIBRARY_DIR}/compute/exec/partition_util.cc" "${LIBRARY_DIR}/compute/function.cc" "${LIBRARY_DIR}/compute/function_internal.cc" "${LIBRARY_DIR}/compute/kernel.cc" + "${LIBRARY_DIR}/compute/light_array.cc" "${LIBRARY_DIR}/compute/registry.cc" "${LIBRARY_DIR}/compute/kernels/aggregate_basic.cc" "${LIBRARY_DIR}/compute/kernels/aggregate_mode.cc" @@ -317,21 +324,28 @@ set(ARROW_SRCS "${LIBRARY_DIR}/compute/kernels/scalar_cast_boolean.cc" "${LIBRARY_DIR}/compute/kernels/scalar_cast_dictionary.cc" "${LIBRARY_DIR}/compute/kernels/scalar_cast_internal.cc" + "${LIBRARY_DIR}/compute/kernels/scalar_cast_extension.cc" "${LIBRARY_DIR}/compute/kernels/scalar_cast_nested.cc" "${LIBRARY_DIR}/compute/kernels/scalar_cast_numeric.cc" "${LIBRARY_DIR}/compute/kernels/scalar_cast_string.cc" "${LIBRARY_DIR}/compute/kernels/scalar_cast_temporal.cc" "${LIBRARY_DIR}/compute/kernels/scalar_compare.cc" "${LIBRARY_DIR}/compute/kernels/scalar_nested.cc" + "${LIBRARY_DIR}/compute/kernels/scalar_random.cc" + "${LIBRARY_DIR}/compute/kernels/scalar_round.cc" "${LIBRARY_DIR}/compute/kernels/scalar_set_lookup.cc" - "${LIBRARY_DIR}/compute/kernels/scalar_string.cc" "${LIBRARY_DIR}/compute/kernels/scalar_temporal_binary.cc" "${LIBRARY_DIR}/compute/kernels/scalar_temporal_unary.cc" "${LIBRARY_DIR}/compute/kernels/scalar_validity.cc" "${LIBRARY_DIR}/compute/kernels/scalar_if_else.cc" + "${LIBRARY_DIR}/compute/kernels/scalar_string_ascii.cc" + "${LIBRARY_DIR}/compute/kernels/scalar_string_utf8.cc" "${LIBRARY_DIR}/compute/kernels/util_internal.cc" "${LIBRARY_DIR}/compute/kernels/vector_array_sort.cc" + "${LIBRARY_DIR}/compute/kernels/vector_cumulative_ops.cc" "${LIBRARY_DIR}/compute/kernels/vector_hash.cc" + "${LIBRARY_DIR}/compute/kernels/vector_rank.cc" + "${LIBRARY_DIR}/compute/kernels/vector_select_k.cc" "${LIBRARY_DIR}/compute/kernels/vector_nested.cc" "${LIBRARY_DIR}/compute/kernels/vector_replace.cc" "${LIBRARY_DIR}/compute/kernels/vector_selection.cc" @@ -340,13 +354,15 @@ set(ARROW_SRCS "${LIBRARY_DIR}/compute/exec/union_node.cc" "${LIBRARY_DIR}/compute/exec/key_hash.cc" "${LIBRARY_DIR}/compute/exec/key_map.cc" - "${LIBRARY_DIR}/compute/exec/key_compare.cc" - "${LIBRARY_DIR}/compute/exec/key_encode.cc" "${LIBRARY_DIR}/compute/exec/util.cc" "${LIBRARY_DIR}/compute/exec/hash_join_dict.cc" "${LIBRARY_DIR}/compute/exec/hash_join.cc" "${LIBRARY_DIR}/compute/exec/hash_join_node.cc" "${LIBRARY_DIR}/compute/exec/task_util.cc" + "${LIBRARY_DIR}/compute/row/encode_internal.cc" + "${LIBRARY_DIR}/compute/row/grouper.cc" + "${LIBRARY_DIR}/compute/row/compare_internal.cc" + "${LIBRARY_DIR}/compute/row/row_internal.cc" "${LIBRARY_DIR}/ipc/dictionary.cc" "${LIBRARY_DIR}/ipc/feather.cc" @@ -357,7 +373,8 @@ set(ARROW_SRCS "${LIBRARY_DIR}/ipc/writer.cc" "${ARROW_SRC_DIR}/arrow/adapters/orc/adapter.cc" - "${ARROW_SRC_DIR}/arrow/adapters/orc/adapter_util.cc" + "${ARROW_SRC_DIR}/arrow/adapters/orc/util.cc" + "${ARROW_SRC_DIR}/arrow/adapters/orc/options.cc" ) add_definitions(-DARROW_WITH_LZ4) From 14e0269e953da0a493a6d3214d387718724e2f48 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 31 Mar 2023 15:56:54 +0000 Subject: [PATCH 2/5] Fix build --- .../Formats/Impl/ParquetBlockOutputFormat.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp index 759f773a574b..b4ed337fa732 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp @@ -95,14 +95,14 @@ void ParquetBlockOutputFormat::consume(Chunk chunk) builder.version(getParquetVersion(format_settings)); builder.compression(getParquetCompression(format_settings.parquet.output_compression_method)); auto props = builder.build(); - auto status = parquet::arrow::FileWriter::Open( + auto result = parquet::arrow::FileWriter::Open( *arrow_table->schema(), arrow::default_memory_pool(), sink, - props, /*parquet::default_writer_properties(),*/ - &file_writer); - if (!status.ok()) - throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while opening a table: {}", status.ToString()); + props); + if (!result.ok()) + throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while opening a table: {}", result.status().ToString()); + file_writer = std::move(result.ValueOrDie()); } // TODO: calculate row_group_size depending on a number of rows and table size From 2573c9c9b85fc1ecb1e743753e2c2bf62910fb2e Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 31 Mar 2023 18:37:12 +0000 Subject: [PATCH 3/5] Fix tests --- .../Formats/Impl/ArrowFieldIndexUtil.h | 18 +++++------------- .../Formats/Impl/ORCBlockInputFormat.cpp | 2 +- .../Formats/Impl/ParquetBlockInputFormat.cpp | 2 +- 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/src/Processors/Formats/Impl/ArrowFieldIndexUtil.h b/src/Processors/Formats/Impl/ArrowFieldIndexUtil.h index ba39d94fcf1c..5aded52c6122 100644 --- a/src/Processors/Formats/Impl/ArrowFieldIndexUtil.h +++ b/src/Processors/Formats/Impl/ArrowFieldIndexUtil.h @@ -21,9 +21,7 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; } -/// For ORC format, index_nested_type = true, a nested type takes one index count. And the -/// the start index for ORC format should be 1, since index 0 indicates to select all columns. -template + class ArrowFieldIndexUtil { public: @@ -46,9 +44,7 @@ class ArrowFieldIndexUtil calculateFieldIndices(const arrow::Schema & schema) { std::unordered_map> result; - // For format like ORC, index = 0 indicates to select all columns, so we skip 0 and start - // from 1. - int index_start = index_nested_type; + int index_start = 0; for (int i = 0; i < schema.num_fields(); ++i) { const auto & field = schema.field(i); @@ -94,17 +90,16 @@ class ArrowFieldIndexUtil } /// Count the number of indices for types. - /// For orc format, index_nested_type is true, a complex type takes one index. size_t countIndicesForType(std::shared_ptr type) { if (type->id() == arrow::Type::LIST) { - return countIndicesForType(static_cast(type.get())->value_type()) + index_nested_type; + return countIndicesForType(static_cast(type.get())->value_type()); } if (type->id() == arrow::Type::STRUCT) { - int indices = index_nested_type; + int indices = 0; auto * struct_type = static_cast(type.get()); for (int i = 0; i != struct_type->num_fields(); ++i) indices += countIndicesForType(struct_type->field(i)->type()); @@ -114,7 +109,7 @@ class ArrowFieldIndexUtil if (type->id() == arrow::Type::MAP) { auto * map_type = static_cast(type.get()); - return countIndicesForType(map_type->key_type()) + countIndicesForType(map_type->item_type()) + index_nested_type; + return countIndicesForType(map_type->key_type()) + countIndicesForType(map_type->item_type()) ; } return 1; @@ -144,8 +139,6 @@ class ArrowFieldIndexUtil index_info.first = current_start_index; if (field_type->id() == arrow::Type::STRUCT) { - current_start_index += index_nested_type; - auto * struct_type = static_cast(field_type.get()); for (int i = 0, n = struct_type->num_fields(); i < n; ++i) { @@ -161,7 +154,6 @@ class ArrowFieldIndexUtil const auto * list_type = static_cast(field_type.get()); const auto value_field = list_type->value_field(); auto index_snapshot = current_start_index; - current_start_index += index_nested_type; calculateFieldIndices(*value_field, field_name, current_start_index, result, name_prefix); // The nested struct field has the same name as this list field. // rewrite it back to the original value. diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index caa9c1e1c0f1..b41864865daa 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -130,7 +130,7 @@ void ORCBlockInputFormat::prepareReader() format_settings.null_as_default, format_settings.orc.case_insensitive_column_matching); - ArrowFieldIndexUtil field_util( + ArrowFieldIndexUtil field_util( format_settings.orc.case_insensitive_column_matching, format_settings.orc.allow_missing_columns); include_indices = field_util.findRequiredIndices(getPort().getHeader(), *schema); diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 210b4485d174..1f14e052fc25 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -139,7 +139,7 @@ void ParquetBlockInputFormat::prepareReader() format_settings.null_as_default, format_settings.parquet.case_insensitive_column_matching); - ArrowFieldIndexUtil field_util( + ArrowFieldIndexUtil field_util( format_settings.parquet.case_insensitive_column_matching, format_settings.parquet.allow_missing_columns); column_indices = field_util.findRequiredIndices(getPort().getHeader(), *schema); From 3c5530ad03fd4732c0a281072e2122334276b443 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 3 Apr 2023 14:02:45 +0000 Subject: [PATCH 4/5] Fix tests, update working with column indexes in ORC format --- .../Formats/Impl/ORCBlockInputFormat.cpp | 15 +++++++++++---- .../0_stateless/00900_long_parquet_load.reference | 7 +++++-- .../datapage_v2.snappy.parquet.columns | 2 +- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index b41864865daa..81d274ee380f 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -130,10 +130,17 @@ void ORCBlockInputFormat::prepareReader() format_settings.null_as_default, format_settings.orc.case_insensitive_column_matching); - ArrowFieldIndexUtil field_util( - format_settings.orc.case_insensitive_column_matching, - format_settings.orc.allow_missing_columns); - include_indices = field_util.findRequiredIndices(getPort().getHeader(), *schema); + const bool ignore_case = format_settings.orc.case_insensitive_column_matching; + std::unordered_set nested_table_names; + if (format_settings.orc.import_nested) + nested_table_names = Nested::getAllTableNames(getPort().getHeader(), ignore_case); + + for (int i = 0; i < schema->num_fields(); ++i) + { + const auto & name = schema->field(i)->name(); + if (getPort().getHeader().has(name, ignore_case) || nested_table_names.contains(ignore_case ? boost::to_lower_copy(name) : name)) + include_indices.push_back(i); + } } ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) diff --git a/tests/queries/0_stateless/00900_long_parquet_load.reference b/tests/queries/0_stateless/00900_long_parquet_load.reference index 72ec99ad2c69..1ca2fbc2fd68 100644 --- a/tests/queries/0_stateless/00900_long_parquet_load.reference +++ b/tests/queries/0_stateless/00900_long_parquet_load.reference @@ -92,8 +92,11 @@ idx10 ['This','is','a','test'] 123 1 456 2 === Try load data from datapage_v2.snappy.parquet -Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Unknown encoding type.: While executing ParquetBlockInputFormat: data for INSERT was parsed from stdin: (in query: INSERT INTO parquet_load FORMAT Parquet). (CANNOT_READ_ALL_DATA) - +abc 1 2 1 [1,2,3] +abc 2 3 1 [] +abc 3 4 1 [] +\N 4 5 0 [1,2,3] +abc 5 2 1 [1,2] === Try load data from datatype-date32.parquet 1925-01-01 1949-10-01 diff --git a/tests/queries/0_stateless/data_parquet/datapage_v2.snappy.parquet.columns b/tests/queries/0_stateless/data_parquet/datapage_v2.snappy.parquet.columns index c6bb5057cc29..dc094bef8ede 100644 --- a/tests/queries/0_stateless/data_parquet/datapage_v2.snappy.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/datapage_v2.snappy.parquet.columns @@ -1 +1 @@ -`a` Nullable(String), `b` Array(Nullable(Int32)), `c` Nullable(Float64), `d` Nullable(UInt8), `e` Array(Nullable(Int32)) +`a` Nullable(String), `b` Nullable(Int32), `c` Nullable(Float64), `d` Nullable(UInt8), `e` Array(Nullable(Int32)) From 16306bfa8d97b31ec2e4543dcb8781ee3b28c5a6 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 3 Apr 2023 19:05:22 +0000 Subject: [PATCH 5/5] Try fix compatibility check, add expf and scalbnf implementation from musl --- base/glibc-compatibility/musl/expf.c | 81 +++++++++++++++++++++++++ base/glibc-compatibility/musl/scalbnf.c | 31 ++++++++++ 2 files changed, 112 insertions(+) create mode 100644 base/glibc-compatibility/musl/expf.c create mode 100644 base/glibc-compatibility/musl/scalbnf.c diff --git a/base/glibc-compatibility/musl/expf.c b/base/glibc-compatibility/musl/expf.c new file mode 100644 index 000000000000..0a59236d1c02 --- /dev/null +++ b/base/glibc-compatibility/musl/expf.c @@ -0,0 +1,81 @@ +/* origin: FreeBSD /usr/src/lib/msun/src/e_expf.c */ +/* + * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com. + */ +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +#include "libm.h" + +static const float + half[2] = {0.5,-0.5}, + ln2hi = 6.9314575195e-1f, /* 0x3f317200 */ + ln2lo = 1.4286067653e-6f, /* 0x35bfbe8e */ + invln2 = 1.4426950216e+0f, /* 0x3fb8aa3b */ + /* + * Domain [-0.34568, 0.34568], range ~[-4.278e-9, 4.447e-9]: + * |x*(exp(x)+1)/(exp(x)-1) - p(x)| < 2**-27.74 + */ + P1 = 1.6666625440e-1f, /* 0xaaaa8f.0p-26 */ + P2 = -2.7667332906e-3f; /* -0xb55215.0p-32 */ + +float expf(float x) +{ + float_t hi, lo, c, xx, y; + int k, sign; + uint32_t hx; + + GET_FLOAT_WORD(hx, x); + sign = hx >> 31; /* sign bit of x */ + hx &= 0x7fffffff; /* high word of |x| */ + + /* special cases */ + if (hx >= 0x42aeac50) { /* if |x| >= -87.33655f or NaN */ + if (hx >= 0x42b17218 && !sign) { /* x >= 88.722839f */ + /* overflow */ + x *= 0x1p127f; + return x; + } + if (sign) { + /* underflow */ + FORCE_EVAL(-0x1p-149f/x); + if (hx >= 0x42cff1b5) /* x <= -103.972084f */ + return 0; + } + } + + /* argument reduction */ + if (hx > 0x3eb17218) { /* if |x| > 0.5 ln2 */ + if (hx > 0x3f851592) /* if |x| > 1.5 ln2 */ + k = invln2*x + half[sign]; + else + k = 1 - sign - sign; + hi = x - k*ln2hi; /* k*ln2hi is exact here */ + lo = k*ln2lo; + x = hi - lo; + } else if (hx > 0x39000000) { /* |x| > 2**-14 */ + k = 0; + hi = x; + lo = 0; + } else { + /* raise inexact */ + FORCE_EVAL(0x1p127f + x); + return 1 + x; + } + + /* x is now in primary range */ + xx = x*x; + c = x - xx*(P1+xx*P2); + y = 1 + (x*c/(2-c) - lo + hi); + if (k == 0) + return y; + return scalbnf(y, k); +} \ No newline at end of file diff --git a/base/glibc-compatibility/musl/scalbnf.c b/base/glibc-compatibility/musl/scalbnf.c new file mode 100644 index 000000000000..cf56cacfb5f8 --- /dev/null +++ b/base/glibc-compatibility/musl/scalbnf.c @@ -0,0 +1,31 @@ +#include +#include + +float scalbnf(float x, int n) +{ + union {float f; uint32_t i;} u; + float_t y = x; + + if (n > 127) { + y *= 0x1p127f; + n -= 127; + if (n > 127) { + y *= 0x1p127f; + n -= 127; + if (n > 127) + n = 127; + } + } else if (n < -126) { + y *= 0x1p-126f; + n += 126; + if (n < -126) { + y *= 0x1p-126f; + n += 126; + if (n < -126) + n = -126; + } + } + u.i = (uint32_t)(0x7f+n)<<23; + x = y * u.f; + return x; +}