diff --git a/be/src/exec/sink/writer/iceberg/iceberg_partition_path.cpp b/be/src/exec/sink/writer/iceberg/iceberg_partition_path.cpp new file mode 100644 index 00000000000000..ecde5ffba3f126 --- /dev/null +++ b/be/src/exec/sink/writer/iceberg/iceberg_partition_path.cpp @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exec/sink/writer/iceberg/iceberg_partition_path.h" + +namespace doris { + +namespace { + +bool is_unescaped_url_encoder_char(unsigned char ch) { + return ('A' <= ch && ch <= 'Z') || ('a' <= ch && ch <= 'z') || ('0' <= ch && ch <= '9') || + ch == '.' || ch == '-' || ch == '*' || ch == '_'; +} + +char hex_digit(unsigned char value) { + return value < 10 ? static_cast('0' + value) : static_cast('A' + value - 10); +} + +} // namespace + +std::string IcebergPartitionPath::escape(const std::string& path) { + std::string escaped; + escaped.reserve(path.size()); + for (unsigned char ch : path) { + if (is_unescaped_url_encoder_char(ch)) { + escaped.push_back(static_cast(ch)); + } else if (ch == ' ') { + escaped.push_back('+'); + } else { + escaped.push_back('%'); + escaped.push_back(hex_digit(ch >> 4)); + escaped.push_back(hex_digit(ch & 0x0F)); + } + } + return escaped; +} + +} // namespace doris diff --git a/be/src/exec/sink/writer/iceberg/iceberg_partition_path.h b/be/src/exec/sink/writer/iceberg/iceberg_partition_path.h new file mode 100644 index 00000000000000..0b34493e22848c --- /dev/null +++ b/be/src/exec/sink/writer/iceberg/iceberg_partition_path.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +namespace doris { + +class IcebergPartitionPath { +public: + // Match Iceberg Java PartitionSpec.partitionToPath, which uses URLEncoder with UTF-8. + static std::string escape(const std::string& path); + +private: + IcebergPartitionPath() = default; +}; + +} // namespace doris diff --git a/be/src/exec/sink/writer/iceberg/viceberg_table_writer.cpp b/be/src/exec/sink/writer/iceberg/viceberg_table_writer.cpp index 5584491e99c452..828f6cdf86da42 100644 --- a/be/src/exec/sink/writer/iceberg/viceberg_table_writer.cpp +++ b/be/src/exec/sink/writer/iceberg/viceberg_table_writer.cpp @@ -25,10 +25,10 @@ #include "core/column/column_vector.h" #include "core/data_type/data_type_nullable.h" #include "core/data_type_serde/data_type_serde.h" +#include "exec/sink/writer/iceberg/iceberg_partition_path.h" #include "exec/sink/writer/iceberg/partition_transformers.h" #include "exec/sink/writer/iceberg/viceberg_partition_writer.h" #include "exec/sink/writer/iceberg/viceberg_sort_writer.h" -#include "exec/sink/writer/vhive_utils.h" #include "exprs/vexpr.h" #include "exprs/vexpr_context.h" #include "format/table/iceberg/partition_spec_parser.h" @@ -516,7 +516,7 @@ std::string VIcebergTableWriter::_partition_to_path(const doris::iceberg::Struct } std::string VIcebergTableWriter::_escape(const std::string& path) { - return VHiveUtils::escape_path_name(path); + return IcebergPartitionPath::escape(path); } std::vector VIcebergTableWriter::_partition_values( diff --git a/be/test/exec/sink/writer/iceberg/iceberg_partition_path_test.cpp b/be/test/exec/sink/writer/iceberg/iceberg_partition_path_test.cpp new file mode 100644 index 00000000000000..fc024bfb213443 --- /dev/null +++ b/be/test/exec/sink/writer/iceberg/iceberg_partition_path_test.cpp @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exec/sink/writer/iceberg/iceberg_partition_path.h" + +#include + +namespace doris { + +class IcebergPartitionPathTest : public testing::Test {}; + +TEST_F(IcebergPartitionPathTest, test_escape_matches_iceberg_partition_spec_path_encoding) { + EXPECT_EQ("", IcebergPartitionPath::escape("")); + EXPECT_EQ("abcXYZ012.-*_", IcebergPartitionPath::escape("abcXYZ012.-*_")); + EXPECT_EQ("with+space", IcebergPartitionPath::escape("with space")); + EXPECT_EQ("slash%2Fcolon%3Aequals%3Dpercent%25question%3F", + IcebergPartitionPath::escape("slash/colon:equals=percent%question?")); + EXPECT_EQ("quote%22hash%23brackets%5B%5Dcaret%5E", + IcebergPartitionPath::escape("quote\"hash#brackets[]caret^")); + EXPECT_EQ("tilde%7Ebang%21plus%2B", IcebergPartitionPath::escape("tilde~bang!plus+")); + EXPECT_EQ( + "with%CC%81combining+character", + IcebergPartitionPath::escape(std::string("with") + "\xCC\x81" + "combining character")); +} + +} // namespace doris diff --git a/regression-test/suites/external_table_p0/iceberg/write/test_iceberg_write_partition_path.groovy b/regression-test/suites/external_table_p0/iceberg/write/test_iceberg_write_partition_path.groovy new file mode 100644 index 00000000000000..5a1946e20aa2bb --- /dev/null +++ b/regression-test/suites/external_table_p0/iceberg/write/test_iceberg_write_partition_path.groovy @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_iceberg_write_partition_path", "p0,external,iceberg,external_docker,external_docker_iceberg") { + String enabled = context.config.otherConfigs.get("enableIcebergTest") + if (enabled == null || !enabled.equalsIgnoreCase("true")) { + logger.info("disable iceberg test.") + return + } + + String catalog_name = "test_iceberg_write_partition_path" + String db_name = "test_partition_path_db" + String table_name = "test_partition_path_tbl" + String rest_port = context.config.otherConfigs.get("iceberg_rest_uri_port") + String minio_port = context.config.otherConfigs.get("iceberg_minio_port") + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + sql """drop catalog if exists ${catalog_name}""" + sql """ + create catalog if not exists ${catalog_name} properties ( + "type" = "iceberg", + "iceberg.catalog.type" = "rest", + "uri" = "http://${externalEnvIp}:${rest_port}", + "s3.access_key" = "admin", + "s3.secret_key" = "password", + "s3.endpoint" = "http://${externalEnvIp}:${minio_port}", + "s3.region" = "us-east-1" + ) + """ + + try { + sql """switch ${catalog_name}""" + sql """drop database if exists ${db_name} force""" + sql """create database ${db_name}""" + sql """use ${db_name}""" + + sql """drop table if exists ${table_name}""" + sql """ + create table ${table_name} ( + id bigint, + part_col string + ) engine=iceberg + partition by list (part_col) () + properties ( + "format-version" = "2", + "write-format" = "parquet", + "write.format.default" = "parquet" + ) + """ + + sql """ + insert into ${table_name} values + (1, concat('with', unhex('CC81'), 'combining character')), + (2, 'slash/colon:equals=percent%question?') + """ + + List> rows = sql """select id, hex(part_col) from ${table_name} order by id""" + assertEquals(2, rows.size()) + assertEquals("1", rows[0][0].toString()) + assertEquals("77697468CC81636F6D62696E696E6720636861726163746572", rows[0][1].toString()) + assertEquals("2", rows[1][0].toString()) + assertEquals("736C6173682F636F6C6F6E3A657175616C733D70657263656E74257175657374696F6E3F", rows[1][1].toString()) + + List filePaths = sql("""select file_path from ${table_name}\$files order by file_path""") + .collect { row -> row[0].toString() } + logger.info("Iceberg partition file paths: ${filePaths}") + assertTrue(filePaths.any { path -> path.contains("part_col=with%CC%81combining+character") }, + "Expected Iceberg URL-encoded UTF-8 partition path, actual paths: ${filePaths}") + assertTrue(filePaths.any { path -> path.contains("part_col=slash%2Fcolon%3Aequals%3Dpercent%25question%3F") }, + "Expected Iceberg URL-encoded special-character partition path, actual paths: ${filePaths}") + } finally { + sql """drop database if exists ${catalog_name}.${db_name} force""" + sql """drop catalog if exists ${catalog_name}""" + } +}