From c4894e5620655d4020dab3b924a2d7bd5b50aa18 Mon Sep 17 00:00:00 2001 From: kevinyhzou <37431499+KevinyhZou@users.noreply.github.com> Date: Fri, 17 Jan 2025 12:13:01 +0800 Subject: [PATCH] [GLUTEN-8529][CH]Fix get_json_object when path has asterisk (#8540) * fix get_json_object diff * fix get_json_object * fix ut * fix ci --- .../GlutenFunctionValidateSuite.scala | 11 +++++++++++ .../Functions/SparkFunctionGetJsonObject.h | 18 ++++++++++++++---- .../clickhouse/ClickHouseTestSettings.scala | 9 --------- .../clickhouse/ClickHouseTestSettings.scala | 13 +++---------- .../clickhouse/ClickHouseTestSettings.scala | 9 --------- .../clickhouse/ClickHouseTestSettings.scala | 9 --------- 6 files changed, 28 insertions(+), 41 deletions(-) diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala index 3eb9b6e36909..f84557e6e97c 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala @@ -370,6 +370,17 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS " get_json_object(string_field1, '$.a') is not null") { _ => } } + test("Test get_json_object 12") { + runQueryAndCompare( + "SELECT get_json_object(string_field1, '$.a[*].y') from json_test where int_field1 = 7") { + _ => + } + runQueryAndCompare( + "select get_json_object(string_field1, '$.a[*].z.n.p') from json_test where int_field1 = 7") { + _ => + } + } + test("Test covar_samp") { runQueryAndCompare("SELECT covar_samp(double_field1, int_field1) from json_test") { _ => } } diff --git a/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h b/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h index fa9b78194be0..1399b422856c 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h +++ b/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h @@ -462,8 +462,8 @@ class GetJsonObjectImpl static size_t getNumberOfIndexArguments(const DB::ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - bool insertResultToColumn(DB::IColumn & dest, const Element & root, DB::GeneratorJSONPath & generator_json_path, bool) - { + bool insertResultToColumn(DB::IColumn & dest, const Element & root, DB::GeneratorJSONPath & generator_json_path, bool path_has_asterisk) + { Element current_element = root; DB::VisitorStatus status; std::stringstream out; // STYLE_CHECK_ALLOW_STD_STRING_STREAM @@ -501,10 +501,14 @@ class GetJsonObjectImpl if (elements[0].isString()) { auto str = elements[0].getString(); + if (path_has_asterisk) + { + str = "\"" + std::string(str) + "\""; + } serializer.addRawString(str); } else - { + { serializer.addElement(elements[0]); } } @@ -684,6 +688,7 @@ class FlattenJSONStringOnRequiredFunction : public DB::IFunction std::vector json_path_asts; std::vector required_fields; + std::vector path_has_asterisk; const auto & first_column = arguments[0]; if (const auto * required_fields_col = typeid_cast(arguments[1].column.get())) { @@ -694,6 +699,11 @@ class FlattenJSONStringOnRequiredFunction : public DB::IFunction { auto normalized_field = JSONPathNormalizer::normalize(field); // LOG_ERROR(getLogger("JSONPatch"), "xxx field {} -> {}", field, normalized_field); + if(normalized_field.find("[*]") != std::string::npos) + path_has_asterisk.emplace_back(true); + else + path_has_asterisk.emplace_back(false); + required_fields.push_back(normalized_field); tuple_columns.emplace_back(str_type->createColumn()); @@ -776,7 +786,7 @@ class FlattenJSONStringOnRequiredFunction : public DB::IFunction for (size_t j = 0; j < tuple_size; ++j) { generator_json_paths[j]->reinitialize(); - if (!impl.insertResultToColumn(*tuple_columns[j], document, *generator_json_paths[j], true)) + if (!impl.insertResultToColumn(*tuple_columns[j], document, *generator_json_paths[j], path_has_asterisk[j])) { tuple_columns[j]->insertDefault(); } diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 8c62e3b0fd9b..a8eaab5486ca 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -757,16 +757,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-35728: Check multiply/divide of day-time intervals of any fields by numeric") .exclude("SPARK-35778: Check multiply/divide of year-month intervals of any fields by numeric") enableSuite[GlutenJsonExpressionsSuite] - .exclude("$.store.book[*]") - .exclude("$.store.book[*].category") - .exclude("$.store.book[*].isbn") - .exclude("$.store.basket[*]") - .exclude("$.store.basket[*][0]") - .exclude("$.store.basket[0][*]") - .exclude("$.store.basket[*][*]") .exclude("$.store.basket[0][*].b") - .exclude("$.zip code") - .exclude("$.fb:testid") .exclude("preserve newlines") .exclude("escape") .exclude("$..no_recursive") diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index f91841b991c7..b979651ae2bf 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -385,16 +385,9 @@ class ClickHouseTestSettings extends BackendTestSettings { enableSuite[GlutenJoinSuite].exclude( "SPARK-36794: Ignore duplicated key when building relation for semi/anti hash join") enableSuite[GlutenJsonExpressionsSuite] - .exclude("$.store.book[*]") - .exclude("$.store.book[*].category") - .exclude("$.store.book[*].isbn") - .exclude("$.store.basket[*]") - .exclude("$.store.basket[*][0]") - .exclude("$.store.basket[0][*]") - .exclude("$.store.basket[*][*]") - .exclude("$.store.basket[0][*].b") - .exclude("$.zip code") - .exclude("$.fb:testid") + .exclude( + "$.store.basket[0][*].b" + ) // issue: https://github.com/apache/incubator-gluten/issues/8529 .exclude("from_json - invalid data") .exclude("from_json - input=object, schema=array, output=array of single row") .exclude("from_json - input=empty object, schema=array, output=array of single row with null") diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 9ebcadf53118..2c315d8aae89 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -385,16 +385,7 @@ class ClickHouseTestSettings extends BackendTestSettings { enableSuite[GlutenJoinSuite].exclude( "SPARK-36794: Ignore duplicated key when building relation for semi/anti hash join") enableSuite[GlutenJsonExpressionsSuite] - .exclude("$.store.book[*]") - .exclude("$.store.book[*].category") - .exclude("$.store.book[*].isbn") - .exclude("$.store.basket[*]") - .exclude("$.store.basket[*][0]") - .exclude("$.store.basket[0][*]") - .exclude("$.store.basket[*][*]") .exclude("$.store.basket[0][*].b") - .exclude("$.zip code") - .exclude("$.fb:testid") .exclude("from_json - invalid data") .exclude("from_json - input=object, schema=array, output=array of single row") .exclude("from_json - input=empty object, schema=array, output=array of single row with null") diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index f482ad921ee3..308c56f40548 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -385,16 +385,7 @@ class ClickHouseTestSettings extends BackendTestSettings { enableSuite[GlutenJoinSuite].exclude( "SPARK-36794: Ignore duplicated key when building relation for semi/anti hash join") enableSuite[GlutenJsonExpressionsSuite] - .exclude("$.store.book[*]") - .exclude("$.store.book[*].category") - .exclude("$.store.book[*].isbn") - .exclude("$.store.basket[*]") - .exclude("$.store.basket[*][0]") - .exclude("$.store.basket[0][*]") - .exclude("$.store.basket[*][*]") .exclude("$.store.basket[0][*].b") - .exclude("$.zip code") - .exclude("$.fb:testid") .exclude("from_json - invalid data") .exclude("from_json - input=object, schema=array, output=array of single row") .exclude("from_json - input=empty object, schema=array, output=array of single row with null")