Skip to content

Commit

Permalink
[GLUTEN-8325][CH] Fix miss matched result for $ and . in reg expr…
Browse files Browse the repository at this point in the history
…ession (apache#8345)

align the meaning of . and $ in vanila and ch
  • Loading branch information
lgbo-ustc authored Dec 26, 2024
1 parent 5d1d0ba commit 547f8b7
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,30 @@ class GlutenClickhouseStringFunctionsSuite extends GlutenClickHouseWholeStageTra
}
}

test("GLUTEN-8325 $. missmatched") {
withTable("regexp_string_end") {
sql("create table regexp_string_end(a String) using parquet")
sql("""
|insert into regexp_string_end
| values ('@abc'),('@abc\n'),('@abc\nsdd'), ('sfsd\n@abc'), ('sdfsdf\n@abc\n'),
| ('sdfsdf\n@abc\nsdf'), ('sdfsdf@abc\nsdf\n'), ('sdfsdf@abc'), ('sdfsdf@abc\n')
|""".stripMargin)
runQueryAndCompare("""
|select
|regexp_extract(a, '@(.*?)($)', 1),
|regexp_extract(a, '@(.*?)(f|$)', 1),
|regexp_extract(a, '^@(.*?)(f|$)', 1)
|from regexp_string_end""".stripMargin) { _ => }

runQueryAndCompare("""
|select
|regexp_extract(a, '@(.*)($)', 1),
|regexp_extract(a, '@(.*)(f|$)', 1),
|regexp_extract(a, '^@(.*?)(f|$)', 1)
|from regexp_string_end""".stripMargin) { _ => }
}
}

test("replace") {
val tableName = "replace_table"
withTable(tableName) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,16 @@
#include <DataTypes/DataTypeString.h>
#include <IO/ReadBufferFromString.h>
#include <Parser/FunctionParser.h>
#include <Poco/Logger.h>
#include <Common/logger_useful.h>
#include <Common/re2.h>

namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
}

Expand All @@ -37,29 +40,26 @@ namespace local_engine
class FunctionParserRegexpExtract : public FunctionParser
{
public:
explicit FunctionParserRegexpExtract(ParserContextPtr parser_context_) : FunctionParser(parser_context_) {}
explicit FunctionParserRegexpExtract(ParserContextPtr parser_context_) : FunctionParser(parser_context_) { }
~FunctionParserRegexpExtract() override = default;

static constexpr auto name = "regexp_extract";
String getName() const override { return name; }

const ActionsDAG::Node * parse(
const substrait::Expression_ScalarFunction & substrait_func,
ActionsDAG & actions_dag) const override
const ActionsDAG::Node * parse(const substrait::Expression_ScalarFunction & substrait_func, ActionsDAG & actions_dag) const override
{
const auto & args = substrait_func.arguments();
if (args.size() != 3)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires 3 arguments", getName());
if(args[1].value().has_literal())

if (args[1].value().has_literal())
{
const auto & literal_expr = args[1].value().literal();
if (literal_expr.has_string())
{
std::string expr_str = literal_expr.string();
size_t expr_size = expr_str.size();
if (expr_str.data()[expr_size - 1] == '$')
expr_str.replace(expr_str.find_last_of("$"), 1, "(?:(\n)*)$");
/// FIXEDME: This only works for RE2
expr_str = adaptPatternForRE2(expr_str);

String sparkRegexp = adjustSparkRegexpRule(expr_str);
const auto * regex_expr_node = addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeString>(), sparkRegexp);
Expand All @@ -76,6 +76,21 @@ class FunctionParserRegexpExtract : public FunctionParser
}

private:
String adaptPatternForRE2(const String & pattern_) const
{
LOG_DEBUG(getLogger("FunctionParserRegexpExtract"), "xxx original pattern: {}", pattern_);
String res = pattern_;
// adaptation for $, see issue #8325. equal two cases in re2: $ and \n$, but not include strings which contains \n in middle.
static const std::string replaced_str = "($|\\\\n$)";
static const re2::RE2 replace_dollar_pattern("([^\\\\])(\\$)");
re2::RE2::GlobalReplace(&res, replace_dollar_pattern, "\\1" + replaced_str);
LOG_DEBUG(getLogger("FunctionParserRegexpExtract"), "xxx adaption for $: {}", res);

// adaption for `.` . Need to remove flag s.
res = "(?-s)" + res;
return res;
}

String adjustSparkRegexpRule(String & str) const
{
const auto left_bracket_pos = str.find('[');
Expand Down Expand Up @@ -142,7 +157,6 @@ class FunctionParserRegexpExtract : public FunctionParser
strs.pop();
strs.top().append("[").append(back);
}

return strs.top();
}
};
Expand Down

0 comments on commit 547f8b7

Please sign in to comment.