Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GLUTEN-8528][CH]Support approx_count_distinct #8550

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from

Conversation

taiyang-li
Copy link
Contributor

What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

(Fixes: #8528)

How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)

(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

@taiyang-li taiyang-li changed the title [GLUTEN-8528][CH]Support approx count distinct [GLUTEN-8528][CH]Support approx_count_distinct Jan 16, 2025
@github-actions github-actions bot added CORE works for Gluten Core CLICKHOUSE labels Jan 16, 2025
Copy link

#8528

Copy link

Run Gluten ClickHouse CI on ARM

@taiyang-li
Copy link
Contributor Author

@CodiumAI-Agent /review

@CodiumAI-Agent
Copy link

PR Reviewer Guide 🔍

Here are some key observations to aid the review process:

🎫 Ticket compliance analysis 🔶

8528 - Partially compliant

Compliant requirements:

  • Support the approx_count_distinct function.

Non-compliant requirements:

[]

Requires further human verification:

[]

⏱️ Estimated effort to review: 4 🔵🔵🔵🔵⚪
🧪 PR contains tests
🔒 No security concerns identified
⚡ Recommended focus areas for review

Possible Issue

Verify that the logic for handling HyperLogLogPlusPlus in the Partial and Complete aggregation modes is correct, especially the addition of relativeSDLiteral to the nodes.

val nodes = aggregateFunc.children.toList.map(
  expr => {
    ExpressionConverter
      .replaceWithExpressionTransformer(expr, child.output)
      .doTransform(args)
  })

val extraNodes = aggregateFunc match {
  case hll: HyperLogLogPlusPlus =>
    val relativeSDLiteral = Literal(hll.relativeSD)
    Seq(
      ExpressionConverter
        .replaceWithExpressionTransformer(relativeSDLiteral, child.output)
        .doTransform(args))
  case _ => Seq.empty
}

nodes ++ extraNodes
Edge Case Handling

Ensure that the parser correctly handles all edge cases, such as invalid argument counts or non-literal second arguments for approx_count_distinct.

Array parseFunctionParameters(
    const CommonFunctionInfo & func_info, ActionsDAG::NodeRawConstPtrs & arg_nodes, ActionsDAG & actions_dag) const override
{
    if (func_info.phase == substrait::AGGREGATION_PHASE_INITIAL_TO_INTERMEDIATE
        || func_info.phase == substrait::AGGREGATION_PHASE_INITIAL_TO_RESULT
        || func_info.phase == substrait::AGGREGATION_PHASE_UNSPECIFIED)
    {
        const auto & arguments = func_info.arguments;
        const size_t num_args = arguments.size();
        const size_t num_nodes = arg_nodes.size();
        if (num_args != num_nodes || num_args > 2 || num_args < 1 || num_nodes > 2 || num_nodes < 1)
            throw Exception(
                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
                "Function {} takes 1 or 2 arguments in phase {}",
                getName(),
                magic_enum::enum_name(func_info.phase));

        Array params;
        if (num_args == 2)
        {
            const auto & relative_sd_arg = arguments[1].value();
            if (relative_sd_arg.has_literal())
            {
                auto [_, field] = parseLiteral(relative_sd_arg.literal());
                params.push_back(std::move(field));
            }
            else
                throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second argument of function {} must be literal", getName());
        }
        else
        {
            params.push_back(0.05);
        }

        const auto & expr_arg = arg_nodes[0];
        const auto * is_null_node = toFunctionNode(actions_dag, "isNull", {expr_arg});
        const auto * hash_node = toFunctionNode(actions_dag, "sparkXxHash64", {expr_arg});
        const auto * null_node
            = addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeNullable>(std::make_shared<DataTypeUInt64>()), {});
        const auto * if_node = toFunctionNode(actions_dag, "if", {is_null_node, null_node, hash_node});
        /// Replace the first argument expr with if(isNull(expr), null, sparkXxHash64(expr))
        arg_nodes[0] = if_node;
        arg_nodes.resize(1);

        return params;
    }
    else
    {
        if (arg_nodes.size() != 1)
            throw Exception(
                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
                "Function {} takes 1 argument in phase {}",
                getName(),
                magic_enum::enum_name(func_info.phase));

        const auto & result_type = arg_nodes[0]->result_type;
        const auto * aggregate_function_type = checkAndGetDataType<DataTypeAggregateFunction>(result_type.get());
        if (!aggregate_function_type)
            throw Exception(
                ErrorCodes::BAD_ARGUMENTS,
                "The first argument type of function {} in phase {} must be AggregateFunction, but is {}",
                getName(),
                magic_enum::enum_name(func_info.phase),
                result_type->getName());

        return aggregate_function_type->getParameters();
    }
Test Coverage

Confirm that the tests cover all critical scenarios for HyperLogLogPlusPlus, including serialization, merging, and edge cases.

#include <gtest/gtest.h>
#include <AggregateFunctions/AggregateFunctionUniqHyperLogLogPlusPlus.h>
#include "IO/ReadBufferFromString.h"

using namespace DB;

static std::vector<UInt64> random_uint64s
    = {17956993516945311251ULL,
       4306050051188505054ULL,
       14289061765075743502ULL,
       16763375724458316157ULL,
       6144297519955185930ULL,
       18446472757487308114ULL,
       16923578592198257123ULL,
       13557354668567515845ULL,
       15328387702200001967ULL,
       15878166530370497646ULL};

static void initSmallHLL(HyperLogLogPlusPlusData & hll)
{
    for (auto x : random_uint64s)
        hll.add(x);
}

static void initLargeHLL(HyperLogLogPlusPlusData & hll)
{
    for (auto x : random_uint64s)
    {
        for (size_t i = 0; i < 100; ++i)
            hll.add(x * (i+1));
    }
}

TEST(HyperLogLogPlusPlusDataTest, Small)
{
    HyperLogLogPlusPlusData hll;
    initSmallHLL(hll);
    EXPECT_EQ(hll.query(), 10);
}

TEST(HyperLogLogPlusPlusDataTest, Large)
{
    HyperLogLogPlusPlusData hll;
    initLargeHLL(hll);
    EXPECT_EQ(hll.query(), 806);
}

TEST(HyperLogLogPlusPlusDataTest, Merge) {
    HyperLogLogPlusPlusData hll1;
    initSmallHLL(hll1);

    HyperLogLogPlusPlusData hll2;
    initLargeHLL(hll2);

    hll1.merge(hll2);
    EXPECT_EQ(hll1.query(), 806);
}

TEST(HyperLogLogPlusPlusDataTest, SerializeAndDeserialize) {
    HyperLogLogPlusPlusData hll1;
    initLargeHLL(hll1);

    WriteBufferFromOwnString write_buffer;
    hll1.serialize(write_buffer);

    ReadBufferFromString read_buffer(write_buffer.str());
    HyperLogLogPlusPlusData hll2;
    hll2.deserialize(read_buffer);

    EXPECT_EQ(hll2.query(), 806);
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
CLICKHOUSE CORE works for Gluten Core
Projects
None yet
Development

Successfully merging this pull request may close these issues.

[CH] support approx_count_distinct
2 participants