CodiumAI-Agent commented on PR #8550: URL: https://github.com/apache/incubator-gluten/pull/8550#issuecomment-2636325426
## PR Reviewer Guide ๐ Here are some key observations to aid the review process: <table> <tr><td> **๐ซ Ticket compliance analysis ๐ถ** **[8528](https://github.com/apache/incubator-gluten/issues/8528) - Partially compliant** Compliant requirements: - Support the `approx_count_distinct` function. Non-compliant requirements: [] Requires further human verification: [] </td></tr> <tr><td>โฑ๏ธ <strong>Estimated effort to review</strong>: 4 ๐ต๐ต๐ต๐ตโช</td></tr> <tr><td>๐งช <strong>PR contains tests</strong></td></tr> <tr><td>๐ <strong>No security concerns identified</strong></td></tr> <tr><td>โก <strong>Recommended focus areas for review</strong><br><br> <details><summary><a href='https://github.com/apache/incubator-gluten/pull/8550/files#diff-a600bba8c3e169329dafd07a3806300a4be4371e1808dea9389a2606293fcdd2R272-R289'><strong>Possible Issue</strong></a> Verify that the logic for handling `HyperLogLogPlusPlus` in the `Partial` and `Complete` aggregation modes is correct, especially the addition of `relativeSDLiteral` to the nodes.</summary> ```scala val nodes = aggregateFunc.children.toList.map( expr => { ExpressionConverter .replaceWithExpressionTransformer(expr, child.output) .doTransform(args) }) val extraNodes = aggregateFunc match { case hll: HyperLogLogPlusPlus => val relativeSDLiteral = Literal(hll.relativeSD) Seq( ExpressionConverter .replaceWithExpressionTransformer(relativeSDLiteral, child.output) .doTransform(args)) case _ => Seq.empty } nodes ++ extraNodes ``` </details> <details><summary><a href='https://github.com/apache/incubator-gluten/pull/8550/files#diff-a4849d6dba3b9258b586813a72c5a2ec05f82b223c741611c662506615a221eeR75-R141'><strong>Edge Case Handling</strong></a> Ensure that the parser correctly handles all edge cases, such as invalid argument counts or non-literal second arguments for `approx_count_distinct`.</summary> ```c++ Array parseFunctionParameters( const CommonFunctionInfo & func_info, ActionsDAG::NodeRawConstPtrs & arg_nodes, ActionsDAG & actions_dag) const override { if (func_info.phase == substrait::AGGREGATION_PHASE_INITIAL_TO_INTERMEDIATE || func_info.phase == substrait::AGGREGATION_PHASE_INITIAL_TO_RESULT || func_info.phase == substrait::AGGREGATION_PHASE_UNSPECIFIED) { const auto & arguments = func_info.arguments; const size_t num_args = arguments.size(); const size_t num_nodes = arg_nodes.size(); if (num_args != num_nodes || num_args > 2 || num_args < 1 || num_nodes > 2 || num_nodes < 1) throw Exception( ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} takes 1 or 2 arguments in phase {}", getName(), magic_enum::enum_name(func_info.phase)); Array params; if (num_args == 2) { const auto & relative_sd_arg = arguments[1].value(); if (relative_sd_arg.has_literal()) { auto [_, field] = parseLiteral(relative_sd_arg.literal()); params.push_back(std::move(field)); } else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second argument of function {} must be literal", getName()); } else { params.push_back(0.05); } const auto & expr_arg = arg_nodes[0]; const auto * is_null_node = toFunctionNode(actions_dag, "isNull", {expr_arg}); const auto * hash_node = toFunctionNode(actions_dag, "sparkXxHash64", {expr_arg}); const auto * null_node = addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeNullable>(std::make_shared<DataTypeUInt64>()), {}); const auto * if_node = toFunctionNode(actions_dag, "if", {is_null_node, null_node, hash_node}); /// Replace the first argument expr with if(isNull(expr), null, sparkXxHash64(expr)) arg_nodes[0] = if_node; arg_nodes.resize(1); return params; } else { if (arg_nodes.size() != 1) throw Exception( ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} takes 1 argument in phase {}", getName(), magic_enum::enum_name(func_info.phase)); const auto & result_type = arg_nodes[0]->result_type; const auto * aggregate_function_type = checkAndGetDataType<DataTypeAggregateFunction>(result_type.get()); if (!aggregate_function_type) throw Exception( ErrorCodes::BAD_ARGUMENTS, "The first argument type of function {} in phase {} must be AggregateFunction, but is {}", getName(), magic_enum::enum_name(func_info.phase), result_type->getName()); return aggregate_function_type->getParameters(); } ``` </details> <details><summary><a href='https://github.com/apache/incubator-gluten/pull/8550/files#diff-a248447f92eab84bf5b4420bec4999aa77dd1b0d9e7d6b87efe6c8b22ad46822R1-R72'><strong>Test Coverage</strong></a> Confirm that the tests cover all critical scenarios for `HyperLogLogPlusPlus`, including serialization, merging, and edge cases.</summary> ```c++ #include <gtest/gtest.h> #include <AggregateFunctions/AggregateFunctionUniqHyperLogLogPlusPlus.h> #include "IO/ReadBufferFromString.h" using namespace DB; static std::vector<UInt64> random_uint64s = {17956993516945311251ULL, 4306050051188505054ULL, 14289061765075743502ULL, 16763375724458316157ULL, 6144297519955185930ULL, 18446472757487308114ULL, 16923578592198257123ULL, 13557354668567515845ULL, 15328387702200001967ULL, 15878166530370497646ULL}; static void initSmallHLL(HyperLogLogPlusPlusData & hll) { for (auto x : random_uint64s) hll.add(x); } static void initLargeHLL(HyperLogLogPlusPlusData & hll) { for (auto x : random_uint64s) { for (size_t i = 0; i < 100; ++i) hll.add(x * (i+1)); } } TEST(HyperLogLogPlusPlusDataTest, Small) { HyperLogLogPlusPlusData hll; initSmallHLL(hll); EXPECT_EQ(hll.query(), 10); } TEST(HyperLogLogPlusPlusDataTest, Large) { HyperLogLogPlusPlusData hll; initLargeHLL(hll); EXPECT_EQ(hll.query(), 806); } TEST(HyperLogLogPlusPlusDataTest, Merge) { HyperLogLogPlusPlusData hll1; initSmallHLL(hll1); HyperLogLogPlusPlusData hll2; initLargeHLL(hll2); hll1.merge(hll2); EXPECT_EQ(hll1.query(), 806); } TEST(HyperLogLogPlusPlusDataTest, SerializeAndDeserialize) { HyperLogLogPlusPlusData hll1; initLargeHLL(hll1); WriteBufferFromOwnString write_buffer; hll1.serialize(write_buffer); ReadBufferFromString read_buffer(write_buffer.str()); HyperLogLogPlusPlusData hll2; hll2.deserialize(read_buffer); EXPECT_EQ(hll2.query(), 806); } ``` </details> </td></tr> </table> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
