CodiumAI-Agent commented on PR #8550:
URL: 
https://github.com/apache/incubator-gluten/pull/8550#issuecomment-2636325426

   ## PR Reviewer Guide ๐Ÿ”
   
   Here are some key observations to aid the review process:
   
   <table>
   <tr><td>
   
   **๐ŸŽซ Ticket compliance analysis ๐Ÿ”ถ**
   
   
   
   **[8528](https://github.com/apache/incubator-gluten/issues/8528) - Partially 
compliant**
   
   Compliant requirements:
   
   - Support the `approx_count_distinct` function.
   
   Non-compliant requirements:
   
   []
   
   Requires further human verification:
   
   []
   
   
   
   </td></tr>
   <tr><td>โฑ๏ธ&nbsp;<strong>Estimated effort to review</strong>: 4 
๐Ÿ”ต๐Ÿ”ต๐Ÿ”ต๐Ÿ”ตโšช</td></tr>
   <tr><td>๐Ÿงช&nbsp;<strong>PR contains tests</strong></td></tr>
   <tr><td>๐Ÿ”’&nbsp;<strong>No security concerns identified</strong></td></tr>
   <tr><td>โšก&nbsp;<strong>Recommended focus areas for review</strong><br><br>
   
   <details><summary><a 
href='https://github.com/apache/incubator-gluten/pull/8550/files#diff-a600bba8c3e169329dafd07a3806300a4be4371e1808dea9389a2606293fcdd2R272-R289'><strong>Possible
 Issue</strong></a>
   
   Verify that the logic for handling `HyperLogLogPlusPlus` in the `Partial` 
and `Complete` aggregation modes is correct, especially the addition of 
`relativeSDLiteral` to the nodes.</summary>
   
   ```scala
   val nodes = aggregateFunc.children.toList.map(
     expr => {
       ExpressionConverter
         .replaceWithExpressionTransformer(expr, child.output)
         .doTransform(args)
     })
   
   val extraNodes = aggregateFunc match {
     case hll: HyperLogLogPlusPlus =>
       val relativeSDLiteral = Literal(hll.relativeSD)
       Seq(
         ExpressionConverter
           .replaceWithExpressionTransformer(relativeSDLiteral, child.output)
           .doTransform(args))
     case _ => Seq.empty
   }
   
   nodes ++ extraNodes
   ```
   
   </details>
   
   <details><summary><a 
href='https://github.com/apache/incubator-gluten/pull/8550/files#diff-a4849d6dba3b9258b586813a72c5a2ec05f82b223c741611c662506615a221eeR75-R141'><strong>Edge
 Case Handling</strong></a>
   
   Ensure that the parser correctly handles all edge cases, such as invalid 
argument counts or non-literal second arguments for 
`approx_count_distinct`.</summary>
   
   ```c++
   Array parseFunctionParameters(
       const CommonFunctionInfo & func_info, ActionsDAG::NodeRawConstPtrs & 
arg_nodes, ActionsDAG & actions_dag) const override
   {
       if (func_info.phase == 
substrait::AGGREGATION_PHASE_INITIAL_TO_INTERMEDIATE
           || func_info.phase == substrait::AGGREGATION_PHASE_INITIAL_TO_RESULT
           || func_info.phase == substrait::AGGREGATION_PHASE_UNSPECIFIED)
       {
           const auto & arguments = func_info.arguments;
           const size_t num_args = arguments.size();
           const size_t num_nodes = arg_nodes.size();
           if (num_args != num_nodes || num_args > 2 || num_args < 1 || 
num_nodes > 2 || num_nodes < 1)
               throw Exception(
                   ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
                   "Function {} takes 1 or 2 arguments in phase {}",
                   getName(),
                   magic_enum::enum_name(func_info.phase));
   
           Array params;
           if (num_args == 2)
           {
               const auto & relative_sd_arg = arguments[1].value();
               if (relative_sd_arg.has_literal())
               {
                   auto [_, field] = parseLiteral(relative_sd_arg.literal());
                   params.push_back(std::move(field));
               }
               else
                   throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second argument 
of function {} must be literal", getName());
           }
           else
           {
               params.push_back(0.05);
           }
   
           const auto & expr_arg = arg_nodes[0];
           const auto * is_null_node = toFunctionNode(actions_dag, "isNull", 
{expr_arg});
           const auto * hash_node = toFunctionNode(actions_dag, 
"sparkXxHash64", {expr_arg});
           const auto * null_node
               = addColumnToActionsDAG(actions_dag, 
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeUInt64>()), {});
           const auto * if_node = toFunctionNode(actions_dag, "if", 
{is_null_node, null_node, hash_node});
           /// Replace the first argument expr with if(isNull(expr), null, 
sparkXxHash64(expr))
           arg_nodes[0] = if_node;
           arg_nodes.resize(1);
   
           return params;
       }
       else
       {
           if (arg_nodes.size() != 1)
               throw Exception(
                   ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
                   "Function {} takes 1 argument in phase {}",
                   getName(),
                   magic_enum::enum_name(func_info.phase));
   
           const auto & result_type = arg_nodes[0]->result_type;
           const auto * aggregate_function_type = 
checkAndGetDataType<DataTypeAggregateFunction>(result_type.get());
           if (!aggregate_function_type)
               throw Exception(
                   ErrorCodes::BAD_ARGUMENTS,
                   "The first argument type of function {} in phase {} must be 
AggregateFunction, but is {}",
                   getName(),
                   magic_enum::enum_name(func_info.phase),
                   result_type->getName());
   
           return aggregate_function_type->getParameters();
       }
   ```
   
   </details>
   
   <details><summary><a 
href='https://github.com/apache/incubator-gluten/pull/8550/files#diff-a248447f92eab84bf5b4420bec4999aa77dd1b0d9e7d6b87efe6c8b22ad46822R1-R72'><strong>Test
 Coverage</strong></a>
   
   Confirm that the tests cover all critical scenarios for 
`HyperLogLogPlusPlus`, including serialization, merging, and edge 
cases.</summary>
   
   ```c++
   #include <gtest/gtest.h>
   #include <AggregateFunctions/AggregateFunctionUniqHyperLogLogPlusPlus.h>
   #include "IO/ReadBufferFromString.h"
   
   using namespace DB;
   
   static std::vector<UInt64> random_uint64s
       = {17956993516945311251ULL,
          4306050051188505054ULL,
          14289061765075743502ULL,
          16763375724458316157ULL,
          6144297519955185930ULL,
          18446472757487308114ULL,
          16923578592198257123ULL,
          13557354668567515845ULL,
          15328387702200001967ULL,
          15878166530370497646ULL};
   
   static void initSmallHLL(HyperLogLogPlusPlusData & hll)
   {
       for (auto x : random_uint64s)
           hll.add(x);
   }
   
   static void initLargeHLL(HyperLogLogPlusPlusData & hll)
   {
       for (auto x : random_uint64s)
       {
           for (size_t i = 0; i < 100; ++i)
               hll.add(x * (i+1));
       }
   }
   
   TEST(HyperLogLogPlusPlusDataTest, Small)
   {
       HyperLogLogPlusPlusData hll;
       initSmallHLL(hll);
       EXPECT_EQ(hll.query(), 10);
   }
   
   TEST(HyperLogLogPlusPlusDataTest, Large)
   {
       HyperLogLogPlusPlusData hll;
       initLargeHLL(hll);
       EXPECT_EQ(hll.query(), 806);
   }
   
   TEST(HyperLogLogPlusPlusDataTest, Merge) {
       HyperLogLogPlusPlusData hll1;
       initSmallHLL(hll1);
   
       HyperLogLogPlusPlusData hll2;
       initLargeHLL(hll2);
   
       hll1.merge(hll2);
       EXPECT_EQ(hll1.query(), 806);
   }
   
   TEST(HyperLogLogPlusPlusDataTest, SerializeAndDeserialize) {
       HyperLogLogPlusPlusData hll1;
       initLargeHLL(hll1);
   
       WriteBufferFromOwnString write_buffer;
       hll1.serialize(write_buffer);
   
       ReadBufferFromString read_buffer(write_buffer.str());
       HyperLogLogPlusPlusData hll2;
       hll2.deserialize(read_buffer);
   
       EXPECT_EQ(hll2.query(), 806);
   }
   
   ```
   
   </details>
   
   </td></tr>
   </table>
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to