CodiumAI-Agent commented on PR #9047: URL: https://github.com/apache/incubator-gluten/pull/9047#issuecomment-2735763563
## PR Reviewer Guide ๐ Here are some key observations to aid the review process: <table> <tr><td> **๐ซ Ticket compliance analysis ๐ถ** **[9044](https://github.com/apache/incubator-gluten/issues/9044) - Partially compliant** Compliant requirements: - Correct handling of virtual columns with new project step. - Removal of redundant explode/split logic in commands. Non-compliant requirements: - None Requires further human verification: - Validate downstream impacts of altered virtual column processing. </td></tr> <tr><td>โฑ๏ธ <strong>Estimated effort to review</strong>: 4 ๐ต๐ต๐ต๐ตโช</td></tr> <tr><td>๐งช <strong>PR contains tests</strong></td></tr> <tr><td>๐ <strong>No security concerns identified</strong></td></tr> <tr><td>โก <strong>Recommended focus areas for review</strong><br><br> <details><summary><a href='https://github.com/apache/incubator-gluten/pull/9047/files#diff-0ffbe729d1267ba46b9657884656a86a31afa4a42204b74e34f12ace7c45fdedR151-R160'><strong>Behavior Change</strong></a> The updated logic in the UpdateCommand omits the explode(split(...)) pattern when processing input_file_name. Review to ensure that this change yields consistent and correct file name parsing across edge cases. </summary> ```scala // that are part of Update condition Nil } else if (dataPredicates.isEmpty) { // Case 2: Update all the rows from the files that are in the specified partitions // when the data filter is empty candidateFiles .map(f => TouchedFileWithDV(f.path, f, newDeletionVector = null, deletedRows = 0L)) } else { // Case 3: Find all the affected files using the user-specified condition val fileIndex = new TahoeBatchFileIndex( ``` </details> <details><summary><a href='https://github.com/apache/incubator-gluten/pull/9047/files#diff-12aad738c70f1bc1d2a5cb226664b69484165258b1e8241205553cb0018b9b85R169-R229'><strong>Virtual Columns</strong></a> A new virtual columns project step has been added to insert and then remove virtual columns. Confirm that the concatenation using the new function and subsequent adjustments do not inadvertently drop required information or affect query results. </summary> ```c++ steps.emplace_back(PlanUtil::renamePlanHeader( *query_plan, [&original_input](const Block & input, NamesWithAliases & aliases) { aliases = buildNamesWithAliases(input, original_input); }, "Rename MergeTree Output")); } // add virtual columns step if (auto step = MergeTreeRelParser::addVirtualColumnsProjectStep(*query_plan, rel, merge_tree_table.absolute_path); step.has_value()) steps.emplace_back(step.value()); return query_plan; } std::optional<DB::IQueryPlanStep *> MergeTreeRelParser::addVirtualColumnsProjectStep(DB::QueryPlan & plan, const substrait::ReadRel & rel, const std::string & path) { if (!rel.has_base_schema() || rel.base_schema().names_size() < 1) return std::nullopt; bool contains_input_file_name = false; bool contains_input_file_block_start = false; bool contains_input_file_block_length = false; for (const auto & name : rel.base_schema().names()) { if (name == FileMetaColumns::INPUT_FILE_NAME) contains_input_file_name = true; if (name == FileMetaColumns::INPUT_FILE_BLOCK_START) contains_input_file_block_start = true; if (name == FileMetaColumns::INPUT_FILE_BLOCK_LENGTH) contains_input_file_block_length = true; } if (!contains_input_file_name && !contains_input_file_block_start && !contains_input_file_block_length) return std::nullopt; const auto & header = plan.getCurrentHeader(); DB::ActionsDAG actions_dag(header.getNamesAndTypesList()); if (contains_input_file_name) { auto concat_func = FunctionFactory::instance().get("concat", context); DB::ActionsDAG::NodeRawConstPtrs args; const auto string_type = std::make_shared<DB::DataTypeString>(); const auto * path_node = &actions_dag.addColumn(DB::ColumnWithTypeAndName(string_type->createColumnConst(1, path + "/"), string_type, "path")); args.emplace_back(path_node); const auto & part_name = actions_dag.findInOutputs(VIRTUAL_COLUMN_PART); args.emplace_back(&part_name); actions_dag.addOrReplaceInOutputs(actions_dag.addFunction(concat_func, args, FileMetaColumns::INPUT_FILE_NAME)); } if (contains_input_file_block_start) { const auto int64_type = std::make_shared<DB::DataTypeInt64>(); actions_dag.addOrReplaceInOutputs(actions_dag.addColumn(DB::ColumnWithTypeAndName(int64_type->createColumnConst(1, -1), int64_type, FileMetaColumns::INPUT_FILE_BLOCK_START))); } if (contains_input_file_block_length) { const auto int64_type = std::make_shared<DB::DataTypeInt64>(); actions_dag.addOrReplaceInOutputs(actions_dag.addColumn(DB::ColumnWithTypeAndName(int64_type->createColumnConst(1, -1), int64_type, FileMetaColumns::INPUT_FILE_BLOCK_LENGTH))); } if (contains_input_file_name) actions_dag.removeUnusedResult(VIRTUAL_COLUMN_PART); auto step = std::make_unique<DB::ExpressionStep>(plan.getCurrentHeader(), std::move(actions_dag)); step->setStepDescription("Add virtual columns"); std::optional<DB::IQueryPlanStep *> result = step.get(); plan.addStep(std::move(step)); ``` </details> </td></tr> </table> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
