This is an automated email from the ASF dual-hosted git repository.

englefly pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new c655b1a64d5 [fix](lazy materialize) set uniqueId for lazy materialized 
slots (#61029)
c655b1a64d5 is described below

commit c655b1a64d5e2fd21cbda2e28b6fcdd48b918873
Author: minghong <[email protected]>
AuthorDate: Tue Mar 17 13:05:42 2026 +0800

    [fix](lazy materialize) set uniqueId for lazy materialized slots (#61029)
    
    This PR has two functional changes and one large documentation/renaming 
effort:
    
    BE bug fix (materialization_opertor.cpp): Moves block_maps inside the 
per-relation loop to prevent stale entries when a backend returns a non-empty 
block for one relation but an empty block for another. This is a correct and 
important fix.
    
    FE fix (PhysicalLazyMaterialize.java): Uses 
SlotReference.withColumn(originalColumn) so that createSlotDesc can write 
colUniqueId into the thrift SlotDescriptor. This is the core fix referenced by 
the PR title. The approach is sound — withColumn creates an immutable copy with 
the column set, which flows through createSlotDesc → setColumn → toThrift → 
setColUniqueId.
    
    Renaming and documentation: lazyTableIdxs/idxs → 
lazyBaseColumnIndices/columnIdxsLists with extensive Javadoc. The naming is 
clearer.
---
 be/src/exec/operator/materialization_opertor.cpp   |  9 +++-
 .../glue/translator/PhysicalPlanTranslator.java    |  2 +-
 .../plans/physical/PhysicalLazyMaterialize.java    | 57 ++++++++++++++++++----
 .../apache/doris/planner/MaterializationNode.java  | 54 ++++++++++++++++++--
 .../hive/test_hive_topn_lazy_mat.groovy            |  6 +--
 .../tvf/test_tvf_topn_lazy_mat.groovy              |  4 +-
 6 files changed, 109 insertions(+), 23 deletions(-)

diff --git a/be/src/exec/operator/materialization_opertor.cpp 
b/be/src/exec/operator/materialization_opertor.cpp
index 39c4fbcbb76..f75e445f6f5 100644
--- a/be/src/exec/operator/materialization_opertor.cpp
+++ b/be/src/exec/operator/materialization_opertor.cpp
@@ -54,9 +54,14 @@ void MaterializationSharedState::get_block(Block* block) {
 }
 
 Status MaterializationSharedState::merge_multi_response() {
-    std::unordered_map<int64_t, std::pair<Block, int>> block_maps;
-
     for (int i = 0; i < block_order_results.size(); ++i) {
+        // block_maps must be rebuilt for each relation (each i), because a 
backend that
+        // returned a non-empty block for relation i-1 may return an empty 
block for
+        // relation i (e.g. it holds rows only from one of the two tables in a 
UNION ALL).
+        // Keeping block_maps across iterations would leave stale entries from 
the previous
+        // relation and miss entries for the current one, causing the
+        // "backend_id not found in block_maps" error.
+        std::unordered_map<int64_t, std::pair<Block, int>> block_maps;
         for (auto& [backend_id, rpc_struct] : rpc_struct_map) {
             Block partial_block;
             size_t uncompressed_size = 0;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java
index 2c1518fdbaa..4100fab5c23 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java
@@ -2779,7 +2779,7 @@ public class PhysicalPlanTranslator extends 
DefaultPlanVisitor<PlanFragment, Pla
 
         materializeNode.setLazyColumns(materialize.getLazyColumns());
         materializeNode.setLocations(materialize.getLazySlotLocations());
-        materializeNode.setIdxs(materialize.getlazyTableIdxs());
+        
materializeNode.setColumnIdxsLists(materialize.getLazyBaseColumnIndices());
 
         List<Boolean> rowStoreFlags = new ArrayList<>();
         for (Relation relation : materialize.getRelations()) {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalLazyMaterialize.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalLazyMaterialize.java
index 5738a377823..25d39cc02b1 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalLazyMaterialize.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalLazyMaterialize.java
@@ -62,11 +62,44 @@ public class PhysicalLazyMaterialize<CHILD_TYPE extends 
Plan> extends PhysicalUn
 
     private final List<Slot> materializeInput;
     private final List<Slot> materializeOutput;
-    // used for BE
+    /**
+     * The following four fields are used by BE to perform the actual lazy 
fetch.
+     * They are indexed by relation: index i corresponds to relations.get(i).
+     *
+     * Example:
+     *   SQL: SELECT t1.a, t1.b, t2.c, t2.d FROM t1 JOIN t2 ON ... WHERE t1.a 
> 5
+     *   Assume t1.b and t2.d are lazily materialized (fetched after 
filtering).
+     *
+     *   materializedSlots (non-lazy, computed eagerly) = [t1.a, t2.c]
+     *   Output slot order = [t1.a(0), t2.c(1), t1.b(2), t2.d(3)]
+     *
+     *   rowIdList         = [row_id_t1, row_id_t2]
+     *                       The row-id slots passed to BE to locate the 
original rows.
+     *
+     *   relations         = [t1, t2]
+     *
+     *   lazyColumns       = [[Column(b)],          [Column(d)]]
+     *                       For each relation, the Column objects to be 
lazily fetched.
+     *
+     *   lazyBaseColumnIndices = [[colIdxOf(b) in t1],  [colIdxOf(d) in t2]]
+     *                       For each relation, the physical column index 
inside the table
+     *                       for each lazy column (used by BE to locate the 
column on disk).
+     *
+     *   lazySlotLocations = [[2],                  [3]]
+     *                       A two-level array: the outer level is indexed by 
relation
+     *                       (same as relations / rowIdList), and the inner 
level lists the
+     *                       output-tuple position for each lazy column of 
that relation.
+     *                       Two levels are needed because a single relation 
can have
+     *                       multiple lazy columns.  For example, if both t1.b 
and t1.e
+     *                       were lazy, the entry for t1 would be [2, 4] 
(positions of b
+     *                       and e in the output tuple), while t2 remains [3].
+     *                       BE uses each position to know which output slot 
to fill in
+     *                       after fetching the column value from disk.
+     */
     private final List<Slot> rowIdList;
     private List<List<Column>> lazyColumns = new ArrayList<>();
     private List<List<Integer>> lazySlotLocations = new ArrayList<>();
-    private List<List<Integer>> lazyTableIdxs = new ArrayList<>();
+    private List<List<Integer>> lazyBaseColumnIndices = new ArrayList<>();
 
     private final List<Relation> relations;
 
@@ -101,7 +134,7 @@ public class PhysicalLazyMaterialize<CHILD_TYPE extends 
Plan> extends PhysicalUn
         this.materializedSlots = ImmutableList.copyOf(materializedSlots);
         this.materializeMap = materializeMap;
         lazySlotLocations = new ArrayList<>();
-        lazyTableIdxs = new ArrayList<>();
+        lazyBaseColumnIndices = new ArrayList<>();
         lazyColumns = new ArrayList<>();
 
         ImmutableList.Builder<Slot> outputBuilder = ImmutableList.builder();
@@ -126,15 +159,19 @@ public class PhysicalLazyMaterialize<CHILD_TYPE extends 
Plan> extends PhysicalUn
 
             List<Column> lazyColumnForRel = new ArrayList<>();
             lazyColumns.add(lazyColumnForRel);
-            List<Integer> lazyIdxForRel = new ArrayList<>();
-            lazyTableIdxs.add(lazyIdxForRel);
+            List<Integer> lazyBaseColumnIdxForRel = new ArrayList<>();
+            lazyBaseColumnIndices.add(lazyBaseColumnIdxForRel);
 
             List<Integer> lazySlotLocationForRel = new ArrayList<>();
             lazySlotLocations.add(lazySlotLocationForRel);
             for (Slot lazySlot : relationToLazySlotMap.get(rel)) {
-                outputBuilder.add(lazySlot);
-                
lazyColumnForRel.add(materializeMap.get(lazySlot).baseSlot.getOriginalColumn().get());
-                
lazyIdxForRel.add(relationTable.getBaseColumnIdxByName(lazySlot.getName()));
+                // Set originalColumn on the lazy slot so that createSlotDesc 
can write
+                // colUniqueId into the thrift SlotDescriptor — BE needs it to 
resolve
+                // the column during remote fetch.
+                Column originalColumn = 
materializeMap.get(lazySlot).baseSlot.getOriginalColumn().get();
+                outputBuilder.add(((SlotReference) 
lazySlot).withColumn(originalColumn));
+                lazyColumnForRel.add(originalColumn);
+                
lazyBaseColumnIdxForRel.add(relationTable.getBaseColumnIdxByName(lazySlot.getName()));
                 lazySlotLocationForRel.add(loc);
                 loc++;
             }
@@ -250,8 +287,8 @@ public class PhysicalLazyMaterialize<CHILD_TYPE extends 
Plan> extends PhysicalUn
         return lazySlotLocations;
     }
 
-    public List<List<Integer>> getlazyTableIdxs() {
-        return lazyTableIdxs;
+    public List<List<Integer>> getLazyBaseColumnIndices() {
+        return lazyBaseColumnIndices;
     }
 
     public List<Slot> getRowIds() {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/planner/MaterializationNode.java 
b/fe/fe-core/src/main/java/org/apache/doris/planner/MaterializationNode.java
index 40d16a88faf..1acb473d851 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/MaterializationNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/MaterializationNode.java
@@ -71,6 +71,50 @@ import java.util.List;
  * }
  */
 public class MaterializationNode extends PlanNode {
+    /**
+     * Example to illustrate the fields below:
+     *
+     *   SQL:  SELECT pk, col_date FROM t ORDER BY pk LIMIT 10
+     *         (col_date is chosen for lazy materialization)
+     *
+     *   The child plan produces:  [pk(eager), row_id_t]
+     *   row_id_t is a "global row-id" slot that encodes where the real 
col_date lives on disk.
+     *
+     *   materializeTupleDescriptor (intermediate tuple, e.g. tuple 5):
+     *     index 0 → pk          (eager slot, already in the child block)
+     *     index 1 → col_date    (lazy slot, to be fetched remotely via row_id)
+     *
+     *   outputTupleDesc (final output tuple, e.g. tuple 6):
+     *     contains the columns actually returned to the parent node after
+     *     applying projectList on top of materializeTupleDescriptor.
+     *     In simple cases it is the same set of columns.
+     *     Example: projectList = [pk#0, col_date#1]  →  outputTupleDesc = 
{pk, col_date}
+     *
+     *   Relationship:
+     *     materializeTupleDescriptor  =  "working" tuple that holds eager + 
lazy columns,
+     *                                    used to build the RPC fetch request 
and place the
+     *                                    fetched values back into the block.
+     *     outputTupleDesc             =  "public" tuple visible to the 
parent; produced by
+     *                                    projecting 
materializeTupleDescriptor through projectList.
+     *
+     *   locations (= PhysicalLazyMaterialize.lazySlotLocations):
+     *     For each relation, the index of each lazy slot inside 
materializeTupleDescriptor.
+     *     In the example above: [[1]]
+     *       → outer list index 0 = first (only) relation t
+     *       → inner value 1      = col_date is at index 1 in 
materializeTupleDescriptor.slots()
+     *     BE uses this to call  slots[location]->to_protobuf(...)  when 
building the fetch
+     *     request so the remote side knows the schema of what to return.
+     *
+    *   columnIdxsLists (= PhysicalLazyMaterialize.lazyBaseColumnIndices):
+     *     For each relation, the physical column index in the table for each 
lazy column.
+     *     In the example above: [[3]]   (col_date is the 3rd column in t, 
0-indexed)
+     *     BE sends this index to the storage layer so it can locate the 
column on disk
+     *     without needing the full schema.
+     *
+     *   If two columns b and c from table t were both lazy, and the column 
indices in the
+     *   table are 2 and 5 respectively, and they occupy slots at index 1 and 
2 in the tuple:
+     *     locations = [[1, 2]],  columnIdxsLists = [[2, 5]]
+     */
     private TPaloNodesInfo nodesInfo;
     private TupleDescriptor materializeTupleDescriptor;
 
@@ -79,7 +123,7 @@ public class MaterializationNode extends PlanNode {
     private List<List<Column>> lazyColumns;
 
     private List<List<Integer>> locations;
-    private List<List<Integer>> idxs;
+    private List<List<Integer>> columnIdxsLists;
 
     private List<Boolean> rowStoreFlags;
 
@@ -124,7 +168,7 @@ public class MaterializationNode extends PlanNode {
         }
         
output.append(detailPrefix).append("column_descs_lists").append(lazyColumns).append("\n");
         output.append(detailPrefix).append("locations: 
").append(locations).append("\n");
-        output.append(detailPrefix).append("table_idxs: 
").append(idxs).append("\n");
+        output.append(detailPrefix).append("column_idxs_lists: 
").append(columnIdxsLists).append("\n");
         output.append(detailPrefix).append("row_ids: 
").append(rowIds).append("\n");
         output.append(detailPrefix).append("isTopMaterializeNode: 
").append(isTopMaterializeNode).append("\n");
         printNestedColumns(output, detailPrefix, outputTupleDesc);
@@ -152,7 +196,7 @@ public class MaterializationNode extends PlanNode {
         msg.materialization_node.setColumnDescsLists(thriftCols);
 
         msg.materialization_node.setSlotLocsLists(locations);
-        msg.materialization_node.setColumnIdxsLists(idxs);
+        msg.materialization_node.setColumnIdxsLists(columnIdxsLists);
         msg.materialization_node.setFetchRowStores(rowStoreFlags);
         msg.materialization_node.setGcIdMap(isTopMaterializeNode);
     }
@@ -169,8 +213,8 @@ public class MaterializationNode extends PlanNode {
         this.locations = locations;
     }
 
-    public void setIdxs(List<List<Integer>> idxs) {
-        this.idxs = idxs;
+    public void setColumnIdxsLists(List<List<Integer>> columnIdxsLists) {
+        this.columnIdxsLists = columnIdxsLists;
     }
 
     public void setRowStoreFlags(List<Boolean> rowStoreFlags) {
diff --git 
a/regression-test/suites/external_table_p0/hive/test_hive_topn_lazy_mat.groovy 
b/regression-test/suites/external_table_p0/hive/test_hive_topn_lazy_mat.groovy
index e90ab9b8631..761cd1cddf9 100644
--- 
a/regression-test/suites/external_table_p0/hive/test_hive_topn_lazy_mat.groovy
+++ 
b/regression-test/suites/external_table_p0/hive/test_hive_topn_lazy_mat.groovy
@@ -210,7 +210,7 @@ suite("test_hive_topn_lazy_mat", "p0,external") {
             contains("projectList:[id, name, value, active, score, file_id]")
             contains("column_descs_lists[[`name` text NULL, `value` double 
NULL, `active` boolean NULL, `score` double NULL, `file_id` int NULL]]")
             contains("locations: [[1, 2, 3, 4, 5]]")
-            contains("table_idxs: [[1, 2, 3, 4, 5]]")
+            contains("column_idxs_lists: [[1, 2, 3, 4, 5]]")
             contains("row_ids: 
[__DORIS_GLOBAL_ROWID_COL__orc_topn_lazy_mat_table]")
         }
        
@@ -219,7 +219,7 @@ suite("test_hive_topn_lazy_mat", "p0,external") {
             contains("projectList:[file_id, id]")
             contains("column_descs_lists[[`id` int NULL, `file_id` int NULL]]")
             contains("locations: [[1, 2]]")
-            contains("table_idxs: [[0, 5]]")
+            contains("column_idxs_lists: [[0, 5]]")
             contains("row_ids: 
[__DORIS_GLOBAL_ROWID_COL__orc_topn_lazy_mat_table]")
         }
 
@@ -229,7 +229,7 @@ suite("test_hive_topn_lazy_mat", "p0,external") {
             contains("projectList:[name, length(a.name), value, id, name, 
value, active, score, file_id, id, name, value, active, score, file_id]")
             contains("column_descs_lists[[`name` text NULL, `value` double 
NULL, `active` boolean NULL, `score` double NULL, `file_id` int NULL], [`value` 
double NULL, `active` boolean NULL, `score` double NULL, `file_id` int NULL]]")
             contains("locations: [[5, 6, 7, 8, 9], [10, 11, 12, 13]]")
-            contains("table_idxs: [[1, 2, 3, 4, 5], [2, 3, 4, 5]]")
+            contains("column_idxs_lists: [[1, 2, 3, 4, 5], [2, 3, 4, 5]]")
             contains("row_ids: 
[__DORIS_GLOBAL_ROWID_COL__orc_topn_lazy_mat_table, 
__DORIS_GLOBAL_ROWID_COL__parquet_topn_lazy_mat_table]")
         }
 
diff --git 
a/regression-test/suites/external_table_p0/tvf/test_tvf_topn_lazy_mat.groovy 
b/regression-test/suites/external_table_p0/tvf/test_tvf_topn_lazy_mat.groovy
index b1db8faa943..d084631c772 100644
--- a/regression-test/suites/external_table_p0/tvf/test_tvf_topn_lazy_mat.groovy
+++ b/regression-test/suites/external_table_p0/tvf/test_tvf_topn_lazy_mat.groovy
@@ -148,7 +148,7 @@ suite("test_tvf_topn_lazy_mat", "p0,external") {
 
             contains("column_descs_lists[[`name` text NULL, `value` double 
NULL, `active` boolean NULL, `score` double NULL]]")
             contains("locations: [[1, 2, 3, 4]]")
-            contains("table_idxs: [[1, 2, 3, 4]]")
+            contains("column_idxs_lists: [[1, 2, 3, 4]]")
             contains("row_ids: [__DORIS_GLOBAL_ROWID_COL__hdfs]")
             contains("isTopMaterializeNode: true")
             contains("SlotDescriptor{id=0, col=id, colUniqueId=-1, 
type=bigint, nullable=true")
@@ -163,7 +163,7 @@ suite("test_tvf_topn_lazy_mat", "p0,external") {
             contains("projectList:[name, value, score]")
             contains("column_descs_lists[[`name` text NULL, `value` double 
NULL, `score` double NULL]]")
             contains("locations: [[1, 2, 3]]")
-            contains("table_idxs: [[1, 2, 4]]")
+            contains("column_idxs_lists: [[1, 2, 4]]")
             contains("row_ids: [__DORIS_GLOBAL_ROWID_COL__hdfs]")
             contains("isTopMaterializeNode: true")
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to