This is an automated email from the ASF dual-hosted git repository.
englefly pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new c655b1a64d5 [fix](lazy materialize) set uniqueId for lazy materialized
slots (#61029)
c655b1a64d5 is described below
commit c655b1a64d5e2fd21cbda2e28b6fcdd48b918873
Author: minghong <[email protected]>
AuthorDate: Tue Mar 17 13:05:42 2026 +0800
[fix](lazy materialize) set uniqueId for lazy materialized slots (#61029)
This PR has two functional changes and one large documentation/renaming
effort:
BE bug fix (materialization_opertor.cpp): Moves block_maps inside the
per-relation loop to prevent stale entries when a backend returns a non-empty
block for one relation but an empty block for another. This is a correct and
important fix.
FE fix (PhysicalLazyMaterialize.java): Uses
SlotReference.withColumn(originalColumn) so that createSlotDesc can write
colUniqueId into the thrift SlotDescriptor. This is the core fix referenced by
the PR title. The approach is sound — withColumn creates an immutable copy with
the column set, which flows through createSlotDesc → setColumn → toThrift →
setColUniqueId.
Renaming and documentation: lazyTableIdxs/idxs →
lazyBaseColumnIndices/columnIdxsLists with extensive Javadoc. The naming is
clearer.
---
be/src/exec/operator/materialization_opertor.cpp | 9 +++-
.../glue/translator/PhysicalPlanTranslator.java | 2 +-
.../plans/physical/PhysicalLazyMaterialize.java | 57 ++++++++++++++++++----
.../apache/doris/planner/MaterializationNode.java | 54 ++++++++++++++++++--
.../hive/test_hive_topn_lazy_mat.groovy | 6 +--
.../tvf/test_tvf_topn_lazy_mat.groovy | 4 +-
6 files changed, 109 insertions(+), 23 deletions(-)
diff --git a/be/src/exec/operator/materialization_opertor.cpp
b/be/src/exec/operator/materialization_opertor.cpp
index 39c4fbcbb76..f75e445f6f5 100644
--- a/be/src/exec/operator/materialization_opertor.cpp
+++ b/be/src/exec/operator/materialization_opertor.cpp
@@ -54,9 +54,14 @@ void MaterializationSharedState::get_block(Block* block) {
}
Status MaterializationSharedState::merge_multi_response() {
- std::unordered_map<int64_t, std::pair<Block, int>> block_maps;
-
for (int i = 0; i < block_order_results.size(); ++i) {
+ // block_maps must be rebuilt for each relation (each i), because a
backend that
+ // returned a non-empty block for relation i-1 may return an empty
block for
+ // relation i (e.g. it holds rows only from one of the two tables in a
UNION ALL).
+ // Keeping block_maps across iterations would leave stale entries from
the previous
+ // relation and miss entries for the current one, causing the
+ // "backend_id not found in block_maps" error.
+ std::unordered_map<int64_t, std::pair<Block, int>> block_maps;
for (auto& [backend_id, rpc_struct] : rpc_struct_map) {
Block partial_block;
size_t uncompressed_size = 0;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java
index 2c1518fdbaa..4100fab5c23 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java
@@ -2779,7 +2779,7 @@ public class PhysicalPlanTranslator extends
DefaultPlanVisitor<PlanFragment, Pla
materializeNode.setLazyColumns(materialize.getLazyColumns());
materializeNode.setLocations(materialize.getLazySlotLocations());
- materializeNode.setIdxs(materialize.getlazyTableIdxs());
+
materializeNode.setColumnIdxsLists(materialize.getLazyBaseColumnIndices());
List<Boolean> rowStoreFlags = new ArrayList<>();
for (Relation relation : materialize.getRelations()) {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalLazyMaterialize.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalLazyMaterialize.java
index 5738a377823..25d39cc02b1 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalLazyMaterialize.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalLazyMaterialize.java
@@ -62,11 +62,44 @@ public class PhysicalLazyMaterialize<CHILD_TYPE extends
Plan> extends PhysicalUn
private final List<Slot> materializeInput;
private final List<Slot> materializeOutput;
- // used for BE
+ /**
+ * The following four fields are used by BE to perform the actual lazy
fetch.
+ * They are indexed by relation: index i corresponds to relations.get(i).
+ *
+ * Example:
+ * SQL: SELECT t1.a, t1.b, t2.c, t2.d FROM t1 JOIN t2 ON ... WHERE t1.a
> 5
+ * Assume t1.b and t2.d are lazily materialized (fetched after
filtering).
+ *
+ * materializedSlots (non-lazy, computed eagerly) = [t1.a, t2.c]
+ * Output slot order = [t1.a(0), t2.c(1), t1.b(2), t2.d(3)]
+ *
+ * rowIdList = [row_id_t1, row_id_t2]
+ * The row-id slots passed to BE to locate the
original rows.
+ *
+ * relations = [t1, t2]
+ *
+ * lazyColumns = [[Column(b)], [Column(d)]]
+ * For each relation, the Column objects to be
lazily fetched.
+ *
+ * lazyBaseColumnIndices = [[colIdxOf(b) in t1], [colIdxOf(d) in t2]]
+ * For each relation, the physical column index
inside the table
+ * for each lazy column (used by BE to locate the
column on disk).
+ *
+ * lazySlotLocations = [[2], [3]]
+ * A two-level array: the outer level is indexed by
relation
+ * (same as relations / rowIdList), and the inner
level lists the
+ * output-tuple position for each lazy column of
that relation.
+ * Two levels are needed because a single relation
can have
+ * multiple lazy columns. For example, if both t1.b
and t1.e
+ * were lazy, the entry for t1 would be [2, 4]
(positions of b
+ * and e in the output tuple), while t2 remains [3].
+ * BE uses each position to know which output slot
to fill in
+ * after fetching the column value from disk.
+ */
private final List<Slot> rowIdList;
private List<List<Column>> lazyColumns = new ArrayList<>();
private List<List<Integer>> lazySlotLocations = new ArrayList<>();
- private List<List<Integer>> lazyTableIdxs = new ArrayList<>();
+ private List<List<Integer>> lazyBaseColumnIndices = new ArrayList<>();
private final List<Relation> relations;
@@ -101,7 +134,7 @@ public class PhysicalLazyMaterialize<CHILD_TYPE extends
Plan> extends PhysicalUn
this.materializedSlots = ImmutableList.copyOf(materializedSlots);
this.materializeMap = materializeMap;
lazySlotLocations = new ArrayList<>();
- lazyTableIdxs = new ArrayList<>();
+ lazyBaseColumnIndices = new ArrayList<>();
lazyColumns = new ArrayList<>();
ImmutableList.Builder<Slot> outputBuilder = ImmutableList.builder();
@@ -126,15 +159,19 @@ public class PhysicalLazyMaterialize<CHILD_TYPE extends
Plan> extends PhysicalUn
List<Column> lazyColumnForRel = new ArrayList<>();
lazyColumns.add(lazyColumnForRel);
- List<Integer> lazyIdxForRel = new ArrayList<>();
- lazyTableIdxs.add(lazyIdxForRel);
+ List<Integer> lazyBaseColumnIdxForRel = new ArrayList<>();
+ lazyBaseColumnIndices.add(lazyBaseColumnIdxForRel);
List<Integer> lazySlotLocationForRel = new ArrayList<>();
lazySlotLocations.add(lazySlotLocationForRel);
for (Slot lazySlot : relationToLazySlotMap.get(rel)) {
- outputBuilder.add(lazySlot);
-
lazyColumnForRel.add(materializeMap.get(lazySlot).baseSlot.getOriginalColumn().get());
-
lazyIdxForRel.add(relationTable.getBaseColumnIdxByName(lazySlot.getName()));
+ // Set originalColumn on the lazy slot so that createSlotDesc
can write
+ // colUniqueId into the thrift SlotDescriptor — BE needs it to
resolve
+ // the column during remote fetch.
+ Column originalColumn =
materializeMap.get(lazySlot).baseSlot.getOriginalColumn().get();
+ outputBuilder.add(((SlotReference)
lazySlot).withColumn(originalColumn));
+ lazyColumnForRel.add(originalColumn);
+
lazyBaseColumnIdxForRel.add(relationTable.getBaseColumnIdxByName(lazySlot.getName()));
lazySlotLocationForRel.add(loc);
loc++;
}
@@ -250,8 +287,8 @@ public class PhysicalLazyMaterialize<CHILD_TYPE extends
Plan> extends PhysicalUn
return lazySlotLocations;
}
- public List<List<Integer>> getlazyTableIdxs() {
- return lazyTableIdxs;
+ public List<List<Integer>> getLazyBaseColumnIndices() {
+ return lazyBaseColumnIndices;
}
public List<Slot> getRowIds() {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/planner/MaterializationNode.java
b/fe/fe-core/src/main/java/org/apache/doris/planner/MaterializationNode.java
index 40d16a88faf..1acb473d851 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/MaterializationNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/MaterializationNode.java
@@ -71,6 +71,50 @@ import java.util.List;
* }
*/
public class MaterializationNode extends PlanNode {
+ /**
+ * Example to illustrate the fields below:
+ *
+ * SQL: SELECT pk, col_date FROM t ORDER BY pk LIMIT 10
+ * (col_date is chosen for lazy materialization)
+ *
+ * The child plan produces: [pk(eager), row_id_t]
+ * row_id_t is a "global row-id" slot that encodes where the real
col_date lives on disk.
+ *
+ * materializeTupleDescriptor (intermediate tuple, e.g. tuple 5):
+ * index 0 → pk (eager slot, already in the child block)
+ * index 1 → col_date (lazy slot, to be fetched remotely via row_id)
+ *
+ * outputTupleDesc (final output tuple, e.g. tuple 6):
+ * contains the columns actually returned to the parent node after
+ * applying projectList on top of materializeTupleDescriptor.
+ * In simple cases it is the same set of columns.
+ * Example: projectList = [pk#0, col_date#1] → outputTupleDesc =
{pk, col_date}
+ *
+ * Relationship:
+ * materializeTupleDescriptor = "working" tuple that holds eager +
lazy columns,
+ * used to build the RPC fetch request
and place the
+ * fetched values back into the block.
+ * outputTupleDesc = "public" tuple visible to the
parent; produced by
+ * projecting
materializeTupleDescriptor through projectList.
+ *
+ * locations (= PhysicalLazyMaterialize.lazySlotLocations):
+ * For each relation, the index of each lazy slot inside
materializeTupleDescriptor.
+ * In the example above: [[1]]
+ * → outer list index 0 = first (only) relation t
+ * → inner value 1 = col_date is at index 1 in
materializeTupleDescriptor.slots()
+ * BE uses this to call slots[location]->to_protobuf(...) when
building the fetch
+ * request so the remote side knows the schema of what to return.
+ *
+ * columnIdxsLists (= PhysicalLazyMaterialize.lazyBaseColumnIndices):
+ * For each relation, the physical column index in the table for each
lazy column.
+ * In the example above: [[3]] (col_date is the 3rd column in t,
0-indexed)
+ * BE sends this index to the storage layer so it can locate the
column on disk
+ * without needing the full schema.
+ *
+ * If two columns b and c from table t were both lazy, and the column
indices in the
+ * table are 2 and 5 respectively, and they occupy slots at index 1 and
2 in the tuple:
+ * locations = [[1, 2]], columnIdxsLists = [[2, 5]]
+ */
private TPaloNodesInfo nodesInfo;
private TupleDescriptor materializeTupleDescriptor;
@@ -79,7 +123,7 @@ public class MaterializationNode extends PlanNode {
private List<List<Column>> lazyColumns;
private List<List<Integer>> locations;
- private List<List<Integer>> idxs;
+ private List<List<Integer>> columnIdxsLists;
private List<Boolean> rowStoreFlags;
@@ -124,7 +168,7 @@ public class MaterializationNode extends PlanNode {
}
output.append(detailPrefix).append("column_descs_lists").append(lazyColumns).append("\n");
output.append(detailPrefix).append("locations:
").append(locations).append("\n");
- output.append(detailPrefix).append("table_idxs:
").append(idxs).append("\n");
+ output.append(detailPrefix).append("column_idxs_lists:
").append(columnIdxsLists).append("\n");
output.append(detailPrefix).append("row_ids:
").append(rowIds).append("\n");
output.append(detailPrefix).append("isTopMaterializeNode:
").append(isTopMaterializeNode).append("\n");
printNestedColumns(output, detailPrefix, outputTupleDesc);
@@ -152,7 +196,7 @@ public class MaterializationNode extends PlanNode {
msg.materialization_node.setColumnDescsLists(thriftCols);
msg.materialization_node.setSlotLocsLists(locations);
- msg.materialization_node.setColumnIdxsLists(idxs);
+ msg.materialization_node.setColumnIdxsLists(columnIdxsLists);
msg.materialization_node.setFetchRowStores(rowStoreFlags);
msg.materialization_node.setGcIdMap(isTopMaterializeNode);
}
@@ -169,8 +213,8 @@ public class MaterializationNode extends PlanNode {
this.locations = locations;
}
- public void setIdxs(List<List<Integer>> idxs) {
- this.idxs = idxs;
+ public void setColumnIdxsLists(List<List<Integer>> columnIdxsLists) {
+ this.columnIdxsLists = columnIdxsLists;
}
public void setRowStoreFlags(List<Boolean> rowStoreFlags) {
diff --git
a/regression-test/suites/external_table_p0/hive/test_hive_topn_lazy_mat.groovy
b/regression-test/suites/external_table_p0/hive/test_hive_topn_lazy_mat.groovy
index e90ab9b8631..761cd1cddf9 100644
---
a/regression-test/suites/external_table_p0/hive/test_hive_topn_lazy_mat.groovy
+++
b/regression-test/suites/external_table_p0/hive/test_hive_topn_lazy_mat.groovy
@@ -210,7 +210,7 @@ suite("test_hive_topn_lazy_mat", "p0,external") {
contains("projectList:[id, name, value, active, score, file_id]")
contains("column_descs_lists[[`name` text NULL, `value` double
NULL, `active` boolean NULL, `score` double NULL, `file_id` int NULL]]")
contains("locations: [[1, 2, 3, 4, 5]]")
- contains("table_idxs: [[1, 2, 3, 4, 5]]")
+ contains("column_idxs_lists: [[1, 2, 3, 4, 5]]")
contains("row_ids:
[__DORIS_GLOBAL_ROWID_COL__orc_topn_lazy_mat_table]")
}
@@ -219,7 +219,7 @@ suite("test_hive_topn_lazy_mat", "p0,external") {
contains("projectList:[file_id, id]")
contains("column_descs_lists[[`id` int NULL, `file_id` int NULL]]")
contains("locations: [[1, 2]]")
- contains("table_idxs: [[0, 5]]")
+ contains("column_idxs_lists: [[0, 5]]")
contains("row_ids:
[__DORIS_GLOBAL_ROWID_COL__orc_topn_lazy_mat_table]")
}
@@ -229,7 +229,7 @@ suite("test_hive_topn_lazy_mat", "p0,external") {
contains("projectList:[name, length(a.name), value, id, name,
value, active, score, file_id, id, name, value, active, score, file_id]")
contains("column_descs_lists[[`name` text NULL, `value` double
NULL, `active` boolean NULL, `score` double NULL, `file_id` int NULL], [`value`
double NULL, `active` boolean NULL, `score` double NULL, `file_id` int NULL]]")
contains("locations: [[5, 6, 7, 8, 9], [10, 11, 12, 13]]")
- contains("table_idxs: [[1, 2, 3, 4, 5], [2, 3, 4, 5]]")
+ contains("column_idxs_lists: [[1, 2, 3, 4, 5], [2, 3, 4, 5]]")
contains("row_ids:
[__DORIS_GLOBAL_ROWID_COL__orc_topn_lazy_mat_table,
__DORIS_GLOBAL_ROWID_COL__parquet_topn_lazy_mat_table]")
}
diff --git
a/regression-test/suites/external_table_p0/tvf/test_tvf_topn_lazy_mat.groovy
b/regression-test/suites/external_table_p0/tvf/test_tvf_topn_lazy_mat.groovy
index b1db8faa943..d084631c772 100644
--- a/regression-test/suites/external_table_p0/tvf/test_tvf_topn_lazy_mat.groovy
+++ b/regression-test/suites/external_table_p0/tvf/test_tvf_topn_lazy_mat.groovy
@@ -148,7 +148,7 @@ suite("test_tvf_topn_lazy_mat", "p0,external") {
contains("column_descs_lists[[`name` text NULL, `value` double
NULL, `active` boolean NULL, `score` double NULL]]")
contains("locations: [[1, 2, 3, 4]]")
- contains("table_idxs: [[1, 2, 3, 4]]")
+ contains("column_idxs_lists: [[1, 2, 3, 4]]")
contains("row_ids: [__DORIS_GLOBAL_ROWID_COL__hdfs]")
contains("isTopMaterializeNode: true")
contains("SlotDescriptor{id=0, col=id, colUniqueId=-1,
type=bigint, nullable=true")
@@ -163,7 +163,7 @@ suite("test_tvf_topn_lazy_mat", "p0,external") {
contains("projectList:[name, value, score]")
contains("column_descs_lists[[`name` text NULL, `value` double
NULL, `score` double NULL]]")
contains("locations: [[1, 2, 3]]")
- contains("table_idxs: [[1, 2, 4]]")
+ contains("column_idxs_lists: [[1, 2, 4]]")
contains("row_ids: [__DORIS_GLOBAL_ROWID_COL__hdfs]")
contains("isTopMaterializeNode: true")
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]