This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new cff565f2b25 [Enhancement](explain)Display deleteFileNum for
FileScanNode when explain verbose (#60308)
cff565f2b25 is described below
commit cff565f2b259700accf53b6d2cb445f180d9f254
Author: daidai <[email protected]>
AuthorDate: Mon Feb 2 16:54:28 2026 +0800
[Enhancement](explain)Display deleteFileNum for FileScanNode when explain
verbose (#60308)
### What problem does this PR solve?
Problem Summary:
This PR enhances the output of EXPLAIN VERBOSE for File Scan nodes by
adding the following metrics:
`dataFileNum=xxx, deleteFileNum=xxx, deleteSplitNum=xxx`
Especially useful for iceberg/paimon/hive acid
These metrics provide more visibility into the underlying file and split
layout, helping users better tune parameters and control query
performance.
Details:
`dataFileNum` : The number of distinct data files that need to be read.
This is not equivalent to the number of splits, since a single data file
can be divided into multiple splits.
`deleteFileNum` : The number of distinct delete files that need to be
read.
`deleteSplitNum` : Added because the relationship between data files and
delete files is many-to-many:
one data file may be associated with multiple delete files
one delete file may apply to multiple data files
Using deleteSplitNum / dataSplitNum, users can estimate the average
number of delete splits that need to be read per data split.
Example:
```
mysql> explain verbose select * from iceberg.format_v3.dv_test_1w;
+-----------------------------------------------------------------------------------------------------------------------------------------------+
| Explain String(Nereids Planner)
|
+-----------------------------------------------------------------------------------------------------------------------------------------------+
| PLAN FRAGMENT 0
|
| OUTPUT EXPRS:
|
| id[#0]
|
| grp[#1]
|
| value[#2]
|
| ts[#3]
|
| PARTITION: RANDOM
|
|
|
| HAS_COLO_PLAN_NODE: false
|
|
|
| VRESULT SINK
|
| MYSQL_PROTOCOL
|
|
|
| 0:VICEBERG_SCAN_NODE(32)
|
| table: iceberg.format_v3.dv_test_1w
|
| inputSplitNum=220, totalFileSize=720774, scanRanges=220
|
| partition=0/0
|
| backends:
|
| 1769590309070
|
|
s3://warehouse/wh/format_v3/dv_test_1w/data/00004-51-fc462f9a-d42a-404d-adfc-c8d2781c8d04-0-00001.parquet
start: 4 length: 2672 |
|
s3://warehouse/wh/format_v3/dv_test_1w/data/00003-50-fc462f9a-d42a-404d-adfc-c8d2781c8d04-0-00001.parquet
start: 4 length: 2852 |
|
s3://warehouse/wh/format_v3/dv_test_1w/data/00000-47-fc462f9a-d42a-404d-adfc-c8d2781c8d04-0-00001.parquet
start: 4 length: 2894 |
| ... other 216 files ...
|
|
s3://warehouse/wh/format_v3/dv_test_1w/data/00001-48-fc462f9a-d42a-404d-adfc-c8d2781c8d04-0-00001.parquet
start: 58397 length: 13894 |
| dataFileNum=10, deleteFileNum=1 deleteSplitNum=220
|
| cardinality=33334, numNodes=1
|
| pushdown agg=NONE
|
| tuple ids: 0
|
|
|
| Tuples:
|
| TupleDescriptor{id=0, tbl=dv_test_1w}
|
| SlotDescriptor{id=0, col=id, colUniqueId=1, type=bigint, nullable=true,
isAutoIncrement=false, subColPath=null, virtualColumn=null} |
| SlotDescriptor{id=1, col=grp, colUniqueId=2, type=int, nullable=true,
isAutoIncrement=false, subColPath=null, virtualColumn=null} |
| SlotDescriptor{id=2, col=value, colUniqueId=3, type=int, nullable=true,
isAutoIncrement=false, subColPath=null, virtualColumn=null} |
| SlotDescriptor{id=3, col=ts, colUniqueId=4, type=datetimev2(6),
nullable=true, isAutoIncrement=false, subColPath=null, virtualColumn=null} |
|
|
|
|
|
|
|
|
| ========== STATISTICS ==========
|
+-----------------------------------------------------------------------------------------------------------------------------------------------+
```
---
.../org/apache/doris/datasource/FileScanNode.java | 32 +++++++++++++++++
.../doris/datasource/hive/source/HiveScanNode.java | 31 +++++++++++++++++
.../datasource/iceberg/source/IcebergScanNode.java | 40 ++++++++++++++++++++++
.../datasource/paimon/source/PaimonScanNode.java | 22 ++++++++++++
.../hive/test_transactional_hive.groovy | 8 +++++
.../iceberg/test_iceberg_position_delete.groovy | 10 ++++++
.../paimon/test_paimon_deletion_vector_oss.groovy | 9 +++++
7 files changed, 152 insertions(+)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java
index a7aa0f607ac..1c1d6ac6720 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java
@@ -54,8 +54,10 @@ import com.google.common.collect.Multimap;
import java.util.Collections;
import java.util.Comparator;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.stream.Collectors;
/**
@@ -99,6 +101,17 @@ public abstract class FileScanNode extends ExternalScanNode
{
return totalFileSize;
}
+ /**
+ * Get all delete files for the given file range.
+ * @param rangeDesc the file range descriptor
+ * @return list of delete file paths (formatted strings)
+ */
+ protected List<String> getDeleteFiles(TFileRangeDesc rangeDesc) {
+ // Default implementation: return empty list
+ // Subclasses should override this method
+ return Collections.emptyList();
+ }
+
@Override
public String getNodeExplainString(String prefix, TExplainLevel
detailLevel) {
StringBuilder output = new StringBuilder();
@@ -139,6 +152,21 @@ public abstract class FileScanNode extends
ExternalScanNode {
return Long.compare(o1.getStartOffset(),
o2.getStartOffset());
}
});
+
+ // A Data file may be divided into different splits, so a set
is used to remove duplicates.
+ Set<String> dataFilesSet = new HashSet<>();
+ // A delete file might be used by multiple data files, so use
set to remove duplicates.
+ Set<String> deleteFilesSet = new HashSet<>();
+ // You can estimate how many delete splits need to be read for
a data split
+ // using deleteSplitNum / dataSplitNum(fileRangeDescs.size())
split.
+ long deleteSplitNum = 0;
+ for (TFileRangeDesc fileRangeDesc : fileRangeDescs) {
+ dataFilesSet.add(fileRangeDesc.getPath());
+ List<String> deletefiles = getDeleteFiles(fileRangeDesc);
+ deleteFilesSet.addAll(deletefiles);
+ deleteSplitNum += deletefiles.size();
+ }
+
// 3. if size <= 4, print all. if size > 4, print first 3 and
last 1
int size = fileRangeDescs.size();
if (size <= 4) {
@@ -164,6 +192,10 @@ public abstract class FileScanNode extends
ExternalScanNode {
.append(" length: ").append(file.getSize())
.append("\n");
}
+ output.append(prefix).append("
").append("dataFileNum=").append(dataFilesSet.size())
+ .append(",
deleteFileNum=").append(deleteFilesSet.size())
+ .append(", deleteSplitNum=").append(deleteSplitNum)
+ .append("\n");
}
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
index 776ff234cd6..2e8f3735f6b 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
@@ -490,6 +490,37 @@ public class HiveScanNode extends FileQueryScanNode {
}
}
+ @Override
+ protected List<String> getDeleteFiles(TFileRangeDesc rangeDesc) {
+ List<String> deleteFiles = new ArrayList<>();
+ if (rangeDesc == null || !rangeDesc.isSetTableFormatParams()) {
+ return deleteFiles;
+ }
+ TTableFormatFileDesc tableFormatParams =
rangeDesc.getTableFormatParams();
+ if (tableFormatParams == null ||
!tableFormatParams.isSetTransactionalHiveParams()) {
+ return deleteFiles;
+ }
+ TTransactionalHiveDesc hiveParams =
tableFormatParams.getTransactionalHiveParams();
+ if (hiveParams == null || !hiveParams.isSetDeleteDeltas()) {
+ return deleteFiles;
+ }
+ List<TTransactionalHiveDeleteDeltaDesc> deleteDeltas =
hiveParams.getDeleteDeltas();
+ if (deleteDeltas == null) {
+ return deleteFiles;
+ }
+ // Format: {directory_location}/{file_name}
+ for (TTransactionalHiveDeleteDeltaDesc deleteDelta : deleteDeltas) {
+ if (deleteDelta != null && deleteDelta.isSetDirectoryLocation()
+ && deleteDelta.isSetFileNames() &&
deleteDelta.getFileNames() != null) {
+ String directoryLocation = deleteDelta.getDirectoryLocation();
+ for (String fileName : deleteDelta.getFileNames()) {
+ deleteFiles.add(directoryLocation + "/" + fileName);
+ }
+ }
+ }
+ return deleteFiles;
+ }
+
@Override
protected Map<String, String> getLocationProperties() {
return hmsTable.getBackendStorageProperties();
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
index 26326d8ee7d..647c2e014b9 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
@@ -277,6 +277,46 @@ public class IcebergScanNode extends FileQueryScanNode {
rangeDesc.setTableFormatParams(tableFormatFileDesc);
}
+ @Override
+ protected List<String> getDeleteFiles(TFileRangeDesc rangeDesc) {
+ List<String> deleteFiles = new ArrayList<>();
+ if (rangeDesc == null || !rangeDesc.isSetTableFormatParams()) {
+ return deleteFiles;
+ }
+ TTableFormatFileDesc tableFormatParams =
rangeDesc.getTableFormatParams();
+ if (tableFormatParams == null ||
!tableFormatParams.isSetIcebergParams()) {
+ return deleteFiles;
+ }
+ TIcebergFileDesc icebergParams = tableFormatParams.getIcebergParams();
+ if (icebergParams == null || !icebergParams.isSetDeleteFiles()) {
+ return deleteFiles;
+ }
+ List<TIcebergDeleteFileDesc> icebergDeleteFiles =
icebergParams.getDeleteFiles();
+ if (icebergDeleteFiles == null) {
+ return deleteFiles;
+ }
+ for (TIcebergDeleteFileDesc deleteFile : icebergDeleteFiles) {
+ if (deleteFile != null && deleteFile.isSetPath()) {
+ deleteFiles.add(deleteFile.getPath());
+ }
+ }
+ return deleteFiles;
+ }
+
+ private String getDeleteFileContentType(int content) {
+ // Iceberg file type: 0: data, 1: position delete, 2: equality delete,
3: deletion vector
+ switch (content) {
+ case 1:
+ return "position_delete";
+ case 2:
+ return "equality_delete";
+ case 3:
+ return "deletion_vector";
+ default:
+ return "unknown";
+ }
+ }
+
@Override
public List<Split> getSplits(int numBackends) throws UserException {
try {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java
index b0f659ae52c..4dffb618ffc 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java
@@ -274,6 +274,28 @@ public class PaimonScanNode extends FileQueryScanNode {
rangeDesc.setTableFormatParams(tableFormatFileDesc);
}
+ @Override
+ protected List<String> getDeleteFiles(TFileRangeDesc rangeDesc) {
+ List<String> deleteFiles = new ArrayList<>();
+ if (rangeDesc == null || !rangeDesc.isSetTableFormatParams()) {
+ return deleteFiles;
+ }
+ TTableFormatFileDesc tableFormatParams =
rangeDesc.getTableFormatParams();
+ if (tableFormatParams == null ||
!tableFormatParams.isSetPaimonParams()) {
+ return deleteFiles;
+ }
+ TPaimonFileDesc paimonParams = tableFormatParams.getPaimonParams();
+ if (paimonParams == null || !paimonParams.isSetDeletionFile()) {
+ return deleteFiles;
+ }
+ TPaimonDeletionFileDesc deletionFile = paimonParams.getDeletionFile();
+ if (deletionFile != null && deletionFile.isSetPath()) {
+ // Format: path [offset: offset, length: length]
+ deleteFiles.add(deletionFile.getPath());
+ }
+ return deleteFiles;
+ }
+
@Override
public List<Split> getSplits(int numBackends) throws UserException {
boolean forceJniScanner = sessionVariable.isForceJniScanner();
diff --git
a/regression-test/suites/external_table_p0/hive/test_transactional_hive.groovy
b/regression-test/suites/external_table_p0/hive/test_transactional_hive.groovy
index 568bb632dec..adc97540665 100644
---
a/regression-test/suites/external_table_p0/hive/test_transactional_hive.groovy
+++
b/regression-test/suites/external_table_p0/hive/test_transactional_hive.groovy
@@ -142,6 +142,13 @@ suite("test_transactional_hive",
"p0,external,hive,external_docker,external_dock
qt_count_5 """ select count(*) from orc_acid_major; """ //3
}
+ def test_explain_verbose = {
+ explain {
+ sql ("select count(*) from orc_full_acid")
+ verbose (true)
+ contains "deleteFileNum"
+ }
+ }
String enabled = context.config.otherConfigs.get("enableHiveTest")
if (enabled == null || !enabled.equalsIgnoreCase("true")) {
@@ -177,6 +184,7 @@ suite("test_transactional_hive",
"p0,external,hive,external_docker,external_dock
test_acid_count()
+ test_explain_verbose()
q01_par_limit()
diff --git
a/regression-test/suites/external_table_p0/iceberg/test_iceberg_position_delete.groovy
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_position_delete.groovy
index d793cef3568..91bc48a6758 100644
---
a/regression-test/suites/external_table_p0/iceberg/test_iceberg_position_delete.groovy
+++
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_position_delete.groovy
@@ -165,6 +165,16 @@ suite("test_iceberg_position_delete",
"p0,external,doris,external_docker,externa
assertTrue(iceberg_position_gen_7.size() == 5632)
// sql """drop catalog ${catalog_name}"""
+
+ def test_explain_verbose = {
+ explain {
+ sql ("select name from iceberg_position_gen_data where id !=
5;")
+ verbose (true)
+ contains "deleteFileNum"
+ }
+ }
+ test_explain_verbose()
+
}
/*
diff --git
a/regression-test/suites/external_table_p0/paimon/test_paimon_deletion_vector_oss.groovy
b/regression-test/suites/external_table_p0/paimon/test_paimon_deletion_vector_oss.groovy
index 71a4d971169..76574aae528 100644
---
a/regression-test/suites/external_table_p0/paimon/test_paimon_deletion_vector_oss.groovy
+++
b/regression-test/suites/external_table_p0/paimon/test_paimon_deletion_vector_oss.groovy
@@ -53,8 +53,17 @@ suite("test_paimon_deletion_vector_oss",
"p0,external,doris,external_docker,exte
qt_6 """select * from deletion_vector_parquet where id > 2 order
by id;"""
}
+ def test_explain_verbose = {
+ explain {
+ sql ("select * from deletion_vector_orc;")
+ verbose (true)
+ contains "deleteFileNum"
+ }
+ }
+
test_cases("false")
test_cases("true")
+ test_explain_verbose()
} finally {
sql """set force_jni_scanner=false"""
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]