This is an automated email from the ASF dual-hosted git repository.

CRZbulabula pushed a commit to branch fix/npe-stale-fragment-instance-context
in repository https://gitbox.apache.org/repos/asf/iotdb.git

commit 5805534097103ebe890348c4701919515ccf40ea
Author: Yongzao <[email protected]>
AuthorDate: Mon May 25 11:56:11 2026 +0800

    Fix NPE when retried query reuses stale FragmentInstanceContext
    
    When QueryExecution.retry() re-plans a query, doDistributionPlan()
    creates fresh PlanFragmentId objects with nextFragmentInstanceId reset
    to 0. Because the queryId is unchanged, retry generates fragment
    instance IDs identical to the first execution (e.g. queryId_11.0).
    
    FragmentInstanceManager.instanceContext retains completed contexts for
    5 minutes for statistics caching. When a retry dispatches the same FI
    ID, instanceContext.computeIfAbsent() returns the stale old context
    whose releaseResource() has already been called, setting dataRegion to
    null. New drivers then NPE at dataRegion.tryReadLock() inside
    FragmentInstanceContext.initQueryDataSource().
    
    Fix: replace computeIfAbsent() with compute() in
    execDataQueryFragmentInstance() so that a released context
    (dataRegion == null) is atomically replaced with a fresh one carrying
    the new dataRegion reference.
    
    Defensive fix: add a null guard for dataRegion in
    getSharedQueryDataSource() that returns null (treated by DataDriver as
    an aborted FI) instead of propagating NPE.
---
 .../fragment/FragmentInstanceContext.java          |  4 +++
 .../fragment/FragmentInstanceManager.java          | 30 ++++++++++++++--------
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git 
a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/fragment/FragmentInstanceContext.java
 
b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/fragment/FragmentInstanceContext.java
index 2a0373cf6fd..ee177dde02e 100644
--- 
a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/fragment/FragmentInstanceContext.java
+++ 
b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/fragment/FragmentInstanceContext.java
@@ -781,6 +781,10 @@ public class FragmentInstanceContext extends QueryContext {
 
   public synchronized IQueryDataSource getSharedQueryDataSource() throws 
QueryProcessException {
     if (sharedQueryDataSource == null) {
+      if (dataRegion == null) {
+        // Context was released (releaseResource() already ran). Signal 
aborted to the driver.
+        return null;
+      }
       switch (queryDataSourceType) {
         case SERIES_SCAN:
           if (initQueryDataSource(sourcePaths)) {
diff --git 
a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/fragment/FragmentInstanceManager.java
 
b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/fragment/FragmentInstanceManager.java
index 37e5c6c0858..520d98c3f81 100644
--- 
a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/fragment/FragmentInstanceManager.java
+++ 
b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/fragment/FragmentInstanceManager.java
@@ -151,19 +151,27 @@ public class FragmentInstanceManager {
                 DataNodeQueryContext dataNodeQueryContext =
                     getOrCreateDataNodeQueryContext(instanceId.getQueryId(), 
dataNodeFINum);
 
+                // Use compute() instead of computeIfAbsent() to handle the 
retry scenario:
+                // QueryExecution.retry() re-creates the distribution plan 
with PlanFragmentId
+                // counters reset to 0, generating fragment instance IDs 
identical to the first
+                // execution. instanceContext retains released contexts 
(dataRegion == null) for
+                // statistics caching. Without this check, a retried FI reuses 
the stale context
+                // and NPEs at dataRegion.tryReadLock().
                 FragmentInstanceContext context =
-                    instanceContext.computeIfAbsent(
+                    instanceContext.compute(
                         instanceId,
-                        fragmentInstanceId ->
-                            createFragmentInstanceContext(
-                                fragmentInstanceId,
-                                stateMachine,
-                                instance.getSessionInfo(),
-                                dataRegion,
-                                instance.getGlobalTimePredicate(),
-                                dataNodeQueryContextMap,
-                                instance.isDebug(),
-                                instance.isVerbose()));
+                        (fiId, existingContext) ->
+                            (existingContext == null || 
existingContext.getDataRegion() == null)
+                                ? createFragmentInstanceContext(
+                                    fiId,
+                                    stateMachine,
+                                    instance.getSessionInfo(),
+                                    dataRegion,
+                                    instance.getGlobalTimePredicate(),
+                                    dataNodeQueryContextMap,
+                                    instance.isDebug(),
+                                    instance.isVerbose())
+                                : existingContext);
                 context.setHighestPriority(instance.isHighestPriority());
 
                 try {

Reply via email to