Copilot commented on code in PR #61233:
URL: https://github.com/apache/doris/pull/61233#discussion_r2918082631


##########
fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudTabletRebalancer.java:
##########
@@ -1599,73 +1591,98 @@ private void handleWarmupCompletion(InfightTask task, 
String clusterId, boolean
             return;
         }
 
-        updateClusterToBeMap(task.pickedTablet, task.destBe, clusterId, infos);
+        updateClusterToBeMap(task.pickedTabletId, task.destBe, clusterId, 
infos);
 
         if (LOG.isDebugEnabled()) {
-            LOG.debug("remove tablet {}-{}", clusterId, 
task.pickedTablet.getId());
+            LOG.debug("remove tablet {}-{}", clusterId, task.pickedTabletId);
         }
-        tabletToInfightTask.remove(new 
InfightTablet(task.pickedTablet.getId(), clusterId));
+        tabletToInfightTask.remove(new InfightTablet(task.pickedTabletId, 
clusterId));
 
         if (BalanceTypeEnum.SYNC_WARMUP.equals(currentBalanceType)) {
             try {
                 // send sync cache rpc again, ignore the result, the best 
effort to sync some new data
-                sendPreHeatingRpc(task.pickedTablet, task.srcBe, task.destBe);
+                
sendPreHeatingRpc(Collections.singletonList(task.pickedTabletId), task.srcBe, 
task.destBe);
             } catch (Exception e) {
                 LOG.warn("Failed to preheat tablet {} from {} to {}, "
                                 + "help msg change fe config 
cloud_warm_up_for_rebalance_type to without_warmup, ",
-                        task.pickedTablet.getId(), task.srcBe, task.destBe, e);
+                        task.pickedTabletId, task.srcBe, task.destBe, e);
             }
         }
     }
 
-    private void updateBeToTablets(Tablet pickedTablet, long srcBe, long 
destBe,
-                                   ConcurrentHashMap<Long, Set<Tablet>> 
globalBeToTablets,
-                                   ConcurrentHashMap<Long, 
ConcurrentHashMap<Long, Set<Tablet>>> beToTabletsInTable,
+    private void updateBeToTablets(long tabletId, long srcBe, long destBe,
+                                   ConcurrentHashMap<Long, Set<Long>> 
globalBeToTablets,
+                                   ConcurrentHashMap<Long, 
ConcurrentHashMap<Long, Set<Long>>> beToTabletsInTable,
                                    ConcurrentHashMap<Long, 
ConcurrentHashMap<Long, ConcurrentHashMap<Long,
-                                       Set<Tablet>>>> partToTablets) {
-        CloudReplica replica = ((CloudTablet) pickedTablet).getCloudReplica();
-        long tableId = replica.getTableId();
-        long partId = replica.getPartitionId();
-        long indexId = replica.getIndexId();
+                                       Set<Long>>>> partToTablets) {
+        TabletMeta tabletMeta = 
Env.getCurrentEnv().getTabletInvertedIndex().getTabletMeta(tabletId);
+        if (tabletMeta == null) {
+            LOG.warn("tablet {} meta not found in inverted index, skip 
updateBeToTablets", tabletId);
+            return;
+        }
+        long tableId = tabletMeta.getTableId();
+        long partId = tabletMeta.getPartitionId();
+        long indexId = tabletMeta.getIndexId();
 
-        globalBeToTablets.get(srcBe).remove(pickedTablet);
-        beToTabletsInTable.get(tableId).get(srcBe).remove(pickedTablet);
-        partToTablets.get(partId).get(indexId).get(srcBe).remove(pickedTablet);
+        globalBeToTablets.get(srcBe).remove(tabletId);
+        beToTabletsInTable.get(tableId).get(srcBe).remove(tabletId);
+        partToTablets.get(partId).get(indexId).get(srcBe).remove(tabletId);
 

Review Comment:
   `updateBeToTablets()` assumes all nested maps/sets exist 
(`globalBeToTablets.get(srcBe)`, `beToTabletsInTable.get(tableId).get(srcBe)`, 
etc.) and will NPE if a warmup task becomes stale (e.g., tablet/table dropped 
or route info rebuilt without this tablet) or if any of these maps don’t 
contain the expected keys. Please make the removals null-safe (check each level 
before `remove`) and decide how to clean up/abort when the mapping is missing.
   ```suggestion
           // Remove tablet from srcBe mappings; all lookups and removals must 
be null-safe.
           Set<Long> globalSrcTablets = globalBeToTablets.get(srcBe);
           if (globalSrcTablets == null || !globalSrcTablets.remove(tabletId)) {
               LOG.debug("skip updateBeToTablets for tablet {}: srcBe {} not 
found in globalBeToTablets "
                               + "or tablet not present", tabletId, srcBe);
               return;
           }
   
           ConcurrentHashMap<Long, Set<Long>> tableBeToTablets = 
beToTabletsInTable.get(tableId);
           if (tableBeToTablets == null) {
               LOG.debug("skip updateBeToTablets for tablet {}: tableId {} not 
found in beToTabletsInTable",
                       tabletId, tableId);
               return;
           }
           Set<Long> tableSrcTablets = tableBeToTablets.get(srcBe);
           if (tableSrcTablets == null || !tableSrcTablets.remove(tabletId)) {
               LOG.debug("skip updateBeToTablets for tablet {}: srcBe {} not 
found in beToTabletsInTable for table {} "
                               + "or tablet not present", tabletId, srcBe, 
tableId);
               return;
           }
   
           ConcurrentHashMap<Long, ConcurrentHashMap<Long, Set<Long>>> 
indexMapByPart = partToTablets.get(partId);
           if (indexMapByPart == null) {
               LOG.debug("skip updateBeToTablets for tablet {}: partId {} not 
found in partToTablets",
                       tabletId, partId);
               return;
           }
           ConcurrentHashMap<Long, Set<Long>> beToTabletsByIndex = 
indexMapByPart.get(indexId);
           if (beToTabletsByIndex == null) {
               LOG.debug("skip updateBeToTablets for tablet {}: indexId {} not 
found in partToTablets for part {}",
                       tabletId, indexId, partId);
               return;
           }
           Set<Long> partSrcTablets = beToTabletsByIndex.get(srcBe);
           if (partSrcTablets == null || !partSrcTablets.remove(tabletId)) {
               LOG.debug("skip updateBeToTablets for tablet {}: srcBe {} not 
found in partToTablets for "
                               + "part {}, index {} or tablet not present", 
tabletId, srcBe, partId, indexId);
               return;
           }
   
           // Only after successful removals from all src mappings do we add 
the tablet to destBe mappings.
   ```



##########
fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudTabletRebalancer.java:
##########
@@ -2067,49 +2089,58 @@ public void addTabletMigrationTask(Long srcBe, Long 
dstBe) {
      * replica location info will be updated in both master and follower FEs.
      */
     private void migrateTablets(Long srcBe, Long dstBe) {
-        // get tablets
-        Set<Tablet> tablets = beToTabletsGlobal.get(srcBe);
-        if (tablets == null || tablets.isEmpty()) {
+        // get tabletIds
+        Set<Long> tabletIds = beToTabletsGlobal.get(srcBe);
+        if (tabletIds == null || tabletIds.isEmpty()) {
             LOG.info("smooth upgrade srcBe={} does not have any tablets, set 
inactive", srcBe);
             ((CloudEnv) 
Env.getCurrentEnv()).getCloudUpgradeMgr().setBeStateInactive(srcBe);
             return;
         }
         List<UpdateCloudReplicaInfo> infos = new ArrayList<>();
-        for (Tablet tablet : tablets) {
-            // get replica
-            CloudReplica cloudReplica = ((CloudTablet) 
tablet).getCloudReplica();
-            Backend be = cloudSystemInfoService.getBackend(srcBe);
-            if (be == null) {
-                LOG.info("src backend {} not found", srcBe);
+        for (Long tabletId : tabletIds) {
+            TabletMeta tabletMeta = 
Env.getCurrentEnv().getTabletInvertedIndex().getTabletMeta(tabletId);
+            if (tabletMeta == null) {
+                LOG.warn("tablet {} meta not found in inverted index, skip 
migration", tabletId);
                 continue;
             }
-            // populate to followers
-            Database db = 
Env.getCurrentInternalCatalog().getDbNullable(cloudReplica.getDbId());
+            Database db = 
Env.getCurrentInternalCatalog().getDbNullable(tabletMeta.getDbId());
             if (db == null) {
-                long beId;
-                try {
-                    beId = cloudReplica.getBackendId();
-                } catch (ComputeGroupException e) {
-                    if (LOG.isDebugEnabled()) {
-                        LOG.debug("get backend failed cloudReplica {}", 
cloudReplica, e);
-                    }
-                    beId = -1;
-                }
-                LOG.error("get null db from replica, tabletId={}, 
partitionId={}, beId={}",
-                        cloudReplica.getTableId(), 
cloudReplica.getPartitionId(), beId);
+                LOG.error("get null db from tablet meta, tabletId={}, 
dbId={}", tabletId, tabletMeta.getDbId());
                 continue;
             }
-            OlapTable table = (OlapTable) 
db.getTableNullable(cloudReplica.getTableId());
+            OlapTable table = (OlapTable) 
db.getTableNullable(tabletMeta.getTableId());
             if (table == null) {
                 continue;
             }
 
+            Backend be = cloudSystemInfoService.getBackend(srcBe);
+            if (be == null) {
+                LOG.info("src backend {} not found", srcBe);
+                continue;
+            }
+
             String clusterId = be.getCloudClusterId();
             String clusterName = be.getCloudClusterName();

Review Comment:
   `migrateTablets()` calls `cloudSystemInfoService.getBackend(srcBe)` inside 
the per-tablet loop, so if `srcBe` is missing it will log once per tablet and 
do redundant lookups even when it exists. Fetch the `Backend` (and derived 
`clusterId/clusterName`) once before iterating `tabletIds`, and return early if 
it’s null.



##########
fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudTabletRebalancer.java:
##########
@@ -844,7 +839,7 @@ public void checkInflightWarmUpCacheAsync() {
                 List<InfightTablet> toRemove = new LinkedList<>();
                 for (InfightTask task : entry.getValue()) {
                     for (InfightTablet key : tabletToInfightTask.keySet()) {
-                        toRemove.add(new 
InfightTablet(task.pickedTablet.getId(), key.clusterId));
+                        toRemove.add(new InfightTablet(task.pickedTabletId, 
key.clusterId));
                     }

Review Comment:
   `checkInflightWarmUpCacheAsync()` builds `toRemove` by pairing each task’s 
tabletId with *every* clusterId in `tabletToInfightTask.keySet()`. This can 
remove inflight tasks for the same tablet in other compute groups (different 
clusterId) that are unrelated to the dead `destBackend`, and it’s also O(n²). 
Consider retaining the `InfightTablet` keys when grouping by destBe (e.g., 
store keys in `beToInfightTasks`) or add `clusterId` into `InfightTask` so you 
can remove exactly the keys corresponding to `entry.getValue()`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to