Copilot commented on code in PR #61233:
URL: https://github.com/apache/doris/pull/61233#discussion_r2918082631
##########
fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudTabletRebalancer.java:
##########
@@ -1599,73 +1591,98 @@ private void handleWarmupCompletion(InfightTask task,
String clusterId, boolean
return;
}
- updateClusterToBeMap(task.pickedTablet, task.destBe, clusterId, infos);
+ updateClusterToBeMap(task.pickedTabletId, task.destBe, clusterId,
infos);
if (LOG.isDebugEnabled()) {
- LOG.debug("remove tablet {}-{}", clusterId,
task.pickedTablet.getId());
+ LOG.debug("remove tablet {}-{}", clusterId, task.pickedTabletId);
}
- tabletToInfightTask.remove(new
InfightTablet(task.pickedTablet.getId(), clusterId));
+ tabletToInfightTask.remove(new InfightTablet(task.pickedTabletId,
clusterId));
if (BalanceTypeEnum.SYNC_WARMUP.equals(currentBalanceType)) {
try {
// send sync cache rpc again, ignore the result, the best
effort to sync some new data
- sendPreHeatingRpc(task.pickedTablet, task.srcBe, task.destBe);
+
sendPreHeatingRpc(Collections.singletonList(task.pickedTabletId), task.srcBe,
task.destBe);
} catch (Exception e) {
LOG.warn("Failed to preheat tablet {} from {} to {}, "
+ "help msg change fe config
cloud_warm_up_for_rebalance_type to without_warmup, ",
- task.pickedTablet.getId(), task.srcBe, task.destBe, e);
+ task.pickedTabletId, task.srcBe, task.destBe, e);
}
}
}
- private void updateBeToTablets(Tablet pickedTablet, long srcBe, long
destBe,
- ConcurrentHashMap<Long, Set<Tablet>>
globalBeToTablets,
- ConcurrentHashMap<Long,
ConcurrentHashMap<Long, Set<Tablet>>> beToTabletsInTable,
+ private void updateBeToTablets(long tabletId, long srcBe, long destBe,
+ ConcurrentHashMap<Long, Set<Long>>
globalBeToTablets,
+ ConcurrentHashMap<Long,
ConcurrentHashMap<Long, Set<Long>>> beToTabletsInTable,
ConcurrentHashMap<Long,
ConcurrentHashMap<Long, ConcurrentHashMap<Long,
- Set<Tablet>>>> partToTablets) {
- CloudReplica replica = ((CloudTablet) pickedTablet).getCloudReplica();
- long tableId = replica.getTableId();
- long partId = replica.getPartitionId();
- long indexId = replica.getIndexId();
+ Set<Long>>>> partToTablets) {
+ TabletMeta tabletMeta =
Env.getCurrentEnv().getTabletInvertedIndex().getTabletMeta(tabletId);
+ if (tabletMeta == null) {
+ LOG.warn("tablet {} meta not found in inverted index, skip
updateBeToTablets", tabletId);
+ return;
+ }
+ long tableId = tabletMeta.getTableId();
+ long partId = tabletMeta.getPartitionId();
+ long indexId = tabletMeta.getIndexId();
- globalBeToTablets.get(srcBe).remove(pickedTablet);
- beToTabletsInTable.get(tableId).get(srcBe).remove(pickedTablet);
- partToTablets.get(partId).get(indexId).get(srcBe).remove(pickedTablet);
+ globalBeToTablets.get(srcBe).remove(tabletId);
+ beToTabletsInTable.get(tableId).get(srcBe).remove(tabletId);
+ partToTablets.get(partId).get(indexId).get(srcBe).remove(tabletId);
Review Comment:
`updateBeToTablets()` assumes all nested maps/sets exist
(`globalBeToTablets.get(srcBe)`, `beToTabletsInTable.get(tableId).get(srcBe)`,
etc.) and will NPE if a warmup task becomes stale (e.g., tablet/table dropped
or route info rebuilt without this tablet) or if any of these maps don’t
contain the expected keys. Please make the removals null-safe (check each level
before `remove`) and decide how to clean up/abort when the mapping is missing.
```suggestion
// Remove tablet from srcBe mappings; all lookups and removals must
be null-safe.
Set<Long> globalSrcTablets = globalBeToTablets.get(srcBe);
if (globalSrcTablets == null || !globalSrcTablets.remove(tabletId)) {
LOG.debug("skip updateBeToTablets for tablet {}: srcBe {} not
found in globalBeToTablets "
+ "or tablet not present", tabletId, srcBe);
return;
}
ConcurrentHashMap<Long, Set<Long>> tableBeToTablets =
beToTabletsInTable.get(tableId);
if (tableBeToTablets == null) {
LOG.debug("skip updateBeToTablets for tablet {}: tableId {} not
found in beToTabletsInTable",
tabletId, tableId);
return;
}
Set<Long> tableSrcTablets = tableBeToTablets.get(srcBe);
if (tableSrcTablets == null || !tableSrcTablets.remove(tabletId)) {
LOG.debug("skip updateBeToTablets for tablet {}: srcBe {} not
found in beToTabletsInTable for table {} "
+ "or tablet not present", tabletId, srcBe,
tableId);
return;
}
ConcurrentHashMap<Long, ConcurrentHashMap<Long, Set<Long>>>
indexMapByPart = partToTablets.get(partId);
if (indexMapByPart == null) {
LOG.debug("skip updateBeToTablets for tablet {}: partId {} not
found in partToTablets",
tabletId, partId);
return;
}
ConcurrentHashMap<Long, Set<Long>> beToTabletsByIndex =
indexMapByPart.get(indexId);
if (beToTabletsByIndex == null) {
LOG.debug("skip updateBeToTablets for tablet {}: indexId {} not
found in partToTablets for part {}",
tabletId, indexId, partId);
return;
}
Set<Long> partSrcTablets = beToTabletsByIndex.get(srcBe);
if (partSrcTablets == null || !partSrcTablets.remove(tabletId)) {
LOG.debug("skip updateBeToTablets for tablet {}: srcBe {} not
found in partToTablets for "
+ "part {}, index {} or tablet not present",
tabletId, srcBe, partId, indexId);
return;
}
// Only after successful removals from all src mappings do we add
the tablet to destBe mappings.
```
##########
fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudTabletRebalancer.java:
##########
@@ -2067,49 +2089,58 @@ public void addTabletMigrationTask(Long srcBe, Long
dstBe) {
* replica location info will be updated in both master and follower FEs.
*/
private void migrateTablets(Long srcBe, Long dstBe) {
- // get tablets
- Set<Tablet> tablets = beToTabletsGlobal.get(srcBe);
- if (tablets == null || tablets.isEmpty()) {
+ // get tabletIds
+ Set<Long> tabletIds = beToTabletsGlobal.get(srcBe);
+ if (tabletIds == null || tabletIds.isEmpty()) {
LOG.info("smooth upgrade srcBe={} does not have any tablets, set
inactive", srcBe);
((CloudEnv)
Env.getCurrentEnv()).getCloudUpgradeMgr().setBeStateInactive(srcBe);
return;
}
List<UpdateCloudReplicaInfo> infos = new ArrayList<>();
- for (Tablet tablet : tablets) {
- // get replica
- CloudReplica cloudReplica = ((CloudTablet)
tablet).getCloudReplica();
- Backend be = cloudSystemInfoService.getBackend(srcBe);
- if (be == null) {
- LOG.info("src backend {} not found", srcBe);
+ for (Long tabletId : tabletIds) {
+ TabletMeta tabletMeta =
Env.getCurrentEnv().getTabletInvertedIndex().getTabletMeta(tabletId);
+ if (tabletMeta == null) {
+ LOG.warn("tablet {} meta not found in inverted index, skip
migration", tabletId);
continue;
}
- // populate to followers
- Database db =
Env.getCurrentInternalCatalog().getDbNullable(cloudReplica.getDbId());
+ Database db =
Env.getCurrentInternalCatalog().getDbNullable(tabletMeta.getDbId());
if (db == null) {
- long beId;
- try {
- beId = cloudReplica.getBackendId();
- } catch (ComputeGroupException e) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("get backend failed cloudReplica {}",
cloudReplica, e);
- }
- beId = -1;
- }
- LOG.error("get null db from replica, tabletId={},
partitionId={}, beId={}",
- cloudReplica.getTableId(),
cloudReplica.getPartitionId(), beId);
+ LOG.error("get null db from tablet meta, tabletId={},
dbId={}", tabletId, tabletMeta.getDbId());
continue;
}
- OlapTable table = (OlapTable)
db.getTableNullable(cloudReplica.getTableId());
+ OlapTable table = (OlapTable)
db.getTableNullable(tabletMeta.getTableId());
if (table == null) {
continue;
}
+ Backend be = cloudSystemInfoService.getBackend(srcBe);
+ if (be == null) {
+ LOG.info("src backend {} not found", srcBe);
+ continue;
+ }
+
String clusterId = be.getCloudClusterId();
String clusterName = be.getCloudClusterName();
Review Comment:
`migrateTablets()` calls `cloudSystemInfoService.getBackend(srcBe)` inside
the per-tablet loop, so if `srcBe` is missing it will log once per tablet and
do redundant lookups even when it exists. Fetch the `Backend` (and derived
`clusterId/clusterName`) once before iterating `tabletIds`, and return early if
it’s null.
##########
fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudTabletRebalancer.java:
##########
@@ -844,7 +839,7 @@ public void checkInflightWarmUpCacheAsync() {
List<InfightTablet> toRemove = new LinkedList<>();
for (InfightTask task : entry.getValue()) {
for (InfightTablet key : tabletToInfightTask.keySet()) {
- toRemove.add(new
InfightTablet(task.pickedTablet.getId(), key.clusterId));
+ toRemove.add(new InfightTablet(task.pickedTabletId,
key.clusterId));
}
Review Comment:
`checkInflightWarmUpCacheAsync()` builds `toRemove` by pairing each task’s
tabletId with *every* clusterId in `tabletToInfightTask.keySet()`. This can
remove inflight tasks for the same tablet in other compute groups (different
clusterId) that are unrelated to the dead `destBackend`, and it’s also O(n²).
Consider retaining the `InfightTablet` keys when grouping by destBe (e.g.,
store keys in `beToInfightTasks`) or add `clusterId` into `InfightTask` so you
can remove exactly the keys corresponding to `entry.getValue()`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]