Re: [PR] Always balance root and metadata tables [accumulo]

via GitHub Wed, 03 Jul 2024 12:49:33 -0700


keith-turner commented on code in PR #4709:
URL: https://github.com/apache/accumulo/pull/4709#discussion_r1664682071



##########
server/manager/src/main/java/org/apache/accumulo/manager/Manager.java:
##########
@@ -958,23 +956,47 @@ private void 
checkForHeldServer(SortedMap<TServerInstance,TabletServerStatus> ts
       }
     }
 
-    private long balanceTablets() {
-      BalanceParamsImpl params = 
BalanceParamsImpl.fromThrift(tserverStatusForBalancer,
-          tserverStatus, migrationsSnapshot());
-      long wait = tabletBalancer.balance(params);
+    private long balanceTablets(boolean balanceUserTables) {
 
-      for (TabletMigration m : 
checkMigrationSanity(tserverStatusForBalancer.keySet(),
-          params.migrationsOut())) {
-        KeyExtent ke = KeyExtent.fromTabletId(m.getTablet());
-        if (migrations.containsKey(ke)) {
-          log.warn("balancer requested migration more than once, skipping {}", 
m);
+      Map<DataLevel,Set<KeyExtent>> partitionedMigrations =
+          new HashMap<>(DataLevel.values().length);
+      migrationsSnapshot().forEach(ke -> {
+        partitionedMigrations.computeIfAbsent(DataLevel.of(ke.tableId()), f -> 
new HashSet<>())
+            .add(ke);
+      });
+
+      BalanceParamsImpl params = null;
+      long wait = 0;
+      for (DataLevel dl : new DataLevel[] {DataLevel.ROOT, DataLevel.METADATA, 
DataLevel.USER}) {
+        Set<KeyExtent> migrationsForLevel = partitionedMigrations.get(dl);
+        if (migrationsForLevel == null) {
+          continue;
+        }
+        if (dl == DataLevel.USER && !balanceUserTables) {
+          log.debug("not balancing user tablets because there are {} unhosted 
tablets",
+              notHosted());
           continue;
         }
-        TServerInstance tserverInstance = 
TabletServerIdImpl.toThrift(m.getNewTabletServer());
-        migrations.put(ke, tserverInstance);
-        log.debug("migration {}", m);
+        params = BalanceParamsImpl.fromThrift(tserverStatusForBalancer, 
tserverStatus,
+            migrationsSnapshot());
+        do {
+          log.debug("Balancing for tables at level: {}", dl);
+          wait = tabletBalancer.balance(params);
+          for (TabletMigration m : 
checkMigrationSanity(tserverStatusForBalancer.keySet(),
+              params.migrationsOut())) {
+            KeyExtent ke = KeyExtent.fromTabletId(m.getTablet());
+            if (migrations.containsKey(ke)) {
+              log.warn("balancer requested migration more than once, skipping 
{}", m);
+              continue;
+            }
+            TServerInstance tserverInstance = 
TabletServerIdImpl.toThrift(m.getNewTabletServer());
+            migrations.put(ke, tserverInstance);
+            log.debug("migration {}", m);
+          }
+        } while (!params.migrationsOut().isEmpty()

Review Comment:
   > I think it will only get stuck if it's the root or metadata table 
flapping. In which case, we may want this.
   
   It would be nice to have that for the metadata table, it could possibly 
cause new problems in a bug fix release though.  Thinking if we have sufficient 
logging to know if a thread ever does end up hanging out in this loop that 
would help debug any potential problems.  Made a comment elsewhere about 
tweaking some logging.  
   
   So ss long as we can debug when code is stuck in this loop, I think its a 
nice change to make.



##########
server/manager/src/main/java/org/apache/accumulo/manager/Manager.java:
##########
@@ -958,23 +956,47 @@ private void 
checkForHeldServer(SortedMap<TServerInstance,TabletServerStatus> ts
       }
     }
 
-    private long balanceTablets() {
-      BalanceParamsImpl params = 
BalanceParamsImpl.fromThrift(tserverStatusForBalancer,
-          tserverStatus, migrationsSnapshot());
-      long wait = tabletBalancer.balance(params);
+    private long balanceTablets(boolean balanceUserTables) {
 
-      for (TabletMigration m : 
checkMigrationSanity(tserverStatusForBalancer.keySet(),
-          params.migrationsOut())) {
-        KeyExtent ke = KeyExtent.fromTabletId(m.getTablet());
-        if (migrations.containsKey(ke)) {
-          log.warn("balancer requested migration more than once, skipping {}", 
m);
+      Map<DataLevel,Set<KeyExtent>> partitionedMigrations =
+          new HashMap<>(DataLevel.values().length);
+      migrationsSnapshot().forEach(ke -> {
+        partitionedMigrations.computeIfAbsent(DataLevel.of(ke.tableId()), f -> 
new HashSet<>())
+            .add(ke);
+      });
+
+      BalanceParamsImpl params = null;
+      long wait = 0;
+      for (DataLevel dl : new DataLevel[] {DataLevel.ROOT, DataLevel.METADATA, 
DataLevel.USER}) {
+        Set<KeyExtent> migrationsForLevel = partitionedMigrations.get(dl);
+        if (migrationsForLevel == null) {
+          continue;
+        }
+        if (dl == DataLevel.USER && !balanceUserTables) {
+          log.debug("not balancing user tablets because there are {} unhosted 
tablets",
+              notHosted());
           continue;
         }
-        TServerInstance tserverInstance = 
TabletServerIdImpl.toThrift(m.getNewTabletServer());
-        migrations.put(ke, tserverInstance);
-        log.debug("migration {}", m);
+        params = BalanceParamsImpl.fromThrift(tserverStatusForBalancer, 
tserverStatus,
+            migrationsSnapshot());
+        do {
+          log.debug("Balancing for tables at level: {}", dl);
+          wait = tabletBalancer.balance(params);
+          for (TabletMigration m : 
checkMigrationSanity(tserverStatusForBalancer.keySet(),
+              params.migrationsOut())) {
+            KeyExtent ke = KeyExtent.fromTabletId(m.getTablet());
+            if (migrations.containsKey(ke)) {
+              log.warn("balancer requested migration more than once, skipping 
{}", m);
+              continue;
+            }
+            TServerInstance tserverInstance = 
TabletServerIdImpl.toThrift(m.getNewTabletServer());
+            migrations.put(ke, tserverInstance);
+            log.debug("migration {}", m);
+          }
+        } while (!params.migrationsOut().isEmpty()

Review Comment:
   > I think it will only get stuck if it's the root or metadata table 
flapping. In which case, we may want this.
   
   It would be nice to have that for the metadata table, it could possibly 
cause new problems in a bug fix release though.  Thinking if we have sufficient 
logging to know if a thread ever does end up hanging out in this loop that 
would help debug any potential problems.  Made a comment elsewhere about 
tweaking some logging.  
   
   So as long as we can debug when code is stuck in this loop, I think its a 
nice change to make.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Always balance root and metadata tables [accumulo]

Reply via email to