goiri commented on code in PR #5990:
URL: https://github.com/apache/hadoop/pull/5990#discussion_r1306244904
##########
hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java:
##########
@@ -83,14 +83,12 @@ public class MembershipNamenodeResolver
/** Cached lookup of NN for block pool. Invalidated on cache refresh. */
private Map<String, List<? extends FederationNamenodeContext>> cacheBP;
-
Review Comment:
Let's avoid these unrelated changes.
##########
hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java:
##########
@@ -478,4 +476,27 @@ private List<MembershipState>
getRecentRegistrationForQuery(
public void setRouterId(String router) {
this.routerId = router;
}
+
+ /**
+ * Shuffle cache, to ensure that the current nn will not be accessed first
next time.
+ *
+ *
+ * @param nsId name service id
+ * @param namenode namenode contexts
+ */
+ @Override
+ public synchronized void shuffleCache(String nsId, FederationNamenodeContext
namenode) {
+ cacheNS.compute(Pair.of(nsId, false), (ns, namenodeContexts) -> {
+ if (namenodeContexts != null
+ && namenodeContexts.size() > 0
+ && !namenodeContexts.get(0).getState().equals(ACTIVE)
+ &&
namenodeContexts.get(0).getRpcAddress().equals(namenode.getRpcAddress())) {
+ List<FederationNamenodeContext> rotatedNnContexts = new
ArrayList<>(namenodeContexts);
+ Collections.rotate(rotatedNnContexts, -1);
Review Comment:
We are not really shuffling right?
##########
hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java:
##########
@@ -478,4 +476,27 @@ private List<MembershipState>
getRecentRegistrationForQuery(
public void setRouterId(String router) {
this.routerId = router;
}
+
+ /**
+ * Shuffle cache, to ensure that the current nn will not be accessed first
next time.
+ *
+ *
+ * @param nsId name service id
+ * @param namenode namenode contexts
+ */
+ @Override
+ public synchronized void shuffleCache(String nsId, FederationNamenodeContext
namenode) {
+ cacheNS.compute(Pair.of(nsId, false), (ns, namenodeContexts) -> {
+ if (namenodeContexts != null
+ && namenodeContexts.size() > 0
Review Comment:
isEmpty() and probably we want to check for > 1 as there's nothing to rotate.
We should actually do that outside, if there's 1 or less, don't do anything.
##########
hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterClientRejectOverload.java:
##########
@@ -357,6 +363,72 @@ public void testNoNamenodesAvailable() throws Exception{
assertEquals(originalRouter0Failures, rpcMetrics0.getProxyOpNoNamenodes());
}
+ /**
+ * When failover occurs, the router may record that the ns has no active
namenode.
+ * Only when the router updates the cache next time can the memory status be
updated,
+ * causing the router to report NoNamenodesAvailableException for a long time
+ */
+ @Test
+ public void testNoNamenodesAvailableLongTimeWhenNsFailover() throws
Exception {
+ setupCluster(false, true);
+ transitionClusterNSToStandby(cluster);
+ for (RouterContext routerContext : cluster.getRouters()) {
+ // Manually trigger the heartbeat
+ Collection<NamenodeHeartbeatService> heartbeatServices = routerContext
+ .getRouter().getNamenodeHeartbeatServices();
+ for (NamenodeHeartbeatService service : heartbeatServices) {
+ service.periodicInvoke();
+ }
+ // Update service cache
+ routerContext.getRouter().getStateStore().refreshCaches(true);
+ }
+ // Record the time after the router first updated the cache
+ long firstLoadTime = Time.now();
+ List<MiniRouterDFSCluster.NamenodeContext> namenodes =
cluster.getNamenodes();
+
+ // Make sure all namenodes are in standby state
+ for (MiniRouterDFSCluster.NamenodeContext namenodeContext : namenodes) {
+ assertTrue(namenodeContext.getNamenode().getNameNodeState() ==
STANDBY.ordinal());
+ }
+
+ Configuration conf = cluster.getRouterClientConf();
+ // Set dfs.client.failover.random.order false, to pick 1st router at first
+ conf.setBoolean("dfs.client.failover.random.order", false);
+
+ DFSClient routerClient = new DFSClient(new URI("hdfs://fed"), conf);
+
+ for (RouterContext routerContext : cluster.getRouters()) {
+ // Get the second namenode in the router cache and make it active
+ List<? extends FederationNamenodeContext> ns0 =
routerContext.getRouter().getNamenodeResolver().getNamenodesForNameserviceId("ns0",
false);
Review Comment:
Pretty sure this line is being flagged by checkstyle.
##########
hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterClientRejectOverload.java:
##########
@@ -357,6 +363,72 @@ public void testNoNamenodesAvailable() throws Exception{
assertEquals(originalRouter0Failures, rpcMetrics0.getProxyOpNoNamenodes());
}
+ /**
+ * When failover occurs, the router may record that the ns has no active
namenode.
+ * Only when the router updates the cache next time can the memory status be
updated,
+ * causing the router to report NoNamenodesAvailableException for a long time
+ */
+ @Test
+ public void testNoNamenodesAvailableLongTimeWhenNsFailover() throws
Exception {
+ setupCluster(false, true);
+ transitionClusterNSToStandby(cluster);
+ for (RouterContext routerContext : cluster.getRouters()) {
+ // Manually trigger the heartbeat
+ Collection<NamenodeHeartbeatService> heartbeatServices = routerContext
+ .getRouter().getNamenodeHeartbeatServices();
+ for (NamenodeHeartbeatService service : heartbeatServices) {
+ service.periodicInvoke();
+ }
+ // Update service cache
+ routerContext.getRouter().getStateStore().refreshCaches(true);
+ }
+ // Record the time after the router first updated the cache
+ long firstLoadTime = Time.now();
+ List<MiniRouterDFSCluster.NamenodeContext> namenodes =
cluster.getNamenodes();
+
+ // Make sure all namenodes are in standby state
+ for (MiniRouterDFSCluster.NamenodeContext namenodeContext : namenodes) {
+ assertTrue(namenodeContext.getNamenode().getNameNodeState() ==
STANDBY.ordinal());
+ }
+
+ Configuration conf = cluster.getRouterClientConf();
+ // Set dfs.client.failover.random.order false, to pick 1st router at first
+ conf.setBoolean("dfs.client.failover.random.order", false);
+
+ DFSClient routerClient = new DFSClient(new URI("hdfs://fed"), conf);
+
+ for (RouterContext routerContext : cluster.getRouters()) {
+ // Get the second namenode in the router cache and make it active
+ List<? extends FederationNamenodeContext> ns0 =
routerContext.getRouter().getNamenodeResolver().getNamenodesForNameserviceId("ns0",
false);
+ String nsId = ns0.get(1).getNamenodeId();
+ cluster.switchToActive("ns0", nsId);
+ // Manually trigger the heartbeat, but the router does not manually load
the cache
+ Collection<NamenodeHeartbeatService> heartbeatServices = routerContext
+ .getRouter().getNamenodeHeartbeatServices();
+ for (NamenodeHeartbeatService service : heartbeatServices) {
+ service.periodicInvoke();
+ }
+ assertTrue(cluster.getNamenode("ns0",
nsId).getNamenode().getNameNodeState() == ACTIVE.ordinal());
Review Comment:
assertEquals?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]