saintstack commented on a change in pull request #2584:
URL: https://github.com/apache/hbase/pull/2584#discussion_r517036330



##########
File path: 
hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncNonMetaRegionLocator.java
##########
@@ -433,9 +474,24 @@ private void locateInMeta(TableName tableName, 
LocateRequest req) {
     Scan scan = new Scan().withStartRow(metaStartKey).withStopRow(metaStopKey, 
true)
       
.addFamily(HConstants.CATALOG_FAMILY).setReversed(true).setCaching(locatePrefetchLimit)
       .setReadType(ReadType.PREAD);
-    if (useMetaReplicas) {
-      scan.setConsistency(Consistency.TIMELINE);
+
+    switch (this.metaReplicaMode) {
+      case LoadBalance:
+        int metaReplicaId = this.metaReplicaSelector.select(tableName, 
req.row, req.locateType);
+        if (metaReplicaId != RegionInfo.DEFAULT_REPLICA_ID) {
+          // If the selector gives a non-primary meta replica region, then go 
with it.
+          // Otherwise, just go to primary in non-hedgedRead mode.
+          scan.setConsistency(Consistency.TIMELINE);
+          scan.setReplicaId(metaReplicaId);
+        }
+        break;
+      case HedgedRead:
+        scan.setConsistency(Consistency.TIMELINE);
+        break;
+      default:
+        // do nothing

Review comment:
       Did you file one @huaxiangsun ?

##########
File path: 
hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaMode.java
##########
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.client;
+
+import org.apache.yetus.audience.InterfaceAudience;
+
+/**
+ * <p>There are two modes with catalog replica support. </p>
+ *
+ * <ol>
+ *   <li>HEDGED_READ  - Client sends requests to the primary region first, 
within a
+ *                 configured amount of time, if there is no response coming 
back,
+ *                 client sends requests to all replica regions and takes the 
first
+ *                 response. </li>
+ *
+ *   <li>LOAD_BALANCE - Client sends requests to replica regions in a 
round-robin mode,
+ *                 if results from replica regions are stale, next time, 
client sends requests for
+ *                 these stale locations to the primary region. In this mode, 
scan
+ *                 requests are load balanced across all replica regions.</li>
+ * </ol>
+ */
[email protected]
+enum CatalogReplicaMode {
+  NONE {
+    @Override
+    public String toString() {
+      return "None";
+    }
+  },
+  HEDGED_READ {
+    @Override
+    public String toString() {
+      return "HedgedRead";
+    }
+  },
+  LOAD_BALANCE {
+    @Override
+    public String toString() {
+      return "LoadBalance";
+    }
+  };
+
+  public static CatalogReplicaMode fromString(final String value) {
+    for(CatalogReplicaMode mode : values()) {
+      if (mode.toString().equalsIgnoreCase(value)) {

Review comment:
       Good that you ignore case.

##########
File path: 
hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSimpleSelector.java
##########
@@ -0,0 +1,302 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.client;
+
+import static org.apache.hadoop.hbase.client.ConnectionUtils.isEmptyStopRow;
+import static org.apache.hadoop.hbase.util.Bytes.BYTES_COMPARATOR;
+import static org.apache.hadoop.hbase.util.ConcurrentMapUtils.computeIfAbsent;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentNavigableMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.function.IntSupplier;
+import org.apache.commons.lang3.builder.ToStringBuilder;
+import org.apache.hadoop.hbase.HRegionLocation;
+import org.apache.hadoop.hbase.ScheduledChore;
+import org.apache.hadoop.hbase.Stoppable;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>CatalogReplicaLoadBalanceReplicaSimpleSelector implements a simple 
catalog replica load balancing
+ * algorithm. It maintains a stale location cache for each table. Whenever 
client looks up location,
+ * it first check if the row is the stale location cache. If yes, the location 
from
+ * catalog replica is stale, it will go to the primary region to look up 
update-to-date location;
+ * otherwise, it will randomly pick up a replica region for lookup. When 
clients receive
+ * RegionNotServedException from region servers, it will add these region 
locations to the stale
+ * location cache. The stale cache will be cleaned up periodically by a 
chore.</p>
+ *
+ * It follows a simple algorithm to choose a replica to go:
+ *
+ * <ol>
+ *  <li>If there is no stale location entry for rows it looks up, it will 
randomly
+ *     pick a replica region to do lookup. </li>
+ *  <li>If the location from the replica region is stale, client gets 
RegionNotServedException
+ *     from region server, in this case, it will create 
StaleLocationCacheEntry in
+ *     CatalogReplicaLoadBalanceReplicaSimpleSelector.</li>
+ *  <li>When client tries to do location lookup, it checks StaleLocationCache 
first for rows it
+ *     tries to lookup, if entry exists, it will go with primary meta region 
to do lookup;
+ *     otherwise, it will follow step 1.</li>
+ *  <li>A chore will periodically run to clean up cache entries in the 
StaleLocationCache.</li>
+ * </ol>
+ */
+class CatalogReplicaLoadBalanceSimpleSelector implements
+  CatalogReplicaLoadBalanceSelector, Stoppable {
+  private static final Logger LOG =
+    LoggerFactory.getLogger(CatalogReplicaLoadBalanceSimpleSelector.class);
+  private final long STALE_CACHE_TIMEOUT_IN_MILLISECONDS = 3000; // 3 seconds
+  private final int STALE_CACHE_CLEAN_CHORE_INTERVAL_IN_MILLISECONDS = 1500; 
// 1.5 seconds
+  private final int REFRESH_REPLICA_COUNT_CHORE_INTERVAL_IN_MILLISECONDS = 
60000; // 1 minute
+
+  /**
+   * StaleLocationCacheEntry is the entry when a stale location is reported by 
an client.
+   */
+  private static final class StaleLocationCacheEntry {
+    // replica id where the stale location comes from.
+    private final int fromReplicaId;
+
+    // timestamp in milliseconds
+    private final long timestamp;
+
+    private final byte[] endKey;
+
+    StaleLocationCacheEntry(final int metaReplicaId, final byte[] endKey) {
+      this.fromReplicaId = metaReplicaId;
+      this.endKey = endKey;
+      timestamp = EnvironmentEdgeManager.currentTime();
+    }
+
+    public byte[] getEndKey() {
+      return this.endKey;
+    }
+
+    public int getFromReplicaId() {
+      return this.fromReplicaId;
+    }
+    public long getTimestamp() {
+      return this.timestamp;
+    }
+
+    @Override
+    public String toString() {
+      return new ToStringBuilder(this)
+        .append("endKey", endKey)
+        .append("fromReplicaId", fromReplicaId)
+        .append("timestamp", timestamp)
+        .toString();
+    }
+  }
+
+  private final ConcurrentMap<TableName, ConcurrentNavigableMap<byte[], 
StaleLocationCacheEntry>>
+    staleCache = new ConcurrentHashMap<>();
+  private volatile int numOfReplicas;
+  private final AsyncConnectionImpl conn;
+  private final TableName tableName;
+  private final IntSupplier getNumOfReplicas;
+  private volatile boolean isStopped = false;
+
+  CatalogReplicaLoadBalanceSimpleSelector(TableName tableName, 
AsyncConnectionImpl conn,
+    IntSupplier getNumOfReplicas) {
+    this.conn = conn;
+    this.tableName = tableName;
+    this.getNumOfReplicas = getNumOfReplicas;
+
+    // This numOfReplicas is going to be lazy initialized.
+    this.numOfReplicas = -1;
+    // Start chores
+    this.conn.getChoreService().scheduleChore(getCacheCleanupChore(this));
+    
this.conn.getChoreService().scheduleChore(getRefreshReplicaCountChore(this));
+  }
+
+  /**
+   * When a client runs into RegionNotServingException, it will call this 
method to
+   * update Selector's internal state.
+   * @param loc the location which causes exception.
+   * @param fromReplicaId the replica id where the stale location comes from.
+   */
+  public void onError(HRegionLocation loc, int fromReplicaId) {
+    ConcurrentNavigableMap<byte[], StaleLocationCacheEntry> tableCache =
+      computeIfAbsent(staleCache, loc.getRegion().getTable(),
+        () -> new ConcurrentSkipListMap<>(BYTES_COMPARATOR));
+    byte[] startKey = loc.getRegion().getStartKey();
+    tableCache.putIfAbsent(startKey,
+      new StaleLocationCacheEntry(fromReplicaId, loc.getRegion().getEndKey()));
+    LOG.debug("Add entry to stale cache for table {} with startKey {}, {}",
+      loc.getRegion().getTable(), startKey, loc.getRegion().getEndKey());
+  }
+
+  /**
+   * Select an random replica id. In case there is no replica region 
configured, return
+   * the primary replica id.
+   * @return Replica id
+   */
+  private int getRandomReplicaId() {
+    int cachedNumOfReplicas = this.numOfReplicas;
+    if (cachedNumOfReplicas < 0) {
+      cachedNumOfReplicas = refreshCatalogReplicaCount();
+      this.numOfReplicas = cachedNumOfReplicas;
+    }
+    // In case of no replica configured, return the primary region id.
+    if (cachedNumOfReplicas <= 1) {
+      return RegionInfo.DEFAULT_REPLICA_ID;
+    }
+    return 1 + ThreadLocalRandom.current().nextInt(cachedNumOfReplicas - 1);
+  }
+
+  /**
+   * When it looks up a location, it will call this method to find a replica 
region to go.
+   * For a normal case, > 99% of region locations from catalog/meta replica 
will be up to date.
+   * In extreme cases such as region server crashes, it will depends on how 
fast replication
+   * catches up.
+   *
+   * @param tablename  table name it looks up
+   * @param row   key it looks up.
+   * @param locateType locateType, Only BEFORE and CURRENT will be passed in.
+   * @return catalog replica id
+   */
+  public int select(final TableName tablename, final byte[] row,
+    final RegionLocateType locateType) {
+    ConcurrentNavigableMap<byte[], StaleLocationCacheEntry> tableCache = 
staleCache.get(tablename);
+
+    // If there is no entry in StaleCache, select a random replica id.
+    if (tableCache == null) {
+      return getRandomReplicaId();
+    }
+
+    Map.Entry<byte[], StaleLocationCacheEntry> entry;
+    boolean isEmptyStopRow = isEmptyStopRow(row);
+    // Only BEFORE and CURRENT are passed in.
+    if (locateType == RegionLocateType.BEFORE) {
+      entry = isEmptyStopRow ? tableCache.lastEntry() : 
tableCache.lowerEntry(row);
+    } else {
+      entry = tableCache.floorEntry(row);
+    }
+
+    // It is not in the stale cache, return a random replica id.
+    if (entry == null) {
+      return getRandomReplicaId();
+    }
+
+    // The entry here is a possible match for the location. Check if the entry 
times out first as
+    // long comparing is faster than comparing byte arrays(in most cases). It 
could remove
+    // stale entries faster. If the possible match entry does not time out, it 
will check if
+    // the entry is a match for the row passed in and select the replica id 
accordingly.
+    if ((EnvironmentEdgeManager.currentTime() - 
entry.getValue().getTimestamp()) >=
+      STALE_CACHE_TIMEOUT_IN_MILLISECONDS) {
+      LOG.debug("Entry for table {} with startKey {}, {} times out", 
tablename, entry.getKey(),
+        entry);
+      tableCache.remove(entry.getKey());
+      return getRandomReplicaId();
+    }
+
+    byte[] endKey =  entry.getValue().getEndKey();
+
+    // The following logic is borrowed from AsyncNonMetaRegionLocator.
+    if (isEmptyStopRow(endKey)) {
+      LOG.debug("Lookup {} goes to primary region", row);
+      return RegionInfo.DEFAULT_REPLICA_ID;
+    }
+
+    if (locateType == RegionLocateType.BEFORE) {
+      if (!isEmptyStopRow && Bytes.compareTo(endKey, row) >= 0) {
+        LOG.debug("Lookup {} goes to primary meta", row);
+        return RegionInfo.DEFAULT_REPLICA_ID;
+      }
+    } else {
+      if (Bytes.compareTo(row, endKey) < 0) {
+        LOG.debug("Lookup {} goes to primary meta", row);
+        return RegionInfo.DEFAULT_REPLICA_ID;
+      }
+    }
+
+    // Not in stale cache, return a random replica id.
+    return getRandomReplicaId();
+  }
+
+  @Override
+  public void stop(String why) {

Review comment:
       So, implements Stoppable rather than do what the likes of AuthUtil does 
where it does createDummyStoppable and then has an internal do-nothing 
Stoppable? Makes sense.
   
   Perhaps add comment that it is a do-nothing stop required by ScheduledChore 
impls. s/isStopped/stopped/

##########
File path: 
hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSelector.java
##########
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.client;
+
+import org.apache.hadoop.hbase.HRegionLocation;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.yetus.audience.InterfaceAudience;
+
+/**
+ * A Catalog replica selector decides which catalog replica to go for read 
requests when it is
+ * configured as CatalogReplicaMode.LoadBalance.
+ */
[email protected]
+interface CatalogReplicaLoadBalanceSelector {
+
+  /**
+   * This method is called when input location is stale, i.e, when clients run 
into
+   * org.apache.hadoop.hbase.NotServingRegionException.
+   * @param loc stale location
+   */
+  void onError(HRegionLocation loc);
+
+  /**
+   * Select a catalog replica region where client go to loop up the input row 
key.
+   *
+   * @param tablename table name
+   * @param row  key to look up
+   * @param locateType  locate type
+   * @return replica id
+   */
+  int select(TableName tablename, byte[] row, RegionLocateType locateType);
+}

Review comment:
       Good




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to