[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-28 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r776144420



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/FanOutOneBlockAsyncDFSOutput.java
##
@@ -511,6 +527,7 @@ private void flush0(CompletableFuture future, boolean 
syncBlock) {
 return future;
   }
 
+  @SuppressWarnings("FutureReturnValueIgnored")

Review comment:
   Oh, no check style issues now...




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-27 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r775739923



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/FanOutOneBlockAsyncDFSOutput.java
##
@@ -511,6 +527,7 @@ private void flush0(CompletableFuture future, boolean 
syncBlock) {
 return future;
   }
 
+  @SuppressWarnings("FutureReturnValueIgnored")

Review comment:
   Removed the annotation, let's see the code style check results. Thanks.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-27 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r775725597



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/StreamSlowMonitor.java
##
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_EXCLUDE_DATANODE_TTL;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_EXCLUDE_DATANODE_TTL_KEY;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY;
+
+import java.util.Deque;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheLoader;
+import org.apache.hbase.thirdparty.com.google.common.cache.LoadingCache;
+
+/**
+ * Class for monitor the wal file flush performance.
+ * Each active wal file has a StreamSlowMonitor.
+ */
+@InterfaceAudience.Private
+public class StreamSlowMonitor implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(StreamSlowMonitor.class);
+
+  /**
+   * Configure for the min count for a datanode detected slow.
+   * If a datanode is detected slow times up to this count, then it will be 
added to the exclude
+   * datanode cache by {@link 
ExcludeDatanodeManager#tryAddExcludeDN(DatanodeInfo, String)}
+   * of this regionsever.
+   */
+  private static final String WAL_SLOW_DETECT_MIN_COUNT_KEY =
+"hbase.regionserver.async.wal.min.slow.detect.count";
+  private static final int DEFAULT_WAL_SLOW_DETECT_MIN_COUNT = 3;
+
+  /**
+   * Configure for the TTL of the data that a datanode detected slow.
+   */
+  private static final String WAL_SLOW_DETECT_DATA_TTL_KEY =
+"hbase.regionserver.async.wal.slow.detect.data.ttl.ms";
+  private static final long DEFAULT_WAL_SLOW_DETECT_DATA_TTL = 10 * 60 * 1000; 
// 10min in ms
+
+  /**
+   * Configure for the speed check of packet min length.
+   * For packets whose data length smaller than this value, check slow by 
processing time.
+   * While for packets whose data length larger than this value, check slow by 
flushing speed.
+   */
+  private static final String 
DATANODE_PACKET_FLUSH_CHECK_SPEED_MIN_DATA_LENGTH_KEY =
+
"hbase.regionserver.async.wal.datanode.slow.check.packet.speed.data.length.min";
+  private static final long 
DEFAULT_DATANODE_PACKET_FLUSH_CHECK_SPEED_MIN_DATA_LENGTH =
+64 * 1024; //64KB
+
+  /**
+   * Configure for the slow packet process time, a duration from send to ACK.
+   * The processing time check is for packets that data length smaller than
+   * {@link DATANODE_PACKET_FLUSH_CHECK_SPEED_MIN_DATA_LENGTH_KEY}
+   */
+  public static final String DATANODE_SLOW_PACKET_PROCESS_TIME_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.process.time.millis";
+  private static final long DEFAULT_DATANODE_SLOW_PACKET_PROCESS_TIME = 6000; 
// 6s in ms
+
+  /**
+   * Configure for the check of large packet(which is configured by
+   * {@link DATANODE_PACKET_FLUSH_CHECK_SPEED_MIN_DATA_LENGTH_KEY}) flush 
speed.
+   * e.g. If the configured slow packet process time is smaller than 10s, then 
here 20KB/s means
+   * 64KB should be processed in less than 3.2s.
+   */
+  private static final String DATANODE_SLOW_PACKET_FLUSH_MIN_SPEED_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.speed.min.kbs";
+  private static final double DEFAULT_DATANODE_SLOW_PACKET_FLUSH_MIN_SPEED = 
20; // 20KB/s
+
+  private 

[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-27 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r775413229



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/StreamSlowMonitor.java
##
@@ -0,0 +1,207 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_EXCLUDE_DATANODE_TTL;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_EXCLUDE_DATANODE_TTL_KEY;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY;
+
+import java.util.Deque;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheLoader;
+import org.apache.hbase.thirdparty.com.google.common.cache.LoadingCache;
+
+/**
+ * Class for monitor the wal file flush performance.
+ * Each active wal file has a StreamSlowMonitor.
+ */
+@InterfaceAudience.Private
+public class StreamSlowMonitor implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(StreamSlowMonitor.class);
+
+  /**
+   * Configure for the min count for a datanode detected slow.
+   * If a datanode is detected slow times up to this count, then it will be 
added to the exclude
+   * datanode cache by {@link 
ExcludeDatanodeManager#tryAddExcludeDN(DatanodeInfo, String)}
+   * of this regionsever.
+   */
+  private static final String WAL_SLOW_DETECT_MIN_COUNT_KEY =
+"hbase.regionserver.async.wal.min.slow.detect.count";
+  private static final int DEFAULT_WAL_SLOW_DETECT_MIN_COUNT = 3;
+
+  /**
+   * Configure for the TTL of the data that a datanode detected slow.
+   */
+  private static final String WAL_SLOW_DETECT_DATA_TTL_KEY =
+"hbase.regionserver.async.wal.slow.detect.data.ttl.ms";
+  private static final long DEFAULT_WAL_SLOW_DETECT_DATA_TTL = 10 * 60 * 1000; 
// 10min in ms
+
+  /**
+   * Configure for the slow packet process time, a duration from send to ACK.
+   * It is preferred that this value should not be less than 1s.
+   */
+  private static final String DATANODE_SLOW_PACKET_PROCESS_TIME_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.process.time.millis";
+  private static final long DEFAULT_DATANODE_SLOW_PACKET_PROCESS_TIME = 6000; 
//6s in ms
+
+  /**
+   * Configure for the packet flush speed.
+   * e.g. If the configured slow slow packet process time is larger than 1s, 
then here 0.1kbs means
+   * 100B should be processed in less than 1s.
+   * If the configured slow slow packet process time is larger than 10s, then 
here 0.1kbs means
+   * 1KB should be processed in less than 10s.
+   */
+  private static final String DATANODE_SLOW_PACKET_FLUSH_SPEED_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.flush.speed.kbs";
+  private static final double DEFAULT_DATANODE_SLOW_PACKET_FLUSH_SPEED = 0.1;
+
+  private final String name;
+  // this is a map of datanodeInfo->queued slow PacketAckData
+  private final LoadingCache> 
datanodeSlowDataQueue;
+  private final ExcludeDatanodeManager excludeDatanodeManager;
+
+  private int minSlowDetectCount;
+  private long slowDataTtl;
+  private long slowPacketAckMs;
+  private double slowPacketAckSpeed;
+
+  public StreamSlowMonitor(Configuration conf, String name,
+  ExcludeDatanodeManager excludeDatanodeManager) {
+setConf(conf);
+this.name = name;
+this.excludeDatanodeManager = excludeDatanodeManager;
+this.datanodeSlowDataQueue = 

[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-23 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r774865512



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/StreamSlowMonitor.java
##
@@ -0,0 +1,207 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_EXCLUDE_DATANODE_TTL;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_EXCLUDE_DATANODE_TTL_KEY;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY;
+
+import java.util.Deque;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheLoader;
+import org.apache.hbase.thirdparty.com.google.common.cache.LoadingCache;
+
+/**
+ * Class for monitor the wal file flush performance.
+ * Each active wal file has a StreamSlowMonitor.
+ */
+@InterfaceAudience.Private
+public class StreamSlowMonitor implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(StreamSlowMonitor.class);
+
+  /**
+   * Configure for the min count for a datanode detected slow.
+   * If a datanode is detected slow times up to this count, then it will be 
added to the exclude
+   * datanode cache by {@link 
ExcludeDatanodeManager#tryAddExcludeDN(DatanodeInfo, String)}
+   * of this regionsever.
+   */
+  private static final String WAL_SLOW_DETECT_MIN_COUNT_KEY =
+"hbase.regionserver.async.wal.min.slow.detect.count";
+  private static final int DEFAULT_WAL_SLOW_DETECT_MIN_COUNT = 3;
+
+  /**
+   * Configure for the TTL of the data that a datanode detected slow.
+   */
+  private static final String WAL_SLOW_DETECT_DATA_TTL_KEY =
+"hbase.regionserver.async.wal.slow.detect.data.ttl.ms";
+  private static final long DEFAULT_WAL_SLOW_DETECT_DATA_TTL = 10 * 60 * 1000; 
// 10min in ms
+
+  /**
+   * Configure for the slow packet process time, a duration from send to ACK.
+   * It is preferred that this value should not be less than 1s.
+   */
+  private static final String DATANODE_SLOW_PACKET_PROCESS_TIME_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.process.time.millis";
+  private static final long DEFAULT_DATANODE_SLOW_PACKET_PROCESS_TIME = 6000; 
//6s in ms
+
+  /**
+   * Configure for the packet flush speed.
+   * e.g. If the configured slow slow packet process time is larger than 1s, 
then here 0.1kbs means
+   * 100B should be processed in less than 1s.
+   * If the configured slow slow packet process time is larger than 10s, then 
here 0.1kbs means
+   * 1KB should be processed in less than 10s.
+   */
+  private static final String DATANODE_SLOW_PACKET_FLUSH_SPEED_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.flush.speed.kbs";
+  private static final double DEFAULT_DATANODE_SLOW_PACKET_FLUSH_SPEED = 0.1;
+
+  private final String name;
+  // this is a map of datanodeInfo->queued slow PacketAckData
+  private final LoadingCache> 
datanodeSlowDataQueue;
+  private final ExcludeDatanodeManager excludeDatanodeManager;
+
+  private int minSlowDetectCount;
+  private long slowDataTtl;
+  private long slowPacketAckMs;
+  private double slowPacketAckSpeed;
+
+  public StreamSlowMonitor(Configuration conf, String name,
+  ExcludeDatanodeManager excludeDatanodeManager) {
+setConf(conf);
+this.name = name;
+this.excludeDatanodeManager = excludeDatanodeManager;
+this.datanodeSlowDataQueue = 

[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-23 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r774865512



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/StreamSlowMonitor.java
##
@@ -0,0 +1,207 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_EXCLUDE_DATANODE_TTL;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_EXCLUDE_DATANODE_TTL_KEY;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY;
+
+import java.util.Deque;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheLoader;
+import org.apache.hbase.thirdparty.com.google.common.cache.LoadingCache;
+
+/**
+ * Class for monitor the wal file flush performance.
+ * Each active wal file has a StreamSlowMonitor.
+ */
+@InterfaceAudience.Private
+public class StreamSlowMonitor implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(StreamSlowMonitor.class);
+
+  /**
+   * Configure for the min count for a datanode detected slow.
+   * If a datanode is detected slow times up to this count, then it will be 
added to the exclude
+   * datanode cache by {@link 
ExcludeDatanodeManager#tryAddExcludeDN(DatanodeInfo, String)}
+   * of this regionsever.
+   */
+  private static final String WAL_SLOW_DETECT_MIN_COUNT_KEY =
+"hbase.regionserver.async.wal.min.slow.detect.count";
+  private static final int DEFAULT_WAL_SLOW_DETECT_MIN_COUNT = 3;
+
+  /**
+   * Configure for the TTL of the data that a datanode detected slow.
+   */
+  private static final String WAL_SLOW_DETECT_DATA_TTL_KEY =
+"hbase.regionserver.async.wal.slow.detect.data.ttl.ms";
+  private static final long DEFAULT_WAL_SLOW_DETECT_DATA_TTL = 10 * 60 * 1000; 
// 10min in ms
+
+  /**
+   * Configure for the slow packet process time, a duration from send to ACK.
+   * It is preferred that this value should not be less than 1s.
+   */
+  private static final String DATANODE_SLOW_PACKET_PROCESS_TIME_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.process.time.millis";
+  private static final long DEFAULT_DATANODE_SLOW_PACKET_PROCESS_TIME = 6000; 
//6s in ms
+
+  /**
+   * Configure for the packet flush speed.
+   * e.g. If the configured slow slow packet process time is larger than 1s, 
then here 0.1kbs means
+   * 100B should be processed in less than 1s.
+   * If the configured slow slow packet process time is larger than 10s, then 
here 0.1kbs means
+   * 1KB should be processed in less than 10s.
+   */
+  private static final String DATANODE_SLOW_PACKET_FLUSH_SPEED_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.flush.speed.kbs";
+  private static final double DEFAULT_DATANODE_SLOW_PACKET_FLUSH_SPEED = 0.1;
+
+  private final String name;
+  // this is a map of datanodeInfo->queued slow PacketAckData
+  private final LoadingCache> 
datanodeSlowDataQueue;
+  private final ExcludeDatanodeManager excludeDatanodeManager;
+
+  private int minSlowDetectCount;
+  private long slowDataTtl;
+  private long slowPacketAckMs;
+  private double slowPacketAckSpeed;
+
+  public StreamSlowMonitor(Configuration conf, String name,
+  ExcludeDatanodeManager excludeDatanodeManager) {
+setConf(conf);
+this.name = name;
+this.excludeDatanodeManager = excludeDatanodeManager;
+this.datanodeSlowDataQueue = 

[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-23 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r774450839



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/StreamSlowMonitor.java
##
@@ -0,0 +1,207 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_EXCLUDE_DATANODE_TTL;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_EXCLUDE_DATANODE_TTL_KEY;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY;
+
+import java.util.Deque;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheLoader;
+import org.apache.hbase.thirdparty.com.google.common.cache.LoadingCache;
+
+/**
+ * Class for monitor the wal file flush performance.
+ * Each active wal file has a StreamSlowMonitor.
+ */
+@InterfaceAudience.Private
+public class StreamSlowMonitor implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(StreamSlowMonitor.class);
+
+  /**
+   * Configure for the min count for a datanode detected slow.
+   * If a datanode is detected slow times up to this count, then it will be 
added to the exclude
+   * datanode cache by {@link 
ExcludeDatanodeManager#tryAddExcludeDN(DatanodeInfo, String)}
+   * of this regionsever.
+   */
+  private static final String WAL_SLOW_DETECT_MIN_COUNT_KEY =
+"hbase.regionserver.async.wal.min.slow.detect.count";
+  private static final int DEFAULT_WAL_SLOW_DETECT_MIN_COUNT = 3;
+
+  /**
+   * Configure for the TTL of the data that a datanode detected slow.
+   */
+  private static final String WAL_SLOW_DETECT_DATA_TTL_KEY =
+"hbase.regionserver.async.wal.slow.detect.data.ttl.ms";
+  private static final long DEFAULT_WAL_SLOW_DETECT_DATA_TTL = 10 * 60 * 1000; 
// 10min in ms
+
+  /**
+   * Configure for the slow packet process time, a duration from send to ACK.
+   * It is preferred that this value should not be less than 1s.
+   */
+  private static final String DATANODE_SLOW_PACKET_PROCESS_TIME_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.process.time.millis";
+  private static final long DEFAULT_DATANODE_SLOW_PACKET_PROCESS_TIME = 6000; 
//6s in ms
+
+  /**
+   * Configure for the packet flush speed.
+   * e.g. If the configured slow slow packet process time is larger than 1s, 
then here 0.1kbs means
+   * 100B should be processed in less than 1s.
+   * If the configured slow slow packet process time is larger than 10s, then 
here 0.1kbs means
+   * 1KB should be processed in less than 10s.
+   */
+  private static final String DATANODE_SLOW_PACKET_FLUSH_SPEED_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.flush.speed.kbs";
+  private static final double DEFAULT_DATANODE_SLOW_PACKET_FLUSH_SPEED = 0.1;
+
+  private final String name;
+  // this is a map of datanodeInfo->queued slow PacketAckData
+  private final LoadingCache> 
datanodeSlowDataQueue;
+  private final ExcludeDatanodeManager excludeDatanodeManager;
+
+  private int minSlowDetectCount;
+  private long slowDataTtl;
+  private long slowPacketAckMs;
+  private double slowPacketAckSpeed;
+
+  public StreamSlowMonitor(Configuration conf, String name,
+  ExcludeDatanodeManager excludeDatanodeManager) {
+setConf(conf);
+this.name = name;
+this.excludeDatanodeManager = excludeDatanodeManager;
+this.datanodeSlowDataQueue = 

[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-20 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r772230583



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/StreamSlowMonitor.java
##
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_EXCLUDE_DATANODE_TTL;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_EXCLUDE_DATANODE_TTL_KEY;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY;
+
+import java.util.Deque;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
+/**
+ * Class for monitor the wal file flush performance.
+ * Each active wal file has a StreamSlowMonitor.
+ */
+@InterfaceAudience.Private
+public class StreamSlowMonitor implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(StreamSlowMonitor.class);
+
+  /**
+   * Configure for the min count for a datanode detected slow.
+   * If a datanode is detected slow times up to this count, then it will be 
added to the exclude
+   * datanode cache by {@link 
ExcludeDatanodeManager#tryAddExcludeDN(DatanodeInfo, String)}
+   * of this regionsever.
+   */
+  private static final String WAL_SLOW_DETECT_MIN_COUNT_KEY =
+"hbase.regionserver.async.wal.min.slow.detect.count";
+  private static final int DEFAULT_WAL_SLOW_DETECT_MIN_COUNT = 3;
+
+  /**
+   * Configure for the TTL of the data that a datanode detected slow.
+   */
+  private static final String WAL_SLOW_DETECT_DATA_TTL_KEY =
+"hbase.regionserver.async.wal.slow.detect.data.ttl.ms";
+  private static final long DEFAULT_WAL_SLOW_DETECT_DATA_TTL = 10 * 60 * 1000; 
// 10min in ms
+
+  /**
+   * Configure for the slow packet process time, a duration from send to ACK.
+   */
+  private static final String DATANODE_SLOW_PACKET_PROCESS_TIME_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.process.time.millis";
+  private static final long DEFAULT_DATANODE_SLOW_PACKET_PROCESS_TIME = 6000; 
//6s in ms
+
+  /**
+   * Configure for the packet flush speed.
+   */
+  private static final String DATANODE_SLOW_PACKET_FLUSH_SPEED_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.flush.speed.kbs";
+  private static final double DEFAULT_DATANODE_SLOW_PACKET_FLUSH_SPEED = 0.1;
+
+  private final String name;
+  // this is a map of datanodeInfo->queued slow PacketAckData
+  private final Cache> 
datanodeSlowDataQueue;
+  private final ExcludeDatanodeManager excludeDatanodeManager;
+
+  private int minSlowDetectCount;
+  private long slowDataTtl;
+  private long slowPacketAckMillis;
+  private double slowPacketAckSpeed;
+
+  public StreamSlowMonitor(Configuration conf, String name,
+  ExcludeDatanodeManager excludeDatanodeManager) {
+setConf(conf);
+this.name = name;
+this.excludeDatanodeManager = excludeDatanodeManager;
+this.datanodeSlowDataQueue = CacheBuilder.newBuilder()
+  .initialCapacity(conf.getInt(WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY,
+DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT))
+  .expireAfterWrite(conf.getLong(WAL_EXCLUDE_DATANODE_TTL_KEY,
+DEFAULT_WAL_EXCLUDE_DATANODE_TTL), TimeUnit.HOURS)
+  .build();
+LOG.info("New stream slow monitor {}", this.name);
+  }
+
+  public static StreamSlowMonitor create(Configuration conf, String name) {
+return new 

[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-20 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r772227119



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/StreamSlowMonitor.java
##
@@ -0,0 +1,207 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_EXCLUDE_DATANODE_TTL;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_EXCLUDE_DATANODE_TTL_KEY;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY;
+
+import java.util.Deque;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheLoader;
+import org.apache.hbase.thirdparty.com.google.common.cache.LoadingCache;
+
+/**
+ * Class for monitor the wal file flush performance.
+ * Each active wal file has a StreamSlowMonitor.
+ */
+@InterfaceAudience.Private
+public class StreamSlowMonitor implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(StreamSlowMonitor.class);
+
+  /**
+   * Configure for the min count for a datanode detected slow.
+   * If a datanode is detected slow times up to this count, then it will be 
added to the exclude
+   * datanode cache by {@link 
ExcludeDatanodeManager#tryAddExcludeDN(DatanodeInfo, String)}
+   * of this regionsever.
+   */
+  private static final String WAL_SLOW_DETECT_MIN_COUNT_KEY =
+"hbase.regionserver.async.wal.min.slow.detect.count";
+  private static final int DEFAULT_WAL_SLOW_DETECT_MIN_COUNT = 3;
+
+  /**
+   * Configure for the TTL of the data that a datanode detected slow.
+   */
+  private static final String WAL_SLOW_DETECT_DATA_TTL_KEY =
+"hbase.regionserver.async.wal.slow.detect.data.ttl.ms";
+  private static final long DEFAULT_WAL_SLOW_DETECT_DATA_TTL = 10 * 60 * 1000; 
// 10min in ms
+
+  /**
+   * Configure for the slow packet process time, a duration from send to ACK.
+   * It is preferred that this value should not be less than 1s.
+   */
+  private static final String DATANODE_SLOW_PACKET_PROCESS_TIME_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.process.time.millis";
+  private static final long DEFAULT_DATANODE_SLOW_PACKET_PROCESS_TIME = 6000; 
//6s in ms
+
+  /**
+   * Configure for the packet flush speed.
+   * e.g. If the configured slow slow packet process time is larger than 1s, 
then here 0.1kbs means
+   * 100B should be processed in less than 1s.
+   * If the configured slow slow packet process time is larger than 10s, then 
here 0.1kbs means
+   * 1KB should be processed in less than 10s.
+   */
+  private static final String DATANODE_SLOW_PACKET_FLUSH_SPEED_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.flush.speed.kbs";
+  private static final double DEFAULT_DATANODE_SLOW_PACKET_FLUSH_SPEED = 0.1;
+
+  private final String name;
+  // this is a map of datanodeInfo->queued slow PacketAckData
+  private final LoadingCache> 
datanodeSlowDataQueue;
+  private final ExcludeDatanodeManager excludeDatanodeManager;
+
+  private int minSlowDetectCount;
+  private long slowDataTtl;
+  private long slowPacketAckMs;
+  private double slowPacketAckSpeed;
+
+  public StreamSlowMonitor(Configuration conf, String name,
+  ExcludeDatanodeManager excludeDatanodeManager) {
+setConf(conf);
+this.name = name;
+this.excludeDatanodeManager = excludeDatanodeManager;
+this.datanodeSlowDataQueue = 

[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-14 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r768690112



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/StreamSlowMonitor.java
##
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_EXCLUDE_DATANODE_TTL;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_EXCLUDE_DATANODE_TTL_KEY;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY;
+
+import java.util.Deque;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
+/**
+ * Class for monitor the wal file flush performance.
+ * Each active wal file has a StreamSlowMonitor.
+ */
+@InterfaceAudience.Private
+public class StreamSlowMonitor implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(StreamSlowMonitor.class);
+
+  /**
+   * Configure for the min count for a datanode detected slow.
+   * If a datanode is detected slow times up to this count, then it will be 
added to the exclude
+   * datanode cache by {@link 
ExcludeDatanodeManager#tryAddExcludeDN(DatanodeInfo, String)}
+   * of this regionsever.
+   */
+  private static final String WAL_SLOW_DETECT_MIN_COUNT_KEY =
+"hbase.regionserver.async.wal.min.slow.detect.count";
+  private static final int DEFAULT_WAL_SLOW_DETECT_MIN_COUNT = 3;
+
+  /**
+   * Configure for the TTL of the data that a datanode detected slow.
+   */
+  private static final String WAL_SLOW_DETECT_DATA_TTL_KEY =
+"hbase.regionserver.async.wal.slow.detect.data.ttl.ms";
+  private static final long DEFAULT_WAL_SLOW_DETECT_DATA_TTL = 10 * 60 * 1000; 
// 10min in ms
+
+  /**
+   * Configure for the slow packet process time, a duration from send to ACK.
+   */
+  private static final String DATANODE_SLOW_PACKET_PROCESS_TIME_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.process.time.millis";
+  private static final long DEFAULT_DATANODE_SLOW_PACKET_PROCESS_TIME = 6000; 
//6s in ms
+
+  /**
+   * Configure for the packet flush speed.
+   */
+  private static final String DATANODE_SLOW_PACKET_FLUSH_SPEED_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.flush.speed.kbs";
+  private static final double DEFAULT_DATANODE_SLOW_PACKET_FLUSH_SPEED = 0.1;
+
+  private final String name;
+  // this is a map of datanodeInfo->queued slow PacketAckData
+  private final Cache> 
datanodeSlowDataQueue;
+  private final ExcludeDatanodeManager excludeDatanodeManager;
+
+  private int minSlowDetectCount;
+  private long slowDataTtl;
+  private long slowPacketAckMillis;
+  private double slowPacketAckSpeed;
+
+  public StreamSlowMonitor(Configuration conf, String name,
+  ExcludeDatanodeManager excludeDatanodeManager) {
+setConf(conf);
+this.name = name;
+this.excludeDatanodeManager = excludeDatanodeManager;
+this.datanodeSlowDataQueue = CacheBuilder.newBuilder()
+  .initialCapacity(conf.getInt(WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY,
+DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT))
+  .expireAfterWrite(conf.getLong(WAL_EXCLUDE_DATANODE_TTL_KEY,
+DEFAULT_WAL_EXCLUDE_DATANODE_TTL), TimeUnit.HOURS)
+  .build();
+LOG.info("New stream slow monitor {}", this.name);
+  }
+
+  public static StreamSlowMonitor create(Configuration conf, String name) {
+return new 

[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-14 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r768689849



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/StreamSlowMonitor.java
##
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_EXCLUDE_DATANODE_TTL;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_EXCLUDE_DATANODE_TTL_KEY;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY;
+
+import java.util.Deque;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
+/**
+ * Class for monitor the wal file flush performance.
+ * Each active wal file has a StreamSlowMonitor.
+ */
+@InterfaceAudience.Private
+public class StreamSlowMonitor implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(StreamSlowMonitor.class);
+
+  /**
+   * Configure for the min count for a datanode detected slow.
+   * If a datanode is detected slow times up to this count, then it will be 
added to the exclude
+   * datanode cache by {@link 
ExcludeDatanodeManager#tryAddExcludeDN(DatanodeInfo, String)}
+   * of this regionsever.
+   */
+  private static final String WAL_SLOW_DETECT_MIN_COUNT_KEY =
+"hbase.regionserver.async.wal.min.slow.detect.count";
+  private static final int DEFAULT_WAL_SLOW_DETECT_MIN_COUNT = 3;
+
+  /**
+   * Configure for the TTL of the data that a datanode detected slow.
+   */
+  private static final String WAL_SLOW_DETECT_DATA_TTL_KEY =
+"hbase.regionserver.async.wal.slow.detect.data.ttl.ms";
+  private static final long DEFAULT_WAL_SLOW_DETECT_DATA_TTL = 10 * 60 * 1000; 
// 10min in ms
+
+  /**
+   * Configure for the slow packet process time, a duration from send to ACK.
+   */
+  private static final String DATANODE_SLOW_PACKET_PROCESS_TIME_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.process.time.millis";
+  private static final long DEFAULT_DATANODE_SLOW_PACKET_PROCESS_TIME = 6000; 
//6s in ms
+
+  /**
+   * Configure for the packet flush speed.
+   */
+  private static final String DATANODE_SLOW_PACKET_FLUSH_SPEED_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.flush.speed.kbs";
+  private static final double DEFAULT_DATANODE_SLOW_PACKET_FLUSH_SPEED = 0.1;
+
+  private final String name;
+  // this is a map of datanodeInfo->queued slow PacketAckData
+  private final Cache> 
datanodeSlowDataQueue;
+  private final ExcludeDatanodeManager excludeDatanodeManager;
+
+  private int minSlowDetectCount;
+  private long slowDataTtl;
+  private long slowPacketAckMillis;
+  private double slowPacketAckSpeed;
+
+  public StreamSlowMonitor(Configuration conf, String name,
+  ExcludeDatanodeManager excludeDatanodeManager) {
+setConf(conf);
+this.name = name;
+this.excludeDatanodeManager = excludeDatanodeManager;
+this.datanodeSlowDataQueue = CacheBuilder.newBuilder()
+  .initialCapacity(conf.getInt(WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY,
+DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT))
+  .expireAfterWrite(conf.getLong(WAL_EXCLUDE_DATANODE_TTL_KEY,
+DEFAULT_WAL_EXCLUDE_DATANODE_TTL), TimeUnit.HOURS)
+  .build();
+LOG.info("New stream slow monitor {}", this.name);
+  }
+
+  public static StreamSlowMonitor create(Configuration conf, String name) {
+return new 

[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-14 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r768688266



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/StreamSlowMonitor.java
##
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_EXCLUDE_DATANODE_TTL;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_EXCLUDE_DATANODE_TTL_KEY;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY;
+
+import java.util.Deque;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
+/**
+ * Class for monitor the wal file flush performance.
+ * Each active wal file has a StreamSlowMonitor.
+ */
+@InterfaceAudience.Private
+public class StreamSlowMonitor implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(StreamSlowMonitor.class);
+
+  /**
+   * Configure for the min count for a datanode detected slow.
+   * If a datanode is detected slow times up to this count, then it will be 
added to the exclude
+   * datanode cache by {@link 
ExcludeDatanodeManager#tryAddExcludeDN(DatanodeInfo, String)}
+   * of this regionsever.
+   */
+  private static final String WAL_SLOW_DETECT_MIN_COUNT_KEY =
+"hbase.regionserver.async.wal.min.slow.detect.count";
+  private static final int DEFAULT_WAL_SLOW_DETECT_MIN_COUNT = 3;
+
+  /**
+   * Configure for the TTL of the data that a datanode detected slow.
+   */
+  private static final String WAL_SLOW_DETECT_DATA_TTL_KEY =
+"hbase.regionserver.async.wal.slow.detect.data.ttl.ms";
+  private static final long DEFAULT_WAL_SLOW_DETECT_DATA_TTL = 10 * 60 * 1000; 
// 10min in ms
+
+  /**
+   * Configure for the slow packet process time, a duration from send to ACK.
+   */
+  private static final String DATANODE_SLOW_PACKET_PROCESS_TIME_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.process.time.millis";
+  private static final long DEFAULT_DATANODE_SLOW_PACKET_PROCESS_TIME = 6000; 
//6s in ms
+
+  /**
+   * Configure for the packet flush speed.
+   */
+  private static final String DATANODE_SLOW_PACKET_FLUSH_SPEED_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.flush.speed.kbs";
+  private static final double DEFAULT_DATANODE_SLOW_PACKET_FLUSH_SPEED = 0.1;
+
+  private final String name;
+  // this is a map of datanodeInfo->queued slow PacketAckData
+  private final Cache> 
datanodeSlowDataQueue;
+  private final ExcludeDatanodeManager excludeDatanodeManager;
+
+  private int minSlowDetectCount;
+  private long slowDataTtl;
+  private long slowPacketAckMillis;
+  private double slowPacketAckSpeed;
+
+  public StreamSlowMonitor(Configuration conf, String name,
+  ExcludeDatanodeManager excludeDatanodeManager) {
+setConf(conf);
+this.name = name;
+this.excludeDatanodeManager = excludeDatanodeManager;
+this.datanodeSlowDataQueue = CacheBuilder.newBuilder()
+  .initialCapacity(conf.getInt(WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY,
+DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT))
+  .expireAfterWrite(conf.getLong(WAL_EXCLUDE_DATANODE_TTL_KEY,
+DEFAULT_WAL_EXCLUDE_DATANODE_TTL), TimeUnit.HOURS)
+  .build();
+LOG.info("New stream slow monitor {}", this.name);
+  }
+
+  public static StreamSlowMonitor create(Configuration conf, String name) {
+return new 

[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-14 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r768688266



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/StreamSlowMonitor.java
##
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_EXCLUDE_DATANODE_TTL;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_EXCLUDE_DATANODE_TTL_KEY;
+import static 
org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager.WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY;
+
+import java.util.Deque;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
+/**
+ * Class for monitor the wal file flush performance.
+ * Each active wal file has a StreamSlowMonitor.
+ */
+@InterfaceAudience.Private
+public class StreamSlowMonitor implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(StreamSlowMonitor.class);
+
+  /**
+   * Configure for the min count for a datanode detected slow.
+   * If a datanode is detected slow times up to this count, then it will be 
added to the exclude
+   * datanode cache by {@link 
ExcludeDatanodeManager#tryAddExcludeDN(DatanodeInfo, String)}
+   * of this regionsever.
+   */
+  private static final String WAL_SLOW_DETECT_MIN_COUNT_KEY =
+"hbase.regionserver.async.wal.min.slow.detect.count";
+  private static final int DEFAULT_WAL_SLOW_DETECT_MIN_COUNT = 3;
+
+  /**
+   * Configure for the TTL of the data that a datanode detected slow.
+   */
+  private static final String WAL_SLOW_DETECT_DATA_TTL_KEY =
+"hbase.regionserver.async.wal.slow.detect.data.ttl.ms";
+  private static final long DEFAULT_WAL_SLOW_DETECT_DATA_TTL = 10 * 60 * 1000; 
// 10min in ms
+
+  /**
+   * Configure for the slow packet process time, a duration from send to ACK.
+   */
+  private static final String DATANODE_SLOW_PACKET_PROCESS_TIME_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.process.time.millis";
+  private static final long DEFAULT_DATANODE_SLOW_PACKET_PROCESS_TIME = 6000; 
//6s in ms
+
+  /**
+   * Configure for the packet flush speed.
+   */
+  private static final String DATANODE_SLOW_PACKET_FLUSH_SPEED_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.flush.speed.kbs";
+  private static final double DEFAULT_DATANODE_SLOW_PACKET_FLUSH_SPEED = 0.1;
+
+  private final String name;
+  // this is a map of datanodeInfo->queued slow PacketAckData
+  private final Cache> 
datanodeSlowDataQueue;
+  private final ExcludeDatanodeManager excludeDatanodeManager;
+
+  private int minSlowDetectCount;
+  private long slowDataTtl;
+  private long slowPacketAckMillis;
+  private double slowPacketAckSpeed;
+
+  public StreamSlowMonitor(Configuration conf, String name,
+  ExcludeDatanodeManager excludeDatanodeManager) {
+setConf(conf);
+this.name = name;
+this.excludeDatanodeManager = excludeDatanodeManager;
+this.datanodeSlowDataQueue = CacheBuilder.newBuilder()
+  .initialCapacity(conf.getInt(WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY,
+DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT))
+  .expireAfterWrite(conf.getLong(WAL_EXCLUDE_DATANODE_TTL_KEY,
+DEFAULT_WAL_EXCLUDE_DATANODE_TTL), TimeUnit.HOURS)
+  .build();
+LOG.info("New stream slow monitor {}", this.name);
+  }
+
+  public static StreamSlowMonitor create(Configuration conf, String name) {
+return new 

[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-14 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r768679101



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/ExcludeDatanodeManager.java
##
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
+/**
+ * The class to manage the excluded datanodes of the WALs on the regionserver.
+ */
+@InterfaceAudience.Private
+public class ExcludeDatanodeManager implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(ExcludeDatanodeManager.class);
+
+  /**
+   * Configure for the max count the excluded datanodes.
+   */
+  public static final String WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY =
+"hbase.regionserver.async.wal.max.exclude.datanode.count";
+  public static final int DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT = 3;
+
+  /**
+   * Configure for the TTL time of the datanodes excluded
+   */
+  public static final String WAL_EXCLUDE_DATANODE_TTL_KEY =
+"hbase.regionserver.async.wal.exclude.datanode.info.ttl.hour";
+  public static final int DEFAULT_WAL_EXCLUDE_DATANODE_TTL = 6; // 6 hours
+
+  private volatile Cache excludeDNsCache;
+  private final int maxExcludeDNCount;
+  private final Configuration conf;
+  // This is a map of providerId->StreamSlowMonitor
+  private final Map streamSlowMonitors =
+new ConcurrentHashMap<>(1);
+
+  public ExcludeDatanodeManager(Configuration conf) {
+this.conf = conf;
+this.maxExcludeDNCount = 
conf.getInt(WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY,
+  DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT);
+this.excludeDNsCache = CacheBuilder.newBuilder()
+  .expireAfterWrite(this.conf.getLong(WAL_EXCLUDE_DATANODE_TTL_KEY,
+DEFAULT_WAL_EXCLUDE_DATANODE_TTL), TimeUnit.HOURS)
+  .maximumSize(this.maxExcludeDNCount)
+  .build();
+  }
+
+  /**
+   * Try to add a datanode to the regionserver excluding cache
+   * @param datanodeInfo the datanode to be added to the excluded cache
+   * @param cause the cause that the datanode is hope to be excluded
+   * @return True if the datanode is added to the regionserver excluding 
cache, false otherwise
+   */
+  public boolean tryAddExcludeDN(DatanodeInfo datanodeInfo, String cause) {
+boolean alreadyMarkedSlow = getExcludeDNs().containsKey(datanodeInfo);
+if (excludeDNsCache.size() >= maxExcludeDNCount) {

Review comment:
   Got it, thanks.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-07 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r764543482



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/ExcludeDatanodeManager.java
##
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
+/**
+ * The class to manage the excluded datanodes of the WALs on the regionserver.
+ */
+@InterfaceAudience.Private
+public class ExcludeDatanodeManager implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(ExcludeDatanodeManager.class);
+
+  /**
+   * Configure for the max count the excluded datanodes.
+   */
+  private static final String WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY =
+"hbase.regionserver.async.wal.max.exclude.datanode.count";
+  private static final int DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT = 3;
+
+  /**
+   * Configure for the TTL time of the datanodes excluded
+   */
+  private static final String WAL_EXCLUDE_DATANODE_TTL_KEY =
+"hbase.regionserver.async.wal.exclude.datanode.info.ttl.hour";
+  private static final int DEFAULT_WAL_EXCLUDE_DATANODE_TTL = 6; // 6 hours
+
+  private volatile Cache excludeDNsCache;
+  private final int maxExcludeDNCount;
+  private final Configuration conf;
+  // This is a map of providerId->StreamSlowMonitor
+  private final Map streamSlowMonitors =
+new ConcurrentHashMap<>(1);
+
+  public ExcludeDatanodeManager(Configuration conf) {
+this.conf = conf;
+this.maxExcludeDNCount = 
conf.getInt(WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY,
+  DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT);
+this.excludeDNsCache = CacheBuilder.newBuilder()
+  .expireAfterWrite(this.conf.getLong(WAL_EXCLUDE_DATANODE_TTL_KEY,
+DEFAULT_WAL_EXCLUDE_DATANODE_TTL),
+TimeUnit.HOURS)
+  .maximumSize(this.maxExcludeDNCount)
+  .concurrencyLevel(10)

Review comment:
   Here we can use the default concurrencyLevel, which is 4.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-07 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r764543345



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/ExcludeDatanodeManager.java
##
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
+/**
+ * The class to manage the excluded datanodes of the WALs on the regionserver.
+ */
+@InterfaceAudience.Private
+public class ExcludeDatanodeManager implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(ExcludeDatanodeManager.class);
+
+  /**
+   * Configure for the max count the excluded datanodes.
+   */
+  private static final String WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY =
+"hbase.regionserver.async.wal.max.exclude.datanode.count";
+  private static final int DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT = 3;
+
+  /**
+   * Configure for the TTL time of the datanodes excluded
+   */
+  private static final String WAL_EXCLUDE_DATANODE_TTL_KEY =
+"hbase.regionserver.async.wal.exclude.datanode.info.ttl.hour";
+  private static final int DEFAULT_WAL_EXCLUDE_DATANODE_TTL = 6; // 6 hours
+
+  private volatile Cache excludeDNsCache;
+  private final int maxExcludeDNCount;
+  private final Configuration conf;
+  // This is a map of providerId->StreamSlowMonitor
+  private final Map streamSlowMonitors =
+new ConcurrentHashMap<>(1);
+
+  public ExcludeDatanodeManager(Configuration conf) {
+this.conf = conf;
+this.maxExcludeDNCount = 
conf.getInt(WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY,
+  DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT);
+this.excludeDNsCache = CacheBuilder.newBuilder()
+  .expireAfterWrite(this.conf.getLong(WAL_EXCLUDE_DATANODE_TTL_KEY,
+DEFAULT_WAL_EXCLUDE_DATANODE_TTL),
+TimeUnit.HOURS)
+  .maximumSize(this.maxExcludeDNCount)
+  .concurrencyLevel(10)
+  .build();
+  }
+
+  /**
+   * Try to add a datanode to the regionserver excluding cache
+   * @param datanodeInfo the datanode to be added to the excluded cache
+   * @param cause the cause that the datanode is hope to be excluded
+   * @return True if the datanode is added to the regionserver excluding 
cache, false otherwise
+   */
+  public boolean tryAddExcludeDN(DatanodeInfo datanodeInfo, String cause) {
+boolean alreadyMarkedSlow = getExcludeDNs().containsKey(datanodeInfo);
+if (excludeDNsCache.size() < maxExcludeDNCount) {
+  if (!alreadyMarkedSlow) {
+excludeDNsCache.put(datanodeInfo, System.currentTimeMillis());

Review comment:
   Hi, @Apache9 , do you mean that in the put method of guava cache, 
concurrently put operation of segments may cause the cache size be larger than 
the configured maximumSize? And in each segment, there is lock for put 
operation, so that the table value set and the eviction method are all in the 
lock. 
   But for cache here has maximumSize=3(no matter what the concurrencyLevel 
is), the segment count is always 1, so I think that concurrently put will be 
serially executed internally through the lock.
   This is the source code of LocalCache set segment count,
   `int segmentShift = 0;
   int segmentCount = 1;
   while (segmentCount < concurrencyLevel && (!evictsBySize() || 
segmentCount * 20 <= maxWeight)) {
 ++segmentShift;
 segmentCount <<= 1;
   }`
   Here the evictsBySize() is always true, because 
maxWeight=maximumSize=3(default config)>=0, and segmentCount * 20 <= maxWeight 
is false in most circumstances, because we will not let the exclude cache 
contains more than 

[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-06 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r762831475



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/ExcludeDatanodeManager.java
##
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
+/**
+ * The class to manage the excluded datanodes of the WALs on the regionserver.
+ */
+@InterfaceAudience.Private
+public class ExcludeDatanodeManager implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(ExcludeDatanodeManager.class);
+
+  /**
+   * Configure for the max count the excluded datanodes.
+   */
+  private static final String WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY =
+"hbase.regionserver.async.wal.max.exclude.datanode.count";
+  private static final int DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT = 3;
+
+  /**
+   * Configure for the TTL time of the datanodes excluded
+   */
+  private static final String WAL_EXCLUDE_DATANODE_TTL_KEY =
+"hbase.regionserver.async.wal.exclude.datanode.info.ttl.hour";
+  private static final int DEFAULT_WAL_EXCLUDE_DATANODE_TTL = 6; // 6 hours
+
+  private volatile Cache excludeDNsCache;
+  private final int maxExcludeDNCount;
+  private final Configuration conf;
+  // This is a map of providerId->StreamSlowMonitor
+  private final Map streamSlowMonitors =
+new ConcurrentHashMap<>(1);
+
+  public ExcludeDatanodeManager(Configuration conf) {
+this.conf = conf;
+this.maxExcludeDNCount = 
conf.getInt(WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY,
+  DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT);
+this.excludeDNsCache = CacheBuilder.newBuilder()
+  .expireAfterWrite(this.conf.getLong(WAL_EXCLUDE_DATANODE_TTL_KEY,
+DEFAULT_WAL_EXCLUDE_DATANODE_TTL),
+TimeUnit.HOURS)
+  .maximumSize(this.maxExcludeDNCount)
+  .concurrencyLevel(10)
+  .build();
+  }
+
+  /**
+   * Try to add a datanode to the regionserver excluding cache
+   * @param datanodeInfo the datanode to be added to the excluded cache
+   * @param cause the cause that the datanode is hope to be excluded
+   * @return True if the datanode is added to the regionserver excluding 
cache, false otherwise
+   */
+  public boolean tryAddExcludeDN(DatanodeInfo datanodeInfo, String cause) {
+boolean alreadyMarkedSlow = getExcludeDNs().containsKey(datanodeInfo);
+if (excludeDNsCache.size() < maxExcludeDNCount) {
+  if (!alreadyMarkedSlow) {
+excludeDNsCache.put(datanodeInfo, System.currentTimeMillis());

Review comment:
   The entry count of this cache will not be larger than maxExcludeDNCount, 
since in the put() method of the cache, every segment will evict the newest 
entry if the overall count larger than the maximumSize, right? Even several 
threads concurrently put entries, the granularity of concurrency is the count 
of the segments inside the cache.
   
   

##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/ExcludeDatanodeManager.java
##
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by 

[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-06 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r763706712



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/ExcludeDatanodeManager.java
##
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
+/**
+ * The class to manage the excluded datanodes of the WALs on the regionserver.
+ */
+@InterfaceAudience.Private
+public class ExcludeDatanodeManager implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(ExcludeDatanodeManager.class);
+
+  /**
+   * Configure for the max count the excluded datanodes.
+   */
+  private static final String WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY =
+"hbase.regionserver.async.wal.max.exclude.datanode.count";
+  private static final int DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT = 3;
+
+  /**
+   * Configure for the TTL time of the datanodes excluded
+   */
+  private static final String WAL_EXCLUDE_DATANODE_TTL_KEY =
+"hbase.regionserver.async.wal.exclude.datanode.info.ttl.hour";
+  private static final int DEFAULT_WAL_EXCLUDE_DATANODE_TTL = 6; // 6 hours
+
+  private volatile Cache excludeDNsCache;

Review comment:
   There is assignment for this variable in the onConfigurationChange(), 
set it be volatile can make it visible to all the read threads. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-06 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r762812910



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/ExcludeDatanodeManager.java
##
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
+/**
+ * The class to manage the excluded datanodes of the WALs on the regionserver.
+ */
+@InterfaceAudience.Private
+public class ExcludeDatanodeManager implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(ExcludeDatanodeManager.class);
+
+  /**
+   * Configure for the max count the excluded datanodes.
+   */
+  private static final String WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY =
+"hbase.regionserver.async.wal.max.exclude.datanode.count";
+  private static final int DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT = 3;
+
+  /**
+   * Configure for the TTL time of the datanodes excluded
+   */
+  private static final String WAL_EXCLUDE_DATANODE_TTL_KEY =
+"hbase.regionserver.async.wal.exclude.datanode.info.ttl.hour";
+  private static final int DEFAULT_WAL_EXCLUDE_DATANODE_TTL = 6; // 6 hours
+
+  private volatile Cache excludeDNsCache;

Review comment:
   This cache can be concurrently read and written by WALs of different 
group, so I make it be volatile to let the change visible to all threads.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-06 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r762840195



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/StreamSlowMonitor.java
##
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import java.util.Deque;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedDeque;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Class for monitor the wal file flush performance.
+ * Each active wal file has a StreamSlowMonitor.
+ */
+@InterfaceAudience.Private
+public class StreamSlowMonitor implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(StreamSlowMonitor.class);
+
+  /**
+   * Configure for the min count for a datanode detected slow.
+   * If a datanode is detected slow times up to this count, then it will be 
added to the exclude
+   * datanode cache by {@link 
ExcludeDatanodeManager#tryAddExcludeDN(DatanodeInfo, String)}
+   * of this regionsever.
+   */
+  private static final String WAL_SLOW_DETECT_MIN_COUNT_KEY =
+"hbase.regionserver.async.wal.min.slow.detect.count";
+  private static final int DEFAULT_WAL_SLOW_DETECT_MIN_COUNT = 3;
+
+  /**
+   * Configure for the TTL of the data that a datanode detected slow.
+   */
+  private static final String WAL_SLOW_DETECT_DATA_TTL_KEY =
+"hbase.regionserver.async.wal.slow.detect.data.ttl.ms";
+  private static final long DEFAULT_WAL_SLOW_DETECT_DATA_TTL = 10 * 60 * 1000; 
// 10min in ms
+
+  /**
+   * Configure for the slow packet process time, a duration from send to ACK.
+   */
+  private static final String DATANODE_SLOW_PACKET_PROCESS_TIME_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.process.time.millis";
+  private static final long DEFAULT_DATANODE_SLOW_PACKET_PROCESS_TIME = 6000; 
//6s in ms
+
+  /**
+   * Configure for the packet flush speed.
+   */
+  private static final String DATANODE_SLOW_PACKET_FLUSH_SPEED_KEY =
+"hbase.regionserver.async.wal.datanode.slow.packet.flush.speed.kbs";
+  private static final double DEFAULT_DATANODE_SLOW_PACKET_FLUSH_SPEED = 0.1;
+
+  private final String name;
+  // this is a map of datanodeInfo->queued slow PacketAckData
+  private final Map> datanodeSlowDataQueue =
+new ConcurrentHashMap<>();
+  private final ExcludeDatanodeManager excludeDatanodeManager;
+
+  private int minSlowDetectCount;
+  private long slowDataTtl;
+  private long slowPacketAckMillis;
+  private double slowPacketAckSpeed;
+
+  public StreamSlowMonitor(Configuration conf, String name,
+  ExcludeDatanodeManager excludeDatanodeManager) {
+setConf(conf);
+this.name = name;
+this.excludeDatanodeManager = excludeDatanodeManager;
+LOG.info("New stream slow monitor {}", this.name);
+  }
+
+  public static StreamSlowMonitor create(Configuration conf, String name) {
+return new StreamSlowMonitor(conf, name, new ExcludeDatanodeManager(conf));
+  }
+
+  /**
+   * Check if the packet process time shows that the relevant datanode is a 
slow node.
+   * @param datanodeInfo the datanode that processed the packet
+   * @param packetDataLen the data length of the packet
+   * @param processTime the process time of the packet on the datanode
+   * @param lastAckTimestamp the last acked timestamp of the packet on another 
datanode
+   * @param unfinished if the packet is unfinished flushed to the datanode 
replicas
+   */
+  public void checkProcessTimeAndSpeed(DatanodeInfo datanodeInfo, long 
packetDataLen,
+  long processTime, long lastAckTimestamp, int unfinished) {
+long current = EnvironmentEdgeManager.currentTime();
+boolean slow = processTime > slowPacketAckMillis ||
+(packetDataLen > 100 && (double) packetDataLen / processTime < 

[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-06 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r762831475



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/ExcludeDatanodeManager.java
##
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
+/**
+ * The class to manage the excluded datanodes of the WALs on the regionserver.
+ */
+@InterfaceAudience.Private
+public class ExcludeDatanodeManager implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(ExcludeDatanodeManager.class);
+
+  /**
+   * Configure for the max count the excluded datanodes.
+   */
+  private static final String WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY =
+"hbase.regionserver.async.wal.max.exclude.datanode.count";
+  private static final int DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT = 3;
+
+  /**
+   * Configure for the TTL time of the datanodes excluded
+   */
+  private static final String WAL_EXCLUDE_DATANODE_TTL_KEY =
+"hbase.regionserver.async.wal.exclude.datanode.info.ttl.hour";
+  private static final int DEFAULT_WAL_EXCLUDE_DATANODE_TTL = 6; // 6 hours
+
+  private volatile Cache excludeDNsCache;
+  private final int maxExcludeDNCount;
+  private final Configuration conf;
+  // This is a map of providerId->StreamSlowMonitor
+  private final Map streamSlowMonitors =
+new ConcurrentHashMap<>(1);
+
+  public ExcludeDatanodeManager(Configuration conf) {
+this.conf = conf;
+this.maxExcludeDNCount = 
conf.getInt(WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY,
+  DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT);
+this.excludeDNsCache = CacheBuilder.newBuilder()
+  .expireAfterWrite(this.conf.getLong(WAL_EXCLUDE_DATANODE_TTL_KEY,
+DEFAULT_WAL_EXCLUDE_DATANODE_TTL),
+TimeUnit.HOURS)
+  .maximumSize(this.maxExcludeDNCount)
+  .concurrencyLevel(10)
+  .build();
+  }
+
+  /**
+   * Try to add a datanode to the regionserver excluding cache
+   * @param datanodeInfo the datanode to be added to the excluded cache
+   * @param cause the cause that the datanode is hope to be excluded
+   * @return True if the datanode is added to the regionserver excluding 
cache, false otherwise
+   */
+  public boolean tryAddExcludeDN(DatanodeInfo datanodeInfo, String cause) {
+boolean alreadyMarkedSlow = getExcludeDNs().containsKey(datanodeInfo);
+if (excludeDNsCache.size() < maxExcludeDNCount) {
+  if (!alreadyMarkedSlow) {
+excludeDNsCache.put(datanodeInfo, System.currentTimeMillis());

Review comment:
   The entry count of this cache will not be larger than maxExcludeDNCount, 
since in the put() method of the cache, every segment will evict the newest 
entry if the overall count larger than the maximumSize, right? Even several 
threads concurrently put entries, the granularity of concurrency is the count 
of the segments inside the cache.
   
   




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-06 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r762815636



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/ExcludeDatanodeManager.java
##
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
+/**
+ * The class to manage the excluded datanodes of the WALs on the regionserver.
+ */
+@InterfaceAudience.Private
+public class ExcludeDatanodeManager implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(ExcludeDatanodeManager.class);
+
+  /**
+   * Configure for the max count the excluded datanodes.
+   */
+  private static final String WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY =
+"hbase.regionserver.async.wal.max.exclude.datanode.count";
+  private static final int DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT = 3;
+
+  /**
+   * Configure for the TTL time of the datanodes excluded
+   */
+  private static final String WAL_EXCLUDE_DATANODE_TTL_KEY =
+"hbase.regionserver.async.wal.exclude.datanode.info.ttl.hour";
+  private static final int DEFAULT_WAL_EXCLUDE_DATANODE_TTL = 6; // 6 hours
+
+  private volatile Cache excludeDNsCache;
+  private final int maxExcludeDNCount;
+  private final Configuration conf;
+  // This is a map of providerId->StreamSlowMonitor
+  private final Map streamSlowMonitors =
+new ConcurrentHashMap<>(1);
+
+  public ExcludeDatanodeManager(Configuration conf) {
+this.conf = conf;
+this.maxExcludeDNCount = 
conf.getInt(WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY,
+  DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT);
+this.excludeDNsCache = CacheBuilder.newBuilder()
+  .expireAfterWrite(this.conf.getLong(WAL_EXCLUDE_DATANODE_TTL_KEY,
+DEFAULT_WAL_EXCLUDE_DATANODE_TTL),
+TimeUnit.HOURS)
+  .maximumSize(this.maxExcludeDNCount)
+  .concurrencyLevel(10)

Review comment:
   This is an experience value for the group count of WALs, not a suitable 
value here, I can remove this attribute setting and use the default 
concurrencyLevel(4).




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-06 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r762815636



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/ExcludeDatanodeManager.java
##
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
+/**
+ * The class to manage the excluded datanodes of the WALs on the regionserver.
+ */
+@InterfaceAudience.Private
+public class ExcludeDatanodeManager implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(ExcludeDatanodeManager.class);
+
+  /**
+   * Configure for the max count the excluded datanodes.
+   */
+  private static final String WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY =
+"hbase.regionserver.async.wal.max.exclude.datanode.count";
+  private static final int DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT = 3;
+
+  /**
+   * Configure for the TTL time of the datanodes excluded
+   */
+  private static final String WAL_EXCLUDE_DATANODE_TTL_KEY =
+"hbase.regionserver.async.wal.exclude.datanode.info.ttl.hour";
+  private static final int DEFAULT_WAL_EXCLUDE_DATANODE_TTL = 6; // 6 hours
+
+  private volatile Cache excludeDNsCache;
+  private final int maxExcludeDNCount;
+  private final Configuration conf;
+  // This is a map of providerId->StreamSlowMonitor
+  private final Map streamSlowMonitors =
+new ConcurrentHashMap<>(1);
+
+  public ExcludeDatanodeManager(Configuration conf) {
+this.conf = conf;
+this.maxExcludeDNCount = 
conf.getInt(WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY,
+  DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT);
+this.excludeDNsCache = CacheBuilder.newBuilder()
+  .expireAfterWrite(this.conf.getLong(WAL_EXCLUDE_DATANODE_TTL_KEY,
+DEFAULT_WAL_EXCLUDE_DATANODE_TTL),
+TimeUnit.HOURS)
+  .maximumSize(this.maxExcludeDNCount)
+  .concurrencyLevel(10)

Review comment:
   This is an experience value for the group count of WALs, not a proper 
value here, I can remove this attribute setting and use the default 
concurrencyLevel(4).




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-12-06 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r762812910



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/ExcludeDatanodeManager.java
##
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
+/**
+ * The class to manage the excluded datanodes of the WALs on the regionserver.
+ */
+@InterfaceAudience.Private
+public class ExcludeDatanodeManager implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(ExcludeDatanodeManager.class);
+
+  /**
+   * Configure for the max count the excluded datanodes.
+   */
+  private static final String WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY =
+"hbase.regionserver.async.wal.max.exclude.datanode.count";
+  private static final int DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT = 3;
+
+  /**
+   * Configure for the TTL time of the datanodes excluded
+   */
+  private static final String WAL_EXCLUDE_DATANODE_TTL_KEY =
+"hbase.regionserver.async.wal.exclude.datanode.info.ttl.hour";
+  private static final int DEFAULT_WAL_EXCLUDE_DATANODE_TTL = 6; // 6 hours
+
+  private volatile Cache excludeDNsCache;

Review comment:
   This cache can be concurrently read and written by WALs of different 
group, so I make it be volatile to let the change visible to all threads.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hbase] sunhelly commented on a change in pull request #3800: HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL

2021-10-28 Thread GitBox


sunhelly commented on a change in pull request #3800:
URL: https://github.com/apache/hbase/pull/3800#discussion_r738426157



##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/FanOutOneBlockAsyncDFSOutput.java
##
@@ -136,17 +138,22 @@
 
 // should be backed by a thread safe collection
 private final Set unfinishedReplicas;
+private final long dataLength;
+private final long flushTimestamp;
+private long lastAckTimestamp = -1;
 
 public Callback(CompletableFuture future, long ackedLength,
-Collection replicas) {
+final Map replicas, long dataLength) {

Review comment:
   Thanks, I changed back the type of replicas, and use packetDataLen 
instead.

##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/FanOutOneBlockAsyncDFSOutputHelper.java
##
@@ -451,15 +454,29 @@ public NameNodeException(Throwable cause) {
 
   private static FanOutOneBlockAsyncDFSOutput 
createOutput(DistributedFileSystem dfs, String src,
   boolean overwrite, boolean createParent, short replication, long 
blockSize,
-  EventLoopGroup eventLoopGroup, Class channelClass) 
throws IOException {
+  EventLoopGroup eventLoopGroup, Class channelClass,
+  StreamSlowMonitor monitor) throws IOException {
 Configuration conf = dfs.getConf();
 DFSClient client = dfs.getClient();
 String clientName = client.getClientName();
 ClientProtocol namenode = client.getNamenode();
 int createMaxRetries = conf.getInt(ASYNC_DFS_OUTPUT_CREATE_MAX_RETRIES,
   DEFAULT_ASYNC_DFS_OUTPUT_CREATE_MAX_RETRIES);
-DatanodeInfo[] excludesNodes = EMPTY_DN_ARRAY;
+ExcludeDatanodeManager excludeDatanodeManager = monitor == null ? null :
+  monitor.getExcludeDatanodeManager();
+Set toExcludeNodes = new HashSet<>();
 for (int retry = 0;; retry++) {
+  if (excludeDatanodeManager != null) {
+toExcludeNodes.addAll(excludeDatanodeManager.getExcludeDNs().keySet());
+  }
+  if (excludeDatanodeManager != null && retry > 1 && retry >= 
createMaxRetries - 1) {
+// invalid the exclude cache, to avoid not enough replicas

Review comment:
   Yes. This purpose of this design is try to add block without the 
excluded datanodes for the last time, avoiding the circumstance that there are 
not enough datanodes to choose the block targets... But actually, the logic 
here is a bit strange, and the RS will abort at the scenario mentioned above, 
so after RS restart, if choose target failed by the exclude reason, it will 
recover.

##
File path: 
hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/monitor/ExcludeDatanodeManager.java
##
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.asyncfs.monitor;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.conf.ConfigurationObserver;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
+@InterfaceAudience.Private
+public class ExcludeDatanodeManager implements ConfigurationObserver {
+  private static final Logger LOG = 
LoggerFactory.getLogger(ExcludeDatanodeManager.class);
+
+  private static final String WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT_KEY =
+"hbase.regionserver.async.wal.max.exclude.datanode.count";
+  private static final int DEFAULT_WAL_MAX_EXCLUDE_SLOW_DATANODE_COUNT = 3;
+
+  private static final String WAL_EXCLUDE_DATANODE_TTL_KEY =
+"hbase.regionserver.async.wal.exclude.datanode.info.ttl.hour";
+  private static final int DEFAULT_WAL_EXCLUDE_DATANODE_TTL = 6; // 6 hours
+
+  private Cache excludeDNsCache;
+  private final int maxExcludeDNCount;
+  private final Configuration conf;
+  private final Map