date:20220921



Apache9 commented on code in PR #4782:
URL: https://github.com/apache/hbase/pull/4782#discussion_r977135055


##
hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/PrefixTreeUtil.java:
##
@@ -0,0 +1,593 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.encoding;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.io.ByteArrayOutputStream;
+import org.apache.hadoop.hbase.io.util.UFIntTool;
+import org.apache.hadoop.hbase.nio.ByteBuff;
+import org.apache.hadoop.hbase.util.ByteBufferUtils;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@InterfaceAudience.Private
+public class PrefixTreeUtil {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(PrefixTreeUtil.class);
+
+  /**
+   * Build tree from begin
+   * @return the tree
+   */
+  public static TokenizerNode buildPrefixTree(List rowKeys) {
+// root node.
+TokenizerNode node = new TokenizerNode();
+int start = 0;
+// Get max common prefix
+int common = maxCommonPrefix(rowKeys, 0, rowKeys.size() - 1, 0);
+if (common > 0) {
+  byte[] commonB = Bytes.copy(rowKeys.get(0), 0, common);
+  node.nodeData = commonB;
+  for (int i = 0; i < rowKeys.size(); i++) {
+if (rowKeys.get(i).length == common) {
+  node.numOccurrences++;
+  if (node.index == null) {
+node.index = new ArrayList<>(1);
+  }
+  node.index.add(i);
+  start = i + 1;
+} else {
+  break;
+}
+  }
+} else {
+  // Only root node data can be empty.
+  node.nodeData = new byte[0];
+}
+constructAndSplitChild(node, rowKeys, start, rowKeys.size() - 1, common);
+return node;
+  }
+
+  /**
+   * Calculate max common prefix
+   * @return the max common prefix num bytes
+   */
+  static int maxCommonPrefix(List rowKeys, int start, int end, int 
startPos) {
+// only one entry.
+if (start == end) {
+  return rowKeys.get(start).length - startPos;
+}
+int common = 0;
+for (int round = 0; round <= rowKeys.get(start).length - startPos - 1; 
round++) {
+  boolean same = true;
+  for (int i = start + 1; i <= end; i++) {
+if (startPos + common > rowKeys.get(i).length - 1) {
+  same = false;
+  break;
+}
+if (rowKeys.get(start)[startPos + common] != rowKeys.get(i)[startPos + 
common]) {
+  same = false;
+  break;
+}
+  }
+  if (same) {
+common++;
+  } else {
+break;
+  }
+}
+return common;
+  }
+
+  /**
+   * No common prefix split it.
+   */
+  static void constructAndSplitChild(TokenizerNode node, List rowKeys, 
int start, int end,
+int startPos) {
+int middle = start;
+for (int i = start + 1; i <= end; i++) {
+  if (startPos > rowKeys.get(i).length - 1) {
+middle = i - 1;
+break;
+  }
+  if (rowKeys.get(start)[startPos] != rowKeys.get(i)[startPos]) {
+middle = i - 1;
+break;
+  }
+}
+constructCommonNodeAndChild(node, rowKeys, start, middle, startPos);
+if (middle + 1 <= end) {
+  // right
+  constructCommonNodeAndChild(node, rowKeys, middle + 1, end, startPos);
+}
+  }
+
+  /**
+   * Get max common prefix as node and build children.
+   */
+  static TokenizerNode constructCommonNodeAndChild(TokenizerNode node, 
List rowKeys,
+int start, int end, int startPos) {
+int common = maxCommonPrefix(rowKeys, start, end, startPos);
+if (common > 0) {
+  TokenizerNode child = new TokenizerNode();
+  child.parent = node;
+  node.children.add(child);
+  byte[] commonB = Bytes.copy(rowKeys.get(start), startPos, common);
+  child.nodeData = commonB;
+  int newStart = start;
+  for (int i = start; i <= end; i++) {
+if (rowKeys.get(i).length == (startPos + common)) {
+  child.numOccurrences++;
+  if

[jira] [Commented] (HBASE-27304) Support using IP to expose master/rs servers for some special scenarios

2022-09-21 Thread Duo Zhang (Jira)



[ 
https://issues.apache.org/jira/browse/HBASE-27304?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17608036#comment-17608036
 ] 

Duo Zhang commented on HBASE-27304:
---

I've merged the PR to master and on branch-2  the code is a bit different. 
Please provide a PR for branch-2 [~heliangjun]. Thanks.

> Support using IP to expose master/rs servers for some special scenarios
> ---
>
> Key: HBASE-27304
> URL: https://issues.apache.org/jira/browse/HBASE-27304
> Project: HBase
>  Issue Type: New Feature
>  Components: master, regionserver
>Affects Versions: 3.0.0-alpha-4
>Reporter: LiangJun He
>Assignee: LiangJun He
>Priority: Minor
> Fix For: 3.0.0-alpha-4
>
>
> In our environment, when accessing the HBase cluster from another cluster 
> (such as the Flink cluster), you cannot use the hostname to access the HBase 
> cluster, but you can use IP. Because it is possible that the hostname of the 
> HBase cluster automatic expansion node is not maintained in the Flink 
> cluster. So we should support using IP to expose master/rs servers for this 
> special scenarios.
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[GitHub] [hbase] Apache9 commented on pull request #4724: HBASE-27280 Add mutual authentication support to TLS



Apache9 commented on PR #4724:
URL: https://github.com/apache/hbase/pull/4724#issuecomment-1254434888

   We are very close now, thanks @bbeaudreault for your patient!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hbase] Apache9 commented on a diff in pull request #4724: HBASE-27280 Add mutual authentication support to TLS



Apache9 commented on code in PR #4724:
URL: https://github.com/apache/hbase/pull/4724#discussion_r977130153


##
hbase-common/src/main/java/org/apache/hadoop/hbase/io/crypto/tls/HBaseTrustManager.java:
##
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.crypto.tls;
+
+import java.net.InetAddress;
+import java.net.Socket;
+import java.net.UnknownHostException;
+import java.security.cert.CertificateException;
+import java.security.cert.X509Certificate;
+import javax.net.ssl.SSLEngine;
+import javax.net.ssl.SSLException;
+import javax.net.ssl.X509ExtendedTrustManager;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A custom TrustManager that supports hostname verification We attempt to 
perform verification
+ * using just the IP address first and if that fails will attempt to perform a 
reverse DNS lookup
+ * and verify using the hostname. This file has been copied from the Apache 
ZooKeeper project.
+ * @see https://github.com/apache/zookeeper/blob/c74658d398cdc1d207aa296cb6e20de00faec03e/zookeeper-server/src/main/java/org/apache/zookeeper/common/ZKTrustManager.java;>Base
+ *  revision
+ */
+@InterfaceAudience.Private
+public class HBaseTrustManager extends X509ExtendedTrustManager {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(HBaseTrustManager.class);
+
+  private final X509ExtendedTrustManager x509ExtendedTrustManager;
+  private final boolean hostnameVerificationEnabled;
+  private final boolean allowReverseDnsLookup;
+
+  private final HBaseHostnameVerifier hostnameVerifier;
+
+  /**
+   * Instantiate a new HBaseTrustManager.
+   * @param x509ExtendedTrustManagerThe trustmanager to use for
+   *checkClientTrusted/checkServerTrusted 
logic
+   * @param hostnameVerificationEnabled If true, this TrustManager should 
verify hostnames of peers
+   *when checking trust.
+   * @param allowReverseDnsLookup   If true, we will fall back on reverse 
dns if resolving of
+   *host fails
+   */
+  HBaseTrustManager(X509ExtendedTrustManager x509ExtendedTrustManager,
+boolean hostnameVerificationEnabled, boolean allowReverseDnsLookup) {
+this.x509ExtendedTrustManager = x509ExtendedTrustManager;
+this.hostnameVerificationEnabled = hostnameVerificationEnabled;
+this.allowReverseDnsLookup = allowReverseDnsLookup;
+this.hostnameVerifier = new HBaseHostnameVerifier();
+  }
+
+  @Override
+  public X509Certificate[] getAcceptedIssuers() {
+return x509ExtendedTrustManager.getAcceptedIssuers();
+  }
+
+  @Override
+  public void checkClientTrusted(X509Certificate[] chain, String authType, 
Socket socket)
+throws CertificateException {
+x509ExtendedTrustManager.checkClientTrusted(chain, authType, socket);
+if (hostnameVerificationEnabled) {
+  performHostVerification(socket.getInetAddress(), chain[0]);
+}
+  }
+
+  @Override
+  public void checkServerTrusted(X509Certificate[] chain, String authType, 
Socket socket)
+throws CertificateException {
+x509ExtendedTrustManager.checkServerTrusted(chain, authType, socket);
+if (hostnameVerificationEnabled) {
+  performHostVerification(socket.getInetAddress(), chain[0]);
+}
+  }
+
+  @Override
+  public void checkClientTrusted(X509Certificate[] chain, String authType, 
SSLEngine engine)
+throws CertificateException {
+x509ExtendedTrustManager.checkClientTrusted(chain, authType, engine);
+if (hostnameVerificationEnabled) {
+  try {
+performHostVerification(InetAddress.getByName(engine.getPeerHost()), 
chain[0]);
+  } catch (UnknownHostException e) {
+throw new CertificateException("Failed to verify host", e);
+  }
+}
+  }
+
+  @Override
+  public void checkServerTrusted(X509Certificate[] chain, String authType, 
SSLEngine engine)
+throws CertificateException {
+x509ExtendedTrustManager.checkServerTrusted(chain, authType, engine);
+if (hostnameVerificationEnabled) {
+  try {
+

[GitHub] [hbase] Apache9 merged pull request #4713: HBASE-27304 Support using IP to expose master/rs servers for some special scenarios



Apache9 merged PR #4713:
URL: https://github.com/apache/hbase/pull/4713


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hbase] Apache9 commented on a diff in pull request #4724: HBASE-27280 Add mutual authentication support to TLS



Apache9 commented on code in PR #4724:
URL: https://github.com/apache/hbase/pull/4724#discussion_r977124782


##
hbase-common/src/main/java/org/apache/hadoop/hbase/io/crypto/tls/HBaseHostnameVerifier.java:
##
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.crypto.tls;
+
+import java.net.InetAddress;
+import java.security.cert.Certificate;
+import java.security.cert.CertificateParsingException;
+import java.security.cert.X509Certificate;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+import java.util.NoSuchElementException;
+import java.util.Objects;
+import java.util.Optional;
+import javax.naming.InvalidNameException;
+import javax.naming.NamingException;
+import javax.naming.directory.Attribute;
+import javax.naming.directory.Attributes;
+import javax.naming.ldap.LdapName;
+import javax.naming.ldap.Rdn;
+import javax.net.ssl.HostnameVerifier;
+import javax.net.ssl.SSLException;
+import javax.net.ssl.SSLPeerUnverifiedException;
+import javax.net.ssl.SSLSession;
+import javax.security.auth.x500.X500Principal;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.net.InetAddresses;
+
+/**
+ * When enabled in {@link X509Util}, handles verifying that the hostname of a 
peer matches the
+ * certificate it presents.
+ * 
+ * This file has been copied from the Apache ZooKeeper project.
+ * @see https://github.com/apache/zookeeper/blob/5820d10d9dc58c8e12d2e25386fdf92acb360359/zookeeper-server/src/main/java/org/apache/zookeeper/common/ZKHostnameVerifier.java;>Base
+ *  revision
+ */
+@InterfaceAudience.Private
+final class HBaseHostnameVerifier implements HostnameVerifier {
+
+  private final Logger LOG = 
LoggerFactory.getLogger(HBaseHostnameVerifier.class);
+
+  /**
+   * Note: copied from Apache httpclient with some minor modifications. We 
want host verification,
+   * but depending on the httpclient jar caused unexplained performance 
regressions (even when the
+   * code was not used).
+   */
+  private static final class SubjectName {
+
+static final int DNS = 2;
+static final int IP = 7;
+
+private final String value;
+private final int type;
+
+SubjectName(final String value, final int type) {
+  if (type != DNS && type != IP) {
+throw new IllegalArgumentException("Invalid type: " + type);
+  }
+  this.value = Objects.requireNonNull(value);
+  this.type = type;
+}
+
+public int getType() {
+  return type;
+}
+
+public String getValue() {
+  return value;
+}
+
+@Override
+public String toString() {
+  return value;
+}
+
+  }
+
+  @Override
+  public boolean verify(final String host, final SSLSession session) {
+try {
+  final Certificate[] certs = session.getPeerCertificates();
+  final X509Certificate x509 = (X509Certificate) certs[0];
+  verify(host, x509);
+  return true;
+} catch (final SSLException ex) {
+  LOG.debug("Unexpected exception", ex);
+  return false;
+}
+  }
+
+  void verify(final String host, final X509Certificate cert) throws 
SSLException {
+final List subjectAlts = getSubjectAltNames(cert);
+if (subjectAlts != null && !subjectAlts.isEmpty()) {
+  Optional inetAddress = parseIpAddress(host);
+  if (inetAddress.isPresent()) {
+matchIPAddress(host, inetAddress.get(), subjectAlts);
+  } else {
+matchDNSName(host, subjectAlts);
+  }
+} else {
+  // CN matching has been deprecated by rfc2818 and can be used
+  // as fallback only when no subjectAlts are available
+  final X500Principal subjectPrincipal = cert.getSubjectX500Principal();
+  final String cn = 
extractCN(subjectPrincipal.getName(X500Principal.RFC2253));
+  if (cn == null) {
+throw new SSLException("Certificate subject for <" + host + "> doesn't 
contain "
+  + "a common name and does not have alternative names");
+  }
+  matchCN(host, cn);
+}
+  }
+
+  private static void

[jira] [Comment Edited] (HBASE-27384) Concurrent modification in RegionNormalizerWorkQueue



[ 
https://issues.apache.org/jira/browse/HBASE-27384?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17608018#comment-17608018
 ] 

Aman Poonia edited comment on HBASE-27384 at 9/22/22 1:03 AM:
--

The issue here is we use LinkedHashSet which would throws exception when we 
modify the Set after a iterator is created.

[https://github.com/apache/hbase/blob/branch-2.4/hbase-server/src/main/java/org/apache/hadoop/hbase/master/normalizer/RegionNormalizerWorkQueue.java#L184]
{code:java}
// code placeholder
public E take() throws InterruptedException {
  E x;
  takeLock.lockInterruptibly();
  try {
while (delegate.isEmpty()) {
  notEmpty.await();
}
final Iterator iter = delegate.iterator();
x = iter.next();
iter.remove();
if (!delegate.isEmpty()) {
  notEmpty.signal();
}
  } finally {
takeLock.unlock();
  }
  return x;
} {code}
[LinkedHasSet 
javadoc|https://docs.oracle.com/javase/7/docs/api/java/util/LinkedHashSet.html]

As we can see in above code while we are reading the set, we don't take putLock 
and only use takeLock which leaves the Set open for modification.


was (Author: mnpoonia):
The issue here is we use LinkedHashSet which would throws exception when we 
modify the Set after a iterator is created.

[https://github.com/apache/hbase/blob/branch-2.4/hbase-server/src/main/java/org/apache/hadoop/hbase/master/normalizer/RegionNormalizerWorkQueue.java#L184]
{code:java}
// code placeholder
public E take() throws InterruptedException {
  E x;
  takeLock.lockInterruptibly();
  try {
while (delegate.isEmpty()) {
  notEmpty.await();
}
final Iterator iter = delegate.iterator();
x = iter.next();
iter.remove();
if (!delegate.isEmpty()) {
  notEmpty.signal();
}
  } finally {
takeLock.unlock();
  }
  return x;
} {code}
As we can see in above code while we are reading the set, we don't take putLock 
and only use takeLock which leaves the Set open for modification.

> Concurrent modification in RegionNormalizerWorkQueue
> 
>
> Key: HBASE-27384
> URL: https://issues.apache.org/jira/browse/HBASE-27384
> Project: HBase
>  Issue Type: Bug
>  Components: Normalizer
>Affects Versions: 2.4.14
>Reporter: Aman Poonia
>Assignee: Aman Poonia
>Priority: Minor
>
> {*}Error: 
> java.util.ConcurrentModificationException{*}{{{}java.util.concurrent.ExecutionException:
>  java.util.ConcurrentModificationException at 
> java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357) 
> at java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1928) at 
> org.apache.hadoop.hbase.master.normalizer.TestRegionNormalizerWorkQueue.testTake(TestRegionNormalizerWorkQueue.java:211)
>  at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>  at java.lang.reflect.Method.invoke(Method.java:498) at 
> org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:59)
>  at 
> org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
>  at 
> org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:56)
>  at 
> org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
>  at org.junit.rules.TestWatcher$1.evaluate(TestWatcher.java:61) at 
> org.junit.runners.ParentRunner$3.evaluate(ParentRunner.java:306) at 
> org.junit.runners.BlockJUnit4ClassRunner$1.evaluate(BlockJUnit4ClassRunner.java:100)
>  at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:366) at 
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:103)
>  at 
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:63)
>  at org.junit.runners.ParentRunner$4.run(ParentRunner.java:331) at 
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:79) at 
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:329) at 
> org.junit.runners.ParentRunner.access$100(ParentRunner.java:66) at 
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:293) at 
> org.apache.hadoop.hbase.SystemExitRule$1.evaluate(SystemExitRule.java:39) at 
> org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:288)
>  at 
> org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:282)
>  at java.util.concurrent.FutureTask.run(FutureTask.java:266) at 
> java.lang.Thread.run(Thread.java:750) Caused by: 
> java.util.ConcurrentModificationException at 
> java.util.LinkedHashMap$LinkedHashIterator.nextNode(LinkedHashMap.java:719) 
> at

[jira] [Comment Edited] (HBASE-27384) Concurrent modification in RegionNormalizerWorkQueue



[ 
https://issues.apache.org/jira/browse/HBASE-27384?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17608018#comment-17608018
 ] 

Aman Poonia edited comment on HBASE-27384 at 9/22/22 1:00 AM:
--

The issue here is we use LinkedHashSet which would throws exception when we 
modify the Set after a iterator is created.

[https://github.com/apache/hbase/blob/branch-2.4/hbase-server/src/main/java/org/apache/hadoop/hbase/master/normalizer/RegionNormalizerWorkQueue.java#L184]
{code:java}
// code placeholder
public E take() throws InterruptedException {
  E x;
  takeLock.lockInterruptibly();
  try {
while (delegate.isEmpty()) {
  notEmpty.await();
}
final Iterator iter = delegate.iterator();
x = iter.next();
iter.remove();
if (!delegate.isEmpty()) {
  notEmpty.signal();
}
  } finally {
takeLock.unlock();
  }
  return x;
} {code}
As we can see in above code while we are reading the set, we don't take putLock 
and only use takeLock which leaves the Set open for modification.


was (Author: mnpoonia):
The issue here is we use LinkedHashSet which would throws exception when we 
modify the Set after a iterator is created. 

https://github.com/apache/hbase/blob/branch-2.4/hbase-server/src/main/java/org/apache/hadoop/hbase/master/normalizer/RegionNormalizerWorkQueue.java#L184
{code:java}
// code placeholder
public E take() throws InterruptedException {
  E x;
  takeLock.lockInterruptibly();
  try {
while (delegate.isEmpty()) {
  notEmpty.await();
}
final Iterator iter = delegate.iterator();
x = iter.next();
iter.remove();
if (!delegate.isEmpty()) {
  notEmpty.signal();
}
  } finally {
takeLock.unlock();
  }
  return x;
} {code}
As we can clearly see in above code while we are reading the set we don't take 
putLock and only use takeLock which leaves the Set open for modification.

> Concurrent modification in RegionNormalizerWorkQueue
> 
>
> Key: HBASE-27384
> URL: https://issues.apache.org/jira/browse/HBASE-27384
> Project: HBase
>  Issue Type: Bug
>  Components: Normalizer
>Affects Versions: 2.4.14
>Reporter: Aman Poonia
>Assignee: Aman Poonia
>Priority: Minor
>
> {*}Error: 
> java.util.ConcurrentModificationException{*}{{{}java.util.concurrent.ExecutionException:
>  java.util.ConcurrentModificationException at 
> java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357) 
> at java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1928) at 
> org.apache.hadoop.hbase.master.normalizer.TestRegionNormalizerWorkQueue.testTake(TestRegionNormalizerWorkQueue.java:211)
>  at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>  at java.lang.reflect.Method.invoke(Method.java:498) at 
> org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:59)
>  at 
> org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
>  at 
> org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:56)
>  at 
> org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
>  at org.junit.rules.TestWatcher$1.evaluate(TestWatcher.java:61) at 
> org.junit.runners.ParentRunner$3.evaluate(ParentRunner.java:306) at 
> org.junit.runners.BlockJUnit4ClassRunner$1.evaluate(BlockJUnit4ClassRunner.java:100)
>  at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:366) at 
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:103)
>  at 
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:63)
>  at org.junit.runners.ParentRunner$4.run(ParentRunner.java:331) at 
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:79) at 
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:329) at 
> org.junit.runners.ParentRunner.access$100(ParentRunner.java:66) at 
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:293) at 
> org.apache.hadoop.hbase.SystemExitRule$1.evaluate(SystemExitRule.java:39) at 
> org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:288)
>  at 
> org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:282)
>  at java.util.concurrent.FutureTask.run(FutureTask.java:266) at 
> java.lang.Thread.run(Thread.java:750) Caused by: 
> java.util.ConcurrentModificationException at 
> java.util.LinkedHashMap$LinkedHashIterator.nextNode(LinkedHashMap.java:719) 
> at java.util.LinkedHashMap$LinkedKeyIterator.next(LinkedHashMap.java:742) at 
>

[jira] [Commented] (HBASE-27384) Concurrent modification in RegionNormalizerWorkQueue



[ 
https://issues.apache.org/jira/browse/HBASE-27384?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17608018#comment-17608018
 ] 

Aman Poonia commented on HBASE-27384:
-

The issue here is we use LinkedHashSet which would throws exception when we 
modify the Set after a iterator is created. 

https://github.com/apache/hbase/blob/branch-2.4/hbase-server/src/main/java/org/apache/hadoop/hbase/master/normalizer/RegionNormalizerWorkQueue.java#L184
{code:java}
// code placeholder
public E take() throws InterruptedException {
  E x;
  takeLock.lockInterruptibly();
  try {
while (delegate.isEmpty()) {
  notEmpty.await();
}
final Iterator iter = delegate.iterator();
x = iter.next();
iter.remove();
if (!delegate.isEmpty()) {
  notEmpty.signal();
}
  } finally {
takeLock.unlock();
  }
  return x;
} {code}
As we can clearly see in above code while we are reading the set we don't take 
putLock and only use takeLock which leaves the Set open for modification.

> Concurrent modification in RegionNormalizerWorkQueue
> 
>
> Key: HBASE-27384
> URL: https://issues.apache.org/jira/browse/HBASE-27384
> Project: HBase
>  Issue Type: Bug
>  Components: Normalizer
>Affects Versions: 2.4.14
>Reporter: Aman Poonia
>Assignee: Aman Poonia
>Priority: Minor
>
> {*}Error: 
> java.util.ConcurrentModificationException{*}{{{}java.util.concurrent.ExecutionException:
>  java.util.ConcurrentModificationException at 
> java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357) 
> at java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1928) at 
> org.apache.hadoop.hbase.master.normalizer.TestRegionNormalizerWorkQueue.testTake(TestRegionNormalizerWorkQueue.java:211)
>  at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>  at java.lang.reflect.Method.invoke(Method.java:498) at 
> org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:59)
>  at 
> org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
>  at 
> org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:56)
>  at 
> org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
>  at org.junit.rules.TestWatcher$1.evaluate(TestWatcher.java:61) at 
> org.junit.runners.ParentRunner$3.evaluate(ParentRunner.java:306) at 
> org.junit.runners.BlockJUnit4ClassRunner$1.evaluate(BlockJUnit4ClassRunner.java:100)
>  at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:366) at 
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:103)
>  at 
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:63)
>  at org.junit.runners.ParentRunner$4.run(ParentRunner.java:331) at 
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:79) at 
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:329) at 
> org.junit.runners.ParentRunner.access$100(ParentRunner.java:66) at 
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:293) at 
> org.apache.hadoop.hbase.SystemExitRule$1.evaluate(SystemExitRule.java:39) at 
> org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:288)
>  at 
> org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:282)
>  at java.util.concurrent.FutureTask.run(FutureTask.java:266) at 
> java.lang.Thread.run(Thread.java:750) Caused by: 
> java.util.ConcurrentModificationException at 
> java.util.LinkedHashMap$LinkedHashIterator.nextNode(LinkedHashMap.java:719) 
> at java.util.LinkedHashMap$LinkedKeyIterator.next(LinkedHashMap.java:742) at 
> org.apache.hadoop.hbase.master.normalizer.RegionNormalizerWorkQueue.take(RegionNormalizerWorkQueue.java:192)
>  at 
> org.apache.hadoop.hbase.master.normalizer.TestRegionNormalizerWorkQueue.lambda$testTake$3(TestRegionNormalizerWorkQueue.java:192)
>  at 
> java.util.concurrent.CompletableFuture$AsyncRun.run(CompletableFuture.java:1640)
>  at 
> java.util.concurrent.CompletableFuture$AsyncRun.exec(CompletableFuture.java:1632)
>  at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) at 
> java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056) 
> at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692) at 
> java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:175){}}}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Assigned] (HBASE-27384) Concurrent modification in RegionNormalizerWorkQueue



 [ 
https://issues.apache.org/jira/browse/HBASE-27384?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Aman Poonia reassigned HBASE-27384:
---

Assignee: Aman Poonia

> Concurrent modification in RegionNormalizerWorkQueue
> 
>
> Key: HBASE-27384
> URL: https://issues.apache.org/jira/browse/HBASE-27384
> Project: HBase
>  Issue Type: Bug
>  Components: Normalizer
>Affects Versions: 2.4.14
>Reporter: Aman Poonia
>Assignee: Aman Poonia
>Priority: Minor
>
> {*}Error: 
> java.util.ConcurrentModificationException{*}{{{}java.util.concurrent.ExecutionException:
>  java.util.ConcurrentModificationException at 
> java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357) 
> at java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1928) at 
> org.apache.hadoop.hbase.master.normalizer.TestRegionNormalizerWorkQueue.testTake(TestRegionNormalizerWorkQueue.java:211)
>  at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>  at java.lang.reflect.Method.invoke(Method.java:498) at 
> org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:59)
>  at 
> org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
>  at 
> org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:56)
>  at 
> org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
>  at org.junit.rules.TestWatcher$1.evaluate(TestWatcher.java:61) at 
> org.junit.runners.ParentRunner$3.evaluate(ParentRunner.java:306) at 
> org.junit.runners.BlockJUnit4ClassRunner$1.evaluate(BlockJUnit4ClassRunner.java:100)
>  at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:366) at 
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:103)
>  at 
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:63)
>  at org.junit.runners.ParentRunner$4.run(ParentRunner.java:331) at 
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:79) at 
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:329) at 
> org.junit.runners.ParentRunner.access$100(ParentRunner.java:66) at 
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:293) at 
> org.apache.hadoop.hbase.SystemExitRule$1.evaluate(SystemExitRule.java:39) at 
> org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:288)
>  at 
> org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:282)
>  at java.util.concurrent.FutureTask.run(FutureTask.java:266) at 
> java.lang.Thread.run(Thread.java:750) Caused by: 
> java.util.ConcurrentModificationException at 
> java.util.LinkedHashMap$LinkedHashIterator.nextNode(LinkedHashMap.java:719) 
> at java.util.LinkedHashMap$LinkedKeyIterator.next(LinkedHashMap.java:742) at 
> org.apache.hadoop.hbase.master.normalizer.RegionNormalizerWorkQueue.take(RegionNormalizerWorkQueue.java:192)
>  at 
> org.apache.hadoop.hbase.master.normalizer.TestRegionNormalizerWorkQueue.lambda$testTake$3(TestRegionNormalizerWorkQueue.java:192)
>  at 
> java.util.concurrent.CompletableFuture$AsyncRun.run(CompletableFuture.java:1640)
>  at 
> java.util.concurrent.CompletableFuture$AsyncRun.exec(CompletableFuture.java:1632)
>  at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) at 
> java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056) 
> at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692) at 
> java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:175){}}}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Created] (HBASE-27384) Concurrent modification in RegionNormalizerWorkQueue

Aman Poonia created HBASE-27384:
---

 Summary: Concurrent modification in RegionNormalizerWorkQueue
 Key: HBASE-27384
 URL: https://issues.apache.org/jira/browse/HBASE-27384
 Project: HBase
  Issue Type: Bug
  Components: Normalizer
Affects Versions: 2.4.14
Reporter: Aman Poonia


{*}Error: 
java.util.ConcurrentModificationException{*}{{{}java.util.concurrent.ExecutionException:
 java.util.ConcurrentModificationException at 
java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357) at 
java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1928) at 
org.apache.hadoop.hbase.master.normalizer.TestRegionNormalizerWorkQueue.testTake(TestRegionNormalizerWorkQueue.java:211)
 at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at 
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 
at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
 at java.lang.reflect.Method.invoke(Method.java:498) at 
org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:59)
 at 
org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
 at 
org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:56)
 at 
org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
 at org.junit.rules.TestWatcher$1.evaluate(TestWatcher.java:61) at 
org.junit.runners.ParentRunner$3.evaluate(ParentRunner.java:306) at 
org.junit.runners.BlockJUnit4ClassRunner$1.evaluate(BlockJUnit4ClassRunner.java:100)
 at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:366) at 
org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:103)
 at 
org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:63)
 at org.junit.runners.ParentRunner$4.run(ParentRunner.java:331) at 
org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:79) at 
org.junit.runners.ParentRunner.runChildren(ParentRunner.java:329) at 
org.junit.runners.ParentRunner.access$100(ParentRunner.java:66) at 
org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:293) at 
org.apache.hadoop.hbase.SystemExitRule$1.evaluate(SystemExitRule.java:39) at 
org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:288)
 at 
org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:282)
 at java.util.concurrent.FutureTask.run(FutureTask.java:266) at 
java.lang.Thread.run(Thread.java:750) Caused by: 
java.util.ConcurrentModificationException at 
java.util.LinkedHashMap$LinkedHashIterator.nextNode(LinkedHashMap.java:719) at 
java.util.LinkedHashMap$LinkedKeyIterator.next(LinkedHashMap.java:742) at 
org.apache.hadoop.hbase.master.normalizer.RegionNormalizerWorkQueue.take(RegionNormalizerWorkQueue.java:192)
 at 
org.apache.hadoop.hbase.master.normalizer.TestRegionNormalizerWorkQueue.lambda$testTake$3(TestRegionNormalizerWorkQueue.java:192)
 at 
java.util.concurrent.CompletableFuture$AsyncRun.run(CompletableFuture.java:1640)
 at 
java.util.concurrent.CompletableFuture$AsyncRun.exec(CompletableFuture.java:1632)
 at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) at 
java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056) at 
java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692) at 
java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:175){}}}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Comment Edited] (HBASE-27382) Cluster completely down due to WAL splitting failing for hbase:meta table.



[ 
https://issues.apache.org/jira/browse/HBASE-27382?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607996#comment-17607996
 ] 

Viraj Jasani edited comment on HBASE-27382 at 9/21/22 10:57 PM:


Yes ZKSplitLogManagerCoordination is present in hbase 2.x but it is not in use 
by default. The default use is procedure v2 based WAL split, introduced by 
HBASE-21588.

(the default coordination option was changed with 2.4.0 release by HBASE-24632).

 

Some interesting discussion worth taking a look at on HBASE-24558, which seems 
to have triggered changing the default implementation to procedure based WAL 
split, resulting in HBASE-24632.


was (Author: vjasani):
Yes ZKSplitLogManagerCoordination is present in hbase 2.x but it is not in use 
by default. The default use is procedure v2 based WAL split, introduced by 
HBASE-21588.

(the default coordination option was changed with 2.4.0 release by HBASE-24632).

 

Some interesting discussion worth taking a look at on HBASE-24558.

> Cluster completely down due to WAL splitting failing for hbase:meta table.
> --
>
> Key: HBASE-27382
> URL: https://issues.apache.org/jira/browse/HBASE-27382
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 2.5.0, 1.7.2, 2.4.14
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
> Fix For: 2.6.0, 2.5.1, 3.0.0-alpha-4, 2.4.15
>
>
> We are running some version of 1.7.2 in our production environment. We 
> encountered this issue recently.
> We colocate namenode and region server holding hbase:meta table on a set of 5 
> master nodes. Co-incidentally active namenode and region server holding meta 
> table were on the same physical node and that node went down due to hardware 
> issue. We have sub optimal hdfs level timeouts configured so whenever active 
> namenode goes down, it takes around 12-15 minutes for hdfs client within 
> hbase to connect to new active namenode. So all the region servers were 
> having problems for about 15 minutes to connect to new active namenode.
> Below are the sequence of events:
> 1. Host running active namenode and hbase:meta went down at +2022-09-09 
> 16:56:56,878+
> 2. HMaster started running ServerCrashProcedure at +2022-09-09 16:59:05,696+
> {noformat}
> 2022-09-09 16:59:05,696 DEBUG [t-processor-pool2-t1] 
> procedure2.ProcedureExecutor - Procedure ServerCrashProcedure 
> serverName=,61020,1662714013670, shouldSplitWal=true, 
> carryingMeta=true id=1 owner=dummy state=RUNNABLE:SERVER_CRASH_START added to 
> the store.
> 2022-09-09 16:59:05,702 DEBUG [t-processor-pool2-t1] master.ServerManager - 
> Added=,61020,1662714013670 to dead servers, submitted shutdown 
> handler to be executed meta=true
> 2022-09-09 16:59:05,707 DEBUG [ProcedureExecutor-0] master.DeadServer - 
> Started processing ,61020,1662714013670; numProcessing=1
> 2022-09-09 16:59:05,712 INFO  [ProcedureExecutor-0] 
> procedure.ServerCrashProcedure - Start processing crashed 
> ,61020,1662714013670
> {noformat}
> 3. SplitLogManager created 2 split log tasks in zookeeper.
> {noformat}
> 2022-09-09 16:59:06,049 INFO  [ProcedureExecutor-1] master.SplitLogManager - 
> Started splitting 2 logs in 
> [hdfs:///hbase/WALs/,61020,1662714013670-splitting]
>  for [,61020,1662714013670]
> 2022-09-09 16:59:06,081 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 16:59:06,093 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662739251611.meta
> {noformat}
> 4. The first split log task is more interesting: 
> +/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta+
> 5. Since all the region servers were having problems connecting to active 
> namenode, SplitLogManager tried total of 4 times to assign this task (3 
> resubmits, configured by hbase.splitlog.max.resubmit) and then finally gave 
> up.
> {noformat}
> -- try 1 -
> 2022-09-09 16:59:06,205 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662540522069
> -- try 2 -
> 2022-09-09 17:01:06,642 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09

[jira] [Comment Edited] (HBASE-27382) Cluster completely down due to WAL splitting failing for hbase:meta table.



[ 
https://issues.apache.org/jira/browse/HBASE-27382?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607996#comment-17607996
 ] 

Viraj Jasani edited comment on HBASE-27382 at 9/21/22 10:55 PM:


Yes ZKSplitLogManagerCoordination is present in hbase 2.x but it is not in use 
by default. The default use is procedure v2 based WAL split, introduced by 
HBASE-21588.

(the default coordination option was changed with 2.4.0 release by HBASE-24632).

 

Some interesting discussion worth taking a look at on HBASE-24558.


was (Author: vjasani):
Yes ZKSplitLogManagerCoordination is present in hbase 2.x but it is not in use 
by default. The default use is procedure v2 based WAL split, introduced by 
HBASE-21588.

(the default coordination option was changed with 2.4.0 release by HBASE-24632).

> Cluster completely down due to WAL splitting failing for hbase:meta table.
> --
>
> Key: HBASE-27382
> URL: https://issues.apache.org/jira/browse/HBASE-27382
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 2.5.0, 1.7.2, 2.4.14
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
> Fix For: 2.6.0, 2.5.1, 3.0.0-alpha-4, 2.4.15
>
>
> We are running some version of 1.7.2 in our production environment. We 
> encountered this issue recently.
> We colocate namenode and region server holding hbase:meta table on a set of 5 
> master nodes. Co-incidentally active namenode and region server holding meta 
> table were on the same physical node and that node went down due to hardware 
> issue. We have sub optimal hdfs level timeouts configured so whenever active 
> namenode goes down, it takes around 12-15 minutes for hdfs client within 
> hbase to connect to new active namenode. So all the region servers were 
> having problems for about 15 minutes to connect to new active namenode.
> Below are the sequence of events:
> 1. Host running active namenode and hbase:meta went down at +2022-09-09 
> 16:56:56,878+
> 2. HMaster started running ServerCrashProcedure at +2022-09-09 16:59:05,696+
> {noformat}
> 2022-09-09 16:59:05,696 DEBUG [t-processor-pool2-t1] 
> procedure2.ProcedureExecutor - Procedure ServerCrashProcedure 
> serverName=,61020,1662714013670, shouldSplitWal=true, 
> carryingMeta=true id=1 owner=dummy state=RUNNABLE:SERVER_CRASH_START added to 
> the store.
> 2022-09-09 16:59:05,702 DEBUG [t-processor-pool2-t1] master.ServerManager - 
> Added=,61020,1662714013670 to dead servers, submitted shutdown 
> handler to be executed meta=true
> 2022-09-09 16:59:05,707 DEBUG [ProcedureExecutor-0] master.DeadServer - 
> Started processing ,61020,1662714013670; numProcessing=1
> 2022-09-09 16:59:05,712 INFO  [ProcedureExecutor-0] 
> procedure.ServerCrashProcedure - Start processing crashed 
> ,61020,1662714013670
> {noformat}
> 3. SplitLogManager created 2 split log tasks in zookeeper.
> {noformat}
> 2022-09-09 16:59:06,049 INFO  [ProcedureExecutor-1] master.SplitLogManager - 
> Started splitting 2 logs in 
> [hdfs:///hbase/WALs/,61020,1662714013670-splitting]
>  for [,61020,1662714013670]
> 2022-09-09 16:59:06,081 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 16:59:06,093 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662739251611.meta
> {noformat}
> 4. The first split log task is more interesting: 
> +/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta+
> 5. Since all the region servers were having problems connecting to active 
> namenode, SplitLogManager tried total of 4 times to assign this task (3 
> resubmits, configured by hbase.splitlog.max.resubmit) and then finally gave 
> up.
> {noformat}
> -- try 1 -
> 2022-09-09 16:59:06,205 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662540522069
> -- try 2 -
> 2022-09-09 17:01:06,642 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:01:06,666 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - task not yet acquired 
>

[jira] [Comment Edited] (HBASE-27382) Cluster completely down due to WAL splitting failing for hbase:meta table.



[ 
https://issues.apache.org/jira/browse/HBASE-27382?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607996#comment-17607996
 ] 

Viraj Jasani edited comment on HBASE-27382 at 9/21/22 10:52 PM:


Yes ZKSplitLogManagerCoordination is present in hbase 2.x but it is not in use 
by default. The default use is procedure v2 based WAL split, introduced by 
HBASE-21588.

(the default coordination option was changed with 2.4.0 release by HBASE-24632).


was (Author: vjasani):
Yes ZKSplitLogManagerCoordination is present in hbase 2.x but it is not in use 
by default. The default use is procedure v2 based WAL split, introduced by 
HBASE-21588.

> Cluster completely down due to WAL splitting failing for hbase:meta table.
> --
>
> Key: HBASE-27382
> URL: https://issues.apache.org/jira/browse/HBASE-27382
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 2.5.0, 1.7.2, 2.4.14
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
> Fix For: 2.6.0, 2.5.1, 3.0.0-alpha-4, 2.4.15
>
>
> We are running some version of 1.7.2 in our production environment. We 
> encountered this issue recently.
> We colocate namenode and region server holding hbase:meta table on a set of 5 
> master nodes. Co-incidentally active namenode and region server holding meta 
> table were on the same physical node and that node went down due to hardware 
> issue. We have sub optimal hdfs level timeouts configured so whenever active 
> namenode goes down, it takes around 12-15 minutes for hdfs client within 
> hbase to connect to new active namenode. So all the region servers were 
> having problems for about 15 minutes to connect to new active namenode.
> Below are the sequence of events:
> 1. Host running active namenode and hbase:meta went down at +2022-09-09 
> 16:56:56,878+
> 2. HMaster started running ServerCrashProcedure at +2022-09-09 16:59:05,696+
> {noformat}
> 2022-09-09 16:59:05,696 DEBUG [t-processor-pool2-t1] 
> procedure2.ProcedureExecutor - Procedure ServerCrashProcedure 
> serverName=,61020,1662714013670, shouldSplitWal=true, 
> carryingMeta=true id=1 owner=dummy state=RUNNABLE:SERVER_CRASH_START added to 
> the store.
> 2022-09-09 16:59:05,702 DEBUG [t-processor-pool2-t1] master.ServerManager - 
> Added=,61020,1662714013670 to dead servers, submitted shutdown 
> handler to be executed meta=true
> 2022-09-09 16:59:05,707 DEBUG [ProcedureExecutor-0] master.DeadServer - 
> Started processing ,61020,1662714013670; numProcessing=1
> 2022-09-09 16:59:05,712 INFO  [ProcedureExecutor-0] 
> procedure.ServerCrashProcedure - Start processing crashed 
> ,61020,1662714013670
> {noformat}
> 3. SplitLogManager created 2 split log tasks in zookeeper.
> {noformat}
> 2022-09-09 16:59:06,049 INFO  [ProcedureExecutor-1] master.SplitLogManager - 
> Started splitting 2 logs in 
> [hdfs:///hbase/WALs/,61020,1662714013670-splitting]
>  for [,61020,1662714013670]
> 2022-09-09 16:59:06,081 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 16:59:06,093 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662739251611.meta
> {noformat}
> 4. The first split log task is more interesting: 
> +/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta+
> 5. Since all the region servers were having problems connecting to active 
> namenode, SplitLogManager tried total of 4 times to assign this task (3 
> resubmits, configured by hbase.splitlog.max.resubmit) and then finally gave 
> up.
> {noformat}
> -- try 1 -
> 2022-09-09 16:59:06,205 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662540522069
> -- try 2 -
> 2022-09-09 17:01:06,642 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:01:06,666 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - task not yet acquired 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  ver = 2
> 2022-09-09 17:01:06,715 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
>

[jira] [Updated] (HBASE-27383) Add dead region server to SplitLogManager#deadWorkers set as the first step.



 [ 
https://issues.apache.org/jira/browse/HBASE-27383?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Rushabh Shah updated HBASE-27383:
-
Affects Version/s: 2.5.0
   (was: 1.6.0)

> Add dead region server to SplitLogManager#deadWorkers set as the first step.
> 
>
> Key: HBASE-27383
> URL: https://issues.apache.org/jira/browse/HBASE-27383
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 2.5.0, 1.7.2, 2.4.14
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
>
> Currently we add a dead region server to +SplitLogManager#deadWorkers+ set in 
> SERVER_CRASH_SPLIT_LOGS state. 
> Consider a case where a region server is handling split log task for 
> hbase:meta table and SplitLogManager has exhausted all the retries and won't 
> try any more region server. 
> The region server which is handling split log task has died. 
> We have a check in SplitLogManager where if a region server is declared dead 
> and if that region server is responsible for split log task then we 
> forcefully resubmit split log task. See the code 
> [here|https://github.com/apache/hbase/blob/branch-1/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java#L721-L726].
> But we add a region server to SplitLogManager#deadWorkers set in 
> [SERVER_CRASH_SPLIT_LOGS|https://github.com/apache/hbase/blob/branch-1/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java#L252]
>  state. 
> Before that it runs 
> [SERVER_CRASH_GET_REGIONS|https://github.com/apache/hbase/blob/branch-1/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java#L214]
>  state  and checks if hbase:meta table is up. In this case, hbase:meta table 
> was not online and that prevented SplitLogManager to add this RS to 
> deadWorkers list. This created a deadlock and hbase cluster was completely 
> down for an extended period of time until we failed over active hmaster. See 
> HBASE-27382 for more details.
> Improvements:
> 1.  We should a dead region server to +SplitLogManager#deadWorkers+ list as 
> the first step.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Updated] (HBASE-27383) Add dead region server to SplitLogManager#deadWorkers set as the first step.



 [ 
https://issues.apache.org/jira/browse/HBASE-27383?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Rushabh Shah updated HBASE-27383:
-
Affects Version/s: 2.4.14

> Add dead region server to SplitLogManager#deadWorkers set as the first step.
> 
>
> Key: HBASE-27383
> URL: https://issues.apache.org/jira/browse/HBASE-27383
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 1.6.0, 1.7.2, 2.4.14
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
>
> Currently we add a dead region server to +SplitLogManager#deadWorkers+ set in 
> SERVER_CRASH_SPLIT_LOGS state. 
> Consider a case where a region server is handling split log task for 
> hbase:meta table and SplitLogManager has exhausted all the retries and won't 
> try any more region server. 
> The region server which is handling split log task has died. 
> We have a check in SplitLogManager where if a region server is declared dead 
> and if that region server is responsible for split log task then we 
> forcefully resubmit split log task. See the code 
> [here|https://github.com/apache/hbase/blob/branch-1/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java#L721-L726].
> But we add a region server to SplitLogManager#deadWorkers set in 
> [SERVER_CRASH_SPLIT_LOGS|https://github.com/apache/hbase/blob/branch-1/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java#L252]
>  state. 
> Before that it runs 
> [SERVER_CRASH_GET_REGIONS|https://github.com/apache/hbase/blob/branch-1/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java#L214]
>  state  and checks if hbase:meta table is up. In this case, hbase:meta table 
> was not online and that prevented SplitLogManager to add this RS to 
> deadWorkers list. This created a deadlock and hbase cluster was completely 
> down for an extended period of time until we failed over active hmaster. See 
> HBASE-27382 for more details.
> Improvements:
> 1.  We should a dead region server to +SplitLogManager#deadWorkers+ list as 
> the first step.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Commented] (HBASE-27382) Cluster completely down due to WAL splitting failing for hbase:meta table.



[ 
https://issues.apache.org/jira/browse/HBASE-27382?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607996#comment-17607996
 ] 

Viraj Jasani commented on HBASE-27382:
--

Yes ZKSplitLogManagerCoordination is present in hbase 2.x but it is not in use 
by default. The default use is procedure v2 based WAL split, introduced by 
HBASE-21588.

> Cluster completely down due to WAL splitting failing for hbase:meta table.
> --
>
> Key: HBASE-27382
> URL: https://issues.apache.org/jira/browse/HBASE-27382
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 2.5.0, 1.7.2, 2.4.14
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
> Fix For: 2.6.0, 2.5.1, 3.0.0-alpha-4, 2.4.15
>
>
> We are running some version of 1.7.2 in our production environment. We 
> encountered this issue recently.
> We colocate namenode and region server holding hbase:meta table on a set of 5 
> master nodes. Co-incidentally active namenode and region server holding meta 
> table were on the same physical node and that node went down due to hardware 
> issue. We have sub optimal hdfs level timeouts configured so whenever active 
> namenode goes down, it takes around 12-15 minutes for hdfs client within 
> hbase to connect to new active namenode. So all the region servers were 
> having problems for about 15 minutes to connect to new active namenode.
> Below are the sequence of events:
> 1. Host running active namenode and hbase:meta went down at +2022-09-09 
> 16:56:56,878+
> 2. HMaster started running ServerCrashProcedure at +2022-09-09 16:59:05,696+
> {noformat}
> 2022-09-09 16:59:05,696 DEBUG [t-processor-pool2-t1] 
> procedure2.ProcedureExecutor - Procedure ServerCrashProcedure 
> serverName=,61020,1662714013670, shouldSplitWal=true, 
> carryingMeta=true id=1 owner=dummy state=RUNNABLE:SERVER_CRASH_START added to 
> the store.
> 2022-09-09 16:59:05,702 DEBUG [t-processor-pool2-t1] master.ServerManager - 
> Added=,61020,1662714013670 to dead servers, submitted shutdown 
> handler to be executed meta=true
> 2022-09-09 16:59:05,707 DEBUG [ProcedureExecutor-0] master.DeadServer - 
> Started processing ,61020,1662714013670; numProcessing=1
> 2022-09-09 16:59:05,712 INFO  [ProcedureExecutor-0] 
> procedure.ServerCrashProcedure - Start processing crashed 
> ,61020,1662714013670
> {noformat}
> 3. SplitLogManager created 2 split log tasks in zookeeper.
> {noformat}
> 2022-09-09 16:59:06,049 INFO  [ProcedureExecutor-1] master.SplitLogManager - 
> Started splitting 2 logs in 
> [hdfs:///hbase/WALs/,61020,1662714013670-splitting]
>  for [,61020,1662714013670]
> 2022-09-09 16:59:06,081 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 16:59:06,093 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662739251611.meta
> {noformat}
> 4. The first split log task is more interesting: 
> +/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta+
> 5. Since all the region servers were having problems connecting to active 
> namenode, SplitLogManager tried total of 4 times to assign this task (3 
> resubmits, configured by hbase.splitlog.max.resubmit) and then finally gave 
> up.
> {noformat}
> -- try 1 -
> 2022-09-09 16:59:06,205 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662540522069
> -- try 2 -
> 2022-09-09 17:01:06,642 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:01:06,666 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - task not yet acquired 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  ver = 2
> 2022-09-09 17:01:06,715 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662530684713
> -- try 3 -
> 2022-09-09 17:03:07,643 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
>

[jira] [Updated] (HBASE-27382) Cluster completely down due to WAL splitting failing for hbase:meta table.



 [ 
https://issues.apache.org/jira/browse/HBASE-27382?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Andrew Kyle Purtell updated HBASE-27382:

Fix Version/s: 2.6.0
   2.5.1
   3.0.0-alpha-4
   2.4.15

> Cluster completely down due to WAL splitting failing for hbase:meta table.
> --
>
> Key: HBASE-27382
> URL: https://issues.apache.org/jira/browse/HBASE-27382
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 2.5.0, 1.7.2, 2.4.14
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
> Fix For: 2.6.0, 2.5.1, 3.0.0-alpha-4, 2.4.15
>
>
> We are running some version of 1.7.2 in our production environment. We 
> encountered this issue recently.
> We colocate namenode and region server holding hbase:meta table on a set of 5 
> master nodes. Co-incidentally active namenode and region server holding meta 
> table were on the same physical node and that node went down due to hardware 
> issue. We have sub optimal hdfs level timeouts configured so whenever active 
> namenode goes down, it takes around 12-15 minutes for hdfs client within 
> hbase to connect to new active namenode. So all the region servers were 
> having problems for about 15 minutes to connect to new active namenode.
> Below are the sequence of events:
> 1. Host running active namenode and hbase:meta went down at +2022-09-09 
> 16:56:56,878+
> 2. HMaster started running ServerCrashProcedure at +2022-09-09 16:59:05,696+
> {noformat}
> 2022-09-09 16:59:05,696 DEBUG [t-processor-pool2-t1] 
> procedure2.ProcedureExecutor - Procedure ServerCrashProcedure 
> serverName=,61020,1662714013670, shouldSplitWal=true, 
> carryingMeta=true id=1 owner=dummy state=RUNNABLE:SERVER_CRASH_START added to 
> the store.
> 2022-09-09 16:59:05,702 DEBUG [t-processor-pool2-t1] master.ServerManager - 
> Added=,61020,1662714013670 to dead servers, submitted shutdown 
> handler to be executed meta=true
> 2022-09-09 16:59:05,707 DEBUG [ProcedureExecutor-0] master.DeadServer - 
> Started processing ,61020,1662714013670; numProcessing=1
> 2022-09-09 16:59:05,712 INFO  [ProcedureExecutor-0] 
> procedure.ServerCrashProcedure - Start processing crashed 
> ,61020,1662714013670
> {noformat}
> 3. SplitLogManager created 2 split log tasks in zookeeper.
> {noformat}
> 2022-09-09 16:59:06,049 INFO  [ProcedureExecutor-1] master.SplitLogManager - 
> Started splitting 2 logs in 
> [hdfs:///hbase/WALs/,61020,1662714013670-splitting]
>  for [,61020,1662714013670]
> 2022-09-09 16:59:06,081 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 16:59:06,093 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662739251611.meta
> {noformat}
> 4. The first split log task is more interesting: 
> +/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta+
> 5. Since all the region servers were having problems connecting to active 
> namenode, SplitLogManager tried total of 4 times to assign this task (3 
> resubmits, configured by hbase.splitlog.max.resubmit) and then finally gave 
> up.
> {noformat}
> -- try 1 -
> 2022-09-09 16:59:06,205 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662540522069
> -- try 2 -
> 2022-09-09 17:01:06,642 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:01:06,666 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - task not yet acquired 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  ver = 2
> 2022-09-09 17:01:06,715 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662530684713
> -- try 3 -
> 2022-09-09 17:03:07,643 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>

[jira] [Updated] (HBASE-27382) Cluster completely down due to WAL splitting failing for hbase:meta table.



 [ 
https://issues.apache.org/jira/browse/HBASE-27382?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Andrew Kyle Purtell updated HBASE-27382:

Affects Version/s: 2.4.14
   2.5.0
   (was: 1.6.0)
   (was: 1.7.1)

> Cluster completely down due to WAL splitting failing for hbase:meta table.
> --
>
> Key: HBASE-27382
> URL: https://issues.apache.org/jira/browse/HBASE-27382
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 2.5.0, 1.7.2, 2.4.14
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
>
> We are running some version of 1.7.2 in our production environment. We 
> encountered this issue recently.
> We colocate namenode and region server holding hbase:meta table on a set of 5 
> master nodes. Co-incidentally active namenode and region server holding meta 
> table were on the same physical node and that node went down due to hardware 
> issue. We have sub optimal hdfs level timeouts configured so whenever active 
> namenode goes down, it takes around 12-15 minutes for hdfs client within 
> hbase to connect to new active namenode. So all the region servers were 
> having problems for about 15 minutes to connect to new active namenode.
> Below are the sequence of events:
> 1. Host running active namenode and hbase:meta went down at +2022-09-09 
> 16:56:56,878+
> 2. HMaster started running ServerCrashProcedure at +2022-09-09 16:59:05,696+
> {noformat}
> 2022-09-09 16:59:05,696 DEBUG [t-processor-pool2-t1] 
> procedure2.ProcedureExecutor - Procedure ServerCrashProcedure 
> serverName=,61020,1662714013670, shouldSplitWal=true, 
> carryingMeta=true id=1 owner=dummy state=RUNNABLE:SERVER_CRASH_START added to 
> the store.
> 2022-09-09 16:59:05,702 DEBUG [t-processor-pool2-t1] master.ServerManager - 
> Added=,61020,1662714013670 to dead servers, submitted shutdown 
> handler to be executed meta=true
> 2022-09-09 16:59:05,707 DEBUG [ProcedureExecutor-0] master.DeadServer - 
> Started processing ,61020,1662714013670; numProcessing=1
> 2022-09-09 16:59:05,712 INFO  [ProcedureExecutor-0] 
> procedure.ServerCrashProcedure - Start processing crashed 
> ,61020,1662714013670
> {noformat}
> 3. SplitLogManager created 2 split log tasks in zookeeper.
> {noformat}
> 2022-09-09 16:59:06,049 INFO  [ProcedureExecutor-1] master.SplitLogManager - 
> Started splitting 2 logs in 
> [hdfs:///hbase/WALs/,61020,1662714013670-splitting]
>  for [,61020,1662714013670]
> 2022-09-09 16:59:06,081 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 16:59:06,093 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662739251611.meta
> {noformat}
> 4. The first split log task is more interesting: 
> +/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta+
> 5. Since all the region servers were having problems connecting to active 
> namenode, SplitLogManager tried total of 4 times to assign this task (3 
> resubmits, configured by hbase.splitlog.max.resubmit) and then finally gave 
> up.
> {noformat}
> -- try 1 -
> 2022-09-09 16:59:06,205 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662540522069
> -- try 2 -
> 2022-09-09 17:01:06,642 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:01:06,666 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - task not yet acquired 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  ver = 2
> 2022-09-09 17:01:06,715 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662530684713
> -- try 3 -
> 2022-09-09 17:03:07,643 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:03:07,687 DEBUG

[jira] [Commented] (HBASE-27382) Cluster completely down due to WAL splitting failing for hbase:meta table.



[ 
https://issues.apache.org/jira/browse/HBASE-27382?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607993#comment-17607993
 ] 

Andrew Kyle Purtell commented on HBASE-27382:
-

(y)

> Cluster completely down due to WAL splitting failing for hbase:meta table.
> --
>
> Key: HBASE-27382
> URL: https://issues.apache.org/jira/browse/HBASE-27382
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 2.5.0, 1.7.2, 2.4.14
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
> Fix For: 2.6.0, 2.5.1, 3.0.0-alpha-4, 2.4.15
>
>
> We are running some version of 1.7.2 in our production environment. We 
> encountered this issue recently.
> We colocate namenode and region server holding hbase:meta table on a set of 5 
> master nodes. Co-incidentally active namenode and region server holding meta 
> table were on the same physical node and that node went down due to hardware 
> issue. We have sub optimal hdfs level timeouts configured so whenever active 
> namenode goes down, it takes around 12-15 minutes for hdfs client within 
> hbase to connect to new active namenode. So all the region servers were 
> having problems for about 15 minutes to connect to new active namenode.
> Below are the sequence of events:
> 1. Host running active namenode and hbase:meta went down at +2022-09-09 
> 16:56:56,878+
> 2. HMaster started running ServerCrashProcedure at +2022-09-09 16:59:05,696+
> {noformat}
> 2022-09-09 16:59:05,696 DEBUG [t-processor-pool2-t1] 
> procedure2.ProcedureExecutor - Procedure ServerCrashProcedure 
> serverName=,61020,1662714013670, shouldSplitWal=true, 
> carryingMeta=true id=1 owner=dummy state=RUNNABLE:SERVER_CRASH_START added to 
> the store.
> 2022-09-09 16:59:05,702 DEBUG [t-processor-pool2-t1] master.ServerManager - 
> Added=,61020,1662714013670 to dead servers, submitted shutdown 
> handler to be executed meta=true
> 2022-09-09 16:59:05,707 DEBUG [ProcedureExecutor-0] master.DeadServer - 
> Started processing ,61020,1662714013670; numProcessing=1
> 2022-09-09 16:59:05,712 INFO  [ProcedureExecutor-0] 
> procedure.ServerCrashProcedure - Start processing crashed 
> ,61020,1662714013670
> {noformat}
> 3. SplitLogManager created 2 split log tasks in zookeeper.
> {noformat}
> 2022-09-09 16:59:06,049 INFO  [ProcedureExecutor-1] master.SplitLogManager - 
> Started splitting 2 logs in 
> [hdfs:///hbase/WALs/,61020,1662714013670-splitting]
>  for [,61020,1662714013670]
> 2022-09-09 16:59:06,081 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 16:59:06,093 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662739251611.meta
> {noformat}
> 4. The first split log task is more interesting: 
> +/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta+
> 5. Since all the region servers were having problems connecting to active 
> namenode, SplitLogManager tried total of 4 times to assign this task (3 
> resubmits, configured by hbase.splitlog.max.resubmit) and then finally gave 
> up.
> {noformat}
> -- try 1 -
> 2022-09-09 16:59:06,205 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662540522069
> -- try 2 -
> 2022-09-09 17:01:06,642 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:01:06,666 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - task not yet acquired 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  ver = 2
> 2022-09-09 17:01:06,715 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662530684713
> -- try 3 -
> 2022-09-09 17:03:07,643 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:03:07,687 DEBUG [main-EventThread] 
>

[GitHub] [hbase] Apache-HBase commented on pull request #4793: HBASE-27309 Add major compact table or region operation on master web…



Apache-HBase commented on PR #4793:
URL: https://github.com/apache/hbase/pull/4793#issuecomment-1254302016

   :confetti_ball: **+1 overall**
   
   
   
   
   
   
   | Vote | Subsystem | Runtime | Comment |
   |::|--:|:|:|
   | +0 :ok: |  reexec  |   1m 30s |  Docker mode activated.  |
   | -0 :warning: |  yetus  |   0m  3s |  Unprocessed flag(s): 
--brief-report-file --spotbugs-strict-precheck --whitespace-eol-ignore-list 
--whitespace-tabs-ignore-list --quick-hadoopcheck  |
   ||| _ Prechecks _ |
   ||| _ master Compile Tests _ |
   | +1 :green_heart: |  mvninstall  |   3m 27s |  master passed  |
   | +1 :green_heart: |  javadoc  |   0m 36s |  master passed  |
   ||| _ Patch Compile Tests _ |
   | +1 :green_heart: |  mvninstall  |   3m 38s |  the patch passed  |
   | +1 :green_heart: |  javadoc  |   0m 36s |  the patch passed  |
   ||| _ Other Tests _ |
   | +1 :green_heart: |  unit  | 227m 11s |  hbase-server in the patch passed.  
|
   |  |   | 238m 34s |   |
   
   
   | Subsystem | Report/Notes |
   |--:|:-|
   | Docker | ClientAPI=1.41 ServerAPI=1.41 base: 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4793/1/artifact/yetus-jdk8-hadoop3-check/output/Dockerfile
 |
   | GITHUB PR | https://github.com/apache/hbase/pull/4793 |
   | Optional Tests | javac javadoc unit |
   | uname | Linux 561f1696eb87 5.4.0-1081-aws #88~18.04.1-Ubuntu SMP Thu Jun 
23 16:29:17 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux |
   | Build tool | maven |
   | Personality | dev-support/hbase-personality.sh |
   | git revision | master / de127bde84 |
   | Default Java | Temurin-1.8.0_345-b01 |
   |  Test Results | 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4793/1/testReport/
 |
   | Max. process+thread count | 2645 (vs. ulimit of 3) |
   | modules | C: hbase-server U: hbase-server |
   | Console output | 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4793/1/console 
|
   | versions | git=2.17.1 maven=3.6.3 |
   | Powered by | Apache Yetus 0.12.0 https://yetus.apache.org |
   
   
   This message was automatically generated.
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[jira] [Commented] (HBASE-27382) Cluster completely down due to WAL splitting failing for hbase:meta table.



[ 
https://issues.apache.org/jira/browse/HBASE-27382?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607987#comment-17607987
 ] 

Rushabh Shah commented on HBASE-27382:
--

[~apurtell] This is an issue in branch-2 as well. We still have 
ZKSplitLogManagerCoordination present in branch-2 even if the default is not to 
use zookeeper as co-ordination engine.

> Cluster completely down due to WAL splitting failing for hbase:meta table.
> --
>
> Key: HBASE-27382
> URL: https://issues.apache.org/jira/browse/HBASE-27382
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 1.6.0, 1.7.1, 1.7.2
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
>
> We are running some version of 1.7.2 in our production environment. We 
> encountered this issue recently.
> We colocate namenode and region server holding hbase:meta table on a set of 5 
> master nodes. Co-incidentally active namenode and region server holding meta 
> table were on the same physical node and that node went down due to hardware 
> issue. We have sub optimal hdfs level timeouts configured so whenever active 
> namenode goes down, it takes around 12-15 minutes for hdfs client within 
> hbase to connect to new active namenode. So all the region servers were 
> having problems for about 15 minutes to connect to new active namenode.
> Below are the sequence of events:
> 1. Host running active namenode and hbase:meta went down at +2022-09-09 
> 16:56:56,878+
> 2. HMaster started running ServerCrashProcedure at +2022-09-09 16:59:05,696+
> {noformat}
> 2022-09-09 16:59:05,696 DEBUG [t-processor-pool2-t1] 
> procedure2.ProcedureExecutor - Procedure ServerCrashProcedure 
> serverName=,61020,1662714013670, shouldSplitWal=true, 
> carryingMeta=true id=1 owner=dummy state=RUNNABLE:SERVER_CRASH_START added to 
> the store.
> 2022-09-09 16:59:05,702 DEBUG [t-processor-pool2-t1] master.ServerManager - 
> Added=,61020,1662714013670 to dead servers, submitted shutdown 
> handler to be executed meta=true
> 2022-09-09 16:59:05,707 DEBUG [ProcedureExecutor-0] master.DeadServer - 
> Started processing ,61020,1662714013670; numProcessing=1
> 2022-09-09 16:59:05,712 INFO  [ProcedureExecutor-0] 
> procedure.ServerCrashProcedure - Start processing crashed 
> ,61020,1662714013670
> {noformat}
> 3. SplitLogManager created 2 split log tasks in zookeeper.
> {noformat}
> 2022-09-09 16:59:06,049 INFO  [ProcedureExecutor-1] master.SplitLogManager - 
> Started splitting 2 logs in 
> [hdfs:///hbase/WALs/,61020,1662714013670-splitting]
>  for [,61020,1662714013670]
> 2022-09-09 16:59:06,081 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 16:59:06,093 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662739251611.meta
> {noformat}
> 4. The first split log task is more interesting: 
> +/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta+
> 5. Since all the region servers were having problems connecting to active 
> namenode, SplitLogManager tried total of 4 times to assign this task (3 
> resubmits, configured by hbase.splitlog.max.resubmit) and then finally gave 
> up.
> {noformat}
> -- try 1 -
> 2022-09-09 16:59:06,205 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662540522069
> -- try 2 -
> 2022-09-09 17:01:06,642 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:01:06,666 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - task not yet acquired 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  ver = 2
> 2022-09-09 17:01:06,715 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662530684713
> -- try 3 -
> 2022-09-09 17:03:07,643 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
>

[GitHub] [hbase] Apache-HBase commented on pull request #4793: HBASE-27309 Add major compact table or region operation on master web…



Apache-HBase commented on PR #4793:
URL: https://github.com/apache/hbase/pull/4793#issuecomment-1254274369

   :confetti_ball: **+1 overall**
   
   
   
   
   
   
   | Vote | Subsystem | Runtime | Comment |
   |::|--:|:|:|
   | +0 :ok: |  reexec  |   1m 25s |  Docker mode activated.  |
   | -0 :warning: |  yetus  |   0m  2s |  Unprocessed flag(s): 
--brief-report-file --spotbugs-strict-precheck --whitespace-eol-ignore-list 
--whitespace-tabs-ignore-list --quick-hadoopcheck  |
   ||| _ Prechecks _ |
   ||| _ master Compile Tests _ |
   | +1 :green_heart: |  mvninstall  |   3m 51s |  master passed  |
   | +1 :green_heart: |  javadoc  |   0m 41s |  master passed  |
   ||| _ Patch Compile Tests _ |
   | +1 :green_heart: |  mvninstall  |   3m 22s |  the patch passed  |
   | +1 :green_heart: |  javadoc  |   0m 32s |  the patch passed  |
   ||| _ Other Tests _ |
   | +1 :green_heart: |  unit  | 196m 29s |  hbase-server in the patch passed.  
|
   |  |   | 207m 41s |   |
   
   
   | Subsystem | Report/Notes |
   |--:|:-|
   | Docker | ClientAPI=1.41 ServerAPI=1.41 base: 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4793/1/artifact/yetus-jdk11-hadoop3-check/output/Dockerfile
 |
   | GITHUB PR | https://github.com/apache/hbase/pull/4793 |
   | Optional Tests | javac javadoc unit |
   | uname | Linux cb8171f296f7 5.4.0-1071-aws #76~18.04.1-Ubuntu SMP Mon Mar 
28 17:49:57 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux |
   | Build tool | maven |
   | Personality | dev-support/hbase-personality.sh |
   | git revision | master / de127bde84 |
   | Default Java | Eclipse Adoptium-11.0.16.1+1 |
   |  Test Results | 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4793/1/testReport/
 |
   | Max. process+thread count | 2437 (vs. ulimit of 3) |
   | modules | C: hbase-server U: hbase-server |
   | Console output | 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4793/1/console 
|
   | versions | git=2.17.1 maven=3.6.3 |
   | Powered by | Apache Yetus 0.12.0 https://yetus.apache.org |
   
   
   This message was automatically generated.
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[jira] [Commented] (HBASE-27382) Cluster completely down due to WAL splitting failing for hbase:meta table.



[ 
https://issues.apache.org/jira/browse/HBASE-27382?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607970#comment-17607970
 ] 

Andrew Kyle Purtell commented on HBASE-27382:
-

Is this an issue in HBase 2 as well? 

HBase 1.x is EOM so if this is only an issue for 1.x, we won't follow up. 
EOM jira: https://issues.apache.org/jira/browse/HBASE-27286
Thread: https://lists.apache.org/thread/8qy2lqtdwzc10tqnfdhrqvo35z6vkrvk

> Cluster completely down due to WAL splitting failing for hbase:meta table.
> --
>
> Key: HBASE-27382
> URL: https://issues.apache.org/jira/browse/HBASE-27382
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 1.6.0, 1.7.1, 1.7.2
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
>
> We are running some version of 1.7.2 in our production environment. We 
> encountered this issue recently.
> We colocate namenode and region server holding hbase:meta table on a set of 5 
> master nodes. Co-incidentally active namenode and region server holding meta 
> table were on the same physical node and that node went down due to hardware 
> issue. We have sub optimal hdfs level timeouts configured so whenever active 
> namenode goes down, it takes around 12-15 minutes for hdfs client within 
> hbase to connect to new active namenode. So all the region servers were 
> having problems for about 15 minutes to connect to new active namenode.
> Below are the sequence of events:
> 1. Host running active namenode and hbase:meta went down at +2022-09-09 
> 16:56:56,878+
> 2. HMaster started running ServerCrashProcedure at +2022-09-09 16:59:05,696+
> {noformat}
> 2022-09-09 16:59:05,696 DEBUG [t-processor-pool2-t1] 
> procedure2.ProcedureExecutor - Procedure ServerCrashProcedure 
> serverName=,61020,1662714013670, shouldSplitWal=true, 
> carryingMeta=true id=1 owner=dummy state=RUNNABLE:SERVER_CRASH_START added to 
> the store.
> 2022-09-09 16:59:05,702 DEBUG [t-processor-pool2-t1] master.ServerManager - 
> Added=,61020,1662714013670 to dead servers, submitted shutdown 
> handler to be executed meta=true
> 2022-09-09 16:59:05,707 DEBUG [ProcedureExecutor-0] master.DeadServer - 
> Started processing ,61020,1662714013670; numProcessing=1
> 2022-09-09 16:59:05,712 INFO  [ProcedureExecutor-0] 
> procedure.ServerCrashProcedure - Start processing crashed 
> ,61020,1662714013670
> {noformat}
> 3. SplitLogManager created 2 split log tasks in zookeeper.
> {noformat}
> 2022-09-09 16:59:06,049 INFO  [ProcedureExecutor-1] master.SplitLogManager - 
> Started splitting 2 logs in 
> [hdfs:///hbase/WALs/,61020,1662714013670-splitting]
>  for [,61020,1662714013670]
> 2022-09-09 16:59:06,081 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 16:59:06,093 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662739251611.meta
> {noformat}
> 4. The first split log task is more interesting: 
> +/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta+
> 5. Since all the region servers were having problems connecting to active 
> namenode, SplitLogManager tried total of 4 times to assign this task (3 
> resubmits, configured by hbase.splitlog.max.resubmit) and then finally gave 
> up.
> {noformat}
> -- try 1 -
> 2022-09-09 16:59:06,205 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662540522069
> -- try 2 -
> 2022-09-09 17:01:06,642 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:01:06,666 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - task not yet acquired 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  ver = 2
> 2022-09-09 17:01:06,715 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662530684713
> -- try 3 -
> 2022-09-09 17:03:07,643 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task

[jira] [Commented] (HBASE-27383) Add dead region server to SplitLogManager#deadWorkers set as the first step.



[ 
https://issues.apache.org/jira/browse/HBASE-27383?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607969#comment-17607969
 ] 

Rushabh Shah commented on HBASE-27383:
--

Cc [~dmanning]

> Add dead region server to SplitLogManager#deadWorkers set as the first step.
> 
>
> Key: HBASE-27383
> URL: https://issues.apache.org/jira/browse/HBASE-27383
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 1.6.0, 1.7.2
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
>
> Currently we add a dead region server to +SplitLogManager#deadWorkers+ set in 
> SERVER_CRASH_SPLIT_LOGS state. 
> Consider a case where a region server is handling split log task for 
> hbase:meta table and SplitLogManager has exhausted all the retries and won't 
> try any more region server. 
> The region server which is handling split log task has died. 
> We have a check in SplitLogManager where if a region server is declared dead 
> and if that region server is responsible for split log task then we 
> forcefully resubmit split log task. See the code 
> [here|https://github.com/apache/hbase/blob/branch-1/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java#L721-L726].
> But we add a region server to SplitLogManager#deadWorkers set in 
> [SERVER_CRASH_SPLIT_LOGS|https://github.com/apache/hbase/blob/branch-1/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java#L252]
>  state. 
> Before that it runs 
> [SERVER_CRASH_GET_REGIONS|https://github.com/apache/hbase/blob/branch-1/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java#L214]
>  state  and checks if hbase:meta table is up. In this case, hbase:meta table 
> was not online and that prevented SplitLogManager to add this RS to 
> deadWorkers list. This created a deadlock and hbase cluster was completely 
> down for an extended period of time until we failed over active hmaster. See 
> HBASE-27382 for more details.
> Improvements:
> 1.  We should a dead region server to +SplitLogManager#deadWorkers+ list as 
> the first step.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Created] (HBASE-27383) Add dead region server to SplitLogManager#deadWorkers set as the first step.

Rushabh Shah created HBASE-27383:

Summary: Add dead region server to SplitLogManager#deadWorkers set
as the first step.
Key: HBASE-27383
URL: https://issues.apache.org/jira/browse/HBASE-27383
Project: HBase
Issue Type: Bug
Affects Versions: 1.7.2, 1.6.0
Reporter: Rushabh Shah
Assignee: Rushabh Shah

Currently we add a dead region server to +SplitLogManager#deadWorkers+ set in
SERVER_CRASH_SPLIT_LOGS state.
Consider a case where a region server is handling split log task for hbase:meta
table and SplitLogManager has exhausted all the retries and won't try any more
region server.
The region server which is handling split log task has died.
We have a check in SplitLogManager where if a region server is declared dead
and if that region server is responsible for split log task then we forcefully
resubmit split log task. See the code
[here|https://github.com/apache/hbase/blob/branch-1/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java#L721-L726].

But we add a region server to SplitLogManager#deadWorkers set in
[SERVER_CRASH_SPLIT_LOGS|https://github.com/apache/hbase/blob/branch-1/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java#L252]
state.
Before that it runs
[SERVER_CRASH_GET_REGIONS|https://github.com/apache/hbase/blob/branch-1/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java#L214]
state and checks if hbase:meta table is up. In this case, hbase:meta table
was not online and that prevented SplitLogManager to add this RS to deadWorkers
list. This created a deadlock and hbase cluster was completely down for an
extended period of time until we failed over active hmaster. See HBASE-27382
for more details.

Improvements:
1. We should a dead region server to +SplitLogManager#deadWorkers+ list as the
first step.

--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Commented] (HBASE-27382) Cluster completely down due to WAL splitting failing for hbase:meta table.



[ 
https://issues.apache.org/jira/browse/HBASE-27382?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607960#comment-17607960
 ] 

Rushabh Shah commented on HBASE-27382:
--

I haven't looked at the proc v2 based WAL split implementation but we still 
have config properties like hbase.splitlog.max.resubmit which dictates this 
behavior.

> Cluster completely down due to WAL splitting failing for hbase:meta table.
> --
>
> Key: HBASE-27382
> URL: https://issues.apache.org/jira/browse/HBASE-27382
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 1.6.0, 1.7.1, 1.7.2
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
>
> We are running some version of 1.7.2 in our production environment. We 
> encountered this issue recently.
> We colocate namenode and region server holding hbase:meta table on a set of 5 
> master nodes. Co-incidentally active namenode and region server holding meta 
> table were on the same physical node and that node went down due to hardware 
> issue. We have sub optimal hdfs level timeouts configured so whenever active 
> namenode goes down, it takes around 12-15 minutes for hdfs client within 
> hbase to connect to new active namenode. So all the region servers were 
> having problems for about 15 minutes to connect to new active namenode.
> Below are the sequence of events:
> 1. Host running active namenode and hbase:meta went down at +2022-09-09 
> 16:56:56,878+
> 2. HMaster started running ServerCrashProcedure at +2022-09-09 16:59:05,696+
> {noformat}
> 2022-09-09 16:59:05,696 DEBUG [t-processor-pool2-t1] 
> procedure2.ProcedureExecutor - Procedure ServerCrashProcedure 
> serverName=,61020,1662714013670, shouldSplitWal=true, 
> carryingMeta=true id=1 owner=dummy state=RUNNABLE:SERVER_CRASH_START added to 
> the store.
> 2022-09-09 16:59:05,702 DEBUG [t-processor-pool2-t1] master.ServerManager - 
> Added=,61020,1662714013670 to dead servers, submitted shutdown 
> handler to be executed meta=true
> 2022-09-09 16:59:05,707 DEBUG [ProcedureExecutor-0] master.DeadServer - 
> Started processing ,61020,1662714013670; numProcessing=1
> 2022-09-09 16:59:05,712 INFO  [ProcedureExecutor-0] 
> procedure.ServerCrashProcedure - Start processing crashed 
> ,61020,1662714013670
> {noformat}
> 3. SplitLogManager created 2 split log tasks in zookeeper.
> {noformat}
> 2022-09-09 16:59:06,049 INFO  [ProcedureExecutor-1] master.SplitLogManager - 
> Started splitting 2 logs in 
> [hdfs:///hbase/WALs/,61020,1662714013670-splitting]
>  for [,61020,1662714013670]
> 2022-09-09 16:59:06,081 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 16:59:06,093 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662739251611.meta
> {noformat}
> 4. The first split log task is more interesting: 
> +/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta+
> 5. Since all the region servers were having problems connecting to active 
> namenode, SplitLogManager tried total of 4 times to assign this task (3 
> resubmits, configured by hbase.splitlog.max.resubmit) and then finally gave 
> up.
> {noformat}
> -- try 1 -
> 2022-09-09 16:59:06,205 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662540522069
> -- try 2 -
> 2022-09-09 17:01:06,642 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:01:06,666 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - task not yet acquired 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  ver = 2
> 2022-09-09 17:01:06,715 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662530684713
> -- try 3 -
> 2022-09-09 17:03:07,643 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
>

[GitHub] [hbase] Apache-HBase commented on pull request #4724: HBASE-27280 Add mutual authentication support to TLS



Apache-HBase commented on PR #4724:
URL: https://github.com/apache/hbase/pull/4724#issuecomment-1254189367

   :broken_heart: **-1 overall**
   
   
   
   
   
   
   | Vote | Subsystem | Runtime | Comment |
   |::|--:|:|:|
   | +0 :ok: |  reexec  |   0m 45s |  Docker mode activated.  |
   | -0 :warning: |  yetus  |   0m  3s |  Unprocessed flag(s): 
--brief-report-file --spotbugs-strict-precheck --whitespace-eol-ignore-list 
--whitespace-tabs-ignore-list --quick-hadoopcheck  |
   ||| _ Prechecks _ |
   ||| _ master Compile Tests _ |
   | +0 :ok: |  mvndep  |   0m 44s |  Maven dependency ordering for branch  |
   | +1 :green_heart: |  mvninstall  |   2m 19s |  master passed  |
   | +1 :green_heart: |  compile  |   0m 48s |  master passed  |
   | +1 :green_heart: |  shadedjars  |   4m  6s |  branch has no errors when 
building our shaded downstream artifacts.  |
   | +1 :green_heart: |  javadoc  |   0m 37s |  master passed  |
   ||| _ Patch Compile Tests _ |
   | +0 :ok: |  mvndep  |   0m 12s |  Maven dependency ordering for patch  |
   | +1 :green_heart: |  mvninstall  |   2m 10s |  the patch passed  |
   | +1 :green_heart: |  compile  |   0m 49s |  the patch passed  |
   | +1 :green_heart: |  javac  |   0m 49s |  the patch passed  |
   | +1 :green_heart: |  shadedjars  |   4m  5s |  patch has no errors when 
building our shaded downstream artifacts.  |
   | +1 :green_heart: |  javadoc  |   0m 35s |  the patch passed  |
   ||| _ Other Tests _ |
   | +1 :green_heart: |  unit  |   1m 41s |  hbase-common in the patch passed.  
|
   | -1 :x: |  unit  | 234m 24s |  hbase-server in the patch failed.  |
   |  |   | 256m 47s |   |
   
   
   | Subsystem | Report/Notes |
   |--:|:-|
   | Docker | ClientAPI=1.41 ServerAPI=1.41 base: 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4724/10/artifact/yetus-jdk8-hadoop3-check/output/Dockerfile
 |
   | GITHUB PR | https://github.com/apache/hbase/pull/4724 |
   | Optional Tests | javac javadoc unit shadedjars compile |
   | uname | Linux 8764b0c3b5a3 5.4.0-1081-aws #88~18.04.1-Ubuntu SMP Thu Jun 
23 16:29:17 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux |
   | Build tool | maven |
   | Personality | dev-support/hbase-personality.sh |
   | git revision | master / de127bde84 |
   | Default Java | Temurin-1.8.0_345-b01 |
   | unit | 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4724/10/artifact/yetus-jdk8-hadoop3-check/output/patch-unit-hbase-server.txt
 |
   |  Test Results | 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4724/10/testReport/
 |
   | Max. process+thread count | 2627 (vs. ulimit of 3) |
   | modules | C: hbase-common hbase-server U: . |
   | Console output | 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4724/10/console
 |
   | versions | git=2.17.1 maven=3.6.3 |
   | Powered by | Apache Yetus 0.12.0 https://yetus.apache.org |
   
   
   This message was automatically generated.
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[jira] [Updated] (HBASE-27382) Cluster completely down due to WAL splitting failing for hbase:meta table.



 [ 
https://issues.apache.org/jira/browse/HBASE-27382?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Viraj Jasani updated HBASE-27382:
-
Affects Version/s: 1.7.1
   1.6.0

> Cluster completely down due to WAL splitting failing for hbase:meta table.
> --
>
> Key: HBASE-27382
> URL: https://issues.apache.org/jira/browse/HBASE-27382
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 1.6.0, 1.7.1, 1.7.2
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
>
> We are running some version of 1.7.2 in our production environment. We 
> encountered this issue recently.
> We colocate namenode and region server holding hbase:meta table on a set of 5 
> master nodes. Co-incidentally active namenode and region server holding meta 
> table were on the same physical node and that node went down due to hardware 
> issue. We have sub optimal hdfs level timeouts configured so whenever active 
> namenode goes down, it takes around 12-15 minutes for hdfs client within 
> hbase to connect to new active namenode. So all the region servers were 
> having problems for about 15 minutes to connect to new active namenode.
> Below are the sequence of events:
> 1. Host running active namenode and hbase:meta went down at +2022-09-09 
> 16:56:56,878+
> 2. HMaster started running ServerCrashProcedure at +2022-09-09 16:59:05,696+
> {noformat}
> 2022-09-09 16:59:05,696 DEBUG [t-processor-pool2-t1] 
> procedure2.ProcedureExecutor - Procedure ServerCrashProcedure 
> serverName=,61020,1662714013670, shouldSplitWal=true, 
> carryingMeta=true id=1 owner=dummy state=RUNNABLE:SERVER_CRASH_START added to 
> the store.
> 2022-09-09 16:59:05,702 DEBUG [t-processor-pool2-t1] master.ServerManager - 
> Added=,61020,1662714013670 to dead servers, submitted shutdown 
> handler to be executed meta=true
> 2022-09-09 16:59:05,707 DEBUG [ProcedureExecutor-0] master.DeadServer - 
> Started processing ,61020,1662714013670; numProcessing=1
> 2022-09-09 16:59:05,712 INFO  [ProcedureExecutor-0] 
> procedure.ServerCrashProcedure - Start processing crashed 
> ,61020,1662714013670
> {noformat}
> 3. SplitLogManager created 2 split log tasks in zookeeper.
> {noformat}
> 2022-09-09 16:59:06,049 INFO  [ProcedureExecutor-1] master.SplitLogManager - 
> Started splitting 2 logs in 
> [hdfs:///hbase/WALs/,61020,1662714013670-splitting]
>  for [,61020,1662714013670]
> 2022-09-09 16:59:06,081 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 16:59:06,093 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662739251611.meta
> {noformat}
> 4. The first split log task is more interesting: 
> +/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta+
> 5. Since all the region servers were having problems connecting to active 
> namenode, SplitLogManager tried total of 4 times to assign this task (3 
> resubmits, configured by hbase.splitlog.max.resubmit) and then finally gave 
> up.
> {noformat}
> -- try 1 -
> 2022-09-09 16:59:06,205 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662540522069
> -- try 2 -
> 2022-09-09 17:01:06,642 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:01:06,666 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - task not yet acquired 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  ver = 2
> 2022-09-09 17:01:06,715 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662530684713
> -- try 3 -
> 2022-09-09 17:03:07,643 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:03:07,687 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - task not yet acquired 
>

[jira] [Commented] (HBASE-27382) Cluster completely down due to WAL splitting failing for hbase:meta table.



[ 
https://issues.apache.org/jira/browse/HBASE-27382?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607959#comment-17607959
 ] 

Viraj Jasani commented on HBASE-27382:
--

Thanks for the detailed analysis [~shahrs87]!

[~tianjingyun] The bug mentioned on this Jira should likely not appear with 
HBASE-21588 (proc v2 based WAL split implementation), but I was wondering if 
the intention to rid of zookeeper for WAL splitting in 2.2+ releases had 
something to do with this category of bugs (in addition to a) perf improvement 
and b) no longer relying on ZK for WAL split coordination).

 

cc [~stack] [~zhangduo]

> Cluster completely down due to WAL splitting failing for hbase:meta table.
> --
>
> Key: HBASE-27382
> URL: https://issues.apache.org/jira/browse/HBASE-27382
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 1.7.2
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
>
> We are running some version of 1.7.2 in our production environment. We 
> encountered this issue recently.
> We colocate namenode and region server holding hbase:meta table on a set of 5 
> master nodes. Co-incidentally active namenode and region server holding meta 
> table were on the same physical node and that node went down due to hardware 
> issue. We have sub optimal hdfs level timeouts configured so whenever active 
> namenode goes down, it takes around 12-15 minutes for hdfs client within 
> hbase to connect to new active namenode. So all the region servers were 
> having problems for about 15 minutes to connect to new active namenode.
> Below are the sequence of events:
> 1. Host running active namenode and hbase:meta went down at +2022-09-09 
> 16:56:56,878+
> 2. HMaster started running ServerCrashProcedure at +2022-09-09 16:59:05,696+
> {noformat}
> 2022-09-09 16:59:05,696 DEBUG [t-processor-pool2-t1] 
> procedure2.ProcedureExecutor - Procedure ServerCrashProcedure 
> serverName=,61020,1662714013670, shouldSplitWal=true, 
> carryingMeta=true id=1 owner=dummy state=RUNNABLE:SERVER_CRASH_START added to 
> the store.
> 2022-09-09 16:59:05,702 DEBUG [t-processor-pool2-t1] master.ServerManager - 
> Added=,61020,1662714013670 to dead servers, submitted shutdown 
> handler to be executed meta=true
> 2022-09-09 16:59:05,707 DEBUG [ProcedureExecutor-0] master.DeadServer - 
> Started processing ,61020,1662714013670; numProcessing=1
> 2022-09-09 16:59:05,712 INFO  [ProcedureExecutor-0] 
> procedure.ServerCrashProcedure - Start processing crashed 
> ,61020,1662714013670
> {noformat}
> 3. SplitLogManager created 2 split log tasks in zookeeper.
> {noformat}
> 2022-09-09 16:59:06,049 INFO  [ProcedureExecutor-1] master.SplitLogManager - 
> Started splitting 2 logs in 
> [hdfs:///hbase/WALs/,61020,1662714013670-splitting]
>  for [,61020,1662714013670]
> 2022-09-09 16:59:06,081 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 16:59:06,093 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662739251611.meta
> {noformat}
> 4. The first split log task is more interesting: 
> +/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta+
> 5. Since all the region servers were having problems connecting to active 
> namenode, SplitLogManager tried total of 4 times to assign this task (3 
> resubmits, configured by hbase.splitlog.max.resubmit) and then finally gave 
> up.
> {noformat}
> -- try 1 -
> 2022-09-09 16:59:06,205 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662540522069
> -- try 2 -
> 2022-09-09 17:01:06,642 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:01:06,666 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - task not yet acquired 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  ver = 2
> 2022-09-09 17:01:06,715 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by

[jira] [Commented] (HBASE-27382) Cluster completely down due to WAL splitting failing for hbase:meta table.



[ 
https://issues.apache.org/jira/browse/HBASE-27382?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607950#comment-17607950
 ] 

Rushabh Shah commented on HBASE-27382:
--

Cc [~apurt...@yahoo.com] [~vjasani] [~dmanning]

> Cluster completely down due to WAL splitting failing for hbase:meta table.
> --
>
> Key: HBASE-27382
> URL: https://issues.apache.org/jira/browse/HBASE-27382
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 1.7.2
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
>
> We are running some version of 1.7.2 in our production environment. We 
> encountered this issue recently.
> We colocate namenode and region server holding hbase:meta table on a set of 5 
> master nodes. Co-incidentally active namenode and region server holding meta 
> table were on the same physical node and that node went down due to hardware 
> issue. We have sub optimal hdfs level timeouts configured so whenever active 
> namenode goes down, it takes around 12-15 minutes for hdfs client within 
> hbase to connect to new active namenode. So all the region servers were 
> having problems for about 15 minutes to connect to new active namenode.
> Below are the sequence of events:
> 1. Host running active namenode and hbase:meta went down at +2022-09-09 
> 16:56:56,878+
> 2. HMaster started running ServerCrashProcedure at +2022-09-09 16:59:05,696+
> {noformat}
> 2022-09-09 16:59:05,696 DEBUG [t-processor-pool2-t1] 
> procedure2.ProcedureExecutor - Procedure ServerCrashProcedure 
> serverName=,61020,1662714013670, shouldSplitWal=true, 
> carryingMeta=true id=1 owner=dummy state=RUNNABLE:SERVER_CRASH_START added to 
> the store.
> 2022-09-09 16:59:05,702 DEBUG [t-processor-pool2-t1] master.ServerManager - 
> Added=,61020,1662714013670 to dead servers, submitted shutdown 
> handler to be executed meta=true
> 2022-09-09 16:59:05,707 DEBUG [ProcedureExecutor-0] master.DeadServer - 
> Started processing ,61020,1662714013670; numProcessing=1
> 2022-09-09 16:59:05,712 INFO  [ProcedureExecutor-0] 
> procedure.ServerCrashProcedure - Start processing crashed 
> ,61020,1662714013670
> {noformat}
> 3. SplitLogManager created 2 split log tasks in zookeeper.
> {noformat}
> 2022-09-09 16:59:06,049 INFO  [ProcedureExecutor-1] master.SplitLogManager - 
> Started splitting 2 logs in 
> [hdfs:///hbase/WALs/,61020,1662714013670-splitting]
>  for [,61020,1662714013670]
> 2022-09-09 16:59:06,081 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 16:59:06,093 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662739251611.meta
> {noformat}
> 4. The first split log task is more interesting: 
> +/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta+
> 5. Since all the region servers were having problems connecting to active 
> namenode, SplitLogManager tried total of 4 times to assign this task (3 
> resubmits, configured by hbase.splitlog.max.resubmit) and then finally gave 
> up.
> {noformat}
> -- try 1 -
> 2022-09-09 16:59:06,205 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662540522069
> -- try 2 -
> 2022-09-09 17:01:06,642 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:01:06,666 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - task not yet acquired 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  ver = 2
> 2022-09-09 17:01:06,715 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662530684713
> -- try 3 -
> 2022-09-09 17:03:07,643 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:03:07,687 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - task not yet

[GitHub] [hbase] Apache-HBase commented on pull request #4724: HBASE-27280 Add mutual authentication support to TLS



Apache-HBase commented on PR #4724:
URL: https://github.com/apache/hbase/pull/4724#issuecomment-1254147425

   :confetti_ball: **+1 overall**
   
   
   
   
   
   
   | Vote | Subsystem | Runtime | Comment |
   |::|--:|:|:|
   | +0 :ok: |  reexec  |   0m 41s |  Docker mode activated.  |
   | -0 :warning: |  yetus  |   0m  3s |  Unprocessed flag(s): 
--brief-report-file --spotbugs-strict-precheck --whitespace-eol-ignore-list 
--whitespace-tabs-ignore-list --quick-hadoopcheck  |
   ||| _ Prechecks _ |
   ||| _ master Compile Tests _ |
   | +0 :ok: |  mvndep  |   0m 15s |  Maven dependency ordering for branch  |
   | +1 :green_heart: |  mvninstall  |   2m 44s |  master passed  |
   | +1 :green_heart: |  compile  |   0m 58s |  master passed  |
   | +1 :green_heart: |  shadedjars  |   3m 54s |  branch has no errors when 
building our shaded downstream artifacts.  |
   | +1 :green_heart: |  javadoc  |   0m 37s |  master passed  |
   ||| _ Patch Compile Tests _ |
   | +0 :ok: |  mvndep  |   0m 11s |  Maven dependency ordering for patch  |
   | +1 :green_heart: |  mvninstall  |   2m 25s |  the patch passed  |
   | +1 :green_heart: |  compile  |   0m 56s |  the patch passed  |
   | +1 :green_heart: |  javac  |   0m 56s |  the patch passed  |
   | +1 :green_heart: |  shadedjars  |   3m 52s |  patch has no errors when 
building our shaded downstream artifacts.  |
   | +1 :green_heart: |  javadoc  |   0m 36s |  the patch passed  |
   ||| _ Other Tests _ |
   | +1 :green_heart: |  unit  |   2m  0s |  hbase-common in the patch passed.  
|
   | +1 :green_heart: |  unit  | 193m 20s |  hbase-server in the patch passed.  
|
   |  |   | 214m 57s |   |
   
   
   | Subsystem | Report/Notes |
   |--:|:-|
   | Docker | ClientAPI=1.41 ServerAPI=1.41 base: 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4724/10/artifact/yetus-jdk11-hadoop3-check/output/Dockerfile
 |
   | GITHUB PR | https://github.com/apache/hbase/pull/4724 |
   | Optional Tests | javac javadoc unit shadedjars compile |
   | uname | Linux 68ee3c5cf7ad 5.4.0-1071-aws #76~18.04.1-Ubuntu SMP Mon Mar 
28 17:49:57 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux |
   | Build tool | maven |
   | Personality | dev-support/hbase-personality.sh |
   | git revision | master / de127bde84 |
   | Default Java | Eclipse Adoptium-11.0.16.1+1 |
   |  Test Results | 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4724/10/testReport/
 |
   | Max. process+thread count | 2683 (vs. ulimit of 3) |
   | modules | C: hbase-common hbase-server U: . |
   | Console output | 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4724/10/console
 |
   | versions | git=2.17.1 maven=3.6.3 |
   | Powered by | Apache Yetus 0.12.0 https://yetus.apache.org |
   
   
   This message was automatically generated.
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[jira] [Commented] (HBASE-27310) Add disabled table is displayed red on master web tables list



[ 
https://issues.apache.org/jira/browse/HBASE-27310?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607945#comment-17607945
 ] 

Hudson commented on HBASE-27310:


Results for branch master
[build #687 on 
builds.a.o|https://ci-hbase.apache.org/job/HBase%20Nightly/job/master/687/]: 
(/) *{color:green}+1 overall{color}*

details (if available):

(/) {color:green}+1 general checks{color}
-- For more information [see general 
report|https://ci-hbase.apache.org/job/HBase%20Nightly/job/master/687/General_20Nightly_20Build_20Report/]




(/) {color:green}+1 jdk8 hadoop3 checks{color}
-- For more information [see jdk8 (hadoop3) 
report|https://ci-hbase.apache.org/job/HBase%20Nightly/job/master/687/JDK8_20Nightly_20Build_20Report_20_28Hadoop3_29/]


(/) {color:green}+1 jdk11 hadoop3 checks{color}
-- For more information [see jdk11 
report|https://ci-hbase.apache.org/job/HBase%20Nightly/job/master/687/JDK11_20Nightly_20Build_20Report_20_28Hadoop3_29/]


(/) {color:green}+1 source release artifact{color}
-- See build output for details.


(/) {color:green}+1 client integration test{color}


> Add disabled table is displayed red on master web tables list
> -
>
> Key: HBASE-27310
> URL: https://issues.apache.org/jira/browse/HBASE-27310
> Project: HBase
>  Issue Type: Sub-task
>  Components: UI
>Reporter: zhengsicheng
>Assignee: zhengsicheng
>Priority: Minor
> Fix For: 2.6.0, 3.0.0-alpha-4
>
> Attachments: image-2022-09-01-14-27-56-145.png
>
>
> The table of the disabled state is displayed in red
> The effect is as follows：
> !image-2022-09-01-14-27-56-145.png!



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Updated] (HBASE-27382) Cluster completely down due to WAL splitting failing for hbase:meta table.



 [ 
https://issues.apache.org/jira/browse/HBASE-27382?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Rushabh Shah updated HBASE-27382:
-
Description: 
We are running some version of 1.7.2 in our production environment. We 
encountered this issue recently.
We colocate namenode and region server holding hbase:meta table on a set of 5 
master nodes. Co-incidentally active namenode and region server holding meta 
table were on the same physical node and that node went down due to hardware 
issue. We have sub optimal hdfs level timeouts configured so whenever active 
namenode goes down, it takes around 12-15 minutes for hdfs client within hbase 
to connect to new active namenode. So all the region servers were having 
problems for about 15 minutes to connect to new active namenode.

Below are the sequence of events:

1. Host running active namenode and hbase:meta went down at +2022-09-09 
16:56:56,878+
2. HMaster started running ServerCrashProcedure at +2022-09-09 16:59:05,696+
{noformat}
2022-09-09 16:59:05,696 DEBUG [t-processor-pool2-t1] 
procedure2.ProcedureExecutor - Procedure ServerCrashProcedure 
serverName=,61020,1662714013670, shouldSplitWal=true, 
carryingMeta=true id=1 owner=dummy state=RUNNABLE:SERVER_CRASH_START added to 
the store.

2022-09-09 16:59:05,702 DEBUG [t-processor-pool2-t1] master.ServerManager - 
Added=,61020,1662714013670 to dead servers, submitted shutdown 
handler to be executed meta=true

2022-09-09 16:59:05,707 DEBUG [ProcedureExecutor-0] master.DeadServer - Started 
processing ,61020,1662714013670; numProcessing=1
2022-09-09 16:59:05,712 INFO  [ProcedureExecutor-0] 
procedure.ServerCrashProcedure - Start processing crashed 
,61020,1662714013670
{noformat}

3. SplitLogManager created 2 split log tasks in zookeeper.

{noformat}
2022-09-09 16:59:06,049 INFO  [ProcedureExecutor-1] master.SplitLogManager - 
Started splitting 2 logs in 
[hdfs:///hbase/WALs/,61020,1662714013670-splitting]
 for [,61020,1662714013670]

2022-09-09 16:59:06,081 DEBUG [main-EventThread] 
coordination.SplitLogManagerCoordination - put up splitlog task at znode 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta

2022-09-09 16:59:06,093 DEBUG [main-EventThread] 
coordination.SplitLogManagerCoordination - put up splitlog task at znode 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662739251611.meta
{noformat}


4. The first split log task is more interesting: 
+/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta+

5. Since all the region servers were having problems connecting to active 
namenode, SplitLogManager tried total of 4 times to assign this task (3 
resubmits, configured by hbase.splitlog.max.resubmit) and then finally gave up.

{noformat}
-- try 1 -
2022-09-09 16:59:06,205 INFO  [main-EventThread] 
coordination.SplitLogManagerCoordination - task 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
 acquired by ,61020,1662540522069

-- try 2 -

2022-09-09 17:01:06,642 INFO  [ager__ChoreService_1] 
coordination.SplitLogManagerCoordination - resubmitting task 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta

2022-09-09 17:01:06,666 DEBUG [main-EventThread] 
coordination.SplitLogManagerCoordination - task not yet acquired 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
 ver = 2

2022-09-09 17:01:06,715 INFO  [main-EventThread] 
coordination.SplitLogManagerCoordination - task 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
 acquired by ,61020,1662530684713

-- try 3 -

2022-09-09 17:03:07,643 INFO  [ager__ChoreService_1] 
coordination.SplitLogManagerCoordination - resubmitting task 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta

2022-09-09 17:03:07,687 DEBUG [main-EventThread] 
coordination.SplitLogManagerCoordination - task not yet acquired 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
 ver = 4

2022-09-09 17:03:07,738 INFO  [main-EventThread] 
coordination.SplitLogManagerCoordination - task 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
 acquired by ,61020,1662542355806


-- try 4 -
2022-09-09 17:05:08,684 INFO  [ager__ChoreService_1] 
coordination.SplitLogManagerCoordination - resubmitting task 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta

[jira] [Commented] (HBASE-27303) Unnecessary replication to secondary region replicas should avoid when WAL.sync throws Exception



[ 
https://issues.apache.org/jira/browse/HBASE-27303?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607946#comment-17607946
 ] 

Hudson commented on HBASE-27303:


Results for branch master
[build #687 on 
builds.a.o|https://ci-hbase.apache.org/job/HBase%20Nightly/job/master/687/]: 
(/) *{color:green}+1 overall{color}*

details (if available):

(/) {color:green}+1 general checks{color}
-- For more information [see general 
report|https://ci-hbase.apache.org/job/HBase%20Nightly/job/master/687/General_20Nightly_20Build_20Report/]




(/) {color:green}+1 jdk8 hadoop3 checks{color}
-- For more information [see jdk8 (hadoop3) 
report|https://ci-hbase.apache.org/job/HBase%20Nightly/job/master/687/JDK8_20Nightly_20Build_20Report_20_28Hadoop3_29/]


(/) {color:green}+1 jdk11 hadoop3 checks{color}
-- For more information [see jdk11 
report|https://ci-hbase.apache.org/job/HBase%20Nightly/job/master/687/JDK11_20Nightly_20Build_20Report_20_28Hadoop3_29/]


(/) {color:green}+1 source release artifact{color}
-- See build output for details.


(/) {color:green}+1 client integration test{color}


> Unnecessary replication to secondary region replicas should avoid when 
> WAL.sync throws Exception
> 
>
> Key: HBASE-27303
> URL: https://issues.apache.org/jira/browse/HBASE-27303
> Project: HBase
>  Issue Type: Improvement
>  Components: read replicas
>Affects Versions: 3.0.0-alpha-4
>Reporter: chenglei
>Assignee: chenglei
>Priority: Major
> Fix For: 3.0.0-alpha-4
>
>
> As HBASE-27230 and HBASE-27223 said, if {{WAL.sync}} throws an exception, we 
> should abort the region server to avoid the data inconsistent between the 
> primary region and secondary region replicas, so we have no need to replicate 
> to secondary region replicas when  {{WAL.sync}} throws an exception.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Created] (HBASE-27382) Cluster completely down due to wal splitting failing for hbase:meta table.

Rushabh Shah created HBASE-27382:


 Summary: Cluster completely down due to wal splitting failing for 
hbase:meta table.
 Key: HBASE-27382
 URL: https://issues.apache.org/jira/browse/HBASE-27382
 Project: HBase
  Issue Type: Bug
Affects Versions: 1.7.2
Reporter: Rushabh Shah
Assignee: Rushabh Shah


We are running some version of 1.7.2 in our production environment. We 
encountered this issue recently.
We colocate namenode and region server holding hbase:meta table on a set of 5 
master nodes. Co-incidentally active namenode and region server holding meta 
table were on the same physical node and that node went down due to hardware 
issue. We have sub optimal hdfs level timeouts configured so whenever active 
namenode goes down, it takes around 12-15 minutes for hdfs client within hbase 
to connect to new active namenode. So all the region servers were having 
problems for about 15 minutes to connect to new active namenode.

Below are the sequence of events:

1. Host running active namenode and hbase:meta went down at +2022-09-09 
16:56:56,878_
2. HMaster started running ServerCrashProcedure at +2022-09-09 16:59:05,696+
{noformat}
2022-09-09 16:59:05,696 DEBUG [t-processor-pool2-t1] 
procedure2.ProcedureExecutor - Procedure ServerCrashProcedure 
serverName=,61020,1662714013670, shouldSplitWal=true, 
carryingMeta=true id=1 owner=dummy state=RUNNABLE:SERVER_CRASH_START added to 
the store.

2022-09-09 16:59:05,702 DEBUG [t-processor-pool2-t1] master.ServerManager - 
Added=,61020,1662714013670 to dead servers, submitted shutdown 
handler to be executed meta=true

2022-09-09 16:59:05,707 DEBUG [ProcedureExecutor-0] master.DeadServer - Started 
processing ,61020,1662714013670; numProcessing=1
2022-09-09 16:59:05,712 INFO  [ProcedureExecutor-0] 
procedure.ServerCrashProcedure - Start processing crashed 
,61020,1662714013670
{noformat}

3. SplitLogManager created 2 split log tasks in zookeeper.

{noformat}
2022-09-09 16:59:06,049 INFO  [ProcedureExecutor-1] master.SplitLogManager - 
Started splitting 2 logs in 
[hdfs:///hbase/WALs/,61020,1662714013670-splitting]
 for [,61020,1662714013670]

2022-09-09 16:59:06,081 DEBUG [main-EventThread] 
coordination.SplitLogManagerCoordination - put up splitlog task at znode 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta

2022-09-09 16:59:06,093 DEBUG [main-EventThread] 
coordination.SplitLogManagerCoordination - put up splitlog task at znode 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662739251611.meta
{noformat}


4. The first split log task is more interesting: 
+/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta+

5. Since all the region servers were having problems connecting to active 
namenode, SplitLogManager tried total of 4 times to assign this task (3 
resubmits, configured by hbase.splitlog.max.resubmit) and then finally gave up.

{noformat}
-- try 1 -
2022-09-09 16:59:06,205 INFO  [main-EventThread] 
coordination.SplitLogManagerCoordination - task 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
 acquired by ,61020,1662540522069

-- try 2 -

2022-09-09 17:01:06,642 INFO  [ager__ChoreService_1] 
coordination.SplitLogManagerCoordination - resubmitting task 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta

2022-09-09 17:01:06,666 DEBUG [main-EventThread] 
coordination.SplitLogManagerCoordination - task not yet acquired 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
 ver = 2

2022-09-09 17:01:06,715 INFO  [main-EventThread] 
coordination.SplitLogManagerCoordination - task 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
 acquired by ,61020,1662530684713

-- try 3 -

2022-09-09 17:03:07,643 INFO  [ager__ChoreService_1] 
coordination.SplitLogManagerCoordination - resubmitting task 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta

2022-09-09 17:03:07,687 DEBUG [main-EventThread] 
coordination.SplitLogManagerCoordination - task not yet acquired 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
 ver = 4

2022-09-09 17:03:07,738 INFO  [main-EventThread] 
coordination.SplitLogManagerCoordination - task 
/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
 acquired by ,61020,1662542355806


-- try 4 -
2022-09-09

[jira] [Updated] (HBASE-27382) Cluster completely down due to WAL splitting failing for hbase:meta table.



 [ 
https://issues.apache.org/jira/browse/HBASE-27382?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Rushabh Shah updated HBASE-27382:
-
Summary: Cluster completely down due to WAL splitting failing for 
hbase:meta table.  (was: Cluster completely down due to wal splitting failing 
for hbase:meta table.)

> Cluster completely down due to WAL splitting failing for hbase:meta table.
> --
>
> Key: HBASE-27382
> URL: https://issues.apache.org/jira/browse/HBASE-27382
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 1.7.2
>Reporter: Rushabh Shah
>Assignee: Rushabh Shah
>Priority: Major
>
> We are running some version of 1.7.2 in our production environment. We 
> encountered this issue recently.
> We colocate namenode and region server holding hbase:meta table on a set of 5 
> master nodes. Co-incidentally active namenode and region server holding meta 
> table were on the same physical node and that node went down due to hardware 
> issue. We have sub optimal hdfs level timeouts configured so whenever active 
> namenode goes down, it takes around 12-15 minutes for hdfs client within 
> hbase to connect to new active namenode. So all the region servers were 
> having problems for about 15 minutes to connect to new active namenode.
> Below are the sequence of events:
> 1. Host running active namenode and hbase:meta went down at +2022-09-09 
> 16:56:56,878_
> 2. HMaster started running ServerCrashProcedure at +2022-09-09 16:59:05,696+
> {noformat}
> 2022-09-09 16:59:05,696 DEBUG [t-processor-pool2-t1] 
> procedure2.ProcedureExecutor - Procedure ServerCrashProcedure 
> serverName=,61020,1662714013670, shouldSplitWal=true, 
> carryingMeta=true id=1 owner=dummy state=RUNNABLE:SERVER_CRASH_START added to 
> the store.
> 2022-09-09 16:59:05,702 DEBUG [t-processor-pool2-t1] master.ServerManager - 
> Added=,61020,1662714013670 to dead servers, submitted shutdown 
> handler to be executed meta=true
> 2022-09-09 16:59:05,707 DEBUG [ProcedureExecutor-0] master.DeadServer - 
> Started processing ,61020,1662714013670; numProcessing=1
> 2022-09-09 16:59:05,712 INFO  [ProcedureExecutor-0] 
> procedure.ServerCrashProcedure - Start processing crashed 
> ,61020,1662714013670
> {noformat}
> 3. SplitLogManager created 2 split log tasks in zookeeper.
> {noformat}
> 2022-09-09 16:59:06,049 INFO  [ProcedureExecutor-1] master.SplitLogManager - 
> Started splitting 2 logs in 
> [hdfs:///hbase/WALs/,61020,1662714013670-splitting]
>  for [,61020,1662714013670]
> 2022-09-09 16:59:06,081 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 16:59:06,093 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - put up splitlog task at znode 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662739251611.meta
> {noformat}
> 4. The first split log task is more interesting: 
> +/hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta+
> 5. Since all the region servers were having problems connecting to active 
> namenode, SplitLogManager tried total of 4 times to assign this task (3 
> resubmits, configured by hbase.splitlog.max.resubmit) and then finally gave 
> up.
> {noformat}
> -- try 1 -
> 2022-09-09 16:59:06,205 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662540522069
> -- try 2 -
> 2022-09-09 17:01:06,642 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:01:06,666 DEBUG [main-EventThread] 
> coordination.SplitLogManagerCoordination - task not yet acquired 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  ver = 2
> 2022-09-09 17:01:06,715 INFO  [main-EventThread] 
> coordination.SplitLogManagerCoordination - task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
>  acquired by ,61020,1662530684713
> -- try 3 -
> 2022-09-09 17:03:07,643 INFO  [ager__ChoreService_1] 
> coordination.SplitLogManagerCoordination - resubmitting task 
> /hbase/splitWAL/WALs%2F%2C61020%2C1662714013670-splitting%2F%252C61020%252C1662714013670.meta.1662735651285.meta
> 2022-09-09 17:03:07,687

[GitHub] [hbase] Apache-HBase commented on pull request #4793: HBASE-27309 Add major compact table or region operation on master web…



Apache-HBase commented on PR #4793:
URL: https://github.com/apache/hbase/pull/4793#issuecomment-1254094472

   :confetti_ball: **+1 overall**
   
   
   
   
   
   
   | Vote | Subsystem | Runtime | Comment |
   |::|--:|:|:|
   | +0 :ok: |  reexec  |   1m 16s |  Docker mode activated.  |
   ||| _ Prechecks _ |
   | +1 :green_heart: |  dupname  |   0m  0s |  No case conflicting files 
found.  |
   | +1 :green_heart: |  @author  |   0m  0s |  The patch does not contain any 
@author tags.  |
   ||| _ master Compile Tests _ |
   | +1 :green_heart: |  mvninstall  |   3m 44s |  master passed  |
   | +1 :green_heart: |  spotless  |   0m 46s |  branch has no errors when 
running spotless:check.  |
   ||| _ Patch Compile Tests _ |
   | +1 :green_heart: |  mvninstall  |   3m  1s |  the patch passed  |
   | +1 :green_heart: |  whitespace  |   0m  0s |  The patch has no whitespace 
issues.  |
   | +1 :green_heart: |  spotless  |   0m 47s |  patch has no errors when 
running spotless:check.  |
   ||| _ Other Tests _ |
   | +1 :green_heart: |  asflicense  |   0m 11s |  The patch does not generate 
ASF License warnings.  |
   |  |   |  10m 55s |   |
   
   
   | Subsystem | Report/Notes |
   |--:|:-|
   | Docker | ClientAPI=1.41 ServerAPI=1.41 base: 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4793/1/artifact/yetus-general-check/output/Dockerfile
 |
   | GITHUB PR | https://github.com/apache/hbase/pull/4793 |
   | Optional Tests | dupname asflicense javac spotless |
   | uname | Linux 9cc64740208a 5.4.0-1071-aws #76~18.04.1-Ubuntu SMP Mon Mar 
28 17:49:57 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux |
   | Build tool | maven |
   | Personality | dev-support/hbase-personality.sh |
   | git revision | master / de127bde84 |
   | Default Java | Temurin-1.8.0_345-b01 |
   | Max. process+thread count | 64 (vs. ulimit of 3) |
   | modules | C: hbase-server U: hbase-server |
   | Console output | 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4793/1/console 
|
   | versions | git=2.17.1 maven=3.6.3 |
   | Powered by | Apache Yetus 0.12.0 https://yetus.apache.org |
   
   
   This message was automatically generated.
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[jira] [Updated] (HBASE-27309) Add major compact table or region operation on master web table page



 [ 
https://issues.apache.org/jira/browse/HBASE-27309?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

zhengsicheng updated HBASE-27309:
-
Description: 
 

!image-2022-09-22-02-32-36-619.png!

> Add major compact table or region operation on master web table page
> 
>
> Key: HBASE-27309
> URL: https://issues.apache.org/jira/browse/HBASE-27309
> Project: HBase
>  Issue Type: Sub-task
>Reporter: zhengsicheng
>Assignee: zhengsicheng
>Priority: Minor
> Attachments: image-2022-09-22-02-32-36-619.png
>
>
>  
> !image-2022-09-22-02-32-36-619.png!



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Updated] (HBASE-27309) Add major compact table or region operation on master web table page



 [ 
https://issues.apache.org/jira/browse/HBASE-27309?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

zhengsicheng updated HBASE-27309:
-
Description: 
Add major compact table or region operation on master web table page

!image-2022-09-22-02-32-36-619.png!

  was:
 

!image-2022-09-22-02-32-36-619.png!


> Add major compact table or region operation on master web table page
> 
>
> Key: HBASE-27309
> URL: https://issues.apache.org/jira/browse/HBASE-27309
> Project: HBase
>  Issue Type: Sub-task
>Reporter: zhengsicheng
>Assignee: zhengsicheng
>Priority: Minor
> Attachments: image-2022-09-22-02-32-36-619.png
>
>
> Add major compact table or region operation on master web table page
> !image-2022-09-22-02-32-36-619.png!



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Updated] (HBASE-27309) Add major compact table or region operation on master web table page



 [ 
https://issues.apache.org/jira/browse/HBASE-27309?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

zhengsicheng updated HBASE-27309:
-
Attachment: image-2022-09-22-02-32-36-619.png

> Add major compact table or region operation on master web table page
> 
>
> Key: HBASE-27309
> URL: https://issues.apache.org/jira/browse/HBASE-27309
> Project: HBase
>  Issue Type: Sub-task
>Reporter: zhengsicheng
>Assignee: zhengsicheng
>Priority: Minor
> Attachments: image-2022-09-22-02-32-36-619.png
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[GitHub] [hbase] SiCheng-Zheng opened a new pull request, #4793: HBASE-27309 Add major compact table or region operation on master web…



SiCheng-Zheng opened a new pull request, #4793:
URL: https://github.com/apache/hbase/pull/4793

   … table page


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[jira] [Updated] (HBASE-27309) Add major compact table or region operation on master web table page



 [ 
https://issues.apache.org/jira/browse/HBASE-27309?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

zhengsicheng updated HBASE-27309:
-
Summary: Add major compact table or region operation on master web table 
page  (was: Add compact table or region on master web)

> Add major compact table or region operation on master web table page
> 
>
> Key: HBASE-27309
> URL: https://issues.apache.org/jira/browse/HBASE-27309
> Project: HBase
>  Issue Type: Sub-task
>Reporter: zhengsicheng
>Assignee: zhengsicheng
>Priority: Minor
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[GitHub] [hbase] Apache-HBase commented on pull request #4724: HBASE-27280 Add mutual authentication support to TLS



Apache-HBase commented on PR #4724:
URL: https://github.com/apache/hbase/pull/4724#issuecomment-1253955830

   :confetti_ball: **+1 overall**
   
   
   
   
   
   
   | Vote | Subsystem | Runtime | Comment |
   |::|--:|:|:|
   | +0 :ok: |  reexec  |   1m  0s |  Docker mode activated.  |
   ||| _ Prechecks _ |
   | +1 :green_heart: |  dupname  |   0m  0s |  No case conflicting files 
found.  |
   | +1 :green_heart: |  hbaseanti  |   0m  0s |  Patch does not have any 
anti-patterns.  |
   | +1 :green_heart: |  @author  |   0m  0s |  The patch does not contain any 
@author tags.  |
   ||| _ master Compile Tests _ |
   | +0 :ok: |  mvndep  |   0m 13s |  Maven dependency ordering for branch  |
   | +1 :green_heart: |  mvninstall  |   2m  9s |  master passed  |
   | +1 :green_heart: |  compile  |   2m 47s |  master passed  |
   | +1 :green_heart: |  checkstyle  |   0m 44s |  master passed  |
   | +1 :green_heart: |  spotless  |   0m 39s |  branch has no errors when 
running spotless:check.  |
   | +1 :green_heart: |  spotbugs  |   1m 45s |  master passed  |
   ||| _ Patch Compile Tests _ |
   | +0 :ok: |  mvndep  |   0m 11s |  Maven dependency ordering for patch  |
   | +1 :green_heart: |  mvninstall  |   2m 11s |  the patch passed  |
   | +1 :green_heart: |  compile  |   2m 45s |  the patch passed  |
   | -0 :warning: |  javac  |   2m 13s |  hbase-server generated 1 new + 192 
unchanged - 1 fixed = 193 total (was 193)  |
   | +1 :green_heart: |  checkstyle  |   0m 44s |  the patch passed  |
   | +1 :green_heart: |  whitespace  |   0m  0s |  The patch has no whitespace 
issues.  |
   | +1 :green_heart: |  hadoopcheck  |   7m 59s |  Patch does not cause any 
errors with Hadoop 3.2.4 3.3.4.  |
   | +1 :green_heart: |  spotless  |   0m 39s |  patch has no errors when 
running spotless:check.  |
   | +1 :green_heart: |  spotbugs  |   1m 59s |  the patch passed  |
   ||| _ Other Tests _ |
   | +1 :green_heart: |  asflicense  |   0m 19s |  The patch does not generate 
ASF License warnings.  |
   |  |   |  31m 20s |   |
   
   
   | Subsystem | Report/Notes |
   |--:|:-|
   | Docker | ClientAPI=1.41 ServerAPI=1.41 base: 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4724/10/artifact/yetus-general-check/output/Dockerfile
 |
   | GITHUB PR | https://github.com/apache/hbase/pull/4724 |
   | Optional Tests | dupname asflicense javac spotbugs hadoopcheck hbaseanti 
spotless checkstyle compile |
   | uname | Linux 99e1f6546893 5.4.0-124-generic #140-Ubuntu SMP Thu Aug 4 
02:23:37 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux |
   | Build tool | maven |
   | Personality | dev-support/hbase-personality.sh |
   | git revision | master / de127bde84 |
   | Default Java | Temurin-1.8.0_345-b01 |
   | javac | 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4724/10/artifact/yetus-general-check/output/diff-compile-javac-hbase-server.txt
 |
   | Max. process+thread count | 64 (vs. ulimit of 3) |
   | modules | C: hbase-common hbase-server U: . |
   | Console output | 
https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-4724/10/console
 |
   | versions | git=2.17.1 maven=3.6.3 spotbugs=4.7.2 |
   | Powered by | Apache Yetus 0.12.0 https://yetus.apache.org |
   
   
   This message was automatically generated.
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@hbase.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[jira] [Commented] (HBASE-24896) 'Stuck' in static initialization creating RegionInfo instance



[ 
https://issues.apache.org/jira/browse/HBASE-24896?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607780#comment-17607780
 ] 

Bryan Beaudreault commented on HBASE-24896:
---

This Jira is quite old, so i've filed 
https://issues.apache.org/jira/browse/HBASE-27381 for further 
investigation/discussion

> 'Stuck' in static initialization creating RegionInfo instance
> -
>
> Key: HBASE-24896
> URL: https://issues.apache.org/jira/browse/HBASE-24896
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 2.3.1
>Reporter: Michael Stack
>Assignee: Michael Stack
>Priority: Major
> Fix For: 3.0.0-alpha-1, 2.4.0, 2.3.2
>
> Attachments: hbasedn192-jstack-0.webarchive, 
> hbasedn192-jstack-1.webarchive, hbasedn192-jstack-2.webarchive
>
>
> We ran into the following deadlocked server in testing. The priority handlers 
> seem stuck across multiple thread dumps. Seven of the ten total priority 
> threads have this state:
> {code:java}
> "RpcServer.priority.RWQ.Fifo.read.handler=5,queue=1,port=16020" #82 daemon 
> prio=5 os_prio=0 cpu=0.70ms elapsed=315627.86s allocated=3744B 
> defined_classes=0 tid=0x7f3da0983040 nid=0x62d9 in Object.wait()  
> [0x7f3d9bc8c000]
>java.lang.Thread.State: RUNNABLE
>   at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.getRegion(HRegionServer.java:3327)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.getRegion(RSRpcServices.java:1491)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.newRegionScanner(RSRpcServices.java:3143)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.scan(RSRpcServices.java:3478)
>   at 
> org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:44858)
>   at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:393)
>   at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:133)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:338)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:318) 
> {code}
> The anomalous three are as follows:
> h3. #1
> {code:java}
> "RpcServer.priority.RWQ.Fifo.write.handler=0,queue=0,port=16020" #77 daemon 
> prio=5 os_prio=0 cpu=175.98ms elapsed=315627.86s allocated=2153K 
> defined_classes=14 tid=0x7f3da0ae6ec0 nid=0x62d4 in Object.wait()  
> [0x7f3d9c19]
>java.lang.Thread.State: RUNNABLE
>   at 
> org.apache.hadoop.hbase.client.RegionInfo.(RegionInfo.java:72)
>   at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.getRegion(HRegionServer.java:3327)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.getRegion(RSRpcServices.java:1491)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.mutate(RSRpcServices.java:2912)
>   at 
> org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:44856)
>   at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:393)
>   at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:133)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:338)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:318){code}
> ...which is the creation of the UNDEFINED in RegionInfo here:
> {color:#808000}@InterfaceAudience.Public{color}{color:#80}public 
> interface {color}RegionInfo {color:#80}extends 
> {color}Comparable {
>  RegionInfo {color:#660e7a}UNDEFINED {color}= 
> RegionInfoBuilder.newBuilder(TableName.valueOf({color:#008000}"__UNDEFINED__"{color})).build();
>  
> h3. #2
> {code:java}
> "RpcServer.priority.RWQ.Fifo.read.handler=4,queue=1,port=16020" #81 daemon 
> prio=5 os_prio=0 cpu=53.85ms elapsed=315627.86s allocated=81984B 
> defined_classes=3 tid=0x7f3da0981590 nid=0x62d8 in Object.wait()  
> [0x7f3d9bd8c000]
>java.lang.Thread.State: RUNNABLE
>   at 
> org.apache.hadoop.hbase.client.RegionInfoBuilder.(RegionInfoBuilder.java:49)
>   at 
> org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3231)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.executeOpenRegionProcedures(RSRpcServices.java:3755)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.lambda$executeProcedures$2(RSRpcServices.java:3827)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices$$Lambda$173/0x0017c0e40040.accept(Unknown
>  Source)
>   at java.util.ArrayList.forEach(java.base@11.0.6/ArrayList.java:1540)
>   at 
> java.util.Collections$UnmodifiableCollection.forEach(java.base@11.0.6/Collections.java:1085)
>   at 
>

[jira] [Created] (HBASE-27381) Still seeing 'Stuck' in static initialization creating RegionInfo instance

Bryan Beaudreault created HBASE-27381:
-

 Summary: Still seeing 'Stuck' in static initialization creating 
RegionInfo instance
 Key: HBASE-27381
 URL: https://issues.apache.org/jira/browse/HBASE-27381
 Project: HBase
  Issue Type: Bug
Reporter: Bryan Beaudreault


See https://issues.apache.org/jira/browse/HBASE-24896 for the original 
description. Despite having that fix, we are seeing this issue in a 2.4.6-based 
deploy. We recently started seeing it as we were moving to centos8. I'm not 
sure why the centos version would affect this, otherwise the hbase server 
version and java versions were not changing.

We're seeing this in a non-trivial number of new centos8 servers that we spin 
up. I'm pushing a hotfix which removes RegionInfo.UNDEFINED to see if that 
resolves our issue.

As mentioned in my last comments on that jira, it could be that this field is 
still an issue because according to 
[https://stackoverflow.com/questions/28631656/runnable-thread-state-but-in-object-wait:]

> Such deadlocks may be caused by a [typical 
> bug|https://bugs.openjdk.org/browse/JDK-8037567] when a subclass is 
> referenced from a static initializer.

If that's true, in this case MutableRegionInfo is a subclass/implementer of 
RegionInfo, so that could trigger it. Granted the linked bug is marked "Not An 
Issue".



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Commented] (HBASE-24896) 'Stuck' in static initialization creating RegionInfo instance



[ 
https://issues.apache.org/jira/browse/HBASE-24896?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607758#comment-17607758
 ] 

Bryan Beaudreault commented on HBASE-24896:
---

Trying some googling, and seeing stuff like 
[https://stackoverflow.com/questions/28631656/runnable-thread-state-but-in-object-wait.]
 This indicates:

> Such deadlocks may be caused by a [typical 
> bug|https://bugs.openjdk.java.net/browse/JDK-8037567] when a subclass is 
> referenced from a static initializer.

By no means conclusive, but if that's true then the UNDEFINED static constant 
in RegionInfo references a subclass of the interface and could be the reason 
this continues triggering. I'm removing this variable from our fork to see if 
it resolves the issue – we're seeing a relatively high percentage of new 
centos8 servers hit this.

> 'Stuck' in static initialization creating RegionInfo instance
> -
>
> Key: HBASE-24896
> URL: https://issues.apache.org/jira/browse/HBASE-24896
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 2.3.1
>Reporter: Michael Stack
>Assignee: Michael Stack
>Priority: Major
> Fix For: 3.0.0-alpha-1, 2.4.0, 2.3.2
>
> Attachments: hbasedn192-jstack-0.webarchive, 
> hbasedn192-jstack-1.webarchive, hbasedn192-jstack-2.webarchive
>
>
> We ran into the following deadlocked server in testing. The priority handlers 
> seem stuck across multiple thread dumps. Seven of the ten total priority 
> threads have this state:
> {code:java}
> "RpcServer.priority.RWQ.Fifo.read.handler=5,queue=1,port=16020" #82 daemon 
> prio=5 os_prio=0 cpu=0.70ms elapsed=315627.86s allocated=3744B 
> defined_classes=0 tid=0x7f3da0983040 nid=0x62d9 in Object.wait()  
> [0x7f3d9bc8c000]
>java.lang.Thread.State: RUNNABLE
>   at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.getRegion(HRegionServer.java:3327)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.getRegion(RSRpcServices.java:1491)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.newRegionScanner(RSRpcServices.java:3143)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.scan(RSRpcServices.java:3478)
>   at 
> org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:44858)
>   at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:393)
>   at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:133)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:338)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:318) 
> {code}
> The anomalous three are as follows:
> h3. #1
> {code:java}
> "RpcServer.priority.RWQ.Fifo.write.handler=0,queue=0,port=16020" #77 daemon 
> prio=5 os_prio=0 cpu=175.98ms elapsed=315627.86s allocated=2153K 
> defined_classes=14 tid=0x7f3da0ae6ec0 nid=0x62d4 in Object.wait()  
> [0x7f3d9c19]
>java.lang.Thread.State: RUNNABLE
>   at 
> org.apache.hadoop.hbase.client.RegionInfo.(RegionInfo.java:72)
>   at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.getRegion(HRegionServer.java:3327)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.getRegion(RSRpcServices.java:1491)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.mutate(RSRpcServices.java:2912)
>   at 
> org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:44856)
>   at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:393)
>   at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:133)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:338)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:318){code}
> ...which is the creation of the UNDEFINED in RegionInfo here:
> {color:#808000}@InterfaceAudience.Public{color}{color:#80}public 
> interface {color}RegionInfo {color:#80}extends 
> {color}Comparable {
>  RegionInfo {color:#660e7a}UNDEFINED {color}= 
> RegionInfoBuilder.newBuilder(TableName.valueOf({color:#008000}"__UNDEFINED__"{color})).build();
>  
> h3. #2
> {code:java}
> "RpcServer.priority.RWQ.Fifo.read.handler=4,queue=1,port=16020" #81 daemon 
> prio=5 os_prio=0 cpu=53.85ms elapsed=315627.86s allocated=81984B 
> defined_classes=3 tid=0x7f3da0981590 nid=0x62d8 in Object.wait()  
> [0x7f3d9bd8c000]
>java.lang.Thread.State: RUNNABLE
>   at 
> org.apache.hadoop.hbase.client.RegionInfoBuilder.(RegionInfoBuilder.java:49)
>   at 
> org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3231)
>   at 
>

[jira] [Comment Edited] (HBASE-24896) 'Stuck' in static initialization creating RegionInfo instance



[ 
https://issues.apache.org/jira/browse/HBASE-24896?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607735#comment-17607735
 ] 

Bryan Beaudreault edited comment on HBASE-24896 at 9/21/22 1:02 PM:


We ran into this on a few servers recently, running version 2.4.6. So it has 
the fix for this jira, but our thread dumps were pretty identical to above – 
{code:java}
org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3331)
 

and

org.apache.hadoop.hbase.client.RegionInfo.(RegionInfo.java:76)

and

at 
org.apache.hadoop.hbase.client.RegionInfoBuilder.build(RegionInfoBuilder.java:110)
    at 
org.apache.hadoop.hbase.client.RegionInfoBuilder.(RegionInfoBuilder.java:34)
    at 
org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3331)
 
and (new)

org.apache.hadoop.hbase.regionserver.HRegionServer.getRegion(HRegionServer.java:3404)
 {code}
 

All were "in Object.wait()" but RUNNABLE state, and the regionserver was live 
but not doing anything. Various regions got stuck in transition.

 

These servers were running temurinjdk 11.0.12+7 and centos8. We've been running 
that version of java across thousands of regionservers for a while and never 
seen the issue. But we're only recently rolling out centos8 and so far it's 
only affected a subset of those servers. 


was (Author: bbeaudreault):
We ran into this on a few servers recently, running version 2.4.6. So it has 
the fix for this jira, but our thread dumps were pretty identical to above – 
{code:java}
org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3331)
 

and

org.apache.hadoop.hbase.client.RegionInfo.(RegionInfo.java:76)

and

at 
org.apache.hadoop.hbase.client.RegionInfoBuilder.build(RegionInfoBuilder.java:110)
    at 
org.apache.hadoop.hbase.client.RegionInfoBuilder.(RegionInfoBuilder.java:34)
    at 
org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3331)
 
and (new)

org.apache.hadoop.hbase.regionserver.HRegionServer.getRegion(HRegionServer.java:3404)
 {code}
 

All were "in Object.wait()" but RUNNABLE state, and the regionserver was live 
but not doing anything. Various regions got stuck in transition.

> 'Stuck' in static initialization creating RegionInfo instance
> -
>
> Key: HBASE-24896
> URL: https://issues.apache.org/jira/browse/HBASE-24896
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 2.3.1
>Reporter: Michael Stack
>Assignee: Michael Stack
>Priority: Major
> Fix For: 3.0.0-alpha-1, 2.4.0, 2.3.2
>
> Attachments: hbasedn192-jstack-0.webarchive, 
> hbasedn192-jstack-1.webarchive, hbasedn192-jstack-2.webarchive
>
>
> We ran into the following deadlocked server in testing. The priority handlers 
> seem stuck across multiple thread dumps. Seven of the ten total priority 
> threads have this state:
> {code:java}
> "RpcServer.priority.RWQ.Fifo.read.handler=5,queue=1,port=16020" #82 daemon 
> prio=5 os_prio=0 cpu=0.70ms elapsed=315627.86s allocated=3744B 
> defined_classes=0 tid=0x7f3da0983040 nid=0x62d9 in Object.wait()  
> [0x7f3d9bc8c000]
>java.lang.Thread.State: RUNNABLE
>   at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.getRegion(HRegionServer.java:3327)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.getRegion(RSRpcServices.java:1491)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.newRegionScanner(RSRpcServices.java:3143)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.scan(RSRpcServices.java:3478)
>   at 
> org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:44858)
>   at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:393)
>   at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:133)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:338)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:318) 
> {code}
> The anomalous three are as follows:
> h3. #1
> {code:java}
> "RpcServer.priority.RWQ.Fifo.write.handler=0,queue=0,port=16020" #77 daemon 
> prio=5 os_prio=0 cpu=175.98ms elapsed=315627.86s allocated=2153K 
> defined_classes=14 tid=0x7f3da0ae6ec0 nid=0x62d4 in Object.wait()  
> [0x7f3d9c19]
>java.lang.Thread.State: RUNNABLE
>   at 
> org.apache.hadoop.hbase.client.RegionInfo.(RegionInfo.java:72)
>   at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.getRegion(HRegionServer.java:3327)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.getRegion(RSRpcServices.java:1491)
>   at 
>

[jira] [Comment Edited] (HBASE-24896) 'Stuck' in static initialization creating RegionInfo instance



[ 
https://issues.apache.org/jira/browse/HBASE-24896?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607735#comment-17607735
 ] 

Bryan Beaudreault edited comment on HBASE-24896 at 9/21/22 12:58 PM:
-

We ran into this on a few servers recently, running version 2.4.6. So it has 
the fix for this jira, but our thread dumps were pretty identical to above – 
{code:java}
org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3331)
 

and

org.apache.hadoop.hbase.client.RegionInfo.(RegionInfo.java:76)

and

at 
org.apache.hadoop.hbase.client.RegionInfoBuilder.build(RegionInfoBuilder.java:110)
    at 
org.apache.hadoop.hbase.client.RegionInfoBuilder.(RegionInfoBuilder.java:34)
    at 
org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3331)
 
and (new)

org.apache.hadoop.hbase.regionserver.HRegionServer.getRegion(HRegionServer.java:3404)
 {code}
 

All were "in Object.wait()" but RUNNABLE state, and the regionserver was live 
but not doing anything. Various regions got stuck in transition.


was (Author: bbeaudreault):
We ran into this on a few servers recently, running version 2.4.6. So it has 
the fix for this jira, but our thread dumps were pretty identical to above – 
{code:java}
org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3331)
 

and

org.apache.hadoop.hbase.client.RegionInfo.(RegionInfo.java:76)

and

at 
org.apache.hadoop.hbase.client.RegionInfoBuilder.build(RegionInfoBuilder.java:110)
    at 
org.apache.hadoop.hbase.client.RegionInfoBuilder.(RegionInfoBuilder.java:34)
    at 
org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3331)
 
and (new)

org.apache.hadoop.hbase.regionserver.HRegionServer.getRegion(HRegionServer.java:3404)
 {code}

> 'Stuck' in static initialization creating RegionInfo instance
> -
>
> Key: HBASE-24896
> URL: https://issues.apache.org/jira/browse/HBASE-24896
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 2.3.1
>Reporter: Michael Stack
>Assignee: Michael Stack
>Priority: Major
> Fix For: 3.0.0-alpha-1, 2.4.0, 2.3.2
>
> Attachments: hbasedn192-jstack-0.webarchive, 
> hbasedn192-jstack-1.webarchive, hbasedn192-jstack-2.webarchive
>
>
> We ran into the following deadlocked server in testing. The priority handlers 
> seem stuck across multiple thread dumps. Seven of the ten total priority 
> threads have this state:
> {code:java}
> "RpcServer.priority.RWQ.Fifo.read.handler=5,queue=1,port=16020" #82 daemon 
> prio=5 os_prio=0 cpu=0.70ms elapsed=315627.86s allocated=3744B 
> defined_classes=0 tid=0x7f3da0983040 nid=0x62d9 in Object.wait()  
> [0x7f3d9bc8c000]
>java.lang.Thread.State: RUNNABLE
>   at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.getRegion(HRegionServer.java:3327)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.getRegion(RSRpcServices.java:1491)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.newRegionScanner(RSRpcServices.java:3143)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.scan(RSRpcServices.java:3478)
>   at 
> org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:44858)
>   at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:393)
>   at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:133)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:338)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:318) 
> {code}
> The anomalous three are as follows:
> h3. #1
> {code:java}
> "RpcServer.priority.RWQ.Fifo.write.handler=0,queue=0,port=16020" #77 daemon 
> prio=5 os_prio=0 cpu=175.98ms elapsed=315627.86s allocated=2153K 
> defined_classes=14 tid=0x7f3da0ae6ec0 nid=0x62d4 in Object.wait()  
> [0x7f3d9c19]
>java.lang.Thread.State: RUNNABLE
>   at 
> org.apache.hadoop.hbase.client.RegionInfo.(RegionInfo.java:72)
>   at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.getRegion(HRegionServer.java:3327)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.getRegion(RSRpcServices.java:1491)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.mutate(RSRpcServices.java:2912)
>   at 
> org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:44856)
>   at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:393)
>   at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:133)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:338)
>   at 
>

[jira] [Commented] (HBASE-24896) 'Stuck' in static initialization creating RegionInfo instance



[ 
https://issues.apache.org/jira/browse/HBASE-24896?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607735#comment-17607735
 ] 

Bryan Beaudreault commented on HBASE-24896:
---

We ran into this on a few servers recently, running version 2.4.6. So it has 
the fix for this jira, but our thread dumps were pretty identical to above – 
{code:java}
org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3331)
 

and

org.apache.hadoop.hbase.client.RegionInfo.(RegionInfo.java:76)

and

at 
org.apache.hadoop.hbase.client.RegionInfoBuilder.build(RegionInfoBuilder.java:110)
    at 
org.apache.hadoop.hbase.client.RegionInfoBuilder.(RegionInfoBuilder.java:34)
    at 
org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3331)
 
and (new)

org.apache.hadoop.hbase.regionserver.HRegionServer.getRegion(HRegionServer.java:3404)
 {code}

> 'Stuck' in static initialization creating RegionInfo instance
> -
>
> Key: HBASE-24896
> URL: https://issues.apache.org/jira/browse/HBASE-24896
> Project: HBase
>  Issue Type: Bug
>Affects Versions: 2.3.1
>Reporter: Michael Stack
>Assignee: Michael Stack
>Priority: Major
> Fix For: 3.0.0-alpha-1, 2.4.0, 2.3.2
>
> Attachments: hbasedn192-jstack-0.webarchive, 
> hbasedn192-jstack-1.webarchive, hbasedn192-jstack-2.webarchive
>
>
> We ran into the following deadlocked server in testing. The priority handlers 
> seem stuck across multiple thread dumps. Seven of the ten total priority 
> threads have this state:
> {code:java}
> "RpcServer.priority.RWQ.Fifo.read.handler=5,queue=1,port=16020" #82 daemon 
> prio=5 os_prio=0 cpu=0.70ms elapsed=315627.86s allocated=3744B 
> defined_classes=0 tid=0x7f3da0983040 nid=0x62d9 in Object.wait()  
> [0x7f3d9bc8c000]
>java.lang.Thread.State: RUNNABLE
>   at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.getRegion(HRegionServer.java:3327)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.getRegion(RSRpcServices.java:1491)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.newRegionScanner(RSRpcServices.java:3143)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.scan(RSRpcServices.java:3478)
>   at 
> org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:44858)
>   at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:393)
>   at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:133)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:338)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:318) 
> {code}
> The anomalous three are as follows:
> h3. #1
> {code:java}
> "RpcServer.priority.RWQ.Fifo.write.handler=0,queue=0,port=16020" #77 daemon 
> prio=5 os_prio=0 cpu=175.98ms elapsed=315627.86s allocated=2153K 
> defined_classes=14 tid=0x7f3da0ae6ec0 nid=0x62d4 in Object.wait()  
> [0x7f3d9c19]
>java.lang.Thread.State: RUNNABLE
>   at 
> org.apache.hadoop.hbase.client.RegionInfo.(RegionInfo.java:72)
>   at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.getRegion(HRegionServer.java:3327)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.getRegion(RSRpcServices.java:1491)
>   at 
> org.apache.hadoop.hbase.regionserver.RSRpcServices.mutate(RSRpcServices.java:2912)
>   at 
> org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:44856)
>   at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:393)
>   at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:133)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:338)
>   at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:318){code}
> ...which is the creation of the UNDEFINED in RegionInfo here:
> {color:#808000}@InterfaceAudience.Public{color}{color:#80}public 
> interface {color}RegionInfo {color:#80}extends 
> {color}Comparable {
>  RegionInfo {color:#660e7a}UNDEFINED {color}= 
> RegionInfoBuilder.newBuilder(TableName.valueOf({color:#008000}"__UNDEFINED__"{color})).build();
>  
> h3. #2
> {code:java}
> "RpcServer.priority.RWQ.Fifo.read.handler=4,queue=1,port=16020" #81 daemon 
> prio=5 os_prio=0 cpu=53.85ms elapsed=315627.86s allocated=81984B 
> defined_classes=3 tid=0x7f3da0981590 nid=0x62d8 in Object.wait()  
> [0x7f3d9bd8c000]
>java.lang.Thread.State: RUNNABLE
>   at 
> org.apache.hadoop.hbase.client.RegionInfoBuilder.(RegionInfoBuilder.java:49)
>   at 
> org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.toRegionInfo(ProtobufUtil.java:3231)
>   at 
>

[jira] [Commented] (HBASE-27370) Avoid decompressing blocks when reading from bucket cache prefetch threads



[ 
https://issues.apache.org/jira/browse/HBASE-27370?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607701#comment-17607701
 ] 

Hudson commented on HBASE-27370:


Results for branch branch-2
[build #649 on 
builds.a.o|https://ci-hbase.apache.org/job/HBase%20Nightly/job/branch-2/649/]: 
(x) *{color:red}-1 overall{color}*

details (if available):

(/) {color:green}+1 general checks{color}
-- For more information [see general 
report|https://ci-hbase.apache.org/job/HBase%20Nightly/job/branch-2/649/General_20Nightly_20Build_20Report/]


(x) {color:red}-1 jdk8 hadoop2 checks{color}
-- For more information [see jdk8 (hadoop2) 
report|https://ci-hbase.apache.org/job/HBase%20Nightly/job/branch-2/649/JDK8_20Nightly_20Build_20Report_20_28Hadoop2_29/]


(/) {color:green}+1 jdk8 hadoop3 checks{color}
-- For more information [see jdk8 (hadoop3) 
report|https://ci-hbase.apache.org/job/HBase%20Nightly/job/branch-2/649/JDK8_20Nightly_20Build_20Report_20_28Hadoop3_29/]


(/) {color:green}+1 jdk11 hadoop3 checks{color}
-- For more information [see jdk11 
report|https://ci-hbase.apache.org/job/HBase%20Nightly/job/branch-2/649/JDK11_20Nightly_20Build_20Report_20_28Hadoop3_29/]


(/) {color:green}+1 source release artifact{color}
-- See build output for details.


(/) {color:green}+1 client integration test{color}


> Avoid decompressing blocks when reading from bucket cache prefetch threads 
> ---
>
> Key: HBASE-27370
> URL: https://issues.apache.org/jira/browse/HBASE-27370
> Project: HBase
>  Issue Type: Improvement
>Affects Versions: 3.0.0-alpha-4
>Reporter: Wellington Chevreuil
>Assignee: Wellington Chevreuil
>Priority: Major
> Fix For: 2.6.0, 2.5.1, 3.0.0-alpha-4, 2.4.15
>
>
> When prefetching blocks into bucket cache, we had observed a consistent CPU 
> usage around 70% with no other workloads ongoing. For large bucket caches 
> (i.e. when using file based bucket cache), the prefetch can last for sometime 
> and having such a high CPU usage may impact the database usage by client 
> applications.
> Further analysis of the prefetch threads stack trace showed that very often, 
> decompress logic is being executed by these threads:
> {noformat}
> "hfile-prefetch-1654895061122" #234 daemon prio=5 os_prio=0 
> tid=0x557bb2907000 nid=0x406d runnable [0x7f294a504000]
>    java.lang.Thread.State: RUNNABLE
>         at 
> org.apache.hadoop.io.compress.snappy.SnappyDecompressor.decompressBytesDirect(Native
>  Method)
>         at 
> org.apache.hadoop.io.compress.snappy.SnappyDecompressor.decompress(SnappyDecompressor.java:235)
>         at 
> org.apache.hadoop.io.compress.BlockDecompressorStream.decompress(BlockDecompressorStream.java:88)
>         at 
> org.apache.hadoop.io.compress.DecompressorStream.read(DecompressorStream.java:105)
>         at java.io.BufferedInputStream.read1(BufferedInputStream.java:284)
>         at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
>         - locked <0x0002d24c0ae8> (a java.io.BufferedInputStream)
>         at 
> org.apache.hadoop.hbase.io.util.BlockIOUtils.readFullyWithHeapBuffer(BlockIOUtils.java:105)
>         at 
> org.apache.hadoop.hbase.io.compress.Compression.decompress(Compression.java:465)
>         at 
> org.apache.hadoop.hbase.io.encoding.HFileBlockDefaultDecodingContext.prepareDecoding(HFileBlockDefaultDecodingContext.java:90)
>         at 
> org.apache.hadoop.hbase.io.hfile.HFileBlock.unpack(HFileBlock.java:650)
>         at 
> org.apache.hadoop.hbase.io.hfile.HFileReaderImpl.readBlock(HFileReaderImpl.java:1342)
>  {noformat}
> This is because *HFileReaderImpl.readBlock* is always decompressing blocks 
> even when *hbase.block.data.cachecompressed* is set to true. 
> This patch proposes an alternative flag to differentiate prefetch from normal 
> reads, so that doesn't decompress DATA blocks when prefetching with  
> *hbase.block.data.cachecompressed* set to true. 



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Commented] (HBASE-27310) Add disabled table is displayed red on master web tables list



[ 
https://issues.apache.org/jira/browse/HBASE-27310?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17607700#comment-17607700
 ] 

Hudson commented on HBASE-27310:


Results for branch branch-2
[build #649 on 
builds.a.o|https://ci-hbase.apache.org/job/HBase%20Nightly/job/branch-2/649/]: 
(x) *{color:red}-1 overall{color}*

details (if available):

(/) {color:green}+1 general checks{color}
-- For more information [see general 
report|https://ci-hbase.apache.org/job/HBase%20Nightly/job/branch-2/649/General_20Nightly_20Build_20Report/]


(x) {color:red}-1 jdk8 hadoop2 checks{color}
-- For more information [see jdk8 (hadoop2) 
report|https://ci-hbase.apache.org/job/HBase%20Nightly/job/branch-2/649/JDK8_20Nightly_20Build_20Report_20_28Hadoop2_29/]


(/) {color:green}+1 jdk8 hadoop3 checks{color}
-- For more information [see jdk8 (hadoop3) 
report|https://ci-hbase.apache.org/job/HBase%20Nightly/job/branch-2/649/JDK8_20Nightly_20Build_20Report_20_28Hadoop3_29/]


(/) {color:green}+1 jdk11 hadoop3 checks{color}
-- For more information [see jdk11 
report|https://ci-hbase.apache.org/job/HBase%20Nightly/job/branch-2/649/JDK11_20Nightly_20Build_20Report_20_28Hadoop3_29/]


(/) {color:green}+1 source release artifact{color}
-- See build output for details.


(/) {color:green}+1 client integration test{color}


> Add disabled table is displayed red on master web tables list
> -
>
> Key: HBASE-27310
> URL: https://issues.apache.org/jira/browse/HBASE-27310
> Project: HBase
>  Issue Type: Sub-task
>  Components: UI
>Reporter: zhengsicheng
>Assignee: zhengsicheng
>Priority: Minor
> Fix For: 2.6.0, 3.0.0-alpha-4
>
> Attachments: image-2022-09-01-14-27-56-145.png
>
>
> The table of the disabled state is displayed in red
> The effect is as follows：
> !image-2022-09-01-14-27-56-145.png!



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Commented] (HBASE-27109) Move replication queue storage from zookeeper to a separated HBase table