HBASE-15019 Replication stuck when HDFS is restarted.
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/5041485a Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/5041485a Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/5041485a Branch: refs/heads/branch-1.1 Commit: 5041485aa5c1ecfaa4697b8d0b8a78d027ceaa8a Parents: aa5dfae Author: Matteo Bertozzi <matteo.berto...@cloudera.com> Authored: Thu Jan 21 00:05:57 2016 -0600 Committer: Matteo Bertozzi <matteo.berto...@cloudera.com> Committed: Thu Jan 28 09:49:38 2016 -0800 ---------------------------------------------------------------------- .../regionserver/ReplicationSource.java | 30 +++++++++++-- .../hbase/util/LeaseNotRecoveredException.java | 47 ++++++++++++++++++++ .../org/apache/hadoop/hbase/wal/WALFactory.java | 5 ++- 3 files changed, 78 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/5041485a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java index 755654a..0496f73 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java @@ -53,8 +53,11 @@ import org.apache.hadoop.hbase.replication.ReplicationQueueInfo; import org.apache.hadoop.hbase.replication.ReplicationQueues; import org.apache.hadoop.hbase.replication.SystemTableWALEntryFilter; import org.apache.hadoop.hbase.replication.WALEntryFilter; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.CancelableProgressable; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hadoop.hbase.util.LeaseNotRecoveredException; import org.apache.hadoop.hbase.util.Threads; import com.google.common.collect.Lists; @@ -587,6 +590,11 @@ public class ReplicationSource extends Thread // TODO What happens the log is missing in both places? } } + } catch (LeaseNotRecoveredException lnre) { + // HBASE-15019 the WAL was not closed due to some hiccup. + LOG.warn(peerClusterZnode + " Try to recover the WAL lease " + currentPath, lnre); + recoverLease(conf, currentPath); + this.reader = null; } catch (IOException ioe) { if (ioe instanceof EOFException && isCurrentLogEmpty()) return true; LOG.warn(this.peerClusterZnode + " Got: ", ioe); @@ -606,6 +614,22 @@ public class ReplicationSource extends Thread return true; } + private void recoverLease(final Configuration conf, final Path path) { + try { + final FileSystem dfs = FSUtils.getCurrentFileSystem(conf); + FSUtils fsUtils = FSUtils.getInstance(dfs, conf); + fsUtils.recoverFileLease(dfs, path, conf, new CancelableProgressable() { + @Override + public boolean progress() { + LOG.debug("recover WAL lease: " + path); + return isActive(); + } + }); + } catch (IOException e) { + LOG.warn("unable to recover lease for WAL: " + path, e); + } + } + /* * Checks whether the current log file is empty, and it is not a recovered queue. This is to * handle scenario when in an idle cluster, there is no entry in the current log and we keep on @@ -861,9 +885,9 @@ public class ReplicationSource extends Thread * @param p path to split * @return start time */ - private long getTS(Path p) { - String[] parts = p.getName().split("\\."); - return Long.parseLong(parts[parts.length-1]); + private static long getTS(Path p) { + int tsIndex = p.getName().lastIndexOf('.') + 1; + return Long.parseLong(p.getName().substring(tsIndex)); } } http://git-wip-us.apache.org/repos/asf/hbase/blob/5041485a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/LeaseNotRecoveredException.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/LeaseNotRecoveredException.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/LeaseNotRecoveredException.java new file mode 100644 index 0000000..ca769b8 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/LeaseNotRecoveredException.java @@ -0,0 +1,47 @@ +/** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.util; + +import org.apache.hadoop.hbase.HBaseIOException; +import org.apache.hadoop.hbase.classification.InterfaceAudience; +import org.apache.hadoop.hbase.classification.InterfaceStability; + +/** + * Thrown when the lease was expected to be recovered, + * but the file can't be opened. + */ +@InterfaceAudience.Public +@InterfaceStability.Stable +public class LeaseNotRecoveredException extends HBaseIOException { + public LeaseNotRecoveredException() { + super(); + } + + public LeaseNotRecoveredException(String message) { + super(message); + } + + public LeaseNotRecoveredException(String message, Throwable cause) { + super(message, cause); + } + + public LeaseNotRecoveredException(Throwable cause) { + super(cause); + } +} http://git-wip-us.apache.org/repos/asf/hbase/blob/5041485a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java index 4ef320a..02e8a75 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java @@ -40,6 +40,7 @@ import org.apache.hadoop.hbase.wal.WAL.Reader; import org.apache.hadoop.hbase.wal.WALProvider.Writer; import org.apache.hadoop.hbase.util.CancelableProgressable; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.util.LeaseNotRecoveredException; // imports for things that haven't moved from regionserver.wal yet. import org.apache.hadoop.hbase.regionserver.wal.MetricsWAL; @@ -334,8 +335,10 @@ public class WALFactory { throw iioe; } } + throw new LeaseNotRecoveredException(e); + } else { + throw e; } - throw e; } } } catch (IOException ie) {