HBASE-15019 Replication stuck when HDFS is restarted.
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/60c6b6df Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/60c6b6df Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/60c6b6df Branch: refs/heads/0.98 Commit: 60c6b6df104030995754bb1470a0d5d3e20cf220 Parents: 444debd Author: Matteo Bertozzi <matteo.berto...@cloudera.com> Authored: Thu Jan 28 10:04:20 2016 -0800 Committer: Matteo Bertozzi <matteo.berto...@cloudera.com> Committed: Thu Jan 28 10:04:20 2016 -0800 ---------------------------------------------------------------------- .../hbase/regionserver/wal/HLogFactory.java | 11 +++-- .../regionserver/ReplicationSource.java | 30 +++++++++++-- .../hbase/util/LeaseNotRecoveredException.java | 47 ++++++++++++++++++++ 3 files changed, 81 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/60c6b6df/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLogFactory.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLogFactory.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLogFactory.java index e6107bf..6999f8e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLogFactory.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLogFactory.java @@ -37,6 +37,7 @@ import org.apache.hadoop.hbase.regionserver.wal.HLog.Reader; import org.apache.hadoop.hbase.regionserver.wal.HLog.Writer; import org.apache.hadoop.hbase.util.CancelableProgressable; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.util.LeaseNotRecoveredException; @InterfaceAudience.Private public class HLogFactory { @@ -46,12 +47,12 @@ public class HLogFactory { final Configuration conf) throws IOException { return new FSHLog(fs, root, logName, conf); } - + public static HLog createHLog(final FileSystem fs, final Path root, final String logName, final String oldLogName, final Configuration conf) throws IOException { return new FSHLog(fs, root, logName, oldLogName, conf); } - + public static HLog createHLog(final FileSystem fs, final Path root, final String logName, final Configuration conf, final List<WALActionsListener> listeners, final String prefix) throws IOException { @@ -61,7 +62,7 @@ public class HLogFactory { public static HLog createMetaHLog(final FileSystem fs, final Path root, final String logName, final Configuration conf, final List<WALActionsListener> listeners, final String prefix) throws IOException { - return new FSHLog(fs, root, logName, HConstants.HREGION_OLDLOGDIR_NAME, + return new FSHLog(fs, root, logName, HConstants.HREGION_OLDLOGDIR_NAME, conf, listeners, false, prefix, true); } @@ -162,8 +163,10 @@ public class HLogFactory { throw iioe; } } + throw new LeaseNotRecoveredException(e); + } else { + throw e; } - throw e; } } } catch (IOException ie) { http://git-wip-us.apache.org/repos/asf/hbase/blob/60c6b6df/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java index 97993bb..b7330aa 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java @@ -50,7 +50,10 @@ import org.apache.hadoop.hbase.replication.ReplicationQueueInfo; import org.apache.hadoop.hbase.replication.ReplicationQueues; import org.apache.hadoop.hbase.replication.SystemTableWALEntryFilter; import org.apache.hadoop.hbase.replication.WALEntryFilter; +import org.apache.hadoop.hbase.util.CancelableProgressable; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hadoop.hbase.util.LeaseNotRecoveredException; import org.apache.hadoop.hbase.util.Threads; import com.google.common.collect.Lists; import com.google.common.util.concurrent.ListenableFuture; @@ -577,6 +580,11 @@ public class ReplicationSource extends Thread // TODO What happens the log is missing in both places? } } + } catch (LeaseNotRecoveredException lnre) { + // HBASE-15019 the WAL was not closed due to some hiccup. + LOG.warn(peerClusterZnode + " Try to recover the WAL lease " + currentPath, lnre); + recoverLease(conf, currentPath); + this.reader = null; } catch (IOException ioe) { if (ioe instanceof EOFException && isCurrentLogEmpty()) return true; LOG.warn(this.peerClusterZnode + " Got: ", ioe); @@ -596,6 +604,22 @@ public class ReplicationSource extends Thread return true; } + private void recoverLease(final Configuration conf, final Path path) { + try { + final FileSystem dfs = FSUtils.getCurrentFileSystem(conf); + FSUtils fsUtils = FSUtils.getInstance(dfs, conf); + fsUtils.recoverFileLease(dfs, path, conf, new CancelableProgressable() { + @Override + public boolean progress() { + LOG.debug("recover WAL lease: " + path); + return isActive(); + } + }); + } catch (IOException e) { + LOG.warn("unable to recover lease for WAL: " + path, e); + } + } + /* * Checks whether the current log file is empty, and it is not a recovered queue. This is to * handle scenario when in an idle cluster, there is no entry in the current log and we keep on @@ -845,9 +869,9 @@ public class ReplicationSource extends Thread * @param p path to split * @return start time */ - private long getTS(Path p) { - String[] parts = p.getName().split("\\."); - return Long.parseLong(parts[parts.length-1]); + private static long getTS(Path p) { + int tsIndex = p.getName().lastIndexOf('.') + 1; + return Long.parseLong(p.getName().substring(tsIndex)); } } http://git-wip-us.apache.org/repos/asf/hbase/blob/60c6b6df/hbase-server/src/main/java/org/apache/hadoop/hbase/util/LeaseNotRecoveredException.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/LeaseNotRecoveredException.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/LeaseNotRecoveredException.java new file mode 100644 index 0000000..6a72e42 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/LeaseNotRecoveredException.java @@ -0,0 +1,47 @@ +/** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.util; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.hbase.HBaseIOException; + +/** + * Thrown when the lease was expected to be recovered, + * but the file can't be opened. + */ +@InterfaceAudience.Public +@InterfaceStability.Stable +public class LeaseNotRecoveredException extends HBaseIOException { + public LeaseNotRecoveredException() { + super(); + } + + public LeaseNotRecoveredException(String message) { + super(message); + } + + public LeaseNotRecoveredException(String message, Throwable cause) { + super(message, cause); + } + + public LeaseNotRecoveredException(Throwable cause) { + super(cause); + } +}