[42/50] hbase git commit: HBASE-15984 Handle premature EOF treatment of WALs in replication.
HBASE-15984 Handle premature EOF treatment of WALs in replication. In some particular deployments, the Replication code believes it has reached EOF for a WAL prior to succesfully parsing all bytes known to exist in a cleanly closed file. Consistently this failure happens due to an InvalidProtobufException after some number of seeks during our attempts to tail the in-progress RegionServer WAL. As a work-around, this patch treats cleanly closed files differently than other execution paths. If an EOF is detected due to parsing or other errors while there are still unparsed bytes before the end-of-file trailer, we now reset the WAL to the very beginning and attempt a clean read-through. In current testing, a single such reset is sufficient to work around observed dataloss. However, the above change will retry a given WAL file indefinitely. On each such attempt, a log message like the below will be emitted at the WARN level: Processing end of WAL file '{}'. At position {}, which is too far away from reported file length {}. Restarting WAL reading (see HBASE-15983 for details). Additionally, this patch adds some additional log detail at the TRACE level about file offsets seen while handling recoverable errors. It also add metrics that measure the use of this recovery mechanism. Conflicts: hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSource.java hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationHLogReaderManager.java hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/cf192c96 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/cf192c96 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/cf192c96 Branch: refs/heads/0.98 Commit: cf192c96b0ef221f7e309bc2174f0d38dc44d96d Parents: 568b3f7 Author: Sean Busbey Authored: Tue Jun 7 16:00:46 2016 -0500 Committer: Andrew Purtell Committed: Tue Oct 4 11:53:36 2016 -0700 -- .../MetricsReplicationSourceSource.java | 17 + .../MetricsReplicationGlobalSourceSource.java | 50 + .../MetricsReplicationSourceSourceImpl.java | 78 +++ .../MetricsReplicationGlobalSourceSource.java | 50 + .../MetricsReplicationSourceSourceImpl.java | 79 .../regionserver/wal/ProtobufLogReader.java | 45 +-- .../replication/regionserver/MetricsSource.java | 39 +- .../ReplicationHLogReaderManager.java | 10 +++ .../regionserver/ReplicationSource.java | 40 -- src/main/asciidoc/_chapters/ops_mgt.adoc| 24 +- 10 files changed, 414 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hbase/blob/cf192c96/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java -- diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java index ea0ae20..06033ae 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java @@ -37,6 +37,16 @@ public interface MetricsReplicationSourceSource extends BaseSource { public static final String SOURCE_LOG_EDITS_FILTERED = "source.logEditsFiltered"; + public static final String SOURCE_CLOSED_LOGS_WITH_UNKNOWN_LENGTH = + "source.closedLogsWithUnknownFileLength"; + public static final String SOURCE_UNCLEANLY_CLOSED_LOGS = "source.uncleanlyClosedLogs"; + public static final String SOURCE_UNCLEANLY_CLOSED_IGNORED_IN_BYTES = + "source.ignoredUncleanlyClosedLogContentsInBytes"; + public static final String SOURCE_RESTARTED_LOG_READING = "source.restartedLogReading"; + public static final String SOURCE_REPEATED_LOG_FILE_BYTES = "source.repeatedLogFileBytes"; + public static final String SOURCE_COMPLETED_LOGS = "source.completedLogs"; + public static final String SOURCE_COMPLE
[1/3] hbase git commit: HBASE-15984 Handle premature EOF treatment of WALs in replication.
Repository: hbase Updated Branches: refs/heads/branch-1.1 35e22455f -> e7ee6fa20 Updated Tags: refs/tags/1.1.7RC1 [created] 95a010cfa HBASE-15984 Handle premature EOF treatment of WALs in replication. In some particular deployments, the Replication code believes it has reached EOF for a WAL prior to succesfully parsing all bytes known to exist in a cleanly closed file. Consistently this failure happens due to an InvalidProtobufException after some number of seeks during our attempts to tail the in-progress RegionServer WAL. As a work-around, this patch treats cleanly closed files differently than other execution paths. If an EOF is detected due to parsing or other errors while there are still unparsed bytes before the end-of-file trailer, we now reset the WAL to the very beginning and attempt a clean read-through. In current testing, a single such reset is sufficient to work around observed dataloss. However, the above change will retry a given WAL file indefinitely. On each such attempt, a log message like the below will be emitted at the WARN level: Processing end of WAL file '{}'. At position {}, which is too far away from reported file length {}. Restarting WAL reading (see HBASE-15983 for details). Additionally, this patch adds some additional log detail at the TRACE level about file offsets seen while handling recoverable errors. It also add metrics that measure the use of this recovery mechanism. Conflicts: hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSource.java hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/07e64d96 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/07e64d96 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/07e64d96 Branch: refs/heads/branch-1.1 Commit: 07e64d9661a0e9128964faacf994d725271e39fc Parents: 35e2245 Author: Sean Busbey Authored: Tue Jun 7 16:00:46 2016 -0500 Committer: Andrew Purtell Committed: Fri Oct 7 14:08:27 2016 -0700 -- .../MetricsReplicationSourceSource.java | 17 + .../MetricsReplicationGlobalSourceSource.java | 44 +++ .../MetricsReplicationSourceSourceImpl.java | 79 .../regionserver/wal/ProtobufLogReader.java | 45 +-- .../replication/regionserver/MetricsSource.java | 35 + .../regionserver/ReplicationSource.java | 41 -- .../ReplicationWALReaderManager.java| 10 +++ src/main/asciidoc/_chapters/ops_mgt.adoc| 24 +- 8 files changed, 278 insertions(+), 17 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hbase/blob/07e64d96/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java -- diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java index fecf191..22b90dd 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java @@ -32,6 +32,16 @@ public interface MetricsReplicationSourceSource { public static final String SOURCE_LOG_EDITS_FILTERED = "source.logEditsFiltered"; + public static final String SOURCE_CLOSED_LOGS_WITH_UNKNOWN_LENGTH = + "source.closedLogsWithUnknownFileLength"; + public static final String SOURCE_UNCLEANLY_CLOSED_LOGS = "source.uncleanlyClosedLogs"; + public static final String SOURCE_UNCLEANLY_CLOSED_IGNORED_IN_BYTES = + "source.ignoredUncleanlyClosedLogContentsInBytes"; + public static final String SOURCE_RESTARTED_LOG_READING = "source.restartedLogReading"; + public static final String SOURCE_REPEATED_LOG_FILE_BYTES = "source.repeatedLogFileBytes"; + public static final String SOURCE_COMPLETED_LOGS = "source.completedLogs"; + public static final String SOURCE_COMPLETED_RECOVERY_QUEUES = "source.completedRecoverQueues"; + void setLastShippedAge(long age); void setSizeOfLogQueue(int size); void
[1/2] hbase git commit: HBASE-15984 Handle premature EOF treatment of WALs in replication.
Repository: hbase Updated Branches: refs/heads/0.98 568b3f7dd -> c95f214cc HBASE-15984 Handle premature EOF treatment of WALs in replication. In some particular deployments, the Replication code believes it has reached EOF for a WAL prior to succesfully parsing all bytes known to exist in a cleanly closed file. Consistently this failure happens due to an InvalidProtobufException after some number of seeks during our attempts to tail the in-progress RegionServer WAL. As a work-around, this patch treats cleanly closed files differently than other execution paths. If an EOF is detected due to parsing or other errors while there are still unparsed bytes before the end-of-file trailer, we now reset the WAL to the very beginning and attempt a clean read-through. In current testing, a single such reset is sufficient to work around observed dataloss. However, the above change will retry a given WAL file indefinitely. On each such attempt, a log message like the below will be emitted at the WARN level: Processing end of WAL file '{}'. At position {}, which is too far away from reported file length {}. Restarting WAL reading (see HBASE-15983 for details). Additionally, this patch adds some additional log detail at the TRACE level about file offsets seen while handling recoverable errors. It also add metrics that measure the use of this recovery mechanism. Conflicts: hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSource.java hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationHLogReaderManager.java hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/cf192c96 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/cf192c96 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/cf192c96 Branch: refs/heads/0.98 Commit: cf192c96b0ef221f7e309bc2174f0d38dc44d96d Parents: 568b3f7 Author: Sean Busbey Authored: Tue Jun 7 16:00:46 2016 -0500 Committer: Andrew Purtell Committed: Tue Oct 4 11:53:36 2016 -0700 -- .../MetricsReplicationSourceSource.java | 17 + .../MetricsReplicationGlobalSourceSource.java | 50 + .../MetricsReplicationSourceSourceImpl.java | 78 +++ .../MetricsReplicationGlobalSourceSource.java | 50 + .../MetricsReplicationSourceSourceImpl.java | 79 .../regionserver/wal/ProtobufLogReader.java | 45 +-- .../replication/regionserver/MetricsSource.java | 39 +- .../ReplicationHLogReaderManager.java | 10 +++ .../regionserver/ReplicationSource.java | 40 -- src/main/asciidoc/_chapters/ops_mgt.adoc| 24 +- 10 files changed, 414 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hbase/blob/cf192c96/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java -- diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java index ea0ae20..06033ae 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java @@ -37,6 +37,16 @@ public interface MetricsReplicationSourceSource extends BaseSource { public static final String SOURCE_LOG_EDITS_FILTERED = "source.logEditsFiltered"; + public static final String SOURCE_CLOSED_LOGS_WITH_UNKNOWN_LENGTH = + "source.closedLogsWithUnknownFileLength"; + public static final String SOURCE_UNCLEANLY_CLOSED_LOGS = "source.uncleanlyClosedLogs"; + public static final String SOURCE_UNCLEANLY_CLOSED_IGNORED_IN_BYTES = + "source.ignoredUncleanlyClosedLogContentsInBytes"; + public static final String SOURCE_RESTARTED_LOG_READING = "source.restartedLogReading"; + public static final String SOURCE_REPEATED_LOG_FILE_BYTES = "source.repeatedLogFileBytes"; + public static final String SOURCE_COMPL
hbase git commit: HBASE-15984 Handle premature EOF treatment of WALs in replication.
Repository: hbase Updated Branches: refs/heads/branch-1.2 bfb20c0c1 -> 42dff8a58 HBASE-15984 Handle premature EOF treatment of WALs in replication. In some particular deployments, the Replication code believes it has reached EOF for a WAL prior to succesfully parsing all bytes known to exist in a cleanly closed file. Consistently this failure happens due to an InvalidProtobufException after some number of seeks during our attempts to tail the in-progress RegionServer WAL. As a work-around, this patch treats cleanly closed files differently than other execution paths. If an EOF is detected due to parsing or other errors while there are still unparsed bytes before the end-of-file trailer, we now reset the WAL to the very beginning and attempt a clean read-through. In current testing, a single such reset is sufficient to work around observed dataloss. However, the above change will retry a given WAL file indefinitely. On each such attempt, a log message like the below will be emitted at the WARN level: Processing end of WAL file '{}'. At position {}, which is too far away from reported file length {}. Restarting WAL reading (see HBASE-15983 for details). Additionally, this patch adds some additional log detail at the TRACE level about file offsets seen while handling recoverable errors. It also add metrics that measure the use of this recovery mechanism. Conflicts: hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSource.java hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java Conflicts: hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSource.java hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/42dff8a5 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/42dff8a5 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/42dff8a5 Branch: refs/heads/branch-1.2 Commit: 42dff8a58af02ec03fe97db34ab930defb79141f Parents: bfb20c0 Author: Sean Busbey Authored: Tue Jun 7 16:00:46 2016 -0500 Committer: Sean Busbey Committed: Thu Sep 29 16:22:37 2016 -0500 -- .../MetricsReplicationSourceSource.java | 17 + .../MetricsReplicationGlobalSourceSource.java | 44 +++ .../MetricsReplicationSourceSourceImpl.java | 80 .../regionserver/wal/ProtobufLogReader.java | 45 +-- .../replication/regionserver/MetricsSource.java | 35 + .../regionserver/ReplicationSource.java | 39 -- .../ReplicationWALReaderManager.java| 10 +++ src/main/asciidoc/_chapters/ops_mgt.adoc| 24 +- 8 files changed, 278 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hbase/blob/42dff8a5/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java -- diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java index fecf191..22b90dd 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java @@ -32,6 +32,16 @@ public interface MetricsReplicationSourceSource { public static final String SOURCE_LOG_EDITS_FILTERED = "source.logEditsFiltered"; + public static final String SOURCE_CLOSED_LOGS_WITH_UNKNOWN_LENGTH = + "source.closedLogsWithUnknownFileLength"; + public static final String SOURCE_UNCLEANLY_CLOSED_LOGS = "source.uncleanlyClosedLogs"; + public static final String SOURCE_UNCLEANLY_CLOSED_IGNORED_IN_BYTES = + "source.ignoredUncleanlyClosedLogContentsInBytes"; + public static final String SOURCE_RESTARTED_LOG_READING = "source.restartedLogReading"; + public static final String SOURCE_REPEATED_LOG_FILE_BYTES = "source.repeatedLogFileBytes"; + public static final String SOURCE_CO
hbase git commit: HBASE-15984 Handle premature EOF treatment of WALs in replication.
Repository: hbase Updated Branches: refs/heads/branch-1.3 e0066e713 -> 39a79d50f HBASE-15984 Handle premature EOF treatment of WALs in replication. In some particular deployments, the Replication code believes it has reached EOF for a WAL prior to succesfully parsing all bytes known to exist in a cleanly closed file. Consistently this failure happens due to an InvalidProtobufException after some number of seeks during our attempts to tail the in-progress RegionServer WAL. As a work-around, this patch treats cleanly closed files differently than other execution paths. If an EOF is detected due to parsing or other errors while there are still unparsed bytes before the end-of-file trailer, we now reset the WAL to the very beginning and attempt a clean read-through. In current testing, a single such reset is sufficient to work around observed dataloss. However, the above change will retry a given WAL file indefinitely. On each such attempt, a log message like the below will be emitted at the WARN level: Processing end of WAL file '{}'. At position {}, which is too far away from reported file length {}. Restarting WAL reading (see HBASE-15983 for details). Additionally, this patch adds some additional log detail at the TRACE level about file offsets seen while handling recoverable errors. It also add metrics that measure the use of this recovery mechanism. Conflicts: hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSource.java hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/39a79d50 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/39a79d50 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/39a79d50 Branch: refs/heads/branch-1.3 Commit: 39a79d50f1bde8ec54e08e7c249ba07562a30f63 Parents: e0066e7 Author: Sean Busbey Authored: Tue Jun 7 16:00:46 2016 -0500 Committer: Sean Busbey Committed: Thu Sep 29 13:01:43 2016 -0500 -- .../MetricsReplicationSourceSource.java | 17 .../MetricsReplicationGlobalSourceSource.java | 45 +++ .../MetricsReplicationSourceSourceImpl.java | 81 .../regionserver/wal/ProtobufLogReader.java | 45 +-- .../replication/regionserver/MetricsSource.java | 35 + .../regionserver/ReplicationSource.java | 39 -- .../ReplicationWALReaderManager.java| 10 +++ src/main/asciidoc/_chapters/ops_mgt.adoc| 24 +- 8 files changed, 280 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hbase/blob/39a79d50/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java -- diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java index 271f0ac..1ed5a6b 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java @@ -38,6 +38,16 @@ public interface MetricsReplicationSourceSource { public static final String SOURCE_SHIPPED_HFILES = "source.shippedHFiles"; public static final String SOURCE_SIZE_OF_HFILE_REFS_QUEUE = "source.sizeOfHFileRefsQueue"; + public static final String SOURCE_CLOSED_LOGS_WITH_UNKNOWN_LENGTH = + "source.closedLogsWithUnknownFileLength"; + public static final String SOURCE_UNCLEANLY_CLOSED_LOGS = "source.uncleanlyClosedLogs"; + public static final String SOURCE_UNCLEANLY_CLOSED_IGNORED_IN_BYTES = + "source.ignoredUncleanlyClosedLogContentsInBytes"; + public static final String SOURCE_RESTARTED_LOG_READING = "source.restartedLogReading"; + public static final String SOURCE_REPEATED_LOG_FILE_BYTES = "source.repeatedLogFileBytes"; + public static final String SOURCE_COMPLETED_LOGS = "source.completedLogs"; + public static final String SOURCE_COMPLETED_RECOVERY_QUEUES = "source.completedRecoverQueues"; + void setLastShippedAge(long age); void incrSizeOfLogQueue(int size); void decrSizeOfLogQueue(int size); @@ -53,4 +63,11 @@ public interface MetricsReplicationSourceSource { void incrHFilesShipped(long hfiles); void incrSizeOfHFileRefsQueue(long size); void decrSizeOfHFi
hbase git commit: HBASE-15984 Handle premature EOF treatment of WALs in replication.
Repository: hbase Updated Branches: refs/heads/branch-1 df5785925 -> df25ebf84 HBASE-15984 Handle premature EOF treatment of WALs in replication. In some particular deployments, the Replication code believes it has reached EOF for a WAL prior to succesfully parsing all bytes known to exist in a cleanly closed file. Consistently this failure happens due to an InvalidProtobufException after some number of seeks during our attempts to tail the in-progress RegionServer WAL. As a work-around, this patch treats cleanly closed files differently than other execution paths. If an EOF is detected due to parsing or other errors while there are still unparsed bytes before the end-of-file trailer, we now reset the WAL to the very beginning and attempt a clean read-through. In current testing, a single such reset is sufficient to work around observed dataloss. However, the above change will retry a given WAL file indefinitely. On each such attempt, a log message like the below will be emitted at the WARN level: Processing end of WAL file '{}'. At position {}, which is too far away from reported file length {}. Restarting WAL reading (see HBASE-15983 for details). Additionally, this patch adds some additional log detail at the TRACE level about file offsets seen while handling recoverable errors. It also add metrics that measure the use of this recovery mechanism. Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/df25ebf8 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/df25ebf8 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/df25ebf8 Branch: refs/heads/branch-1 Commit: df25ebf84f0be995204e9c16fc5b540893cb62bf Parents: df57859 Author: Sean Busbey Authored: Tue Jun 7 16:00:46 2016 -0500 Committer: Sean Busbey Committed: Thu Sep 29 10:47:57 2016 -0500 -- .../MetricsReplicationSourceSource.java | 17 + .../MetricsReplicationGlobalSourceSource.java | 43 +++ .../MetricsReplicationSourceSourceImpl.java | 79 .../regionserver/wal/ProtobufLogReader.java | 45 +-- .../replication/regionserver/MetricsSource.java | 35 + .../regionserver/ReplicationSource.java | 39 -- .../ReplicationWALReaderManager.java| 10 +++ src/main/asciidoc/_chapters/ops_mgt.adoc| 24 +- 8 files changed, 276 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hbase/blob/df25ebf8/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java -- diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java index c877608..9075a68 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java @@ -40,6 +40,16 @@ public interface MetricsReplicationSourceSource extends BaseSource { public static final String SOURCE_SHIPPED_HFILES = "source.shippedHFiles"; public static final String SOURCE_SIZE_OF_HFILE_REFS_QUEUE = "source.sizeOfHFileRefsQueue"; + public static final String SOURCE_CLOSED_LOGS_WITH_UNKNOWN_LENGTH = + "source.closedLogsWithUnknownFileLength"; + public static final String SOURCE_UNCLEANLY_CLOSED_LOGS = "source.uncleanlyClosedLogs"; + public static final String SOURCE_UNCLEANLY_CLOSED_IGNORED_IN_BYTES = + "source.ignoredUncleanlyClosedLogContentsInBytes"; + public static final String SOURCE_RESTARTED_LOG_READING = "source.restartedLogReading"; + public static final String SOURCE_REPEATED_LOG_FILE_BYTES = "source.repeatedLogFileBytes"; + public static final String SOURCE_COMPLETED_LOGS = "source.completedLogs"; + public static final String SOURCE_COMPLETED_RECOVERY_QUEUES = "source.completedRecoverQueues"; + void setLastShippedAge(long age); void incrSizeOfLogQueue(int size); void decrSizeOfLogQueue(int size); @@ -55,4 +65,11 @@ public interface MetricsReplicationSourceSource extends BaseSource { void incrHFilesShipped(long hfiles); void incrSizeOfHFileRefsQueue(long size); void decrSizeOfHFileRefsQueue(long size); + void incrUnknownFileLengthForClosedWAL(); + void incrUncleanlyClosedWALs(); + void incrBytesSkippedInUncleanlyClosedWALs(final long bytes); + void incrRestartedWALReading(); + void incrRepeatedFileBytes(final long bytes); + void incrCompletedWAL(); + void incrCompletedRecoveryQueue(); } http://git-wip-us.apache.
[03/18] hbase git commit: HBASE-15984 Handle premature EOF treatment of WALs in replication.
HBASE-15984 Handle premature EOF treatment of WALs in replication. In some particular deployments, the Replication code believes it has reached EOF for a WAL prior to succesfully parsing all bytes known to exist in a cleanly closed file. Consistently this failure happens due to an InvalidProtobufException after some number of seeks during our attempts to tail the in-progress RegionServer WAL. As a work-around, this patch treats cleanly closed files differently than other execution paths. If an EOF is detected due to parsing or other errors while there are still unparsed bytes before the end-of-file trailer, we now reset the WAL to the very beginning and attempt a clean read-through. In current testing, a single such reset is sufficient to work around observed dataloss. However, the above change will retry a given WAL file indefinitely. On each such attempt, a log message like the below will be emitted at the WARN level: Processing end of WAL file '{}'. At position {}, which is too far away from reported file length {}. Restarting WAL reading (see HBASE-15983 for details). Additionally, this patch adds some additional log detail at the TRACE level about file offsets seen while handling recoverable errors. It also add metrics that measure the use of this recovery mechanism. Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/76396714 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/76396714 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/76396714 Branch: refs/heads/hbase-14439 Commit: 76396714e1dbb3d7c65064c807f4131d0b2f11b8 Parents: 63808a2 Author: Sean Busbey Authored: Tue Jun 7 16:00:46 2016 -0500 Committer: Sean Busbey Committed: Thu Sep 29 10:07:14 2016 -0500 -- .../MetricsReplicationSourceSource.java | 17 + .../MetricsReplicationGlobalSourceSource.java | 43 +++ .../MetricsReplicationSourceSourceImpl.java | 79 .../regionserver/wal/ProtobufLogReader.java | 45 +-- .../replication/regionserver/MetricsSource.java | 35 + .../regionserver/ReplicationSource.java | 39 -- .../ReplicationWALReaderManager.java| 10 +++ src/main/asciidoc/_chapters/ops_mgt.adoc| 24 +- 8 files changed, 276 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hbase/blob/76396714/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java -- diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java index c877608..9075a68 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java @@ -40,6 +40,16 @@ public interface MetricsReplicationSourceSource extends BaseSource { public static final String SOURCE_SHIPPED_HFILES = "source.shippedHFiles"; public static final String SOURCE_SIZE_OF_HFILE_REFS_QUEUE = "source.sizeOfHFileRefsQueue"; + public static final String SOURCE_CLOSED_LOGS_WITH_UNKNOWN_LENGTH = + "source.closedLogsWithUnknownFileLength"; + public static final String SOURCE_UNCLEANLY_CLOSED_LOGS = "source.uncleanlyClosedLogs"; + public static final String SOURCE_UNCLEANLY_CLOSED_IGNORED_IN_BYTES = + "source.ignoredUncleanlyClosedLogContentsInBytes"; + public static final String SOURCE_RESTARTED_LOG_READING = "source.restartedLogReading"; + public static final String SOURCE_REPEATED_LOG_FILE_BYTES = "source.repeatedLogFileBytes"; + public static final String SOURCE_COMPLETED_LOGS = "source.completedLogs"; + public static final String SOURCE_COMPLETED_RECOVERY_QUEUES = "source.completedRecoverQueues"; + void setLastShippedAge(long age); void incrSizeOfLogQueue(int size); void decrSizeOfLogQueue(int size); @@ -55,4 +65,11 @@ public interface MetricsReplicationSourceSource extends BaseSource { void incrHFilesShipped(long hfiles); void incrSizeOfHFileRefsQueue(long size); void decrSizeOfHFileRefsQueue(long size); + void incrUnknownFileLengthForClosedWAL(); + void incrUncleanlyClosedWALs(); + void incrBytesSkippedInUncleanlyClosedWALs(final long bytes); + void incrRestartedWALReading(); + void incrRepeatedFileBytes(final long bytes); + void incrCompletedWAL(); + void incrCompletedRecoveryQueue(); } http://git-wip-us.apache.org/repos/asf/hbase/blob/76396714/hbase-hadoop2-compat/src/main/java/org/apache/
hbase git commit: HBASE-15984 Handle premature EOF treatment of WALs in replication.
Repository: hbase Updated Branches: refs/heads/master 63808a224 -> 76396714e HBASE-15984 Handle premature EOF treatment of WALs in replication. In some particular deployments, the Replication code believes it has reached EOF for a WAL prior to succesfully parsing all bytes known to exist in a cleanly closed file. Consistently this failure happens due to an InvalidProtobufException after some number of seeks during our attempts to tail the in-progress RegionServer WAL. As a work-around, this patch treats cleanly closed files differently than other execution paths. If an EOF is detected due to parsing or other errors while there are still unparsed bytes before the end-of-file trailer, we now reset the WAL to the very beginning and attempt a clean read-through. In current testing, a single such reset is sufficient to work around observed dataloss. However, the above change will retry a given WAL file indefinitely. On each such attempt, a log message like the below will be emitted at the WARN level: Processing end of WAL file '{}'. At position {}, which is too far away from reported file length {}. Restarting WAL reading (see HBASE-15983 for details). Additionally, this patch adds some additional log detail at the TRACE level about file offsets seen while handling recoverable errors. It also add metrics that measure the use of this recovery mechanism. Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/76396714 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/76396714 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/76396714 Branch: refs/heads/master Commit: 76396714e1dbb3d7c65064c807f4131d0b2f11b8 Parents: 63808a2 Author: Sean Busbey Authored: Tue Jun 7 16:00:46 2016 -0500 Committer: Sean Busbey Committed: Thu Sep 29 10:07:14 2016 -0500 -- .../MetricsReplicationSourceSource.java | 17 + .../MetricsReplicationGlobalSourceSource.java | 43 +++ .../MetricsReplicationSourceSourceImpl.java | 79 .../regionserver/wal/ProtobufLogReader.java | 45 +-- .../replication/regionserver/MetricsSource.java | 35 + .../regionserver/ReplicationSource.java | 39 -- .../ReplicationWALReaderManager.java| 10 +++ src/main/asciidoc/_chapters/ops_mgt.adoc| 24 +- 8 files changed, 276 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hbase/blob/76396714/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java -- diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java index c877608..9075a68 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java @@ -40,6 +40,16 @@ public interface MetricsReplicationSourceSource extends BaseSource { public static final String SOURCE_SHIPPED_HFILES = "source.shippedHFiles"; public static final String SOURCE_SIZE_OF_HFILE_REFS_QUEUE = "source.sizeOfHFileRefsQueue"; + public static final String SOURCE_CLOSED_LOGS_WITH_UNKNOWN_LENGTH = + "source.closedLogsWithUnknownFileLength"; + public static final String SOURCE_UNCLEANLY_CLOSED_LOGS = "source.uncleanlyClosedLogs"; + public static final String SOURCE_UNCLEANLY_CLOSED_IGNORED_IN_BYTES = + "source.ignoredUncleanlyClosedLogContentsInBytes"; + public static final String SOURCE_RESTARTED_LOG_READING = "source.restartedLogReading"; + public static final String SOURCE_REPEATED_LOG_FILE_BYTES = "source.repeatedLogFileBytes"; + public static final String SOURCE_COMPLETED_LOGS = "source.completedLogs"; + public static final String SOURCE_COMPLETED_RECOVERY_QUEUES = "source.completedRecoverQueues"; + void setLastShippedAge(long age); void incrSizeOfLogQueue(int size); void decrSizeOfLogQueue(int size); @@ -55,4 +65,11 @@ public interface MetricsReplicationSourceSource extends BaseSource { void incrHFilesShipped(long hfiles); void incrSizeOfHFileRefsQueue(long size); void decrSizeOfHFileRefsQueue(long size); + void incrUnknownFileLengthForClosedWAL(); + void incrUncleanlyClosedWALs(); + void incrBytesSkippedInUncleanlyClosedWALs(final long bytes); + void incrRestartedWALReading(); + void incrRepeatedFileBytes(final long bytes); + void incrCompletedWAL(); + void incrCompletedRecoveryQueue(); } http://git-wip-us.apache.org/