This is an automated email from the ASF dual-hosted git repository.
stack pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new 9a7293c HBASE-22289 WAL-based log splitting resubmit threshold may
result in a task being stuck forever
9a7293c is described below
commit 9a7293c1740e895476166e63e2cb9a385a672a9f
Author: Sergey Soldatov <[email protected]>
AuthorDate: Fri May 17 19:31:34 2019 -0700
HBASE-22289 WAL-based log splitting resubmit threshold may result in a task
being stuck forever
---
.../coordination/ZkSplitLogWorkerCoordination.java | 4 +-
.../hadoop/hbase/regionserver/SplitLogWorker.java | 2 +
.../regionserver/handler/WALSplitterHandler.java | 45 +++++++++++++---------
3 files changed, 32 insertions(+), 19 deletions(-)
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZkSplitLogWorkerCoordination.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZkSplitLogWorkerCoordination.java
index ff555f2..7efb4a5 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZkSplitLogWorkerCoordination.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZkSplitLogWorkerCoordination.java
@@ -572,7 +572,9 @@ public class ZkSplitLogWorkerCoordination extends
ZKListener implements
try {
if (ZKUtil.setData(watcher, task, slt.toByteArray(), taskZKVersion)) {
LOG.info("successfully transitioned task " + task + " to final state "
+ slt);
- ctr.increment();
+ if (ctr != null) {
+ ctr.increment();
+ }
return;
}
LOG.warn("failed to transistion task " + task + " to end state " + slt
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java
index a1c2030..d4549de 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java
@@ -96,6 +96,8 @@ public class SplitLogWorker implements Runnable {
// TODO have to correctly figure out when log splitting has been
// interrupted or has encountered a transient error and when it has
// encountered a bad non-retry-able persistent error.
+ // Note: this can actually get the master stuck (HBASE-22289) so treat
preempted as error.
+ // splitLogFile does return false for legitimate retriable
errors.
try {
if (!WALSplitter.splitLogFile(walDir, fs.getFileStatus(new
Path(walDir, filename)),
fs, conf, p, sequenceIdChecker,
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/WALSplitterHandler.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/WALSplitterHandler.java
index 49ab574..f1bbfe7 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/WALSplitterHandler.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/WALSplitterHandler.java
@@ -62,38 +62,47 @@ public class WALSplitterHandler extends EventHandler {
this.splitTaskExecutor = splitTaskExecutor;
}
+
+
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value="SF_SWITCH_FALLTHROUGH",
+ justification="Intentional")
@Override
public void process() throws IOException {
long startTime = System.currentTimeMillis();
Status status = null;
try {
status = this.splitTaskExecutor.exec(splitTaskDetails.getWALFile(),
reporter);
+ boolean wasCounterIncremented = false;
switch (status) {
case DONE:
coordination.endTask(new SplitLogTask.Done(this.serverName),
SplitLogCounters.tot_wkr_task_done, splitTaskDetails);
break;
case PREEMPTED:
- SplitLogCounters.tot_wkr_preempt_task.increment();
- LOG.warn("task execution preempted " + splitTaskDetails.getWALFile());
- break;
+ SplitLogCounters.tot_wkr_preempt_task.increment();
+ wasCounterIncremented = true;
+ // Preempted state can currently be returned either when task is
preempted, or when
+ // there's a particular kind of error (e.g. some ZK/HDFS errors, in
my observation).
+ // In the latter case, master-side split task will get stuck if we
don't update the
+ // status. Treat preemption as error to be on the safe side.
+ LOG.warn("task execution preempted; treating as error " +
splitTaskDetails.getWALFile());
+ //$FALL-THROUGH$
case ERR:
- if (server != null && !server.isStopped()) {
- coordination.endTask(new SplitLogTask.Err(this.serverName),
- SplitLogCounters.tot_wkr_task_err, splitTaskDetails);
- break;
- }
- // if the RS is exiting then there is probably a tons of stuff
- // that can go wrong. Resign instead of signaling error.
- //$FALL-THROUGH$
+ if (server != null && !server.isStopped()) {
+ coordination.endTask(new SplitLogTask.Err(this.serverName),
wasCounterIncremented
+ ? null : SplitLogCounters.tot_wkr_task_err, splitTaskDetails);
+ break;
+ }
+ // if the RS is exiting then there is probably a tons of stuff
+ // that can go wrong. Resign instead of signaling error.
+ //$FALL-THROUGH$
case RESIGNED:
- if (server != null && server.isStopped()) {
- LOG.info("task execution interrupted because worker is exiting "
- + splitTaskDetails.toString());
- }
- coordination.endTask(new SplitLogTask.Resigned(this.serverName),
- SplitLogCounters.tot_wkr_task_resigned, splitTaskDetails);
- break;
+ if (server != null && server.isStopped()) {
+ LOG.info("task execution interrupted because worker is exiting "
+ + splitTaskDetails.toString());
+ }
+ coordination.endTask(new SplitLogTask.Resigned(this.serverName),
wasCounterIncremented
+ ? null : SplitLogCounters.tot_wkr_task_resigned, splitTaskDetails);
+ break;
}
} finally {
LOG.info("Worker " + serverName + " done with task " +
splitTaskDetails.toString() + " in "