Repository: hbase Updated Branches: refs/heads/0.98 223b508ac -> 62ce0e21c
HBASE-15219 Canary tool does not return non-zero exit code when one of regions is in stuck state Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/62ce0e21 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/62ce0e21 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/62ce0e21 Branch: refs/heads/0.98 Commit: 62ce0e21c3ce68c962bb65e501944e7a58b69bbf Parents: 223b508 Author: tedyu <[email protected]> Authored: Wed Feb 10 02:46:32 2016 -0800 Committer: tedyu <[email protected]> Committed: Wed Feb 10 02:46:32 2016 -0800 ---------------------------------------------------------------------- .../org/apache/hadoop/hbase/tool/Canary.java | 54 ++++++++++++++++---- src/main/asciidoc/_chapters/ops_mgt.adoc | 11 ++++ 2 files changed, 56 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/62ce0e21/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java index 8ca4ff5..3cc3b7b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java @@ -37,6 +37,7 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.ScheduledThreadPoolExecutor; +import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -92,9 +93,11 @@ import com.google.protobuf.ServiceException; public final class Canary implements Tool { // Sink interface used by the canary to outputs information public interface Sink { + public long getReadFailureCount(); public void publishReadFailure(HRegionInfo region, Exception e); public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception e); public void publishReadTiming(HRegionInfo region, HColumnDescriptor column, long msTime); + public long getWriteFailureCount(); public void publishWriteFailure(HRegionInfo region, Exception e); public void publishWriteFailure(HRegionInfo region, HColumnDescriptor column, Exception e); public void publishWriteTiming(HRegionInfo region, HColumnDescriptor column, long msTime); @@ -109,13 +112,23 @@ public final class Canary implements Tool { // Simple implementation of canary sink that allows to plot on // file or standard output timings or failures. public static class StdOutSink implements Sink { + protected AtomicLong readFailureCount = new AtomicLong(0), + writeFailureCount = new AtomicLong(0); + + @Override + public long getReadFailureCount() { + return readFailureCount.get(); + } + @Override public void publishReadFailure(HRegionInfo region, Exception e) { + readFailureCount.incrementAndGet(); LOG.error(String.format("read from region %s failed", region.getRegionNameAsString()), e); } @Override public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception e) { + readFailureCount.incrementAndGet(); LOG.error(String.format("read from region %s column family %s failed", region.getRegionNameAsString(), column.getNameAsString()), e); } @@ -127,12 +140,19 @@ public final class Canary implements Tool { } @Override + public long getWriteFailureCount() { + return writeFailureCount.get(); + } + + @Override public void publishWriteFailure(HRegionInfo region, Exception e) { + writeFailureCount.incrementAndGet(); LOG.error(String.format("write to region %s failed", region.getRegionNameAsString()), e); } @Override public void publishWriteFailure(HRegionInfo region, HColumnDescriptor column, Exception e) { + writeFailureCount.incrementAndGet(); LOG.error(String.format("write to region %s column family %s failed", region.getRegionNameAsString(), column.getNameAsString()), e); } @@ -148,6 +168,7 @@ public final class Canary implements Tool { @Override public void publishReadFailure(String table, String server) { + readFailureCount.incrementAndGet(); LOG.error(String.format("Read from table:%s on region server:%s", table, server)); } @@ -427,6 +448,7 @@ public final class Canary implements Tool { private boolean failOnError = true; private boolean regionServerMode = false; private boolean writeSniffing = false; + private boolean treatFailureAsError = false; private TableName writeTableName = DEFAULT_WRITE_TABLE_NAME; private ExecutorService executor; // threads to retrieve data from regionservers @@ -488,6 +510,8 @@ public final class Canary implements Tool { this.regionServerMode = true; } else if(cmd.equals("-writeSniffing")) { this.writeSniffing = true; + } else if(cmd.equals("-treatFailureAsError")) { + this.treatFailureAsError = true; } else if (cmd.equals("-e")) { this.useRegExp = true; } else if (cmd.equals("-t")) { @@ -585,7 +609,7 @@ public final class Canary implements Tool { } } - if (this.failOnError && monitor.hasError()) { + if (this.failOnError && monitor.finalCheckForErrors()) { monitorThread.interrupt(); System.exit(monitor.errorCode); } @@ -618,6 +642,7 @@ public final class Canary implements Tool { " default is true"); System.err.println(" -t <N> timeout for a check, default is 600000 (milisecs)"); System.err.println(" -writeSniffing enable the write sniffing in canary"); + System.err.println(" -treatFailureAsError treats read / write failure as error"); System.err.println(" -writeTable The table used for write sniffing." + " Default is hbase:canary"); System.err @@ -645,11 +670,11 @@ public final class Canary implements Tool { if (this.regionServerMode) { monitor = new RegionServerMonitor(connection, monitorTargets, this.useRegExp, - (ExtendedSink) this.sink, this.executor); + (ExtendedSink) this.sink, this.executor, this.treatFailureAsError); } else { monitor = new RegionMonitor(connection, monitorTargets, this.useRegExp, this.sink, this.executor, - this.writeSniffing, this.writeTableName); + this.writeSniffing, this.writeTableName, this.treatFailureAsError); } return monitor; } @@ -661,6 +686,7 @@ public final class Canary implements Tool { protected HBaseAdmin admin; protected String[] targets; protected boolean useRegExp; + protected boolean treatFailureAsError; protected boolean initialized = false; protected boolean done = false; @@ -681,13 +707,22 @@ public final class Canary implements Tool { if (this.admin != null) this.admin.close(); } + public boolean finalCheckForErrors() { + if (errorCode != 0) { + return true; + } + return treatFailureAsError && + (sink.getReadFailureCount() > 0 || sink.getWriteFailureCount() > 0); + } + protected Monitor(HConnection connection, String[] monitorTargets, boolean useRegExp, Sink sink, - ExecutorService executor) { + ExecutorService executor, boolean treatFailureAsError) { if (null == connection) throw new IllegalArgumentException("connection shall not be null"); this.connection = connection; this.targets = monitorTargets; this.useRegExp = useRegExp; + this.treatFailureAsError = treatFailureAsError; this.sink = sink; this.executor = executor; } @@ -727,8 +762,9 @@ public final class Canary implements Tool { private int checkPeriod; public RegionMonitor(HConnection connection, String[] monitorTargets, boolean useRegExp, - Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName) { - super(connection, monitorTargets, useRegExp, sink, executor); + Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName, + boolean treatFailureAsError) { + super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError); Configuration conf = connection.getConfiguration(); this.writeSniffing = writeSniffing; this.writeTableName = writeTableName; @@ -1023,8 +1059,8 @@ public final class Canary implements Tool { private static class RegionServerMonitor extends Monitor { public RegionServerMonitor(HConnection connection, String[] monitorTargets, boolean useRegExp, - ExtendedSink sink, ExecutorService executor) { - super(connection, monitorTargets, useRegExp, sink, executor); + ExtendedSink sink, ExecutorService executor, boolean treatFailureAsError) { + super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError); } private ExtendedSink getSink() { @@ -1095,7 +1131,7 @@ public final class Canary implements Tool { } } catch (InterruptedException e) { this.errorCode = ERROR_EXIT_CODE; - LOG.error("Sniff regionserver failed!", e); + LOG.error("Sniff regionserver interrupted!", e); } } http://git-wip-us.apache.org/repos/asf/hbase/blob/62ce0e21/src/main/asciidoc/_chapters/ops_mgt.adoc ---------------------------------------------------------------------- diff --git a/src/main/asciidoc/_chapters/ops_mgt.adoc b/src/main/asciidoc/_chapters/ops_mgt.adoc index 7089a16..d7ac987 100644 --- a/src/main/asciidoc/_chapters/ops_mgt.adoc +++ b/src/main/asciidoc/_chapters/ops_mgt.adoc @@ -93,6 +93,7 @@ Usage: bin/hbase org.apache.hadoop.hbase.tool.Canary [opts] [table1 [table2]...] -f <B> stop whole program if first error occurs, default is true -t <N> timeout for a check, default is 600000 (milliseconds) -writeSniffing enable the write sniffing in canary + -treatFailureAsError treats read / write failure as error -writeTable The table used for write sniffing. Default is hbase:canary -D<configProperty>=<value> assigning or override the configuration params ---- @@ -215,6 +216,16 @@ $ ${HBASE_HOME}/bin/hbase canary -writeSniffing -writeTable ns:canary The default value size of each put is 10 bytes and you can set it by the config key: `hbase.canary.write.value.size`. +==== Treat read / write failure as error + +By default, the canary tool only logs read failure, due to e.g. RetriesExhaustedException, +while returning normal exit code. To treat read / write failure as error, you can run canary +with the `-treatFailureAsError` option. When enabled, read / write failure would result in error +exit code. +---- +$ ${HBASE_HOME}/bin/hbase canary --treatFailureAsError +---- + ==== Running Canary in a Kerberos-enabled Cluster To run Canary in a Kerberos-enabled cluster, configure the following two properties in _hbase-site.xml_:
