HBASE-20376 RowCounter and CellCounter documentations are incorrect
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/c4ebf666 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/c4ebf666 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/c4ebf666 Branch: refs/heads/HBASE-19064 Commit: c4ebf666b78f92a6d02652eece8dd95360bd0482 Parents: 5a69465 Author: Peter Somogyi <psomo...@apache.org> Authored: Tue Apr 10 15:16:03 2018 +0200 Committer: Peter Somogyi <psomo...@apache.org> Committed: Thu Apr 12 10:00:38 2018 +0200 ---------------------------------------------------------------------- bin/hbase | 6 +++ .../hadoop/hbase/mapreduce/CellCounter.java | 47 +++++++++++--------- .../hadoop/hbase/mapreduce/RowCounter.java | 6 +-- .../hadoop/hbase/mapreduce/TestRowCounter.java | 22 +++++---- src/main/asciidoc/_chapters/ops_mgt.adoc | 31 ++++++++----- 5 files changed, 64 insertions(+), 48 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/c4ebf666/bin/hbase ---------------------------------------------------------------------- diff --git a/bin/hbase b/bin/hbase index 8e37f5f..f1e2306 100755 --- a/bin/hbase +++ b/bin/hbase @@ -106,6 +106,8 @@ if [ $# = 0 ]; then echo " backup Backup tables for recovery" echo " restore Restore tables from existing backup image" echo " regionsplitter Run RegionSplitter tool" + echo " rowcounter Run RowCounter tool" + echo " cellcounter Run CellCounter tool" echo " CLASSNAME Run the class named CLASSNAME" exit 1 fi @@ -465,6 +467,10 @@ elif [ "$COMMAND" = "version" ] ; then CLASS='org.apache.hadoop.hbase.util.VersionInfo' elif [ "$COMMAND" = "regionsplitter" ] ; then CLASS='org.apache.hadoop.hbase.util.RegionSplitter' +elif [ "$COMMAND" = "rowcounter" ] ; then + CLASS='org.apache.hadoop.hbase.mapreduce.RowCounter' +elif [ "$COMMAND" = "cellcounter" ] ; then + CLASS='org.apache.hadoop.hbase.mapreduce.CellCounter' else CLASS=$COMMAND fi http://git-wip-us.apache.org/repos/asf/hbase/blob/c4ebf666/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/CellCounter.java ---------------------------------------------------------------------- diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/CellCounter.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/CellCounter.java index aa79aac..ff0f01c 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/CellCounter.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/CellCounter.java @@ -292,33 +292,38 @@ public class CellCounter extends Configured implements Tool { @Override public int run(String[] args) throws Exception { if (args.length < 2) { - System.err.println("ERROR: Wrong number of parameters: " + args.length); - System.err.println("Usage: CellCounter "); - System.err.println(" <tablename> <outputDir> <reportSeparator> [^[regex pattern] or " + - "[Prefix] for row filter]] --starttime=[starttime] --endtime=[endtime]"); - System.err.println(" Note: -D properties will be applied to the conf used. "); - System.err.println(" Additionally, all of the SCAN properties from TableInputFormat"); - System.err.println(" can be specified to get fine grained control on what is counted.."); - System.err.println(" -D " + TableInputFormat.SCAN_ROW_START + "=<rowkey>"); - System.err.println(" -D " + TableInputFormat.SCAN_ROW_STOP + "=<rowkey>"); - System.err.println(" -D " + TableInputFormat.SCAN_COLUMNS + "=\"<col1> <col2>...\""); - System.err.println(" -D " + TableInputFormat.SCAN_COLUMN_FAMILY + "=<family1>,<family2>, ..."); - System.err.println(" -D " + TableInputFormat.SCAN_TIMESTAMP + "=<timestamp>"); - System.err.println(" -D " + TableInputFormat.SCAN_TIMERANGE_START + "=<timestamp>"); - System.err.println(" -D " + TableInputFormat.SCAN_TIMERANGE_END + "=<timestamp>"); - System.err.println(" -D " + TableInputFormat.SCAN_MAXVERSIONS + "=<count>"); - System.err.println(" -D " + TableInputFormat.SCAN_CACHEDROWS + "=<count>"); - System.err.println(" -D " + TableInputFormat.SCAN_BATCHSIZE + "=<count>"); - System.err.println(" <reportSeparator> parameter can be used to override the default report separator " + - "string : used to separate the rowId/column family name and qualifier name."); - System.err.println(" [^[regex pattern] or [Prefix] parameter can be used to limit the cell counter count " + - "operation to a limited subset of rows from the table based on regex or prefix pattern."); + printUsage(args.length); return -1; } Job job = createSubmittableJob(getConf(), args); return (job.waitForCompletion(true) ? 0 : 1); } + private void printUsage(int parameterCount) { + System.err.println("ERROR: Wrong number of parameters: " + parameterCount); + System.err.println("Usage: hbase cellcounter <tablename> <outputDir> [reportSeparator] " + + "[^[regex pattern] or [Prefix]] [--starttime=<starttime> --endtime=<endtime>]"); + System.err.println(" Note: -D properties will be applied to the conf used."); + System.err.println(" Additionally, all of the SCAN properties from TableInputFormat can be " + + "specified to get fine grained control on what is counted."); + System.err.println(" -D" + TableInputFormat.SCAN_ROW_START + "=<rowkey>"); + System.err.println(" -D" + TableInputFormat.SCAN_ROW_STOP + "=<rowkey>"); + System.err.println(" -D" + TableInputFormat.SCAN_COLUMNS + "=\"<col1> <col2>...\""); + System.err.println(" -D" + TableInputFormat.SCAN_COLUMN_FAMILY + + "=<family1>,<family2>, ..."); + System.err.println(" -D" + TableInputFormat.SCAN_TIMESTAMP + "=<timestamp>"); + System.err.println(" -D" + TableInputFormat.SCAN_TIMERANGE_START + "=<timestamp>"); + System.err.println(" -D" + TableInputFormat.SCAN_TIMERANGE_END + "=<timestamp>"); + System.err.println(" -D" + TableInputFormat.SCAN_MAXVERSIONS + "=<count>"); + System.err.println(" -D" + TableInputFormat.SCAN_CACHEDROWS + "=<count>"); + System.err.println(" -D" + TableInputFormat.SCAN_BATCHSIZE + "=<count>"); + System.err.println(" <reportSeparator> parameter can be used to override the default report " + + "separator string : used to separate the rowId/column family name and qualifier name."); + System.err.println(" [^[regex pattern] or [Prefix] parameter can be used to limit the cell " + + "counter count operation to a limited subset of rows from the table based on regex or " + + "prefix pattern."); + } + /** * Main entry point. * @param args The command line parameters. http://git-wip-us.apache.org/repos/asf/hbase/blob/c4ebf666/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/RowCounter.java ---------------------------------------------------------------------- diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/RowCounter.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/RowCounter.java index 9c7b489..7fa5dec 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/RowCounter.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/RowCounter.java @@ -221,9 +221,9 @@ public class RowCounter extends Configured implements Tool { * Note that we don't document --expected-count, because it's intended for test. */ private static void printUsage() { - System.err.println("Usage: RowCounter [options] <tablename> " + - "[--starttime=[start] --endtime=[end] " + - "[--range=[startKey],[endKey][;[startKey],[endKey]...]] [<column1> <column2>...]"); + System.err.println("Usage: hbase rowcounter [options] <tablename> " + + "[--starttime=<start> --endtime=<end>] " + + "[--range=[startKey],[endKey][;[startKey],[endKey]...]] [<column1> <column2>...]"); System.err.println("For performance consider the following options:\n" + "-Dhbase.client.scanner.caching=100\n" + "-Dmapreduce.map.speculative=false"); http://git-wip-us.apache.org/repos/asf/hbase/blob/c4ebf666/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/mapreduce/TestRowCounter.java ---------------------------------------------------------------------- diff --git a/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/mapreduce/TestRowCounter.java b/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/mapreduce/TestRowCounter.java index b07de7f..18c1874 100644 --- a/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/mapreduce/TestRowCounter.java +++ b/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/mapreduce/TestRowCounter.java @@ -363,13 +363,7 @@ public class TestRowCounter { } catch (SecurityException e) { assertEquals(-1, newSecurityManager.getExitCode()); assertTrue(data.toString().contains("Wrong number of parameters:")); - assertTrue(data.toString().contains( - "Usage: RowCounter [options] <tablename> " + - "[--starttime=[start] --endtime=[end] " + - "[--range=[startKey],[endKey][;[startKey],[endKey]...]] " + - "[<column1> <column2>...]")); - assertTrue(data.toString().contains("-Dhbase.client.scanner.caching=100")); - assertTrue(data.toString().contains("-Dmapreduce.map.speculative=false")); + assertUsageContent(data.toString()); } data.reset(); try { @@ -383,18 +377,22 @@ public class TestRowCounter { assertTrue(data.toString().contains( "Please specify range in such format as \"--range=a,b\" or, with only one boundary," + " \"--range=,b\" or \"--range=a,\"")); - assertTrue(data.toString().contains( - "Usage: RowCounter [options] <tablename> " + - "[--starttime=[start] --endtime=[end] " + - "[--range=[startKey],[endKey][;[startKey],[endKey]...]] " + - "[<column1> <column2>...]")); + assertUsageContent(data.toString()); } } finally { System.setErr(oldPrintStream); System.setSecurityManager(SECURITY_MANAGER); } + } + private void assertUsageContent(String usage) { + assertTrue(usage.contains("Usage: hbase rowcounter [options] <tablename> " + + "[--starttime=<start> --endtime=<end>] " + + "[--range=[startKey],[endKey][;[startKey],[endKey]...]] [<column1> <column2>...]")); + assertTrue(usage.contains("For performance consider the following options:")); + assertTrue(usage.contains("-Dhbase.client.scanner.caching=100")); + assertTrue(usage.contains("-Dmapreduce.map.speculative=false")); } } http://git-wip-us.apache.org/repos/asf/hbase/blob/c4ebf666/src/main/asciidoc/_chapters/ops_mgt.adoc ---------------------------------------------------------------------- diff --git a/src/main/asciidoc/_chapters/ops_mgt.adoc b/src/main/asciidoc/_chapters/ops_mgt.adoc index 82badb4..38a7dff 100644 --- a/src/main/asciidoc/_chapters/ops_mgt.adoc +++ b/src/main/asciidoc/_chapters/ops_mgt.adoc @@ -68,8 +68,12 @@ Some commands take arguments. Pass no args or -h for usage. pe Run PerformanceEvaluation ltt Run LoadTestTool canary Run the Canary tool - regionsplitter Run the RegionSplitter tool version Print the version + backup Backup tables for recovery + restore Restore tables from existing backup image + regionsplitter Run RegionSplitter tool + rowcounter Run RowCounter tool + cellcounter Run CellCounter tool CLASSNAME Run the class named CLASSNAME ---- @@ -744,24 +748,28 @@ For performance also consider the following options: ---- [[rowcounter]] -=== RowCounter and CellCounter +=== RowCounter -link:https://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/RowCounter.html[RowCounter] is a mapreduce job to count all the rows of a table. +link:https://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/RowCounter.html[RowCounter] is a mapreduce job to count all the rows of a table. This is a good utility to use as a sanity check to ensure that HBase can read all the blocks of a table if there are any concerns of metadata inconsistency. -It will run the mapreduce all in a single process but it will run faster if you have a MapReduce cluster in place for it to exploit. It is also possible to limit -the time range of data to be scanned by using the `--starttime=[starttime]` and `--endtime=[endtime]` flags. +It will run the mapreduce all in a single process but it will run faster if you have a MapReduce cluster in place for it to exploit. +It is possible to limit the time range of data to be scanned by using the `--starttime=[starttime]` and `--endtime=[endtime]` flags. +The scanned data can be limited based on keys using the `--range=[startKey],[endKey][;[startKey],[endKey]...]` option. ---- -$ bin/hbase org.apache.hadoop.hbase.mapreduce.RowCounter <tablename> [<column1> <column2>...] +$ bin/hbase rowcounter [options] <tablename> [--starttime=<start> --endtime=<end>] [--range=[startKey],[endKey][;[startKey],[endKey]...]] [<column1> <column2>...] ---- RowCounter only counts one version per cell. -Note: caching for the input Scan is configured via `hbase.client.scanner.caching` in the job configuration. +For performance consider to use `-Dhbase.client.scanner.caching=100` and `-Dmapreduce.map.speculative=false` options. + +[[cellcounter]] +=== CellCounter HBase ships another diagnostic mapreduce job called link:https://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/CellCounter.html[CellCounter]. Like RowCounter, it gathers more fine-grained statistics about your table. -The statistics gathered by RowCounter are more fine-grained and include: +The statistics gathered by CellCounter are more fine-grained and include: * Total number of rows in the table. * Total number of CFs across all rows. @@ -772,12 +780,12 @@ The statistics gathered by RowCounter are more fine-grained and include: The program allows you to limit the scope of the run. Provide a row regex or prefix to limit the rows to analyze. -Specify a time range to scan the table by using the `--starttime=[starttime]` and `--endtime=[endtime]` flags. +Specify a time range to scan the table by using the `--starttime=<starttime>` and `--endtime=<endtime>` flags. Use `hbase.mapreduce.scan.column.family` to specify scanning a single column family. ---- -$ bin/hbase org.apache.hadoop.hbase.mapreduce.CellCounter <tablename> <outputDir> [regex or prefix] +$ bin/hbase cellcounter <tablename> <outputDir> [reportSeparator] [regex or prefix] [--starttime=<starttime> --endtime=<endtime>] ---- Note: just like RowCounter, caching for the input Scan is configured via `hbase.client.scanner.caching` in the job configuration. @@ -785,8 +793,7 @@ Note: just like RowCounter, caching for the input Scan is configured via `hbase. === mlockall It is possible to optionally pin your servers in physical memory making them less likely to be swapped out in oversubscribed environments by having the servers call link:http://linux.die.net/man/2/mlockall[mlockall] on startup. -See link:https://issues.apache.org/jira/browse/HBASE-4391[HBASE-4391 Add ability to - start RS as root and call mlockall] for how to build the optional library and have it run on startup. +See link:https://issues.apache.org/jira/browse/HBASE-4391[HBASE-4391 Add ability to start RS as root and call mlockall] for how to build the optional library and have it run on startup. [[compaction.tool]] === Offline Compaction Tool