http://git-wip-us.apache.org/repos/asf/hbase/blob/e80b3092/src/main/docbkx/hbase_apis.xml ---------------------------------------------------------------------- diff --git a/src/main/docbkx/hbase_apis.xml b/src/main/docbkx/hbase_apis.xml deleted file mode 100644 index bc35aba..0000000 --- a/src/main/docbkx/hbase_apis.xml +++ /dev/null @@ -1,133 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<chapter - version="5.0" - xml:id="hbase_apis" - xmlns="http://docbook.org/ns/docbook" - xmlns:xlink="http://www.w3.org/1999/xlink" - xmlns:xi="http://www.w3.org/2001/XInclude" - xmlns:svg="http://www.w3.org/2000/svg" - xmlns:m="http://www.w3.org/1998/Math/MathML" - xmlns:html="http://www.w3.org/1999/xhtml" - xmlns:db="http://docbook.org/ns/docbook"> - <!-- -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ ---> - <title>Apache HBase APIs</title> - <para>This chapter provides information about performing operations using HBase native APIs. This - information is not exhaustive, and provides a quick reference in addition to the <link - xlink:href="http://hbase.apache.org/apidocs/index.html">User API - Reference</link>. The examples here are not comprehensive or complete, and should be used for - purposes of illustration only.</para> - <para>Apache HBase also works with multiple external APIs. See <xref linkend="external_apis" /> - for more information.</para> - - <example> - <title>Create a Table Using Java</title> - <para>This example has been tested on HBase 0.96.1.1.</para> - <programlisting language="java"> -package com.example.hbase.admin; - -import java.io.IOException; - -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.HColumnDescriptor; -import org.apache.hadoop.hbase.HTableDescriptor; -import org.apache.hadoop.hbase.TableName; -import org.apache.hadoop.hbase.client.HBaseAdmin; -import org.apache.hadoop.hbase.io.compress.Compression.Algorithm; -import org.apache.hadoop.conf.Configuration; - -import static com.example.hbase.Constants.*; - -public class CreateSchema { - - public static void createOrOverwrite(HBaseAdmin admin, HTableDescriptor table) throws IOException { - if (admin.tableExists(table.getName())) { - admin.disableTable(table.getName()); - admin.deleteTable(table.getName()); - } - admin.createTable(table); - } - - public static void createSchemaTables (Configuration config) { - try { - final HBaseAdmin admin = new HBaseAdmin(config); - HTableDescriptor table = new HTableDescriptor(TableName.valueOf(TABLE_NAME)); - table.addFamily(new HColumnDescriptor(CF_DEFAULT).setCompressionType(Algorithm.SNAPPY)); - - System.out.print("Creating table. "); - createOrOverwrite(admin, table); - System.out.println(" Done."); - - admin.close(); - } catch (Exception e) { - e.printStackTrace(); - System.exit(-1); - } - } - - -} - - </programlisting> - </example> - <example> - <title>Add, Modify, and Delete a Table</title> - <para>This example has been tested on HBase 0.96.1.1.</para> - <programlisting language="java"> -public static void upgradeFrom0 (Configuration config) { - - try { - final HBaseAdmin admin = new HBaseAdmin(config); - TableName tableName = TableName.valueOf(TABLE_ASSETMETA); - HTableDescriptor table_assetmeta = new HTableDescriptor(tableName); - table_assetmeta.addFamily(new HColumnDescriptor(CF_DEFAULT).setCompressionType(Algorithm.SNAPPY)); - - // Create a new table. - - System.out.print("Creating table_assetmeta. "); - admin.createTable(table_assetmeta); - System.out.println(" Done."); - - // Update existing table - HColumnDescriptor newColumn = new HColumnDescriptor("NEWCF"); - newColumn.setCompactionCompressionType(Algorithm.GZ); - newColumn.setMaxVersions(HConstants.ALL_VERSIONS); - admin.addColumn(tableName, newColumn); - - // Disable an existing table - admin.disableTable(tableName); - - // Delete an existing column family - admin.deleteColumn(tableName, CF_DEFAULT); - - // Delete a table (Need to be disabled first) - admin.deleteTable(tableName); - - - admin.close(); - } catch (Exception e) { - e.printStackTrace(); - System.exit(-1); - } - } - </programlisting> - </example> - -</chapter>
http://git-wip-us.apache.org/repos/asf/hbase/blob/e80b3092/src/main/docbkx/hbase_history.xml ---------------------------------------------------------------------- diff --git a/src/main/docbkx/hbase_history.xml b/src/main/docbkx/hbase_history.xml deleted file mode 100644 index f7b9064..0000000 --- a/src/main/docbkx/hbase_history.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<appendix - xml:id="hbase.history" - version="5.0" - xmlns="http://docbook.org/ns/docbook" - xmlns:xlink="http://www.w3.org/1999/xlink" - xmlns:xi="http://www.w3.org/2001/XInclude" - xmlns:svg="http://www.w3.org/2000/svg" - xmlns:m="http://www.w3.org/1998/Math/MathML" - xmlns:html="http://www.w3.org/1999/xhtml" - xmlns:db="http://docbook.org/ns/docbook"> - <!--/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ ---> - <title>HBase History</title> - <itemizedlist> - <listitem><para>2006: <link xlink:href="http://research.google.com/archive/bigtable.html">BigTable</link> paper published by Google. - </para></listitem> - <listitem><para>2006 (end of year): HBase development starts. - </para></listitem> - <listitem><para>2008: HBase becomes Hadoop sub-project. - </para></listitem> - <listitem><para>2010: HBase becomes Apache top-level project. - </para></listitem> - </itemizedlist> -</appendix> http://git-wip-us.apache.org/repos/asf/hbase/blob/e80b3092/src/main/docbkx/hbck_in_depth.xml ---------------------------------------------------------------------- diff --git a/src/main/docbkx/hbck_in_depth.xml b/src/main/docbkx/hbck_in_depth.xml deleted file mode 100644 index e2ee34f..0000000 --- a/src/main/docbkx/hbck_in_depth.xml +++ /dev/null @@ -1,237 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<appendix - xml:id="hbck.in.depth" - version="5.0" - xmlns="http://docbook.org/ns/docbook" - xmlns:xlink="http://www.w3.org/1999/xlink" - xmlns:xi="http://www.w3.org/2001/XInclude" - xmlns:svg="http://www.w3.org/2000/svg" - xmlns:m="http://www.w3.org/1998/Math/MathML" - xmlns:html="http://www.w3.org/1999/xhtml" - xmlns:db="http://docbook.org/ns/docbook"> - <!--/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ ---> - - <title>hbck In Depth</title> - <para>HBaseFsck (hbck) is a tool for checking for region consistency and table integrity problems - and repairing a corrupted HBase. It works in two basic modes -- a read-only inconsistency - identifying mode and a multi-phase read-write repair mode. - </para> - <section> - <title>Running hbck to identify inconsistencies</title> - <para>To check to see if your HBase cluster has corruptions, run hbck against your HBase cluster:</para> - <programlisting language="bourne"> -$ ./bin/hbase hbck -</programlisting> - <para> - At the end of the commands output it prints OK or tells you the number of INCONSISTENCIES - present. You may also want to run run hbck a few times because some inconsistencies can be - transient (e.g. cluster is starting up or a region is splitting). Operationally you may want to run - hbck regularly and setup alert (e.g. via nagios) if it repeatedly reports inconsistencies . - A run of hbck will report a list of inconsistencies along with a brief description of the regions and - tables affected. The using the <code>-details</code> option will report more details including a representative - listing of all the splits present in all the tables. - </para> - <programlisting language="bourne"> -$ ./bin/hbase hbck -details -</programlisting> - <para>If you just want to know if some tables are corrupted, you can limit hbck to identify inconsistencies - in only specific tables. For example the following command would only attempt to check table - TableFoo and TableBar. The benefit is that hbck will run in less time.</para> - <programlisting language="bourne"> -$ ./bin/hbase hbck TableFoo TableBar -</programlisting> - </section> - <section><title>Inconsistencies</title> - <para> - If after several runs, inconsistencies continue to be reported, you may have encountered a - corruption. These should be rare, but in the event they occur newer versions of HBase include - the hbck tool enabled with automatic repair options. - </para> - <para> - There are two invariants that when violated create inconsistencies in HBase: - </para> - <itemizedlist> - <listitem><para>HBaseâs region consistency invariant is satisfied if every region is assigned and - deployed on exactly one region server, and all places where this state kept is in - accordance.</para> - </listitem> - <listitem><para>HBaseâs table integrity invariant is satisfied if for each table, every possible row key - resolves to exactly one region.</para> - </listitem> - </itemizedlist> - <para> - Repairs generally work in three phases -- a read-only information gathering phase that identifies - inconsistencies, a table integrity repair phase that restores the table integrity invariant, and then - finally a region consistency repair phase that restores the region consistency invariant. - Starting from version 0.90.0, hbck could detect region consistency problems report on a subset - of possible table integrity problems. It also included the ability to automatically fix the most - common inconsistency, region assignment and deployment consistency problems. This repair - could be done by using the <code>-fix</code> command line option. These problems close regions if they are - open on the wrong server or on multiple region servers and also assigns regions to region - servers if they are not open. - </para> - <para> - Starting from HBase versions 0.90.7, 0.92.2 and 0.94.0, several new command line options are - introduced to aid repairing a corrupted HBase. This hbck sometimes goes by the nickname - âuberhbckâ. Each particular version of uber hbck is compatible with the HBaseâs of the same - major version (0.90.7 uberhbck can repair a 0.90.4). However, versions <=0.90.6 and versions - <=0.92.1 may require restarting the master or failing over to a backup master. - </para> - </section> - <section><title>Localized repairs</title> - <para> - When repairing a corrupted HBase, it is best to repair the lowest risk inconsistencies first. - These are generally region consistency repairs -- localized single region repairs, that only modify - in-memory data, ephemeral zookeeper data, or patch holes in the META table. - Region consistency requires that the HBase instance has the state of the regionâs data in HDFS - (.regioninfo files), the regionâs row in the hbase:meta table., and regionâs deployment/assignments on - region servers and the master in accordance. Options for repairing region consistency include: - <itemizedlist> - <listitem><para><code>-fixAssignments</code> (equivalent to the 0.90 <code>-fix</code> option) repairs unassigned, incorrectly - assigned or multiply assigned regions.</para> - </listitem> - <listitem><para><code>-fixMeta</code> which removes meta rows when corresponding regions are not present in - HDFS and adds new meta rows if they regions are present in HDFS while not in META.</para> - </listitem> - </itemizedlist> - To fix deployment and assignment problems you can run this command: - </para> - <programlisting language="bourne"> -$ ./bin/hbase hbck -fixAssignments -</programlisting> - <para>To fix deployment and assignment problems as well as repairing incorrect meta rows you can - run this command:</para> - <programlisting language="bourne"> -$ ./bin/hbase hbck -fixAssignments -fixMeta -</programlisting> - <para>There are a few classes of table integrity problems that are low risk repairs. The first two are - degenerate (startkey == endkey) regions and backwards regions (startkey > endkey). These are - automatically handled by sidelining the data to a temporary directory (/hbck/xxxx). - The third low-risk class is hdfs region holes. This can be repaired by using the:</para> - <itemizedlist> - <listitem><para><code>-fixHdfsHoles</code> option for fabricating new empty regions on the file system. - If holes are detected you can use -fixHdfsHoles and should include -fixMeta and -fixAssignments to make the new region consistent.</para> - </listitem> - </itemizedlist> - <programlisting language="bourne"> -$ ./bin/hbase hbck -fixAssignments -fixMeta -fixHdfsHoles -</programlisting> - <para>Since this is a common operation, weâve added a the <code>-repairHoles</code> flag that is equivalent to the - previous command:</para> - <programlisting language="bourne"> -$ ./bin/hbase hbck -repairHoles -</programlisting> - <para>If inconsistencies still remain after these steps, you most likely have table integrity problems - related to orphaned or overlapping regions.</para> - </section> - <section><title>Region Overlap Repairs</title> - <para>Table integrity problems can require repairs that deal with overlaps. This is a riskier operation - because it requires modifications to the file system, requires some decision making, and may - require some manual steps. For these repairs it is best to analyze the output of a <code>hbck -details</code> - run so that you isolate repairs attempts only upon problems the checks identify. Because this is - riskier, there are safeguard that should be used to limit the scope of the repairs. - WARNING: This is a relatively new and have only been tested on online but idle HBase instances - (no reads/writes). Use at your own risk in an active production environment! - The options for repairing table integrity violations include:</para> - <itemizedlist> - <listitem><para><code>-fixHdfsOrphans</code> option for âadoptingâ a region directory that is missing a region - metadata file (the .regioninfo file).</para> - </listitem> - <listitem><para><code>-fixHdfsOverlaps</code> ability for fixing overlapping regions</para> - </listitem> - </itemizedlist> - <para>When repairing overlapping regions, a regionâs data can be modified on the file system in two - ways: 1) by merging regions into a larger region or 2) by sidelining regions by moving data to - âsidelineâ directory where data could be restored later. Merging a large number of regions is - technically correct but could result in an extremely large region that requires series of costly - compactions and splitting operations. In these cases, it is probably better to sideline the regions - that overlap with the most other regions (likely the largest ranges) so that merges can happen on - a more reasonable scale. Since these sidelined regions are already laid out in HBaseâs native - directory and HFile format, they can be restored by using HBaseâs bulk load mechanism. - The default safeguard thresholds are conservative. These options let you override the default - thresholds and to enable the large region sidelining feature.</para> - <itemizedlist> - <listitem><para><code>-maxMerge <n></code> maximum number of overlapping regions to merge</para> - </listitem> - <listitem><para><code>-sidelineBigOverlaps</code> if more than maxMerge regions are overlapping, sideline attempt - to sideline the regions overlapping with the most other regions.</para> - </listitem> - <listitem><para><code>-maxOverlapsToSideline <n></code> if sidelining large overlapping regions, sideline at most n - regions.</para> - </listitem> - </itemizedlist> - - <para>Since often times you would just want to get the tables repaired, you can use this option to turn - on all repair options:</para> - <itemizedlist> - <listitem><para><code>-repair</code> includes all the region consistency options and only the hole repairing table - integrity options.</para> - </listitem> - </itemizedlist> - <para>Finally, there are safeguards to limit repairs to only specific tables. For example the following - command would only attempt to check and repair table TableFoo and TableBar.</para> - <screen language="bourne"> -$ ./bin/hbase hbck -repair TableFoo TableBar -</screen> - <section><title>Special cases: Meta is not properly assigned</title> - <para>There are a few special cases that hbck can handle as well. - Sometimes the meta tableâs only region is inconsistently assigned or deployed. In this case - there is a special <code>-fixMetaOnly</code> option that can try to fix meta assignments.</para> - <screen language="bourne"> -$ ./bin/hbase hbck -fixMetaOnly -fixAssignments -</screen> - </section> - <section><title>Special cases: HBase version file is missing</title> - <para>HBaseâs data on the file system requires a version file in order to start. If this flie is missing, you - can use the <code>-fixVersionFile</code> option to fabricating a new HBase version file. This assumes that - the version of hbck you are running is the appropriate version for the HBase cluster.</para> - </section> - <section><title>Special case: Root and META are corrupt.</title> - <para>The most drastic corruption scenario is the case where the ROOT or META is corrupted and - HBase will not start. In this case you can use the OfflineMetaRepair tool create new ROOT - and META regions and tables. - This tool assumes that HBase is offline. It then marches through the existing HBase home - directory, loads as much information from region metadata files (.regioninfo files) as possible - from the file system. If the region metadata has proper table integrity, it sidelines the original root - and meta table directories, and builds new ones with pointers to the region directories and their - data.</para> - <screen language="bourne"> -$ ./bin/hbase org.apache.hadoop.hbase.util.hbck.OfflineMetaRepair -</screen> - <para>NOTE: This tool is not as clever as uberhbck but can be used to bootstrap repairs that uberhbck - can complete. - If the tool succeeds you should be able to start hbase and run online repairs if necessary.</para> - </section> - <section><title>Special cases: Offline split parent</title> - <para> - Once a region is split, the offline parent will be cleaned up automatically. Sometimes, daughter regions - are split again before their parents are cleaned up. HBase can clean up parents in the right order. However, - there could be some lingering offline split parents sometimes. They are in META, in HDFS, and not deployed. - But HBase can't clean them up. In this case, you can use the <code>-fixSplitParents</code> option to reset - them in META to be online and not split. Therefore, hbck can merge them with other regions if fixing - overlapping regions option is used. - </para> - <para> - This option should not normally be used, and it is not in <code>-fixAll</code>. - </para> - </section> - </section> - -</appendix> http://git-wip-us.apache.org/repos/asf/hbase/blob/e80b3092/src/main/docbkx/mapreduce.xml ---------------------------------------------------------------------- diff --git a/src/main/docbkx/mapreduce.xml b/src/main/docbkx/mapreduce.xml deleted file mode 100644 index 9e9e474..0000000 --- a/src/main/docbkx/mapreduce.xml +++ /dev/null @@ -1,630 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<chapter - xml:id="mapreduce" - version="5.0" - xmlns="http://docbook.org/ns/docbook" - xmlns:xlink="http://www.w3.org/1999/xlink" - xmlns:xi="http://www.w3.org/2001/XInclude" - xmlns:svg="http://www.w3.org/2000/svg" - xmlns:m="http://www.w3.org/1998/Math/MathML" - xmlns:html="http://www.w3.org/1999/xhtml" - xmlns:db="http://docbook.org/ns/docbook"> - <!--/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ ---> - - <title>HBase and MapReduce</title> - <para>Apache MapReduce is a software framework used to analyze large amounts of data, and is - the framework used most often with <link - xlink:href="http://hadoop.apache.org/">Apache Hadoop</link>. MapReduce itself is out of the - scope of this document. A good place to get started with MapReduce is <link - xlink:href="http://hadoop.apache.org/docs/r1.2.1/mapred_tutorial.html" />. MapReduce version - 2 (MR2)is now part of <link - xlink:href="http://hadoop.apache.org/docs/r2.3.0/hadoop-yarn/hadoop-yarn-site/">YARN</link>. </para> - - <para> This chapter discusses specific configuration steps you need to take to use MapReduce on - data within HBase. In addition, it discusses other interactions and issues between HBase and - MapReduce jobs. - <note> - <title>mapred and mapreduce</title> - <para>There are two mapreduce packages in HBase as in MapReduce itself: <filename>org.apache.hadoop.hbase.mapred</filename> - and <filename>org.apache.hadoop.hbase.mapreduce</filename>. The former does old-style API and the latter - the new style. The latter has more facility though you can usually find an equivalent in the older - package. Pick the package that goes with your mapreduce deploy. When in doubt or starting over, pick the - <filename>org.apache.hadoop.hbase.mapreduce</filename>. In the notes below, we refer to - o.a.h.h.mapreduce but replace with the o.a.h.h.mapred if that is what you are using. - </para> - </note> - </para> - - <section - xml:id="hbase.mapreduce.classpath"> - <title>HBase, MapReduce, and the CLASSPATH</title> - <para>By default, MapReduce jobs deployed to a MapReduce cluster do not have access to either - the HBase configuration under <envar>$HBASE_CONF_DIR</envar> or the HBase classes.</para> - <para>To give the MapReduce jobs the access they need, you could add - <filename>hbase-site.xml</filename> to the - <filename><replaceable>$HADOOP_HOME</replaceable>/conf/</filename> directory and add the - HBase JARs to the <filename><replaceable>HADOOP_HOME</replaceable>/conf/</filename> - directory, then copy these changes across your cluster. You could add hbase-site.xml to - $HADOOP_HOME/conf and add HBase jars to the $HADOOP_HOME/lib. You would then need to copy - these changes across your cluster or edit - <filename><replaceable>$HADOOP_HOME</replaceable>conf/hadoop-env.sh</filename> and add - them to the <envar>HADOOP_CLASSPATH</envar> variable. However, this approach is not - recommended because it will pollute your Hadoop install with HBase references. It also - requires you to restart the Hadoop cluster before Hadoop can use the HBase data.</para> - <para> Since HBase 0.90.x, HBase adds its dependency JARs to the job configuration itself. The - dependencies only need to be available on the local CLASSPATH. The following example runs - the bundled HBase <link - xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/RowCounter.html">RowCounter</link> - MapReduce job against a table named <systemitem>usertable</systemitem> If you have not set - the environment variables expected in the command (the parts prefixed by a - <literal>$</literal> sign and curly braces), you can use the actual system paths instead. - Be sure to use the correct version of the HBase JAR for your system. The backticks - (<literal>`</literal> symbols) cause ths shell to execute the sub-commands, setting the - CLASSPATH as part of the command. This example assumes you use a BASH-compatible shell. </para> - <screen language="bourne">$ <userinput>HADOOP_CLASSPATH=`${HBASE_HOME}/bin/hbase classpath` ${HADOOP_HOME}/bin/hadoop jar ${HBASE_HOME}/hbase-server-VERSION.jar rowcounter usertable</userinput></screen> - <para>When the command runs, internally, the HBase JAR finds the dependencies it needs for - zookeeper, guava, and its other dependencies on the passed <envar>HADOOP_CLASSPATH</envar> - and adds the JARs to the MapReduce job configuration. See the source at - TableMapReduceUtil#addDependencyJars(org.apache.hadoop.mapreduce.Job) for how this is done. </para> - <note> - <para> The example may not work if you are running HBase from its build directory rather - than an installed location. You may see an error like the following:</para> - <screen>java.lang.RuntimeException: java.lang.ClassNotFoundException: org.apache.hadoop.hbase.mapreduce.RowCounter$RowCounterMapper</screen> - <para>If this occurs, try modifying the command as follows, so that it uses the HBase JARs - from the <filename>target/</filename> directory within the build environment.</para> - <screen language="bourne">$ <userinput>HADOOP_CLASSPATH=${HBASE_HOME}/hbase-server/target/hbase-server-VERSION-SNAPSHOT.jar:`${HBASE_HOME}/bin/hbase classpath` ${HADOOP_HOME}/bin/hadoop jar ${HBASE_HOME}/hbase-server/target/hbase-server-VERSION-SNAPSHOT.jar rowcounter usertable</userinput></screen> - </note> - <caution> - <title>Notice to Mapreduce users of HBase 0.96.1 and above</title> - <para>Some mapreduce jobs that use HBase fail to launch. The symptom is an exception similar - to the following:</para> - <screen> -Exception in thread "main" java.lang.IllegalAccessError: class - com.google.protobuf.ZeroCopyLiteralByteString cannot access its superclass - com.google.protobuf.LiteralByteString - at java.lang.ClassLoader.defineClass1(Native Method) - at java.lang.ClassLoader.defineClass(ClassLoader.java:792) - at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142) - at java.net.URLClassLoader.defineClass(URLClassLoader.java:449) - at java.net.URLClassLoader.access$100(URLClassLoader.java:71) - at java.net.URLClassLoader$1.run(URLClassLoader.java:361) - at java.net.URLClassLoader$1.run(URLClassLoader.java:355) - at java.security.AccessController.doPrivileged(Native Method) - at java.net.URLClassLoader.findClass(URLClassLoader.java:354) - at java.lang.ClassLoader.loadClass(ClassLoader.java:424) - at java.lang.ClassLoader.loadClass(ClassLoader.java:357) - at - org.apache.hadoop.hbase.protobuf.ProtobufUtil.toScan(ProtobufUtil.java:818) - at - org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.convertScanToString(TableMapReduceUtil.java:433) - at - org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.initTableMapperJob(TableMapReduceUtil.java:186) - at - org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.initTableMapperJob(TableMapReduceUtil.java:147) - at - org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.initTableMapperJob(TableMapReduceUtil.java:270) - at - org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.initTableMapperJob(TableMapReduceUtil.java:100) -... -</screen> - <para>This is caused by an optimization introduced in <link - xlink:href="https://issues.apache.org/jira/browse/HBASE-9867">HBASE-9867</link> that - inadvertently introduced a classloader dependency. </para> - <para>This affects both jobs using the <code>-libjars</code> option and "fat jar," those - which package their runtime dependencies in a nested <code>lib</code> folder.</para> - <para>In order to satisfy the new classloader requirements, hbase-protocol.jar must be - included in Hadoop's classpath. See <xref - linkend="hbase.mapreduce.classpath" /> for current recommendations for resolving - classpath errors. The following is included for historical purposes.</para> - <para>This can be resolved system-wide by including a reference to the hbase-protocol.jar in - hadoop's lib directory, via a symlink or by copying the jar into the new location.</para> - <para>This can also be achieved on a per-job launch basis by including it in the - <code>HADOOP_CLASSPATH</code> environment variable at job submission time. When - launching jobs that package their dependencies, all three of the following job launching - commands satisfy this requirement:</para> - <screen language="bourne"> -$ <userinput>HADOOP_CLASSPATH=/path/to/hbase-protocol.jar:/path/to/hbase/conf hadoop jar MyJob.jar MyJobMainClass</userinput> -$ <userinput>HADOOP_CLASSPATH=$(hbase mapredcp):/path/to/hbase/conf hadoop jar MyJob.jar MyJobMainClass</userinput> -$ <userinput>HADOOP_CLASSPATH=$(hbase classpath) hadoop jar MyJob.jar MyJobMainClass</userinput> - </screen> - <para>For jars that do not package their dependencies, the following command structure is - necessary:</para> - <screen language="bourne"> -$ <userinput>HADOOP_CLASSPATH=$(hbase mapredcp):/etc/hbase/conf hadoop jar MyApp.jar MyJobMainClass -libjars $(hbase mapredcp | tr ':' ',')</userinput> ... - </screen> - <para>See also <link - xlink:href="https://issues.apache.org/jira/browse/HBASE-10304">HBASE-10304</link> for - further discussion of this issue.</para> - </caution> - </section> - - <section> - <title>MapReduce Scan Caching</title> - <para>TableMapReduceUtil now restores the option to set scanner caching (the number of rows - which are cached before returning the result to the client) on the Scan object that is - passed in. This functionality was lost due to a bug in HBase 0.95 (<link - xlink:href="https://issues.apache.org/jira/browse/HBASE-11558">HBASE-11558</link>), which - is fixed for HBase 0.98.5 and 0.96.3. The priority order for choosing the scanner caching is - as follows:</para> - <orderedlist> - <listitem> - <para>Caching settings which are set on the scan object.</para> - </listitem> - <listitem> - <para>Caching settings which are specified via the configuration option - <option>hbase.client.scanner.caching</option>, which can either be set manually in - <filename>hbase-site.xml</filename> or via the helper method - <code>TableMapReduceUtil.setScannerCaching()</code>.</para> - </listitem> - <listitem> - <para>The default value <code>HConstants.DEFAULT_HBASE_CLIENT_SCANNER_CACHING</code>, which is set to - <literal>100</literal>.</para> - </listitem> - </orderedlist> - <para>Optimizing the caching settings is a balance between the time the client waits for a - result and the number of sets of results the client needs to receive. If the caching setting - is too large, the client could end up waiting for a long time or the request could even time - out. If the setting is too small, the scan needs to return results in several pieces. - If you think of the scan as a shovel, a bigger cache setting is analogous to a bigger - shovel, and a smaller cache setting is equivalent to more shoveling in order to fill the - bucket.</para> - <para>The list of priorities mentioned above allows you to set a reasonable default, and - override it for specific operations.</para> - <para>See the API documentation for <link - xlink:href="https://hbase.apache.org/apidocs/org/apache/hadoop/hbase/client/Scan.html" - >Scan</link> for more details.</para> - </section> - - <section> - <title>Bundled HBase MapReduce Jobs</title> - <para>The HBase JAR also serves as a Driver for some bundled mapreduce jobs. To learn about - the bundled MapReduce jobs, run the following command.</para> - - <screen language="bourne">$ <userinput>${HADOOP_HOME}/bin/hadoop jar ${HBASE_HOME}/hbase-server-VERSION.jar</userinput> -<computeroutput>An example program must be given as the first argument. -Valid program names are: - copytable: Export a table from local cluster to peer cluster - completebulkload: Complete a bulk data load. - export: Write table data to HDFS. - import: Import data written by Export. - importtsv: Import data in TSV format. - rowcounter: Count rows in HBase table</computeroutput> - </screen> - <para>Each of the valid program names are bundled MapReduce jobs. To run one of the jobs, - model your command after the following example.</para> - <screen language="bourne">$ <userinput>${HADOOP_HOME}/bin/hadoop jar ${HBASE_HOME}/hbase-server-VERSION.jar rowcounter myTable</userinput></screen> - </section> - - <section> - <title>HBase as a MapReduce Job Data Source and Data Sink</title> - <para>HBase can be used as a data source, <link - xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html">TableInputFormat</link>, - and data sink, <link - xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableOutputFormat.html">TableOutputFormat</link> - or <link - xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/MultiTableOutputFormat.html">MultiTableOutputFormat</link>, - for MapReduce jobs. Writing MapReduce jobs that read or write HBase, it is advisable to - subclass <link - xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableMapper.html">TableMapper</link> - and/or <link - xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableReducer.html">TableReducer</link>. - See the do-nothing pass-through classes <link - xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/IdentityTableMapper.html">IdentityTableMapper</link> - and <link - xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/IdentityTableReducer.html">IdentityTableReducer</link> - for basic usage. For a more involved example, see <link - xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/RowCounter.html">RowCounter</link> - or review the <code>org.apache.hadoop.hbase.mapreduce.TestTableMapReduce</code> unit test. </para> - <para>If you run MapReduce jobs that use HBase as source or sink, need to specify source and - sink table and column names in your configuration.</para> - - <para>When you read from HBase, the <code>TableInputFormat</code> requests the list of regions - from HBase and makes a map, which is either a <code>map-per-region</code> or - <code>mapreduce.job.maps</code> map, whichever is smaller. If your job only has two maps, - raise <code>mapreduce.job.maps</code> to a number greater than the number of regions. Maps - will run on the adjacent TaskTracker if you are running a TaskTracer and RegionServer per - node. When writing to HBase, it may make sense to avoid the Reduce step and write back into - HBase from within your map. This approach works when your job does not need the sort and - collation that MapReduce does on the map-emitted data. On insert, HBase 'sorts' so there is - no point double-sorting (and shuffling data around your MapReduce cluster) unless you need - to. If you do not need the Reduce, you myour map might emit counts of records processed for - reporting at the end of the jobj, or set the number of Reduces to zero and use - TableOutputFormat. If running the Reduce step makes sense in your case, you should typically - use multiple reducers so that load is spread across the HBase cluster.</para> - - <para>A new HBase partitioner, the <link - xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/HRegionPartitioner.html">HRegionPartitioner</link>, - can run as many reducers the number of existing regions. The HRegionPartitioner is suitable - when your table is large and your upload will not greatly alter the number of existing - regions upon completion. Otherwise use the default partitioner. </para> - </section> - - <section> - <title>Writing HFiles Directly During Bulk Import</title> - <para>If you are importing into a new table, you can bypass the HBase API and write your - content directly to the filesystem, formatted into HBase data files (HFiles). Your import - will run faster, perhaps an order of magnitude faster. For more on how this mechanism works, - see <xref - linkend="arch.bulk.load" />.</para> - </section> - - <section> - <title>RowCounter Example</title> - <para>The included <link - xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/RowCounter.html">RowCounter</link> - MapReduce job uses <code>TableInputFormat</code> and does a count of all rows in the specified - table. To run it, use the following command: </para> - <screen language="bourne">$ <userinput>./bin/hadoop jar hbase-X.X.X.jar</userinput></screen> - <para>This will - invoke the HBase MapReduce Driver class. Select <literal>rowcounter</literal> from the choice of jobs - offered. This will print rowcouner usage advice to standard output. Specify the tablename, - column to count, and output - directory. If you have classpath errors, see <xref linkend="hbase.mapreduce.classpath" />.</para> - </section> - - <section - xml:id="splitter"> - <title>Map-Task Splitting</title> - <section - xml:id="splitter.default"> - <title>The Default HBase MapReduce Splitter</title> - <para>When <link - xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html">TableInputFormat</link> - is used to source an HBase table in a MapReduce job, its splitter will make a map task for - each region of the table. Thus, if there are 100 regions in the table, there will be 100 - map-tasks for the job - regardless of how many column families are selected in the - Scan.</para> - </section> - <section - xml:id="splitter.custom"> - <title>Custom Splitters</title> - <para>For those interested in implementing custom splitters, see the method - <code>getSplits</code> in <link - xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.html">TableInputFormatBase</link>. - That is where the logic for map-task assignment resides. </para> - </section> - </section> - <section - xml:id="mapreduce.example"> - <title>HBase MapReduce Examples</title> - <section - xml:id="mapreduce.example.read"> - <title>HBase MapReduce Read Example</title> - <para>The following is an example of using HBase as a MapReduce source in read-only manner. - Specifically, there is a Mapper instance but no Reducer, and nothing is being emitted from - the Mapper. There job would be defined as follows...</para> - <programlisting language="java"> -Configuration config = HBaseConfiguration.create(); -Job job = new Job(config, "ExampleRead"); -job.setJarByClass(MyReadJob.class); // class that contains mapper - -Scan scan = new Scan(); -scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs -scan.setCacheBlocks(false); // don't set to true for MR jobs -// set other scan attrs -... - -TableMapReduceUtil.initTableMapperJob( - tableName, // input HBase table name - scan, // Scan instance to control CF and attribute selection - MyMapper.class, // mapper - null, // mapper output key - null, // mapper output value - job); -job.setOutputFormatClass(NullOutputFormat.class); // because we aren't emitting anything from mapper - -boolean b = job.waitForCompletion(true); -if (!b) { - throw new IOException("error with job!"); -} - </programlisting> - <para>...and the mapper instance would extend <link - xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableMapper.html">TableMapper</link>...</para> - <programlisting language="java"> -public static class MyMapper extends TableMapper<Text, Text> { - - public void map(ImmutableBytesWritable row, Result value, Context context) throws InterruptedException, IOException { - // process data for the row from the Result instance. - } -} - </programlisting> - </section> - <section - xml:id="mapreduce.example.readwrite"> - <title>HBase MapReduce Read/Write Example</title> - <para>The following is an example of using HBase both as a source and as a sink with - MapReduce. This example will simply copy data from one table to another.</para> - <programlisting language="java"> -Configuration config = HBaseConfiguration.create(); -Job job = new Job(config,"ExampleReadWrite"); -job.setJarByClass(MyReadWriteJob.class); // class that contains mapper - -Scan scan = new Scan(); -scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs -scan.setCacheBlocks(false); // don't set to true for MR jobs -// set other scan attrs - -TableMapReduceUtil.initTableMapperJob( - sourceTable, // input table - scan, // Scan instance to control CF and attribute selection - MyMapper.class, // mapper class - null, // mapper output key - null, // mapper output value - job); -TableMapReduceUtil.initTableReducerJob( - targetTable, // output table - null, // reducer class - job); -job.setNumReduceTasks(0); - -boolean b = job.waitForCompletion(true); -if (!b) { - throw new IOException("error with job!"); -} - </programlisting> - <para>An explanation is required of what <classname>TableMapReduceUtil</classname> is doing, - especially with the reducer. <link - xlink:href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableOutputFormat.html">TableOutputFormat</link> - is being used as the outputFormat class, and several parameters are being set on the - config (e.g., TableOutputFormat.OUTPUT_TABLE), as well as setting the reducer output key - to <classname>ImmutableBytesWritable</classname> and reducer value to - <classname>Writable</classname>. These could be set by the programmer on the job and - conf, but <classname>TableMapReduceUtil</classname> tries to make things easier.</para> - <para>The following is the example mapper, which will create a <classname>Put</classname> - and matching the input <classname>Result</classname> and emit it. Note: this is what the - CopyTable utility does. </para> - <programlisting language="java"> -public static class MyMapper extends TableMapper<ImmutableBytesWritable, Put> { - - public void map(ImmutableBytesWritable row, Result value, Context context) throws IOException, InterruptedException { - // this example is just copying the data from the source table... - context.write(row, resultToPut(row,value)); - } - - private static Put resultToPut(ImmutableBytesWritable key, Result result) throws IOException { - Put put = new Put(key.get()); - for (KeyValue kv : result.raw()) { - put.add(kv); - } - return put; - } -} - </programlisting> - <para>There isn't actually a reducer step, so <classname>TableOutputFormat</classname> takes - care of sending the <classname>Put</classname> to the target table. </para> - <para>This is just an example, developers could choose not to use - <classname>TableOutputFormat</classname> and connect to the target table themselves. - </para> - </section> - <section - xml:id="mapreduce.example.readwrite.multi"> - <title>HBase MapReduce Read/Write Example With Multi-Table Output</title> - <para>TODO: example for <classname>MultiTableOutputFormat</classname>. </para> - </section> - <section - xml:id="mapreduce.example.summary"> - <title>HBase MapReduce Summary to HBase Example</title> - <para>The following example uses HBase as a MapReduce source and sink with a summarization - step. This example will count the number of distinct instances of a value in a table and - write those summarized counts in another table. - <programlisting language="java"> -Configuration config = HBaseConfiguration.create(); -Job job = new Job(config,"ExampleSummary"); -job.setJarByClass(MySummaryJob.class); // class that contains mapper and reducer - -Scan scan = new Scan(); -scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs -scan.setCacheBlocks(false); // don't set to true for MR jobs -// set other scan attrs - -TableMapReduceUtil.initTableMapperJob( - sourceTable, // input table - scan, // Scan instance to control CF and attribute selection - MyMapper.class, // mapper class - Text.class, // mapper output key - IntWritable.class, // mapper output value - job); -TableMapReduceUtil.initTableReducerJob( - targetTable, // output table - MyTableReducer.class, // reducer class - job); -job.setNumReduceTasks(1); // at least one, adjust as required - -boolean b = job.waitForCompletion(true); -if (!b) { - throw new IOException("error with job!"); -} - </programlisting> - In this example mapper a column with a String-value is chosen as the value to summarize - upon. This value is used as the key to emit from the mapper, and an - <classname>IntWritable</classname> represents an instance counter. - <programlisting language="java"> -public static class MyMapper extends TableMapper<Text, IntWritable> { - public static final byte[] CF = "cf".getBytes(); - public static final byte[] ATTR1 = "attr1".getBytes(); - - private final IntWritable ONE = new IntWritable(1); - private Text text = new Text(); - - public void map(ImmutableBytesWritable row, Result value, Context context) throws IOException, InterruptedException { - String val = new String(value.getValue(CF, ATTR1)); - text.set(val); // we can only emit Writables... - - context.write(text, ONE); - } -} - </programlisting> - In the reducer, the "ones" are counted (just like any other MR example that does this), - and then emits a <classname>Put</classname>. - <programlisting language="java"> -public static class MyTableReducer extends TableReducer<Text, IntWritable, ImmutableBytesWritable> { - public static final byte[] CF = "cf".getBytes(); - public static final byte[] COUNT = "count".getBytes(); - - public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { - int i = 0; - for (IntWritable val : values) { - i += val.get(); - } - Put put = new Put(Bytes.toBytes(key.toString())); - put.add(CF, COUNT, Bytes.toBytes(i)); - - context.write(null, put); - } -} - </programlisting> - </para> - </section> - <section - xml:id="mapreduce.example.summary.file"> - <title>HBase MapReduce Summary to File Example</title> - <para>This very similar to the summary example above, with exception that this is using - HBase as a MapReduce source but HDFS as the sink. The differences are in the job setup and - in the reducer. The mapper remains the same. </para> - <programlisting language="java"> -Configuration config = HBaseConfiguration.create(); -Job job = new Job(config,"ExampleSummaryToFile"); -job.setJarByClass(MySummaryFileJob.class); // class that contains mapper and reducer - -Scan scan = new Scan(); -scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs -scan.setCacheBlocks(false); // don't set to true for MR jobs -// set other scan attrs - -TableMapReduceUtil.initTableMapperJob( - sourceTable, // input table - scan, // Scan instance to control CF and attribute selection - MyMapper.class, // mapper class - Text.class, // mapper output key - IntWritable.class, // mapper output value - job); -job.setReducerClass(MyReducer.class); // reducer class -job.setNumReduceTasks(1); // at least one, adjust as required -FileOutputFormat.setOutputPath(job, new Path("/tmp/mr/mySummaryFile")); // adjust directories as required - -boolean b = job.waitForCompletion(true); -if (!b) { - throw new IOException("error with job!"); -} - </programlisting> - <para>As stated above, the previous Mapper can run unchanged with this example. As for the - Reducer, it is a "generic" Reducer instead of extending TableMapper and emitting - Puts.</para> - <programlisting language="java"> - public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> { - - public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { - int i = 0; - for (IntWritable val : values) { - i += val.get(); - } - context.write(key, new IntWritable(i)); - } -} - </programlisting> - </section> - <section - xml:id="mapreduce.example.summary.noreducer"> - <title>HBase MapReduce Summary to HBase Without Reducer</title> - <para>It is also possible to perform summaries without a reducer - if you use HBase as the - reducer. </para> - <para>An HBase target table would need to exist for the job summary. The Table method - <code>incrementColumnValue</code> would be used to atomically increment values. From a - performance perspective, it might make sense to keep a Map of values with their values to - be incremeneted for each map-task, and make one update per key at during the <code> - cleanup</code> method of the mapper. However, your milage may vary depending on the - number of rows to be processed and unique keys. </para> - <para>In the end, the summary results are in HBase. </para> - </section> - <section - xml:id="mapreduce.example.summary.rdbms"> - <title>HBase MapReduce Summary to RDBMS</title> - <para>Sometimes it is more appropriate to generate summaries to an RDBMS. For these cases, - it is possible to generate summaries directly to an RDBMS via a custom reducer. The - <code>setup</code> method can connect to an RDBMS (the connection information can be - passed via custom parameters in the context) and the cleanup method can close the - connection. </para> - <para>It is critical to understand that number of reducers for the job affects the - summarization implementation, and you'll have to design this into your reducer. - Specifically, whether it is designed to run as a singleton (one reducer) or multiple - reducers. Neither is right or wrong, it depends on your use-case. Recognize that the more - reducers that are assigned to the job, the more simultaneous connections to the RDBMS will - be created - this will scale, but only to a point. </para> - <programlisting language="java"> - public static class MyRdbmsReducer extends Reducer<Text, IntWritable, Text, IntWritable> { - - private Connection c = null; - - public void setup(Context context) { - // create DB connection... - } - - public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { - // do summarization - // in this example the keys are Text, but this is just an example - } - - public void cleanup(Context context) { - // close db connection - } - -} - </programlisting> - <para>In the end, the summary results are written to your RDBMS table/s. </para> - </section> - - </section> - <!-- mr examples --> - <section - xml:id="mapreduce.htable.access"> - <title>Accessing Other HBase Tables in a MapReduce Job</title> - <para>Although the framework currently allows one HBase table as input to a MapReduce job, - other HBase tables can be accessed as lookup tables, etc., in a MapReduce job via creating - an Table instance in the setup method of the Mapper. - <programlisting language="java">public class MyMapper extends TableMapper<Text, LongWritable> { - private Table myOtherTable; - - public void setup(Context context) { - // In here create a Connection to the cluster and save it or use the Connection - // from the existing table - myOtherTable = connection.getTable("myOtherTable"); - } - - public void map(ImmutableBytesWritable row, Result value, Context context) throws IOException, InterruptedException { - // process Result... - // use 'myOtherTable' for lookups - } - - </programlisting> - </para> - </section> - <section - xml:id="mapreduce.specex"> - <title>Speculative Execution</title> - <para>It is generally advisable to turn off speculative execution for MapReduce jobs that use - HBase as a source. This can either be done on a per-Job basis through properties, on on the - entire cluster. Especially for longer running jobs, speculative execution will create - duplicate map-tasks which will double-write your data to HBase; this is probably not what - you want. </para> - <para>See <xref - linkend="spec.ex" /> for more information. </para> - </section> - -</chapter>
