loadstore.xml

gates Mon, 23 Apr 2012 18:00:06 -0700

Author: gates
Date: Tue Apr 24 00:59:36 2012
New Revision: 1329516

URL: http://svn.apache.org/viewvc?rev=1329516&view=rev
Log:
HCATALOG-372 Add filter information to Load/Store and Input/Output docs


Modified:
    incubator/hcatalog/branches/branch-0.4/CHANGES.txt
    
incubator/hcatalog/branches/branch-0.4/src/docs/src/documentation/content/xdocs/inputoutput.xml
    
incubator/hcatalog/branches/branch-0.4/src/docs/src/documentation/content/xdocs/loadstore.xml

Modified: incubator/hcatalog/branches/branch-0.4/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/hcatalog/branches/branch-0.4/CHANGES.txt?rev=1329516&r1=1329515&r2=1329516&view=diff
==============================================================================
--- incubator/hcatalog/branches/branch-0.4/CHANGES.txt (original)
+++ incubator/hcatalog/branches/branch-0.4/CHANGES.txt Tue Apr 24 00:59:36 2012
@@ -79,6 +79,8 @@ Release 0.4.0 - Unreleased
   HCAT-2 Support nested schema conversion between Hive an Pig (julienledem via 
hashutosh)
 
   IMPROVEMENTS
+  HCAT-372 Add filter information to Load/Store and Input/Output docs (lefty 
via gates)
+
   HCAT-379 Fix mapred.out.dir hack in FileOutputCommitterContainer (toffer)
 
   HCAT-69 Fix token usage from HCat (toffer)

Modified: 
incubator/hcatalog/branches/branch-0.4/src/docs/src/documentation/content/xdocs/inputoutput.xml
URL: 
http://svn.apache.org/viewvc/incubator/hcatalog/branches/branch-0.4/src/docs/src/documentation/content/xdocs/inputoutput.xml?rev=1329516&r1=1329515&r2=1329516&view=diff
==============================================================================
--- 
incubator/hcatalog/branches/branch-0.4/src/docs/src/documentation/content/xdocs/inputoutput.xml
 (original)
+++ 
incubator/hcatalog/branches/branch-0.4/src/docs/src/documentation/content/xdocs/inputoutput.xml
 Tue Apr 24 00:59:36 2012
@@ -148,8 +148,29 @@ pass the Hive and HCatalog jars MapReduc
 <source>
 export HADOOP_HOME=&lt;path_to_hadoop_install&gt;
 export HCAT_HOME=&lt;path_to_hcat_install&gt;
-export 
LIB_JARS=$HCAT_HOME/share/hcatalog/hcatalog-0.4.0.jar,$HCAT_HOME/share/hcatalog/lib/hive-metastore-0.8.1.jar,$HCAT_HOME/share/hcatalog/lib/libthrift-0.7.0.jar,$HCAT_HOME/share/hcatalog/lib/hive-exec-0.8.1.jar,$HCAT_HOME/share/hcatalog/lib/libfb303-0.7.0.jar,$HCAT_HOME/share/hcatalog/lib/jdo2-api-2.3-ec.jar,$HCAT_HOME/share/hcatalog/lib/slf4j-api-1.6.1.jar,$HCAT_HOME/share/hcatalog/lib/antlr-runtime-3.0.1.jar,$HCAT_HOME/share/hcatalog/lib/datanucleus-connectionpool-2.0.3.jar,$HCAT_HOME/share/hcatalog/lib/datanucleus-core-2.0.3.jar,$HCAT_HOME/share/hcatalog/lib/datanucleus-enhancer-2.0.3.jar,$HCAT_HOME/share/hcatalog/lib/datanucleus-rdbms-2.0.3.jar,$HCAT_HOME/share/hcatalog/lib/commons-dbcp-1.4.jar,$HCAT_HOME/share/hcatalog/lib/commons-pool-1.5.4.jar
-export 
HADOOP_CLASSPATH=$HCAT_HOME/share/hcatalog/hcatalog-0.4.0.jar:$HCAT_HOME/share/hcatalog/lib/hive-metastore-0.8.1.jar:$HCAT_HOME/share/hcatalog/lib/libthrift-0.7.0.jar:$HCAT_HOME/share/hcatalog/lib/hive-exec-0.8.1.jar:$HCAT_HOME/share/hcatalog/lib/libfb303-0.7.0.jar:$HCAT_HOME/share/hcatalog/lib/jdo2-api-2.3-ec.jar:$HCAT_HOME/share/hcatalog/lib/slf4j-api-1.6.1.jar:$HCAT_HOME/share/hcatalog/lib/antlr-runtime-3.0.1.jar:$HCAT_HOME/share/hcatalog/lib/datanucleus-connectionpool-2.0.3.jar:$HCAT_HOME/share/hcatalog/lib/datanucleus-core-2.0.3.jar:$HCAT_HOME/share/hcatalog/lib/datanucleus-enhancer-2.0.3.jar:$HCAT_HOME/share/hcatalog/lib/datanucleus-rdbms-2.0.3.jar:$HCAT_HOME/share/hcatalog/lib/commons-dbcp-1.4.jar:$HCAT_HOME/share/hcatalog/lib/commons-pool-1.5.4.jar:$HCAT_HOME/etc/hcatalog
+export LIB_JARS=$HCAT_HOME/share/hcatalog/hcatalog-0.4.0.jar,
+$HCAT_HOME/share/hcatalog/lib/hive-metastore-0.8.1.jar,$HCAT_HOME/share/hcatalog/lib/libthrift-0.7.0.jar,
+$HCAT_HOME/share/hcatalog/lib/hive-exec-0.8.1.jar,$HCAT_HOME/share/hcatalog/lib/libfb303-0.7.0.jar,
+$HCAT_HOME/share/hcatalog/lib/jdo2-api-2.3-ec.jar,$HCAT_HOME/share/hcatalog/lib/slf4j-api-1.6.1.jar,
+$HCAT_HOME/share/hcatalog/lib/antlr-runtime-3.0.1.jar,
+$HCAT_HOME/share/hcatalog/lib/datanucleus-connectionpool-2.0.3.jar,
+$HCAT_HOME/share/hcatalog/lib/datanucleus-core-2.0.3.jar,
+$HCAT_HOME/share/hcatalog/lib/datanucleus-enhancer-2.0.3.jar,
+$HCAT_HOME/share/hcatalog/lib/datanucleus-rdbms-2.0.3.jar,
+$HCAT_HOME/share/hcatalog/lib/commons-dbcp-1.4.jar,
+$HCAT_HOME/share/hcatalog/lib/commons-pool-1.5.4.jar
+export HADOOP_CLASSPATH=$HCAT_HOME/share/hcatalog/hcatalog-0.4.0.jar:
+$HCAT_HOME/share/hcatalog/lib/hive-metastore-0.8.1.jar:$HCAT_HOME/share/hcatalog/lib/libthrift-0.7.0.jar:
+$HCAT_HOME/share/hcatalog/lib/hive-exec-0.8.1.jar:$HCAT_HOME/share/hcatalog/lib/libfb303-0.7.0.jar:
+$HCAT_HOME/share/hcatalog/lib/jdo2-api-2.3-ec.jar:$HCAT_HOME/share/hcatalog/lib/slf4j-api-1.6.1.jar:
+$HCAT_HOME/share/hcatalog/lib/antlr-runtime-3.0.1.jar:
+$HCAT_HOME/share/hcatalog/lib/datanucleus-connectionpool-2.0.3.jar:
+$HCAT_HOME/share/hcatalog/lib/datanucleus-core-2.0.3.jar:
+$HCAT_HOME/share/hcatalog/lib/datanucleus-enhancer-2.0.3.jar:
+$HCAT_HOME/share/hcatalog/lib/datanucleus-rdbms-2.0.3.jar:
+$HCAT_HOME/share/hcatalog/lib/commons-dbcp-1.4.jar:
+$HCAT_HOME/share/hcatalog/lib/commons-pool-1.5.4.jar:
+$HCAT_HOME/etc/hcatalog
 
 $HADOOP_HOME/bin/hadoop --config $HADOOP_HOME/conf jar &lt;path_to_jar&gt;
 &lt;main_class&gt; -libjars $LIB_JARS &lt;program_arguments&gt;
@@ -162,7 +183,7 @@ $HADOOP_HOME/bin/hadoop --config $HADOOP
        </tr>
 </table>
 
-<p><strong>Examples</strong></p>
+<p><strong>Read Example</strong></p>
 
 <p>
 The following very simple MapReduce program reads data from one table which it 
assumes to have an integer in the
@@ -182,7 +203,8 @@ public class GroupByAge extends Configur
         protected void map(
                 WritableComparable key,
                 HCatRecord value,
-                org.apache.hadoop.mapreduce.Mapper&lt;WritableComparable, 
HCatRecord, IntWritable, IntWritable&gt;.Context context)
+                org.apache.hadoop.mapreduce.Mapper&lt;WritableComparable, 
HCatRecord, 
+                        IntWritable, IntWritable&gt;.Context context)
                 throws IOException, InterruptedException {
             age = (Integer) value.get(1);
             context.write(new IntWritable(age), new IntWritable(1));
@@ -194,9 +216,12 @@ public class GroupByAge extends Configur
 
 
       @Override 
-      protected void reduce(IntWritable key, 
java.lang.Iterable&lt;IntWritable&gt;
-        values, 
org.apache.hadoop.mapreduce.Reducer&lt;IntWritable,IntWritable,WritableComparable,HCatRecord&gt;.Context
 context)
-        throws IOException ,InterruptedException {
+      protected void reduce(
+              IntWritable key,
+              java.lang.Iterable&lt;IntWritable&gt; values, 
+              org.apache.hadoop.mapreduce.Reducer&lt;IntWritable, IntWritable,
+                      WritableComparable, HCatRecord&gt;.Context context)
+              throws IOException, InterruptedException {
           int sum = 0;
           Iterator&lt;IntWritable&gt; iter = values.iterator();
           while (iter.hasNext()) {
@@ -249,26 +274,36 @@ public class GroupByAge extends Configur
 }
 </source>
 
-<p>Notice a number of important points about this program:
-<br></br><br></br>
-1) The implementation of Map takes HCatRecord as an input and the 
implementation of Reduce produces it as an output.
-<br></br>
-2) This example program assumes the schema of the input, but it could also 
retrieve the schema via
-HCatOutputFormat.getOutputSchema() and retrieve fields based on the results of 
that call.
-<br></br>
-3) The input descriptor for the table to be read is created by calling 
InputJobInfo.create.  It requires the database name,
+<p>Notice a number of important points about this program:</p>
+<ol>
+<li>The implementation of Map takes HCatRecord as an input and the 
implementation of Reduce produces it as an output.</li>
+<li>This example program assumes the schema of the input, but it could also 
retrieve the schema via
+HCatOutputFormat.getOutputSchema() and retrieve fields based on the results of 
that call.</li>
+<li>The input descriptor for the table to be read is created by calling 
InputJobInfo.create.  It requires the database name,
 table name, and partition filter.  In this example the partition filter is 
null, so all partitions of the table
-will be read.
-<br></br>
-4) The output descriptor for the table to be written is created by calling 
OutputJobInfo.create.  It requires the
+will be read.</li>
+<li>The output descriptor for the table to be written is created by calling 
OutputJobInfo.create.  It requires the
 database name, the table name, and a Map of partition keys and values that 
describe the partition being written.
-In this example it is assumed the table is unpartitioned, so this Map is null.
-</p>
+In this example it is assumed the table is unpartitioned, so this Map is 
null.</li>
+</ol>
 
 <p>To scan just selected partitions of a table, a filter describing the 
desired partitions can be passed to
-InputJobInfo.create.  This filter can contain the operators '=', '&lt;', 
'&gt;', '&lt;=',
-'&gt;=', '&lt;&gt;', 'and', 'or', and 'like'.  Assume for example you have a 
web_logs
-table that is partitioned by the column datestamp.  You could select one 
partition of the table by changing</p>
+InputJobInfo.create.  To scan a single filter, the filter string should look 
like: "datestamp=20120401" where
+datestamp is the partition column name and 20120401 is the value you want to 
read.</p>
+
+<p><strong>Filter Operators</strong></p>
+
+<p>A filter can contain the operators 'and', 'or', 'like', '()', '=', 
'&lt;&gt;' (not equal), '&lt;', '&gt;', '&lt;='
+and '&gt;='.  For example: </p>
+<ul>
+<li><code>datestamp &gt; "20110924"</code></li>
+<li><code>datestamp &lt; "20110925</code></li>
+<li><code>datestamp &lt;= "20110925" and datestamp &gt;= "20110924"</code></li>
+</ul>
+
+<p><strong>Scan Filter</strong></p>
+
+<p>Assume for example you have a web_logs table that is partitioned by the 
column datestamp.  You could select one partition of the table by changing</p>
 <source>
 HCatInputFormat.setInput(job, InputJobInfo.create(dbName, inputTableName, 
null));
 </source>
@@ -281,6 +316,8 @@ HCatInputFormat.setInput(job,
   </source>
 <p>
 This filter must reference only partition columns.  Values from other columns 
will cause the job to fail.</p>
+
+<p><strong>Write Filter</strong></p>
 <p>
 To write to a single partition you can change the above example to have a Map 
of key value pairs that describe all
 of the partition keys and values for that partition.  In our example web_logs 
table, there is only one partition

Modified: 
incubator/hcatalog/branches/branch-0.4/src/docs/src/documentation/content/xdocs/loadstore.xml
URL: 
http://svn.apache.org/viewvc/incubator/hcatalog/branches/branch-0.4/src/docs/src/documentation/content/xdocs/loadstore.xml?rev=1329516&r1=1329515&r2=1329516&view=diff
==============================================================================
--- 
incubator/hcatalog/branches/branch-0.4/src/docs/src/documentation/content/xdocs/loadstore.xml
 (original)
+++ 
incubator/hcatalog/branches/branch-0.4/src/docs/src/documentation/content/xdocs/loadstore.xml
 Tue Apr 24 00:59:36 2012
@@ -28,7 +28,7 @@
   <title>Set Up</title>
   
 <p>The HCatLoader and HCatStorer interfaces are used with Pig scripts to read 
and write data in HCatalog managed tables.</p>
-<p><strong>Authentication</strong></p>
+
 </section>
   
       
@@ -115,12 +115,22 @@ variable. In the case where you have ins
 <source>
 export HADOOP_HOME=&lt;path_to_hadoop_install&gt;
 export HCAT_HOME=&lt;path_to_hcat_install&gt;
-PIG_CLASSPATH=$HCAT_HOME/share/hcatalog/hcatalog-0.4.0.jar:$HCAT_HOME/share/hcatalog/lib/hive-metastore-0.8.1.jar:$HCAT_HOME/share/hcatalog/lib/libthrift-0.7.0.jar:$HCAT_HOME/share/hcatalog/lib/hive-exec-0.8.1.jar:$HCAT_HOME/share/hcatalog/lib/libfb303-0.7.0.jar:$HCAT_HOME/share/hcatalog/lib/jdo2-api-2.3-ec.jar:$HCAT_HOME/etc/hcatalog:$HADOOP_HOME/conf:$HCAT_HOME/share/hcatalog/lib/slf4j-api-1.6.1.jar
+PIG_CLASSPATH=$HCAT_HOME/share/hcatalog/hcatalog-0.4.0.jar:$HCAT_HOME/share/hcatalog/lib/
+hive-metastore-0.8.1.jar:$HCAT_HOME/share/hcatalog/lib/libthrift-0.7.0.jar:$HCAT_HOME/
+share/hcatalog/lib/hive-exec-0.8.1.jar:$HCAT_HOME/share/hcatalog/lib/libfb303-0.7.0.jar:
+$HCAT_HOME/share/hcatalog/lib/jdo2-api-2.3-ec.jar:$HCAT_HOME/etc/hcatalog:$HADOOP_HOME/
+conf:$HCAT_HOME/share/hcatalog/lib/slf4j-api-1.6.1.jar
 export PIG_OPTS=-Dhive.metastore.uris=thrift://&lt;hostname&gt;:&lt;port&gt;
 
-&lt;path_to_pig_install&gt;/bin/pig 
-Dpig.additional.jars=$HCAT_HOME/share/hcatalog/hcatalog-0.4.0.jar:$HCAT_HOME/share/hcatalog/lib/hive-metastore-0.8.1.jar:$HCAT_HOME/share/hcatalog/lib/libthrift-0.7.0.jar:$HCAT_HOME/share/hcatalog/lib/hive-exec-0.8.1.jar:$HCAT_HOME/share/hcatalog/lib/libfb303-0.7.0.jar:$HCAT_HOME/share/hcatalog/lib/jdo2-api-2.3-ec.jar:$HCAT_HOME/etc/hcatalog:$HCAT_HOME/share/hcatalog/lib/slf4j-api-1.6.1.jar
 &lt;script.pig&gt;
+&lt;path_to_pig_install&gt;/bin/pig 
-Dpig.additional.jars=$HCAT_HOME/share/hcatalog/
+hcatalog-0.4.0.jar:$HCAT_HOME/share/hcatalog/lib/hive-metastore-0.8.1.jar:$HCAT_HOME/
+share/hcatalog/lib/libthrift-0.7.0.jar:$HCAT_HOME/share/hcatalog/lib/hive-exec-0.8.1.jar:
+$HCAT_HOME/share/hcatalog/lib/libfb303-0.7.0.jar:$HCAT_HOME/share/hcatalog/lib/jdo2-
+api-2.3-ec.jar:$HCAT_HOME/etc/hcatalog:$HCAT_HOME/share/hcatalog/lib/slf4j-api-1.6.1.jar
+ &lt;script.pig&gt;
 </source>
 
+<p><strong>Authentication</strong></p>
 <table>
        <tr>
        <td><p>If you are using a secure cluster and a failure results in a 
message like "2010-11-03 16:17:28,225 WARN hive.metastore ... - Unable to 
connect metastore with URI thrift://..." in /tmp/&lt;username&gt;/hive.log, 
then make sure you have run "kinit &lt;username&gt;@FOO.COM" to get a Kerberos 
ticket and to be able to authenticate to the HCatalog server. </p></td>
@@ -148,17 +158,15 @@ A = LOAD 'tablename' USING org.apache.hc
 A = LOAD 'tablename' USING  org.apache.hcatalog.pig.HCatLoader();
 
 -- date is a partition column; age is not
-
 B = filter A by date == '20100819' and age &lt; 30; 
 
 -- both date and country are partition columns
-
 C = filter A by date == '20100819' and country == 'US'; 
 ...
 ...
 </source>
 
-<p>To scan a whole table:</p>
+<p>To scan a whole table, for example:</p>
 
 <source>
 a = load 'student_data' using org.apache.hcatalog.pig.HCatLoader();
@@ -169,14 +177,14 @@ b = foreach a generate name, age;
 <p>Notice that the schema is automatically provided to Pig, there's no need to 
declare name and age as fields, as if
 you were loading from a file.</p>
 
-<p>Example of scanning a single partition. Assume the table web_logs is 
partitioned by the column datestamp:</p>
+<p>To scan a single partition of the table web_logs, for example, partitioned 
by the column datestamp:</p>
 
 <source>
 a = load 'web_logs' using org.apache.hcatalog.pig.HCatLoader();
 b = filter a by datestamp == '20110924';
 </source>
 
-<p>Pig will push the datestamp filter shown here to HCatalog, so that HCat 
knows to just scan the partition where
+<p>Pig will push the datestamp filter shown here to HCatalog, so that HCatalog 
knows to just scan the partition where
 datestamp = '20110924'. You can combine this filter with others via 'and':</p>
 
 <source>
@@ -184,14 +192,40 @@ a = load 'web_logs' using org.apache.hca
 b = filter a by datestamp == '20110924' and user is not null;
 </source>
 
-<p>Pig will split the above filter, pushing the datestamp portion to HCatalog 
and retaining the user is not null part
-to apply itself. You can also give a more complex filter to retrieve a set of 
partitions:</p>
+<p>Pig will split the above filter, pushing the datestamp portion to HCatalog 
and retaining the <code>user is not null</code> part
+to apply itself. You can also give a more complex filter to retrieve a set of 
partitions.</p>
+
+<p><strong>Filter Operators</strong></p>
+
+<p>A filter can contain the operators 'and', 'or', '()', '==', '!=', '&lt;', 
'&gt;', '&lt;='
+and '&gt;='.</p>
+
+<p>For example:</p>
+
+<source>
+a = load 'web_logs' using org.apache.hcatalog.pig.HCatLoader();
+b = filter a by datestamp &gt; '20110924';
+</source>
+
+<p>A complex filter can have various combinations of operators, such as:</p>
+
+<source>
+a = load 'web_logs' using org.apache.hcatalog.pig.HCatLoader();
+b = filter a by datestamp == '20110924' or datestamp == '20110925';
+</source>
+
+<p>These two examples have the same effect:</p>
 
 <source>
 a = load 'web_logs' using org.apache.hcatalog.pig.HCatLoader();
 b = filter a by datestamp &gt;= '20110924' and datestamp &lt;= '20110925';
 </source>
 
+<source>
+a = load 'web_logs' using org.apache.hcatalog.pig.HCatLoader();
+b = filter a by datestamp &lt;= '20110925' and datestamp &gt;= '20110924';
+</source>
+
 </section> 
  
 </section> 
@@ -247,8 +281,8 @@ attention to the quoting, as the whole s
 <p>To write into multiple partitions at one, make sure that the partition 
column is present in your data, then call
 HCatStorer with no argument:</p>
 
-<source>store z into 'web_data' using org.apache.hcatalog.pig.HCatStorer(); -- 
datestamp
-must be a field in the relation z</source>
+<source>store z into 'web_data' using org.apache.hcatalog.pig.HCatStorer(); 
+  -- datestamp must be a field in the relation z</source>
 
 
        </section>

svn commit: r1329516 - in /incubator/hcatalog/branches/branch-0.4: CHANGES.txt src/docs/src/documentation/content/xdocs/inputoutput.xml src/docs/src/documentation/content/xdocs/loadstore.xml

Reply via email to