Author: olga
Date: Sat Dec 19 00:01:11 2009
New Revision: 892408
URL: http://svn.apache.org/viewvc?rev=892408&view=rev
Log:
PIG-1163: Pig/Zebra 0.6.0 release (chandec via olgan)
Modified:
hadoop/pig/trunk/CHANGES.txt
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/site.xml
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_mapreduce.xml
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_overview.xml
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_pig.xml
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_reference.xml
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_stream.xml
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_users.xml
Modified: hadoop/pig/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=892408&r1=892407&r2=892408&view=diff
==============================================================================
--- hadoop/pig/trunk/CHANGES.txt (original)
+++ hadoop/pig/trunk/CHANGES.txt Sat Dec 19 00:01:11 2009
@@ -24,6 +24,8 @@
IMPROVEMENTS
+PIG-1163: Pig/Zebra 0.6.0 release (chandec via olgan)
+
PIG-1156: Add aliases to ExecJobs and PhysicalOperators (dvryaboy via gates)
PIG-1161: add missing license headers (dvryaboy via olgan)
Modified: hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/site.xml
URL:
http://svn.apache.org/viewvc/hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/site.xml?rev=892408&r1=892407&r2=892408&view=diff
==============================================================================
--- hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/site.xml
(original)
+++ hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/site.xml Sat Dec
19 00:01:11 2009
@@ -45,24 +45,24 @@
<tutorial label="Tutorial"
href="tutorial.html" />
</docs>
<docs label="Guides">
- <piglatin label="Pig Latin Users " href="piglatin_users.html" />
- <piglatin label="Pig Latin Reference" href="piglatin_reference.html"
/>
+ <plusers label="Pig Latin Users " href="piglatin_users.html" />
+ <plref label="Pig Latin Reference" href="piglatin_reference.html" />
<cookbook label="Cookbook" href="cookbook.html" />
<udf label="UDFs" href="udf.html" />
</docs>
<docs label="Zebra">
- <piglatin label="Zebra Overview " href="zebra_overview.html" />
- <piglatin label="Zebra Users " href="zebra_users.html" />
- <piglatin label="Zebra Reference " href="zebra_reference.html" />
- <piglatin label="Zebra MapReduce " href="zebra_mapreduce.html" />
- <piglatin label="Zebra Pig " href="zebra_pig.html" />
- <piglatin label="Zebra Streaming " href="zebra_stream.html" />
+ <zover label="Zebra Overview " href="zebra_overview.html" />
+ <zusers label="Zebra Users " href="zebra_users.html" />
+ <zref label="Zebra Reference " href="zebra_reference.html" />
+ <zmr label="Zebra MapReduce " href="zebra_mapreduce.html" />
+ <zpig label="Zebra Pig " href="zebra_pig.html" />
+ <zstream label="Zebra Streaming " href="zebra_stream.html" />
</docs>
<docs label="Miscellaneous">
<api label="API Docs"
href="ext:api"/>
- <wiki label="Wiki" href="ext:wiki" />
- <faq label="FAQ" href="ext:faq" />
- <relnotes label="Release Notes" href="ext:relnotes" />
+ <wiki label="Wiki" href="ext:wiki"
/>
+ <faq label="FAQ" href="ext:faq" />
+ <relnotes label="Release Notes" href="ext:relnotes" />
</docs>
<external-refs>
Modified:
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_mapreduce.xml
URL:
http://svn.apache.org/viewvc/hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_mapreduce.xml?rev=892408&r1=892407&r2=892408&view=diff
==============================================================================
---
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_mapreduce.xml
(original)
+++
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_mapreduce.xml
Sat Dec 19 00:01:11 2009
@@ -149,6 +149,7 @@
throw new RuntimeException(e);
}
}
+
}
static class ProjectionMap extends MapReduceBase implements
Mapper<BytesWritable, Tuple, Text, IntWritable> {
@@ -209,21 +210,22 @@
jobConf.setOutputFormat(BasicTableOutputFormat.class);
BasicTableOutputFormat.setOutputPath(jobConf, new Path(args[1]));
- // set the output info:
- // ZebraSchema - 2 logical columns
- // ZebraStorageHint - 2 physical column groups (one column each)
- // ZebraSortInfo - unsorted table (null)
+ // set the storage info of logical schema with 2 columns;
+ // and create 2 physical column groups;
+ // unsorted table
BasicTableOutputFormat.setStorageInfo(jobConf,
ZebraSchema.createZebraSchema("word:string, count:int"),
- ZebraStorageHint.createZebraStorageHint("[word];[count]"),
- null);
+ ZebraStorageHint.createZebraStorageHint("[word];[count]"), null);
// set map-only job.
jobConf.setNumReduceTasks(0);
// Run Job
JobClient.runJob(jobConf);
+
+ // Need to close Zebra output streams
+ BasicTableOutputFormat.close(jobConf);
/*
Second MR Job for Table Projection of count column
@@ -264,7 +266,13 @@
<!-- ZEBRA OUTPUT EXAMPLE-->
<section>
<title>Table Input/Output Formats</title>
-<p>This MapReduce example demonstrates the Zebra table input/output formats.
</p>
+<p>
+This MapReduce examples demonstrates how to perform a simple union.
+To run this program, we need two basic tables that contain
+the data as in the example above (word, count). In this example they are:
+/user/mapredu/t1 and /user/mapredu/t2. The resulting table is
/user/mapredu2/t.
+</p>
+
<source>
package org.apache.hadoop.zebra.mapred;
@@ -286,13 +294,7 @@
import org.apache.hadoop.zebra.types.TypesUtils;
import org.apache.pig.data.Tuple;
-/**
- * This is a sample to show using zebra table to do a simple basic union in
- * map/reduce * To run this, we need have two basic tables ready. They contain
- * the data as in Sample 1, i.e., (word, count). In this example, they are at:
- * /user/mapredu/t1 /user/mapredu/t2 The resulting table is put at:
/user/mapredu2/t1
- *
- */
+
public class TableMRSample2 {
static class MapClass implements
Mapper<BytesWritable, Tuple, BytesWritable, Tuple> {
Modified:
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_overview.xml
URL:
http://svn.apache.org/viewvc/hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_overview.xml?rev=892408&r1=892407&r2=892408&view=diff
==============================================================================
---
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_overview.xml
(original)
+++
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_overview.xml
Sat Dec 19 00:01:11 2009
@@ -42,8 +42,8 @@
<title>Prerequisites</title>
<p>Zebra requires:</p>
<ul>
- <li>Pig 0.6.0 </li>
- <li>Hadoop 0.20.1</li>
+ <li>Pig 0.6.0 or later</li>
+ <li>Hadoop 0.20.1 or later</li>
</ul>
<p></p>
<p>Also, make sure the following software is installed on your system:</p>
Modified:
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_pig.xml
URL:
http://svn.apache.org/viewvc/hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_pig.xml?rev=892408&r1=892407&r2=892408&view=diff
==============================================================================
--- hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_pig.xml
(original)
+++ hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_pig.xml Sat
Dec 19 00:01:11 2009
@@ -73,7 +73,7 @@
A = LOAD 'studenttab, votertab' USING
org.apache.hadoop.zebra.pig.TableLoader();
</source>
-<p>TableLoader supports efficient column selection; projections are
automatically push down to the loader. This example tells the loader to only
return two columns, name and age.</p>
+<p>TableLoader supports efficient column selection; projections are
automatically pushed down to the loader. This example tells the loader to only
return two columns, name and age.</p>
<source>
A = LOAD 'studenttab' USING org.apache.hadoop.zebra.pig.TableLoader('name,
age');
</source>
@@ -84,7 +84,7 @@
<section>
<title>Map-Side Group and Merge Join</title>
-<p>If the input data is globally sorted, map-side group or merge join can be
used. Please, notice the âsortedâ argument passed to the loader. This lets
the loader know that the data is expected to be globally sorted and that a
single key must be given to the same map.</p>
+<p>If the input data is globally sorted, merge join and map-side group can be
used. Please note the âsortedâ argument that is passed to the loader. This
lets the loader know that the data is expected to be globally sorted and that a
single key must be given to the same map.</p>
<p>Here is an example of the merge join. Note that the first argument to the
loader is left empty to indicate that all columns are requested.</p>
<source>
@@ -93,7 +93,7 @@
G = JOIN A BY $0, B By $0 USING "merge";
</source>
-<p>Here is an example of a map-side group. Note that multiple sorted files are
passed to the loader and that the loader will perform sort preserving merge to
make sure that the data is globally sorted.</p>
+<p>Here is an example of a map-side group. Note that multiple sorted files are
passed to the loader and that the loader will perform sort-preserving merge to
make sure that the data is globally sorted.</p>
<source>
A = LOAD 'studentsortedtab, studentnullsortedtab' using
org.apache.hadoop.zebra.pig.TableLoader('name, age, gpa, source_table',
'sorted');
B = GROUP A BY $0 USING "collected";
Modified:
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_reference.xml
URL:
http://svn.apache.org/viewvc/hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_reference.xml?rev=892408&r1=892407&r2=892408&view=diff
==============================================================================
---
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_reference.xml
(original)
+++
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_reference.xml
Sat Dec 19 00:01:11 2009
@@ -150,7 +150,7 @@
(<em>In a future release, the schema will also support type compatibility
between Zebra/Pig-SQL and will guide the underlying serialization formats
provided by Avro for projection, filtering, and so on. </em>)
</p>
- <p>The basic format for the the store schema is shown here.
+ <p>The basic format for the store schema is shown here.
The type name is optional; if not specified, the column defaults to type
bytes.</p>
<p>
<code>
@@ -453,7 +453,7 @@
<p>The Zebra load schema is load or read table columns. </p>
<section>
<title>Schema</title>
- <p>The basic format for the the Zebra load (read) schema is shown here. The
column name can be any valid Zebra type.
+ <p>The basic format for the Zebra load (read) schema is shown here. The
column name can be any valid Zebra type.
If no columns are specified, the entire Zebra table is loaded.</p>
<p>
<code>
Modified:
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_stream.xml
URL:
http://svn.apache.org/viewvc/hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_stream.xml?rev=892408&r1=892407&r2=892408&view=diff
==============================================================================
--- hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_stream.xml
(original)
+++ hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_stream.xml
Sat Dec 19 00:01:11 2009
@@ -105,7 +105,7 @@
<section>
<title>Locating Frequently Visited Pages</title>
- <p>This perl script sorts the pages on number of page view counts. The script
outputs space padded count
+ <p>This Perl script sorts the pages on number of page view counts. The script
outputs space padded count
so that string sorting results in correct output. The first TAB separates the
key and value for Hadoop streaming.</p>
<source>
Modified:
hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_users.xml
URL:
http://svn.apache.org/viewvc/hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_users.xml?rev=892408&r1=892407&r2=892408&view=diff
==============================================================================
--- hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_users.xml
(original)
+++ hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_users.xml
Sat Dec 19 00:01:11 2009
@@ -32,31 +32,24 @@
<title>Column Security</title>
<p><strong>NOTE: THIS FEATURE IS EXPERIMENTAL AND SUBJECT TO CHANGE IN THE
FUTURE</strong></p>
- <p>Since Zebra provides columnar storage of user data, we intend to separate
secure and non-secure data into separate columns. We can then have access
control based on HDFS file systesm based security. This would be achieved by an
administrator setting appropriate permissions on the HDFS files contaning
secure data. </p>
+ <p>Since Zebra supports reading and writing data in a column-oriented
fashion, you can store secure and non-secure data in separate columns.
+ Then, using the HDFS file system, you can enable access control by setting
the appropriate permissions on the columns containing secure data.
+ </p>
- <section>
- <title>Design Issues</title>
-<p>Roles:</p>
-<ul>
-<li>Publishers of secure data </li>
-<li>Consumers of secure data </li>
-<li>Administrators of secure data </li>
+ <p>About the data:</p>
+ <ul>
+<li>All the files and directories containing secure data will have the same
permissions and groups within a table. </li>
+<li>If no security information is provided, then the HDFS file system default
behavior applies.</li>
</ul>
-
- <p>How it will work:</p>
+<p></p>
+ <p>About the users:</p>
<ul>
+<li>The user who creates the data will set the file permissions.</li>
+<li>If a permissions-related error happens, it will be communicated to the
user as a normal IO exception. </li>
+<li>A user running a client application needs to have chgrp permissions to
execute the "secure by group" operations on a table. </li>
+<li>If a user running a client application does not have read permissions for
a secure column group, an IO exception is issued.</li>
-<li>Before any data is written in tables, all the corresponding files and
directories need to have right set of ownership and permissions.
-This is necessary because if all the data is written and then the operation to
make it secure is executed, that can leave security holes and paranoid will not
allow that.</li>
-
-<li>All the files and direcories containing secure data will have same
permissions and groups within a table </li>
-<li>User of the MR Job/Pig Script is required to have permissions to execute
chgrp operations on a table. </li>
-<li>If no security information provided, then default behaviour.</li>
-<li>If permissions related error happens, it will be communicated to user as
normal IOException </li>
-<li>If reader does not have read permissions for a column (CG), an IOException
will be thrown </li>
-<li>The publisher/creator of the files will set these permissions when
creating data.</li>
</ul>
- </section>
<p></p>
<p>One simple Pig example:</p>
@@ -70,7 +63,7 @@
zStorageHint = ZebraStorageHint.createZebraStorageHint(â[a, b] secure by
group:secure perm:640â);
zSchema = â¦;
zSortInfo = â¦;
-setStorageInfo(jobConf, zSchema, zStorageHint, zSortInf);
+setStorageInfo(jobConf, zSchema, zStorageHint, zSortInfo);
</source>
</section>
<!-- END COLUMN SECURITY -->
@@ -80,20 +73,20 @@
<title>Drop Column Groups</title>
<p><strong>NOTE: THIS FEATURE IS EXPERIMENTAL AND SUBJECT TO CHANGE IN THE
FUTURE</strong></p>
- <p>Zebra allows you to delete a column group using the column group name.
+ <p>Zebra allows you to delete a column group (CG) using the column group
name.
For examples, see <a href="zebra_mapreduce.html#Drop+Column+Groups">Drop
Column Groups</a>. </p>
<p>Please note the following:</p>
<ul>
<li>Any failures during a drop will leave the table in consistent state
(either with or with out the column group).
-While success of a CG removal guarantees a column removal, a failure does not
imply CG is not removed.
+While success of a column group removal guarantees a column removal, a failure
does not imply the column group is not removed.
In rare cases, you might receive an error but the column could still be
deleted. </li>
<li>MapReduce jobs and other clients that are currently accessing the table
might fail with exceptions.
-It is recommended that the column groups are dropped when there are no
accesses to a table.
+It is recommended that column groups be dropped when there are no accesses to
a table.
It might not be feasible to ensure that there are no readers for a table; in
these cases the readers should handle the exception. </li>
-<li>Once a column group is dropped, the column gruop data is deleted from the
underlying filesystem.
-In the case of HDFS, it may not imply that physical data is actually removed
because of earlier snapshot of the filesystem; handling this is out side the
scope of Zebra. Legal requirements might require an admin finalize HDFS (if it
is not already finalized) before or after performing a deletion. </li>
-<li>Concurrent deletions are supported and their access is serialized. </li>
+<li>Once a column group is dropped, the column group data is deleted from the
underlying file system.
+In the case of the HDFS filesystem, it may not imply that physical data is
actually removed because of earlier snapshot of the file system; handling this
is out side the scope of Zebra.</li>
+<li>Concurrent column group deletions are supported and their access is
serialized. </li>
<li>Deleting a non-existant column group or a column group that is already
deleted is not allowed.</li>
<li>If you delete all the remaining columns in a table, it logically leaves an
empty null table. The difference between a non-existant table and a table with
zero columns is that opening a non-existant table causes an error. </li>
</ul>
@@ -104,19 +97,13 @@
<section>
<title>Order-Preserving Sorted Table Union</title>
<p>
-This Zebra functionality is only available on underlying sorted Zebra tables.
-</p>
-
-<section>
-<title>Output Records</title>
-<p>
-This feature groups all records from all "delta tables" on some sort key to
form an output set of records while preserving the sorted ordering of the
records in the origional tables. For instance, if the client application wants
to fetch records from a union of tables of T1, T2 on a column "c1", then all
records from T1 with a particular value of column "c1" and all records from T2
with that value of column "c1" will be output. The ordering of the rows of the
output set of the same value of column "c1" is undefined. As a prerequisite,
both T1 and T2 must be sorted on column "c1". More specifically the input and
results could be as follows:
+With Zebra you can group all records from all "delta tables" on some sort key
to form an output set of records while preserving the sorted ordering of the
records in the original tables. For instance, if the client application wants
to fetch records from a union of tables of T1, T2 on a column "C1", then all
records from T1 with a particular value of column "c1" and all records from T2
with that value of column "C1" will be output. The ordering of the rows of the
output set of the same value of column "C1" is undefined. As a prerequisite,
both T1 and T2 must be sorted on column "C1". More specifically the input and
results could be as follows:
</p>
<p>Table T1: </p>
<source>
C1 C2
---------------
+-------------
A 11
A 12
B 21
@@ -137,7 +124,7 @@
<p>T1 Sort-Unioned with T2: </p>
<source>
source_table C1 C2
----------------------------------------
+------------------------------
0 A 11
1 A 101
0 A 12
@@ -148,27 +135,27 @@
1 D 401
0 D 41
</source>
-</section>
+
<p>
-Note that the sortness is guaranteed per mapper and among all mappers arranged
with certain ordering, but not among mappers arranged in any ordering. For
instance, the outputs generated by 4 mappers, m1, m2, m3 and m4, could be in
total ordering between m1, m3, m2 and m4, but not in any other arrangements.
+Note that the sortness is guaranteed per mapper and among all mappers arranged
with certain ordering, but not among mappers arranged in any ordering. For
instance, the outputs generated by four mappers, m1, m2, m3 and m4, could be in
total ordering between m1, m3, m2 and m4, but not in any other arrangements.
</p>
<section>
-<title>Grouping and Indexing of Sort-Unioned Output Rows </title>
+<title>Indexing Sort-Unioned Results</title>
<p>
-The order preserving sort-unioned results above could be further indexed by
the component tables if the projection contains column(s) named "source_table".
If so specified, the component table index will be output at the position(s) as
specified in the projection list. If the underlying table is not a union of
sorted tables, use of the special column name in projection will cause an
exception thrown.
-</p>
+The order-preserving sort-unioned results above can be further indexed by the
component tables if the projection contains column(s) named "source_table". If
so specified, the component table index is output at the position(s) as
specified in the projection list.
-<p>
-If an attempt is made to create a table of a column named "source_table", an
excpetion will be thrown as the name is reserved by zebra for the virtual name.
+If the underlying table is not a union of sorted tables, the use of the
special column name in a projection will cause an exception.
+
+If an attempt is made to create a table of a column named "source_table", an
exception will be thrown as the name is reserved by zebra for the virtual name.
</p>
</section>
<section>
-<title>MapReduce Interface </title>
+<title>MapReduce Jobs</title>
<p>
-TableInputFormat will have a static method, requireSortedTable, that allows
the caller to specify the behavior of a single sorted table or an order
preserving sorted table union as described above. The method will ensure all
tables in a union are sorted. For more information, see <a
href="zebra_reference.html#TableInputFormat">TableInputFormat</a>.
+TableInputFormat has static method, requireSortedTable, that allows the caller
to specify the behavior of a single sorted table or an order-preserving sorted
table union as described above. The method ensures all tables in a union are
sorted. For more information, see <a
href="zebra_reference.html#TableInputFormat">TableInputFormat</a>.
</p>
<p>One simple example: A order-preserving sorted union B. A and B are sorted
tables. </p>
@@ -182,8 +169,8 @@
</section>
<section>
-<title>Pig Interface </title>
-<p>Pig will take an extra string argument of "sorted" indicating the desire to
load from a sorted table or an order preserving sorted table union.
+<title>Pig Scripts</title>
+<p>Pig takes an extra string argument of "sorted" indicating the desire to
load from a sorted table or an order-preserving sorted table union.
For more information, see <a href="zebra_pig.html#Zebra+Pig+Examples">Zebra
Pig Examples</a>.</p>
<p>One simple example:</p>
@@ -193,8 +180,7 @@
...
</source>
</section>
-
-</section>
+ </section>
<!-- END ORDER PRESERVE SORT-->
<!--MERGE JOIN-->
@@ -206,23 +192,22 @@
<p>One simple example:</p>
<source>
-Class myMapper {
-â¦
-Object keyGenerator;
-â¦
-
-public void map(â¦) {
- bytesKey = BasicTableOutputFormat.getSortKey(keyGenerator, userKey);
- â¦
- output.collect(bytesKey, valueTuple);
- â¦
+class myMapper extends Mapper<â¦> {
+ â¦
+ Object keyGenerator;
+ â¦
+ public void map(â¦) {
+ bytesKey = BasicTableOutputFormat.getSortKey(keyGenerator, userKey);
+ â¦
+ output.collect(bytesKey, valueTuple);
+ â¦
+ }
+ public void configure(JobConf job) {
+ keyGenerator = BasicTableOutputFormat.getSortKeyGenerator(job);
+ â¦
+ }
}
-public void configure(JobConf job)
-{
- keyGenerator = BasicTableOutputFormat.getSortKeyGenerator(job);
-â¦
-}
</source>
</section>