svn commit: r963952 - in /hadoop/pig/branches/branch-0.7: ./ src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/ src/org/apache/pig/data/ src/org/apache/pig/tools/pigstats/

2010-07-14 Thread dvryaboy
Author: dvryaboy
Date: Wed Jul 14 06:14:33 2010
New Revision: 963952

URL: http://svn.apache.org/viewvc?rev=963952view=rev
Log:
PIG-1428: Make a StatusReporter singleton available for incrementing counters 
(dvryaboy)

Added:

hadoop/pig/branches/branch-0.7/src/org/apache/pig/tools/pigstats/PigStatusReporter.java
Modified:
hadoop/pig/branches/branch-0.7/CHANGES.txt

hadoop/pig/branches/branch-0.7/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MapReducePOStoreImpl.java

hadoop/pig/branches/branch-0.7/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/PigCombiner.java

hadoop/pig/branches/branch-0.7/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/PigHadoopLogger.java

hadoop/pig/branches/branch-0.7/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/PigMapBase.java

hadoop/pig/branches/branch-0.7/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/PigMapReduce.java

hadoop/pig/branches/branch-0.7/src/org/apache/pig/data/DefaultAbstractBag.java

Modified: hadoop/pig/branches/branch-0.7/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.7/CHANGES.txt?rev=963952r1=963951r2=963952view=diff
==
--- hadoop/pig/branches/branch-0.7/CHANGES.txt (original)
+++ hadoop/pig/branches/branch-0.7/CHANGES.txt Wed Jul 14 06:14:33 2010
@@ -22,9 +22,6 @@ Release 0.7.0 - 2010-05-03
 
 INCOMPATIBLE CHANGES
 
-PIG-1438: [Performance] MultiQueryOptimizer should also merge DISTINCT jobs
-(rding)
-
 PIG-1292: Interface Refinements (hashutosh)
 
 PIG-1259: ResourceFieldSchema.setSchema should not allow a bag field without a
@@ -71,6 +68,11 @@ manner (rding via pradeepkth)
 
 IMPROVEMENTS
 
+PIG-1428: Make a StatusReporter singleton available for incrementing counters 
(dvryaboy)
+
+PIG-1438: [Performance] MultiQueryOptimizer should also merge DISTINCT jobs
+(rding)
+
 PIG-1309: Map-side Cogroup (hashutosh)
 
 PIG-1441: new test targets (olgan)

Modified: 
hadoop/pig/branches/branch-0.7/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MapReducePOStoreImpl.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.7/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MapReducePOStoreImpl.java?rev=963952r1=963951r2=963952view=diff
==
--- 
hadoop/pig/branches/branch-0.7/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MapReducePOStoreImpl.java
 (original)
+++ 
hadoop/pig/branches/branch-0.7/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MapReducePOStoreImpl.java
 Wed Jul 14 06:14:33 2010
@@ -22,9 +22,12 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.hadoop.mapreduce.RecordWriter;
 import org.apache.hadoop.mapreduce.OutputFormat;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
 import org.apache.pig.StoreFuncInterface;
 import 
org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore;
 import 
org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStoreImpl;
+import org.apache.pig.tools.pigstats.PigStatusReporter;
+
 /**
  * This class is used to have a POStore write to DFS via a output
  * collector/record writer. It sets up a modified job configuration to
@@ -36,21 +39,27 @@ public class MapReducePOStoreImpl extend
 
 private TaskAttemptContext context;
 
+private PigStatusReporter reporter;
+
 @SuppressWarnings(unchecked)
 private RecordWriter writer;
 
-public MapReducePOStoreImpl(TaskAttemptContext context) {
+public MapReducePOStoreImpl(TaskInputOutputContext context) {
 // get a copy of the Configuration so that changes to the
 // configuration below (like setting the output location) do
 // not affect the caller's copy
 Configuration outputConf = new 
Configuration(context.getConfiguration());
 
+PigStatusReporter.setContext(context);
+   reporter = PigStatusReporter.getInstance();
+
 // make a copy of the Context to use here - since in the same
 // task (map or reduce) we could have multiple stores, we should
 // make this copy so that the same context does not get over-written
 // by the different stores.
 this.context = new TaskAttemptContext(outputConf, 
 context.getTaskAttemptID());
+
 }
 
 @SuppressWarnings(unchecked)

Modified: 
hadoop/pig/branches/branch-0.7/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/PigCombiner.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.7/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/PigCombiner.java?rev=963952r1=963951r2=963952view=diff

[Pig Wiki] Update of NativeMapReduce by Aniket Mokash i

2010-07-14 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Pig Wiki for change 
notification.

The NativeMapReduce page has been changed by Aniket Mokashi.
http://wiki.apache.org/pig/NativeMapReduce?action=diffrev1=4rev2=5

--

  With native job support, pig can support native map reduce jobs written in 
java language that can convert a data set into a different data set after 
applying a custom map reduce functions of any complexity.
  
  == Native Mapreduce job specification ==
- Native Mapreduce job needs to conform to some specification defined by Pig. 
This is required as Pig specifies the input and output directory in the script 
for this job and is responsible for managing the coordination of the native job 
with the remaining pig mapreduce jobs. Pig also might need to provide some 
extra configuration like job name, input/output formats, parallelism to the 
native job. For communicating such parameters to the native job, it should 
provide some way of communication.
+ Native Mapreduce job needs to conform to some specification defined by Pig. 
This is required because Pig specifies the input and output directory in the 
script for this job and is responsible for managing the coordination of the 
native job with the remaining pig mapreduce jobs. Pig also might need to 
provide some extra configuration like job name, input/output formats, 
parallelism to the native job. For communicating such parameters to the native 
job, it should be according to specification provided by Pig.
  
  Following are some of the approaches of achieving this-
   1. Ordered inputLoc/outputLoc parameters- This is simplistic approach 
wherein native programs follow up a convention so that their first and second 
parameters are treated as input and output respectively. Pig ''native'' command 
takes the parameters required by the native mapreduce job and passes it to 
native job as command line arguments. It is upto the native program to use 
these parameters for operations it performs.
@@ -51, +51 @@

  FileInputFormat.setInputPaths(conf, new Path(args[0]));  
  FileOutputFormat.setOutputPath(conf, new Path(args[1]));
  }}}
+  2. getJobConf Function- Native jobs implement '''getJobConf''' method which 
returns org.apache.hadoop.mapred.JobConf object so that pig can schedule the 
job. This also provides a way to add more pig specific parame
- 
-  2. getJobConf Function-
  
  
  


svn commit: r964127 - /hadoop/pig/branches/branch-0.7/src/org/apache/pig/data/DefaultAbstractBag.java

2010-07-14 Thread daijy
Author: daijy
Date: Wed Jul 14 18:38:52 2010
New Revision: 964127

URL: http://svn.apache.org/viewvc?rev=964127view=rev
Log:
Fix NPE in TestDataBag introduced by PIG-1428

Modified:

hadoop/pig/branches/branch-0.7/src/org/apache/pig/data/DefaultAbstractBag.java

Modified: 
hadoop/pig/branches/branch-0.7/src/org/apache/pig/data/DefaultAbstractBag.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.7/src/org/apache/pig/data/DefaultAbstractBag.java?rev=964127r1=964126r2=964127view=diff
==
--- 
hadoop/pig/branches/branch-0.7/src/org/apache/pig/data/DefaultAbstractBag.java 
(original)
+++ 
hadoop/pig/branches/branch-0.7/src/org/apache/pig/data/DefaultAbstractBag.java 
Wed Jul 14 18:38:52 2010
@@ -375,7 +375,7 @@ public abstract class DefaultAbstractBag
 
 protected void incSpillCount(Enum counter) {
 PigStatusReporter reporter = PigStatusReporter.getInstance();
-if (reporter != null) {
+if (reporter != null  reporter.getCounter(counter)!=null) {
 reporter.getCounter(counter).increment(1);
 } else {
 PigHadoopLogger.getInstance().warn(this, Spill counter 
incremented, counter);




svn commit: r964177 - in /hadoop/pig/trunk: ./ src/org/apache/pig/builtin/

2010-07-14 Thread gates
Author: gates
Date: Wed Jul 14 20:21:26 2010
New Revision: 964177

URL: http://svn.apache.org/viewvc?rev=964177view=rev
Log:
Javadoc improvements for org.apache.pig.builtin package.

Modified:
hadoop/pig/trunk/CHANGES.txt
hadoop/pig/trunk/src/org/apache/pig/builtin/ARITY.java
hadoop/pig/trunk/src/org/apache/pig/builtin/AVG.java
hadoop/pig/trunk/src/org/apache/pig/builtin/BagSize.java
hadoop/pig/trunk/src/org/apache/pig/builtin/BinStorage.java
hadoop/pig/trunk/src/org/apache/pig/builtin/CONCAT.java
hadoop/pig/trunk/src/org/apache/pig/builtin/COUNT.java
hadoop/pig/trunk/src/org/apache/pig/builtin/COUNT_STAR.java
hadoop/pig/trunk/src/org/apache/pig/builtin/ConstantSize.java
hadoop/pig/trunk/src/org/apache/pig/builtin/DIFF.java
hadoop/pig/trunk/src/org/apache/pig/builtin/Distinct.java
hadoop/pig/trunk/src/org/apache/pig/builtin/DoubleAvg.java
hadoop/pig/trunk/src/org/apache/pig/builtin/DoubleMax.java
hadoop/pig/trunk/src/org/apache/pig/builtin/DoubleMin.java
hadoop/pig/trunk/src/org/apache/pig/builtin/DoubleSum.java
hadoop/pig/trunk/src/org/apache/pig/builtin/FloatAvg.java
hadoop/pig/trunk/src/org/apache/pig/builtin/FloatMax.java
hadoop/pig/trunk/src/org/apache/pig/builtin/FloatMin.java
hadoop/pig/trunk/src/org/apache/pig/builtin/FloatSum.java
hadoop/pig/trunk/src/org/apache/pig/builtin/IntAvg.java
hadoop/pig/trunk/src/org/apache/pig/builtin/IntMax.java
hadoop/pig/trunk/src/org/apache/pig/builtin/IntMin.java
hadoop/pig/trunk/src/org/apache/pig/builtin/IntSum.java
hadoop/pig/trunk/src/org/apache/pig/builtin/IsEmpty.java
hadoop/pig/trunk/src/org/apache/pig/builtin/LongAvg.java
hadoop/pig/trunk/src/org/apache/pig/builtin/LongMax.java
hadoop/pig/trunk/src/org/apache/pig/builtin/LongMin.java
hadoop/pig/trunk/src/org/apache/pig/builtin/LongSum.java
hadoop/pig/trunk/src/org/apache/pig/builtin/MAX.java
hadoop/pig/trunk/src/org/apache/pig/builtin/MIN.java
hadoop/pig/trunk/src/org/apache/pig/builtin/MapSize.java
hadoop/pig/trunk/src/org/apache/pig/builtin/PigStorage.java
hadoop/pig/trunk/src/org/apache/pig/builtin/RANDOM.java
hadoop/pig/trunk/src/org/apache/pig/builtin/SIZE.java
hadoop/pig/trunk/src/org/apache/pig/builtin/SUM.java
hadoop/pig/trunk/src/org/apache/pig/builtin/StringConcat.java
hadoop/pig/trunk/src/org/apache/pig/builtin/StringMax.java
hadoop/pig/trunk/src/org/apache/pig/builtin/StringMin.java
hadoop/pig/trunk/src/org/apache/pig/builtin/StringSize.java
hadoop/pig/trunk/src/org/apache/pig/builtin/TOKENIZE.java
hadoop/pig/trunk/src/org/apache/pig/builtin/TextLoader.java
hadoop/pig/trunk/src/org/apache/pig/builtin/TupleSize.java
hadoop/pig/trunk/src/org/apache/pig/builtin/Utf8StorageConverter.java

Modified: hadoop/pig/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=964177r1=964176r2=964177view=diff
==
--- hadoop/pig/trunk/CHANGES.txt (original)
+++ hadoop/pig/trunk/CHANGES.txt Wed Jul 14 20:21:26 2010
@@ -100,6 +100,8 @@ PIG-1309: Map-side Cogroup (ashutoshc)
 
 BUG FIXES
 
+PIG-1409: Fix up javadocs for org.apache.pig.builtin (gates)
+
 PIG-1490: Make Pig storers work with remote HDFS in secure mode (rding)
 
 PIG-1469: DefaultDataBag assumes ArrayList as default List type (azaroth via 
dvryaboy)

Modified: hadoop/pig/trunk/src/org/apache/pig/builtin/ARITY.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/ARITY.java?rev=964177r1=964176r2=964177view=diff
==
--- hadoop/pig/trunk/src/org/apache/pig/builtin/ARITY.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/ARITY.java Wed Jul 14 20:21:26 
2010
@@ -26,6 +26,11 @@ import org.apache.pig.data.DataType;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 
+/**
+ * Find the number of fields in a tuple.  Expected input is a tuple,
+ * output is an integer.
+ * @deprecated Use {...@link SIZE} instead.
+ */
 public class ARITY extends EvalFuncInteger {
 
 @Override

Modified: hadoop/pig/trunk/src/org/apache/pig/builtin/AVG.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/AVG.java?rev=964177r1=964176r2=964177view=diff
==
--- hadoop/pig/trunk/src/org/apache/pig/builtin/AVG.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/AVG.java Wed Jul 14 20:21:26 
2010
@@ -38,8 +38,21 @@ import org.apache.pig.backend.executione
 
 
 /**
- * Generates the average of the values of the first field of a tuple. This 
class is Algebraic in
- * implemenation, so if possible the execution will be split into a local and 
global application
+ * Generates the average of a set of values. This class implements
+ * {...@link 

svn commit: r964182 - /hadoop/pig/trunk/src/org/apache/pig/builtin/package.html

2010-07-14 Thread gates
Author: gates
Date: Wed Jul 14 20:23:57 2010
New Revision: 964182

URL: http://svn.apache.org/viewvc?rev=964182view=rev
Log:
PIG-1409: File I forgot to add in the last checkin.

Added:
hadoop/pig/trunk/src/org/apache/pig/builtin/package.html

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/package.html
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/package.html?rev=964182view=auto
==
--- hadoop/pig/trunk/src/org/apache/pig/builtin/package.html (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/package.html Wed Jul 14 
20:23:57 2010
@@ -0,0 +1,11 @@
+html
+body
+
+p
+This package contains builtin Pig UDFs.  This includes
+...@link org.apache.pig.EvalFunc}s,
+...@link org.apache.pig.LoadFunc}s and
+...@link org.apache.pig.StoreFunc}s.
+
+/body
+/html




[Pig Wiki] Update of NativeMapReduce by Aniket Mokash i

2010-07-14 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Pig Wiki for change 
notification.

The NativeMapReduce page has been changed by Aniket Mokashi.
http://wiki.apache.org/pig/NativeMapReduce?action=diffrev1=5rev2=6

--

- = Page under construction =
- 
  #format wiki
  #language en
  
@@ -18, +16 @@

  
  == Syntax ==
  To support native mapreduce job pig will support following syntax-
- 
  {{{
  X = ... ;
  Y = NATIVE ('mymr.jar' [, 'other.jar' ...]) STORE X INTO 'storeLocation' 
USING storeFunc LOAD 'loadLocation' USING loadFunc [params, ... ];
@@ -35, +32 @@

  Purpose of [[#ref2|pig streaming]] is to send data through an external script 
or program to transform a dataset into a different dataset based on a custom 
script written in any programming/scripting language. Pig streaming uses 
support of hadoop streaming to achieve this. Pig can register custom programs 
in a script, inline in the stream clause or using a define clause. Pig also 
provides a level of data guarantees on the data processing, provides feature 
for job management, provides ability to use distributed cache for the scripts 
(configurable). Streaming application run locally on individual mapper and 
reducer nodes for transforming the data.
  
  === Hive Transforms ===
- With [[#ref3|hive transforms]], users can also plug in their own custom 
mappers and reducers in the data stream. Basically, it is also an application 
of custom streaming supported by hadoop. Thus, these mappers and reducers can 
be written in any scripting languages and can be registered to distributed 
cache to help performance. To support custom map reduce programs written in 
java ([[#ref4|bezo's blog]]), we can use our custom mappers and reducers as 
data streaming functions and use them to transform the data using 'java -cp 
mymr.jar'. This will not invoke a map reduce task but will attempt to transform 
the data during the map or the reduce task (locally).
+ With [[#ref3|hive transforms]], users can also plug in their own custom 
mappers and reducers in the data stream. Basically, it is also an application 
of custom streaming supported by hadoop. Thus, these mappers and reducers can 
be written in any scripting languages and can be registered to distributed 
cache to help performance. To support custom map reduce programs written in 
java ([[#ref4|bizo's blog]]), we can use our custom mappers and reducers as 
data streaming functions and use them to transform the data using 'java -cp 
mymr.jar'. This will not invoke a map reduce task but will attempt to transform 
the data during the map or the reduce task (locally).
  
  Thus, both these features can transform data submitted to a map reduce job 
(mapper) into a different data set and/or transform data produced by a 
mapreduce job (reducer) into a different data set. But we should notice that 
data tranformation takes on a single machine and does not take advantage of map 
reduce framework itself. Also, these blocks only allow custom transformations 
inside the data pipeline and does not break the pipeline.
  
@@ -45, +42 @@

  Native Mapreduce job needs to conform to some specification defined by Pig. 
This is required because Pig specifies the input and output directory in the 
script for this job and is responsible for managing the coordination of the 
native job with the remaining pig mapreduce jobs. Pig also might need to 
provide some extra configuration like job name, input/output formats, 
parallelism to the native job. For communicating such parameters to the native 
job, it should be according to specification provided by Pig.
  
  Following are some of the approaches of achieving this-
-  1. Ordered inputLoc/outputLoc parameters- This is simplistic approach 
wherein native programs follow up a convention so that their first and second 
parameters are treated as input and output respectively. Pig ''native'' command 
takes the parameters required by the native mapreduce job and passes it to 
native job as command line arguments. It is upto the native program to use 
these parameters for operations it performs.
+  1. '''Ordered inputLoc/outputLoc parameters'''- This is simplistic approach 
wherein native programs follow up a convention so that their first and second 
parameters are treated as input and output respectively. Pig ''native'' command 
takes the parameters required by the native mapreduce job and passes it to 
native job as command line arguments. It is upto the native program to use 
these parameters for operations it performs.
  Thus, only following lines of code are mandatory inside the native program.
  {{{
  FileInputFormat.setInputPaths(conf, new Path(args[0]));  
  FileOutputFormat.setOutputPath(conf, new Path(args[1]));
  }}}
-  2. getJobConf Function- Native jobs implement '''getJobConf''' method which 
returns org.apache.hadoop.mapred.JobConf object so that pig can schedule the 
job. This also provides a way to