svn commit: r925971 - in /hadoop/pig/trunk/contrib/zebra: ./ src/java/org/apache/hadoop/zebra/io/ src/java/org/apache/hadoop/zebra/mapred/ src/java/org/apache/hadoop/zebra/mapreduce/ src/java/org/apac

2010-03-22 Thread yanz
Author: yanz
Date: Mon Mar 22 06:26:41 2010
New Revision: 925971

URL: http://svn.apache.org/viewvc?rev=925971view=rev
Log:
PIG-1258 Number of sorted input splits is unusually high (yanz)

Modified:
hadoop/pig/trunk/contrib/zebra/CHANGES.txt

hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/io/BasicTable.java

hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/io/BlockDistribution.java

hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/io/ColumnGroup.java

hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/io/KeyDistribution.java

hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/mapred/TableInputFormat.java

hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/mapred/TableRecordReader.java

hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/mapreduce/TableInputFormat.java

hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/tfile/TFile.java

hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestBasicTable.java

hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroup.java

hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupWithWorkPath.java

Modified: hadoop/pig/trunk/contrib/zebra/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/zebra/CHANGES.txt?rev=925971r1=925970r2=925971view=diff
==
--- hadoop/pig/trunk/contrib/zebra/CHANGES.txt (original)
+++ hadoop/pig/trunk/contrib/zebra/CHANGES.txt Mon Mar 22 06:26:41 2010
@@ -66,6 +66,8 @@ Trunk (unreleased changes)
 
   BUG FIXES
 
+PIG-1258 Number of sorted input splits is unusually high (yanz)
+
 PIG-1269 Restrict schema definition for collection (xuefuz via yanz)
 
 PIG-1253: make map/reduce test cases run on real cluster (chaow via yanz)

Modified: 
hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/io/BasicTable.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/io/BasicTable.java?rev=925971r1=925970r2=925971view=diff
==
--- 
hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/io/BasicTable.java
 (original)
+++ 
hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/io/BasicTable.java
 Mon Mar 22 06:26:41 2010
@@ -44,7 +44,6 @@ import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.io.BytesWritable;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableUtils;
-import org.apache.hadoop.zebra.tfile.RawComparable;
 import org.apache.hadoop.zebra.tfile.TFile;
 import org.apache.hadoop.zebra.tfile.Utils;
 import org.apache.hadoop.zebra.tfile.MetaBlockAlreadyExists;
@@ -474,21 +473,18 @@ public class BasicTable {
  * 
  * @param n
  *  Targeted size of the sampling.
+ * @param nTables
+ *  Number of tables in union
  * @return KeyDistribution object.
  * @throws IOException
  */
-public KeyDistribution getKeyDistribution(int n) throws IOException {
-  KeyDistribution kd =
-  new 
KeyDistribution(TFile.makeComparator(schemaFile.getComparator()));
-  for (int nx = 0; nx  colGroups.length; nx++) {
-if (!isCGDeleted(nx)) {
-   kd.add(colGroups[nx].getKeyDistribution(n));
-}
-  }
-  if (n = 0  kd.size()  (int) (n * 1.5)) {
-kd.resize(n);
+public KeyDistribution getKeyDistribution(int n, int nTables, 
BlockDistribution lastBd) throws IOException {
+  if (firstValidCG = 0)
+  {
+// pick the largest CG as in the row split case
+return colGroups[getRowSplitCGIndex()].getKeyDistribution(n, nTables, 
lastBd);
   }
-  return kd;
+  return null;
 }
 
 /**
@@ -650,7 +646,8 @@ public class BasicTable {
  * construct a TableScanner later. 
  * 
  */
-public ListRowSplit rowSplit(long[] starts, long[] lengths, Path[] 
paths, int splitCGIndex, int[] batchSizes, int numBatches) throws IOException {
+public ListRowSplit rowSplit(long[] starts, long[] lengths, Path[] paths,
+int splitCGIndex, int[] batchSizes, int numBatches) throws IOException 
{
   ListRowSplit ret;  
   ListCGRowSplit cgSplits = colGroups[splitCGIndex].rowSplit(starts, 
lengths, paths, batchSizes, numBatches);
   int numSlices = cgSplits.size();
@@ -679,6 +676,7 @@ public class BasicTable {
  */
 public int getRowSplitCGIndex() throws IOException {
   // Try to find the largest non-deleted and used column group by 
projection;
+  // Try to find the largest non-deleted and used column group by 
projection;
   if (rowSplitCGIndex == -1)
   {
 int largestCGIndex = -1;
@@ -702,7 +700,7 @@ public class BasicTable {
   rowSplitCGIndex = 

svn commit: r925988 [8/8] - in /hadoop/pig/trunk/contrib/zebra: ./ src/test/org/apache/hadoop/zebra/ src/test/org/apache/hadoop/zebra/mapred/ src/test/org/apache/hadoop/zebra/mapreduce/ src/test/org/a

2010-03-22 Thread yanz
Modified: 
hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestTableMergeJoin.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestTableMergeJoin.java?rev=925988r1=925987r2=925988view=diff
==
--- 
hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestTableMergeJoin.java
 (original)
+++ 
hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestTableMergeJoin.java
 Mon Mar 22 07:54:51 2010
@@ -40,6 +40,7 @@ import org.apache.hadoop.zebra.types.Typ
 import org.apache.pig.ExecType;
 import org.apache.pig.PigServer;
 import org.apache.pig.backend.executionengine.ExecException;
+import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.test.MiniCluster;
 import org.junit.After;
@@ -47,6 +48,8 @@ import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
+import org.apache.hadoop.zebra.BaseTestCase;
+
 
 /**
  * Note:
@@ -55,32 +58,17 @@ import org.junit.Test;
  * app/debug configuration, when run this from inside the Eclipse.
  * 
  */
-public class TestTableMergeJoin {
-  protected static ExecType execType = ExecType.MAPREDUCE;
-  private static MiniCluster cluster;
-  protected static PigServer pigServer;
+public class TestTableMergeJoin extends BaseTestCase {
   private static Path pathTable;
 
+
   @BeforeClass
   public static void setUp() throws Exception {
-if (System.getProperty(hadoop.log.dir) == null) {
-  String base = new File(.).getPath(); // getAbsolutePath();
-  System
-  .setProperty(hadoop.log.dir, new Path(base).toString() + ./logs);
-}
-
-if (execType == ExecType.MAPREDUCE) {
-  cluster = MiniCluster.buildCluster();
-  pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
-} else {
-  pigServer = new PigServer(ExecType.LOCAL);
-}
-
-Configuration conf = new Configuration();
-FileSystem fs = cluster.getFileSystem();
-Path pathWorking = fs.getWorkingDirectory();
-pathTable = new Path(pathWorking, TestTableStorer);
-System.out.println(pathTable = + pathTable);
+
+init();
+pathTable = getTableFullPath(TestTableMergeJoin);
+removeDir(pathTable);
+
 BasicTable.Writer writer = new BasicTable.Writer(pathTable,
 SF_a:string,SF_b:string,SF_c,SF_d,SF_e,SF_f,SF_g,
 [SF_a, SF_b, SF_c]; [SF_e, SF_f, SF_g], conf);

Modified: 
hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestTableMergeJoinAfterFilter.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestTableMergeJoinAfterFilter.java?rev=925988r1=925987r2=925988view=diff
==
--- 
hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestTableMergeJoinAfterFilter.java
 (original)
+++ 
hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestTableMergeJoinAfterFilter.java
 Mon Mar 22 07:54:51 2010
@@ -40,6 +40,7 @@ import org.apache.hadoop.zebra.types.Typ
 import org.apache.pig.ExecType;
 import org.apache.pig.PigServer;
 import org.apache.pig.backend.executionengine.ExecException;
+import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.test.MiniCluster;
 import org.junit.After;
@@ -47,6 +48,8 @@ import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
+import org.apache.hadoop.zebra.BaseTestCase;
+
 
 /**
  * Note:
@@ -55,32 +58,17 @@ import org.junit.Test;
  * app/debug configuration, when run this from inside the Eclipse.
  * 
  */
-public class TestTableMergeJoinAfterFilter {
-  protected static ExecType execType = ExecType.MAPREDUCE;
-  private static MiniCluster cluster;
-  protected static PigServer pigServer;
+public class TestTableMergeJoinAfterFilter extends BaseTestCase {
   private static Path pathTable;
 
   @BeforeClass
   public static void setUp() throws Exception {
-if (System.getProperty(hadoop.log.dir) == null) {
-  String base = new File(.).getPath(); // getAbsolutePath();
-  System
-  .setProperty(hadoop.log.dir, new Path(base).toString() + ./logs);
-}
-
-if (execType == ExecType.MAPREDUCE) {
-  cluster = MiniCluster.buildCluster();
-  pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
-} else {
-  pigServer = new PigServer(ExecType.LOCAL);
-}
-
-Configuration conf = new Configuration();
-FileSystem fs = cluster.getFileSystem();
-Path pathWorking = fs.getWorkingDirectory();
-pathTable = new Path(pathWorking, TestTableStorer);
-System.out.println(pathTable = + pathTable);
+
+init();
+pathTable = 

svn commit: r925990 - in /hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra: pig/TestRealCluster.java types/TestColumnSecurity.java

2010-03-22 Thread yanz
Author: yanz
Date: Mon Mar 22 08:00:27 2010
New Revision: 925990

URL: http://svn.apache.org/viewvc?rev=925990view=rev
Log:
PIG-1282 make Zebra's pig test cases run on real cluster (chaow via yanz)

Removed:

hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestRealCluster.java

hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/types/TestColumnSecurity.java



svn commit: r926228 - in /hadoop/pig/trunk: ./ src/org/apache/pig/impl/logicalLayer/optimizer/ src/org/apache/pig/impl/plan/optimizer/ test/org/apache/pig/test/

2010-03-22 Thread pradeepkth
Author: pradeepkth
Date: Mon Mar 22 18:02:34 2010
New Revision: 926228

URL: http://svn.apache.org/viewvc?rev=926228view=rev
Log:
PIG-1308: Inifinite loop in JobClient when reading from BinStorage Message: 
[org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to 
process : 2] (pradeepkth)

Modified:
hadoop/pig/trunk/CHANGES.txt

hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/LogicalOptimizer.java

hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/OpLimitOptimizer.java
hadoop/pig/trunk/src/org/apache/pig/impl/plan/optimizer/PlanOptimizer.java
hadoop/pig/trunk/test/org/apache/pig/test/TestLogicalOptimizer.java

Modified: hadoop/pig/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=926228r1=926227r2=926228view=diff
==
--- hadoop/pig/trunk/CHANGES.txt (original)
+++ hadoop/pig/trunk/CHANGES.txt Mon Mar 22 18:02:34 2010
@@ -68,6 +68,10 @@ manner (rding via pradeepkth)
 
 IMPROVEMENTS
 
+PIG-1308: Inifinite loop in JobClient when reading from BinStorage Message:
+[org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to
+process : 2] (pradeepkth)
+
 PIG-1285: Allow SingleTupleBag to be serialized (dvryaboy)
 
 PIG-1117: Pig reading hive columnar rc tables (gerritjvv via dvryaboy)

Modified: 
hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/LogicalOptimizer.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/LogicalOptimizer.java?rev=926228r1=926227r2=926228view=diff
==
--- 
hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/LogicalOptimizer.java
 (original)
+++ 
hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/LogicalOptimizer.java
 Mon Mar 22 18:02:34 2010
@@ -185,7 +185,7 @@ public class LogicalOptimizer extends
 }
 
 @Override
-public final void optimize() throws OptimizerException {
+public final int optimize() throws OptimizerException {
 //the code that follows is a copy of the code in the
 //base class. see the todo note in the base class
 boolean sawMatch = false;
@@ -240,5 +240,6 @@ public class LogicalOptimizer extends
 ((PruneColumns)pruneRule.getTransformer()).prune();
 }
 }
+return numIterations;
 }
 }
\ No newline at end of file

Modified: 
hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/OpLimitOptimizer.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/OpLimitOptimizer.java?rev=926228r1=926227r2=926228view=diff
==
--- 
hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/OpLimitOptimizer.java
 (original)
+++ 
hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/OpLimitOptimizer.java
 Mon Mar 22 18:02:34 2010
@@ -87,6 +87,34 @@ public class OpLimitOptimizer extends Lo
 + (lo == null ? lo : lo.getClass().getSimpleName());
 throw new OptimizerException(msg, errCode, PigException.BUG);
 }
+ListLogicalOperator predecessors = mPlan.getPredecessors(lo);
+if (predecessors.size()!=1) {
+int errCode = 2008;
+String msg = Limit cannot have more than one input. Found  + 
predecessors.size() +  inputs.;
+throw new OptimizerException(msg, errCode, PigException.BUG);
+}
+LogicalOperator predecessor = predecessors.get(0);
+
+// Limit cannot be pushed up
+if (predecessor instanceof LOCogroup || predecessor instanceof 
LOFilter ||
+predecessor instanceof LOLoad || predecessor instanceof 
LOSplit ||
+predecessor instanceof LODistinct || predecessor 
instanceof LOJoin)
+{
+return false;
+}
+// Limit cannot be pushed in front of ForEach if it has a flatten
+if (predecessor instanceof LOForEach)
+{
+LOForEach loForEach = (LOForEach)predecessor;
+ListBoolean mFlatten = loForEach.getFlatten();
+boolean hasFlatten = false;
+for (Boolean b:mFlatten)
+if (b.equals(true)) hasFlatten = true;
+
+if (hasFlatten) {
+return false;
+}
+}
 } catch (Exception e) {
 int errCode = 2049;
 String msg = Error while performing checks to optimize limit 
operator.;

Modified: 
hadoop/pig/trunk/src/org/apache/pig/impl/plan/optimizer/PlanOptimizer.java
URL: 

svn commit: r926270 - in /hadoop/pig/trunk: CHANGES.txt src/org/apache/pig/impl/builtin/DefaultIndexableLoader.java

2010-03-22 Thread daijy
Author: daijy
Date: Mon Mar 22 19:40:36 2010
New Revision: 926270

URL: http://svn.apache.org/viewvc?rev=926270view=rev
Log:
PIG-1312: Make Pig work with hadoop security

Modified:
hadoop/pig/trunk/CHANGES.txt
hadoop/pig/trunk/src/org/apache/pig/impl/builtin/DefaultIndexableLoader.java

Modified: hadoop/pig/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=926270r1=926269r2=926270view=diff
==
--- hadoop/pig/trunk/CHANGES.txt (original)
+++ hadoop/pig/trunk/CHANGES.txt Mon Mar 22 19:40:36 2010
@@ -68,6 +68,8 @@ manner (rding via pradeepkth)
 
 IMPROVEMENTS
 
+PIG-1312: Make Pig work with hadoop security (daijy)
+
 PIG-1308: Inifinite loop in JobClient when reading from BinStorage Message:
 [org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to
 process : 2] (pradeepkth)

Modified: 
hadoop/pig/trunk/src/org/apache/pig/impl/builtin/DefaultIndexableLoader.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/impl/builtin/DefaultIndexableLoader.java?rev=926270r1=926269r2=926270view=diff
==
--- 
hadoop/pig/trunk/src/org/apache/pig/impl/builtin/DefaultIndexableLoader.java 
(original)
+++ 
hadoop/pig/trunk/src/org/apache/pig/impl/builtin/DefaultIndexableLoader.java 
Mon Mar 22 19:40:36 2010
@@ -194,13 +194,18 @@ public class DefaultIndexableLoader exte
 private void initRightLoader(int [] splitsToBeRead) throws IOException{
 PigContext pc = (PigContext) ObjectSerializer
 .deserialize(PigMapReduce.sJobConf.get(pig.pigContext));
+
+Configuration conf = 
ConfigurationUtil.toConfiguration(pc.getProperties());
+
+// Hadoop security need this property to be set
+if (System.getenv(HADOOP_TOKEN_FILE_LOCATION) != null) {
+conf.set(mapreduce.job.credentials.binary, 
+System.getenv(HADOOP_TOKEN_FILE_LOCATION));
+}
+
 //create ReadToEndLoader that will read the given splits in order
-loader = new ReadToEndLoader(
-
(LoadFunc)PigContext.instantiateFuncFromSpec(rightLoaderFuncSpec),
-ConfigurationUtil.toConfiguration(pc.getProperties()),
-inpLocation,
-splitsToBeRead
-);
+loader = new 
ReadToEndLoader((LoadFunc)PigContext.instantiateFuncFromSpec(rightLoaderFuncSpec),
+conf, inpLocation, splitsToBeRead);
 }
 
 private Object extractKeysFromIdxTuple(Tuple idxTuple) throws 
ExecException{




[Pig Wiki] Update of Pig070LoadStoreHowTo by PradeepK amath

2010-03-22 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Pig Wiki for change 
notification.

The Pig070LoadStoreHowTo page has been changed by PradeepKamath.
http://wiki.apache.org/pig/Pig070LoadStoreHowTo?action=diffrev1=12rev2=13

--

   * 
[[http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/LoadCaster.java?view=markup
 | LoadCaster]] has methods to convert byte arrays to specific types. A loader 
implementation should implement this if casts (implicit or explicit) from 
!DataByteArray fields to other types need to be supported.
  
  The !LoadFunc abstract class is the main class to extend for implementing a 
loader. The methods which need to be overriden are explained below:
-  * getInputFormat() :This method will be called by Pig to get the 
!InputFormat used by the loader. The methods in the !InputFormat (and 
underlying !RecordReader) will be called by pig in the same manner (and in the 
same context) as by Hadoop in a map-reduce java program. If the !InputFormat is 
a hadoop packaged one, the implementation should use the new API based one 
under org.apache.hadoop.mapreduce. If it is a custom !InputFormat, it should be 
implemented using the new API in org.apache.hadoop.mapreduce.
+  * getInputFormat() :This method will be called by Pig to get the 
!InputFormat used by the loader. The methods in the !InputFormat (and 
underlying !RecordReader) will be called by pig in the same manner (and in the 
same context) as by Hadoop in a map-reduce java program. If the !InputFormat is 
a hadoop packaged one, the implementation should use the new API based one 
under org.apache.hadoop.mapreduce. If it is a custom !InputFormat, it should be 
implemented using the new API in org.apache.hadoop.mapreduce.  If a custom 
loader using a text-based !InputFormat or a file based !InputFormat would like 
to read files in all subdirectories under a given input directory recursively, 
then it should use the !PigFileInputFormat and !PigTextInputFormat classes 
provided in org.apache.pig.backend.hadoop.executionengine.mapReduceLayer. This 
is to work around the current limitation in Hadoop's !TextInputFormat and 
!FileInputFormat which only read one level down from provided input directory. 
So for example if the input in the load statement is 'dir1' and there are 
subdirs 'dir2' and 'dir2/dir3' underneath dir1, using Hadoop's !TextInputFormat 
or !FileInputFormat only files under 'dir1' can be read. Using 
!PigFileInputFormat or !PigTextInputFormat (or by extending them), files in all 
the directories can be read.
+   
+ 
+ Changes to custom Load Functions
+   
+ 
+ Low to medium
+   
+ 
+ 
+   
+ 
+ This is to get around the problem of MAPREDUCE-1577. 
   * setLocation() :This method is called by Pig to communicate the load 
location to the loader. The loader should use this method to communicate the 
same information to the underlying !InputFormat. This method is called multiple 
times by pig - implementations should bear this in mind and should ensure there 
are no inconsistent side effects due to the multiple calls.
   * prepareToRead() : Through this method the !RecordReader associated with 
the !InputFormat provided by the !LoadFunc is passed to the !LoadFunc. The 
!RecordReader can then be used by the implementation in getNext() to return a 
tuple representing a record of data back to pig.
   * getNext() :The meaning of getNext() has not changed and is called by Pig 
runtime to get the next tuple in the data - in this method the implementation 
should use the the underlying !RecordReader and construct the tuple to return.


[Pig Wiki] Update of LoadStoreMigrationGuide by Prade epKamath

2010-03-22 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Pig Wiki for change 
notification.

The LoadStoreMigrationGuide page has been changed by PradeepKamath.
http://wiki.apache.org/pig/LoadStoreMigrationGuide?action=diffrev1=36rev2=37

--

  ||No equivalent method ||relativeToAbsolutePath() ||!LoadFunc ||Pig runtime 
will call this method to allow the Loader to convert a relative load location 
to an absolute location. The default implementation provided in !LoadFunc 
handles this for !FileSystem locations. If the load source is something else, 
loader implementation may choose to override this. ||
  ||determineSchema() ||getSchema() ||!LoadMetadata ||determineSchema() was 
used by old code to ask the loader to provide a schema for the data returned by 
it - the same semantics are now achieved through getSchema() of the 
!LoadMetadata interface. !LoadMetadata is an optional interface for loaders to 
implement - if a loader does not implement it, this will indicate to the pig 
runtime that the loader cannot return a schema for the data ||
  ||fieldsToRead() ||pushProjection() ||!LoadPushDown ||fieldsToRead() was used 
by old code to convey to the loader the exact fields required by the pig script 
-the same semantics are now achieved through pushProject() of the !LoadPushDown 
interface. !LoadPushDown is an optional interface for loaders to implement - if 
a loader does not implement it, this will indicate to the pig runtime that the 
loader is not capable of returning just the required fields and will return all 
fields in the data. If a loader implementation is able to efficiently return 
only required fields, it should implement !LoadPushDown to improve query 
performance ||
- ||No equivalent method ||getInputFormat() ||!LoadFunc ||This method will be 
called by Pig to get the !InputFormat used by the loader. The methods in the 
!InputFormat (and underlying !RecordReader) will be called by pig in the same 
manner (and in the same context) as by Hadoop in a map-reduce java program. 
'''If the !InputFormat is a hadoop packaged one, the implementation should use 
the new API based one under org.apache.hadoop.mapreduce. If it is a custom 
!InputFormat, it should be implemented using the new API in 
org.apache.hadoop.mapreduce'''||
+ ||No equivalent method ||getInputFormat() ||!LoadFunc ||This method will be 
called by Pig to get the !InputFormat used by the loader. The methods in the 
!InputFormat (and underlying !RecordReader) will be called by pig in the same 
manner (and in the same context) as by Hadoop in a map-reduce java program. 
'''If the !InputFormat is a hadoop packaged one, the implementation should use 
the new API based one under org.apache.hadoop.mapreduce. If it is a custom 
!InputFormat, it should be implemented using the new API in 
org.apache.hadoop.mapreduce'''.  If a custom loader using a text-based 
InputFormat or a file based InputFormat would like to read files in all 
subdirectories under a given input directory recursively, then it should use 
the PigFileInputFormat and PigTextInputFormat classes provided in 
org.apache.pig.backend.hadoop.executionengine.mapReduceLayer. This is to work 
around the current limitation in Hadoop's TextInputFormat and FileInputFormat 
which only read one level down from provided input directory. So for example if 
the input in the load statement is 'dir1' and there are subdirs 'dir2' and 
'dir2/dir3' underneath dir1, using Hadoop's TextInputFormat or FileInputFormat 
only files under 'dir1' can be read. Using PigFileInputFormat or 
PigTextInputFormat (or by extending them), files in all the directories can be 
read.||
  ||No equivalent method ||setLocation() ||!LoadFunc ||This method is called by 
Pig to communicate the load location to the loader. The loader should use this 
method to communicate the same information to the underlying !InputFormat. This 
method is called multiple times by pig - implementations should bear in mind 
that this method is called multiple times and should ensure there are no 
inconsistent side effects due to the multiple calls. ||
  ||bindTo() ||prepareToRead() ||!LoadFunc ||bindTo() was the old method which 
would provide an !InputStream among other things to the !LoadFunc. The 
!LoadFunc implementation would then read from the !InputStream in getNext(). In 
the new API, reading of the data is through the !InputFormat provided by the 
!LoadFunc. So the equivalent call is prepareToRead() wherein the !RecordReader 
associated with the !InputFormat provided by the !LoadFunc is passed to the 
!LoadFunc. The !RecordReader can then be used by the implementation in 
getNext() to return a tuple representing a record of data back to pig. ||
  ||getNext() ||getNext() ||!LoadFunc ||The meaning of getNext() has not 
changed and is called by Pig runtime to get the next tuple in the data - in the 
new API, this is the method wherein the implementation will use the the 

[Pig Wiki] Update of owl by jaytang

2010-03-22 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Pig Wiki for change 
notification.

The owl page has been changed by jaytang.
http://wiki.apache.org/pig/owl?action=diffrev1=3rev2=4

--

  
  = Apache Owl Wiki =
  
- The goal of Owl (a.k.a Hadoop metadata system) is to allow users and 
applications to register data stored on HDFS, search for the data available on 
HDFS, and associate metadata such as schema, statistics, etc. with a particular 
data unit or a data set stored on HDFS. The initial goal is to provide a fairly 
generic, low level abstraction that any user or application on HDFS can use to 
store an retrieve metadata. Over time a higher level abstractions closely tied 
to particular applications or tools can be developed.
+ The goal of Owl is to provide a high level data management abstraction than 
that provided by HDFS directories and files.  Applications written in MapReduce 
and Pig scripts must deal with low data data management issues such as storage 
format, serialization/compression schemes, data layout, and efficient data 
access paths, often with different solutions. Owl attempts to provide a 
standard way to addresses this issue.
  
- Please refer to this document for more detailed 
[[http://wiki.apache.org/pig/Metadata|use case, architecture, data model]]
+ Owl supports the notion of Owl Tables, a basic unit of data management.  An 
Owl Table has these characteristics:
+ 
+* lives in an Owl database name space and could contain multiple partitions
+* has columns and rows and supports a unified table level schema
+* supports MapReduce and Pig Latin and potentially other languages
+* designed for batch read/write operations
+* supports external tables (data already exists on file system)
+* pluggable architecture for different storage format such as Zebra
+* presents a logically partitioned view of data organization
+* efficient data access mechanisms via partition and projection pruning
+ 
+ 
+ Owl supports two major public APIs.  Owl Driver provides management APIs 
against Owl Table, Owl Database, and Partition.  This API is backed up by 
an internal Owl metadata store that runs on Tomcat and a relational database.  
OwlInputFormat provides a data access API and is modeled after the 
traditional Hadoop InputFormat.  In the future, we plan to support 
OwlOutputFormat and thus the notion of Owl Managed Table where Owl controls 
the data flow into and out of Owl Tables.  Owl supports Pig integration with 
OwlPigLoader/Storer module.
  
  
  == Prerequisite ==
  
- Owl high no dependency on the release of Hadoop and Pig
+ Owl depends on Pig for its tuple classes as a basic unit of data container, 
and Hadoop 20 for OwlInputFormat.  Owl supports Zebra integration out of the 
box.
  
  == Getting Owl ==
  
@@ -26, +38 @@

 * JDK 1.6
 * Ant 1.7.1
 * download [[http://dev.mysql.com/downloads/connector/j/5.1.html|MySQL 5.1 
JDBC driver]]
+* Oracle
  
  How to compile
  
 * check out latest PIG trunk
+* compile Pig
 * cd contrib/owl
 * copy MySQL JDBC driver to contrib/owl/java/lib directory
 * ant war (build owl web application)
@@ -40, +54 @@

  For development environment, Owl supports jetty 7.0 (with jetty-runner) and 
derby 10.5.  For production deployment, Owl supports:
  
 * Tomcat 6.0
-* MySQL 5.1
+* MySQL 5.1 or Oracle 11g
  
  After installing Tomcat and MySQL, you will need these files:
  
-* owl.war - owl web application
+* owl-0.x.x.war - owl web application
+* owl-0.x.x.jar - owl client library OwlInputFormat and OwlDriver with 
all their dependent 3rd party libs
 * mysql_schema.sql - owl database schema file at contrib/owl/setup/mysql
 * owlServerConfig.xml - owl server configuration file at 
contrib/owl/setup/mysql
  
@@ -58, +73 @@

  
  == Sample Code == 
  
- Owl comes with a Java-based client.  Client API Javadoc is at: 
+ Owl comes with a Java-based client.  Client API Javadoc is at...  These two 
key packages contain the public APIs for Owl's main features: 
org.apache.hadoop.owl.client and org.apache.hadoop.owl.mapreduce
  
  Sample code is attached to write a client application against owl:
  


svn commit: r926404 - in /hadoop/pig/trunk: CHANGES.txt build.xml

2010-03-22 Thread pradeepkth
Author: pradeepkth
Date: Mon Mar 22 23:50:48 2010
New Revision: 926404

URL: http://svn.apache.org/viewvc?rev=926404view=rev
Log:
Provide a way to exclude a testcase when running ant test (pradeepkth)

Modified:
hadoop/pig/trunk/CHANGES.txt
hadoop/pig/trunk/build.xml

Modified: hadoop/pig/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=926404r1=926403r2=926404view=diff
==
--- hadoop/pig/trunk/CHANGES.txt (original)
+++ hadoop/pig/trunk/CHANGES.txt Mon Mar 22 23:50:48 2010
@@ -68,6 +68,9 @@ manner (rding via pradeepkth)
 
 IMPROVEMENTS
 
+PIG-1325: Provide a way to exclude a testcase when running ant test
+(pradeepkth)
+
 PIG-1312: Make Pig work with hadoop security (daijy)
 
 PIG-1308: Inifinite loop in JobClient when reading from BinStorage Message:

Modified: hadoop/pig/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/hadoop/pig/trunk/build.xml?rev=926404r1=926403r2=926404view=diff
==
--- hadoop/pig/trunk/build.xml (original)
+++ hadoop/pig/trunk/build.xml Mon Mar 22 23:50:48 2010
@@ -543,6 +543,7 @@
 exclude name=**/TestOrderBy2.java /
 exclude name=**/TestPi.java /
 exclude name=**/nightly/** /
+exclude name=**/${exclude.testcase}.java 
if=exclude.testcase /
 /fileset
 /batchtest
 batchtest fork=yes todir=${test.log.dir} if=testcase