svn commit: r925971 - in /hadoop/pig/trunk/contrib/zebra: ./ src/java/org/apache/hadoop/zebra/io/ src/java/org/apache/hadoop/zebra/mapred/ src/java/org/apache/hadoop/zebra/mapreduce/ src/java/org/apac
Author: yanz Date: Mon Mar 22 06:26:41 2010 New Revision: 925971 URL: http://svn.apache.org/viewvc?rev=925971view=rev Log: PIG-1258 Number of sorted input splits is unusually high (yanz) Modified: hadoop/pig/trunk/contrib/zebra/CHANGES.txt hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/io/BasicTable.java hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/io/BlockDistribution.java hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/io/ColumnGroup.java hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/io/KeyDistribution.java hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/mapred/TableInputFormat.java hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/mapred/TableRecordReader.java hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/mapreduce/TableInputFormat.java hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/tfile/TFile.java hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestBasicTable.java hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroup.java hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupWithWorkPath.java Modified: hadoop/pig/trunk/contrib/zebra/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/zebra/CHANGES.txt?rev=925971r1=925970r2=925971view=diff == --- hadoop/pig/trunk/contrib/zebra/CHANGES.txt (original) +++ hadoop/pig/trunk/contrib/zebra/CHANGES.txt Mon Mar 22 06:26:41 2010 @@ -66,6 +66,8 @@ Trunk (unreleased changes) BUG FIXES +PIG-1258 Number of sorted input splits is unusually high (yanz) + PIG-1269 Restrict schema definition for collection (xuefuz via yanz) PIG-1253: make map/reduce test cases run on real cluster (chaow via yanz) Modified: hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/io/BasicTable.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/io/BasicTable.java?rev=925971r1=925970r2=925971view=diff == --- hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/io/BasicTable.java (original) +++ hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/io/BasicTable.java Mon Mar 22 06:26:41 2010 @@ -44,7 +44,6 @@ import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableUtils; -import org.apache.hadoop.zebra.tfile.RawComparable; import org.apache.hadoop.zebra.tfile.TFile; import org.apache.hadoop.zebra.tfile.Utils; import org.apache.hadoop.zebra.tfile.MetaBlockAlreadyExists; @@ -474,21 +473,18 @@ public class BasicTable { * * @param n * Targeted size of the sampling. + * @param nTables + * Number of tables in union * @return KeyDistribution object. * @throws IOException */ -public KeyDistribution getKeyDistribution(int n) throws IOException { - KeyDistribution kd = - new KeyDistribution(TFile.makeComparator(schemaFile.getComparator())); - for (int nx = 0; nx colGroups.length; nx++) { -if (!isCGDeleted(nx)) { - kd.add(colGroups[nx].getKeyDistribution(n)); -} - } - if (n = 0 kd.size() (int) (n * 1.5)) { -kd.resize(n); +public KeyDistribution getKeyDistribution(int n, int nTables, BlockDistribution lastBd) throws IOException { + if (firstValidCG = 0) + { +// pick the largest CG as in the row split case +return colGroups[getRowSplitCGIndex()].getKeyDistribution(n, nTables, lastBd); } - return kd; + return null; } /** @@ -650,7 +646,8 @@ public class BasicTable { * construct a TableScanner later. * */ -public ListRowSplit rowSplit(long[] starts, long[] lengths, Path[] paths, int splitCGIndex, int[] batchSizes, int numBatches) throws IOException { +public ListRowSplit rowSplit(long[] starts, long[] lengths, Path[] paths, +int splitCGIndex, int[] batchSizes, int numBatches) throws IOException { ListRowSplit ret; ListCGRowSplit cgSplits = colGroups[splitCGIndex].rowSplit(starts, lengths, paths, batchSizes, numBatches); int numSlices = cgSplits.size(); @@ -679,6 +676,7 @@ public class BasicTable { */ public int getRowSplitCGIndex() throws IOException { // Try to find the largest non-deleted and used column group by projection; + // Try to find the largest non-deleted and used column group by projection; if (rowSplitCGIndex == -1) { int largestCGIndex = -1; @@ -702,7 +700,7 @@ public class BasicTable { rowSplitCGIndex =
svn commit: r925988 [8/8] - in /hadoop/pig/trunk/contrib/zebra: ./ src/test/org/apache/hadoop/zebra/ src/test/org/apache/hadoop/zebra/mapred/ src/test/org/apache/hadoop/zebra/mapreduce/ src/test/org/a
Modified: hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestTableMergeJoin.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestTableMergeJoin.java?rev=925988r1=925987r2=925988view=diff == --- hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestTableMergeJoin.java (original) +++ hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestTableMergeJoin.java Mon Mar 22 07:54:51 2010 @@ -40,6 +40,7 @@ import org.apache.hadoop.zebra.types.Typ import org.apache.pig.ExecType; import org.apache.pig.PigServer; import org.apache.pig.backend.executionengine.ExecException; +import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil; import org.apache.pig.data.Tuple; import org.apache.pig.test.MiniCluster; import org.junit.After; @@ -47,6 +48,8 @@ import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; +import org.apache.hadoop.zebra.BaseTestCase; + /** * Note: @@ -55,32 +58,17 @@ import org.junit.Test; * app/debug configuration, when run this from inside the Eclipse. * */ -public class TestTableMergeJoin { - protected static ExecType execType = ExecType.MAPREDUCE; - private static MiniCluster cluster; - protected static PigServer pigServer; +public class TestTableMergeJoin extends BaseTestCase { private static Path pathTable; + @BeforeClass public static void setUp() throws Exception { -if (System.getProperty(hadoop.log.dir) == null) { - String base = new File(.).getPath(); // getAbsolutePath(); - System - .setProperty(hadoop.log.dir, new Path(base).toString() + ./logs); -} - -if (execType == ExecType.MAPREDUCE) { - cluster = MiniCluster.buildCluster(); - pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); -} else { - pigServer = new PigServer(ExecType.LOCAL); -} - -Configuration conf = new Configuration(); -FileSystem fs = cluster.getFileSystem(); -Path pathWorking = fs.getWorkingDirectory(); -pathTable = new Path(pathWorking, TestTableStorer); -System.out.println(pathTable = + pathTable); + +init(); +pathTable = getTableFullPath(TestTableMergeJoin); +removeDir(pathTable); + BasicTable.Writer writer = new BasicTable.Writer(pathTable, SF_a:string,SF_b:string,SF_c,SF_d,SF_e,SF_f,SF_g, [SF_a, SF_b, SF_c]; [SF_e, SF_f, SF_g], conf); Modified: hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestTableMergeJoinAfterFilter.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestTableMergeJoinAfterFilter.java?rev=925988r1=925987r2=925988view=diff == --- hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestTableMergeJoinAfterFilter.java (original) +++ hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestTableMergeJoinAfterFilter.java Mon Mar 22 07:54:51 2010 @@ -40,6 +40,7 @@ import org.apache.hadoop.zebra.types.Typ import org.apache.pig.ExecType; import org.apache.pig.PigServer; import org.apache.pig.backend.executionengine.ExecException; +import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil; import org.apache.pig.data.Tuple; import org.apache.pig.test.MiniCluster; import org.junit.After; @@ -47,6 +48,8 @@ import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; +import org.apache.hadoop.zebra.BaseTestCase; + /** * Note: @@ -55,32 +58,17 @@ import org.junit.Test; * app/debug configuration, when run this from inside the Eclipse. * */ -public class TestTableMergeJoinAfterFilter { - protected static ExecType execType = ExecType.MAPREDUCE; - private static MiniCluster cluster; - protected static PigServer pigServer; +public class TestTableMergeJoinAfterFilter extends BaseTestCase { private static Path pathTable; @BeforeClass public static void setUp() throws Exception { -if (System.getProperty(hadoop.log.dir) == null) { - String base = new File(.).getPath(); // getAbsolutePath(); - System - .setProperty(hadoop.log.dir, new Path(base).toString() + ./logs); -} - -if (execType == ExecType.MAPREDUCE) { - cluster = MiniCluster.buildCluster(); - pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); -} else { - pigServer = new PigServer(ExecType.LOCAL); -} - -Configuration conf = new Configuration(); -FileSystem fs = cluster.getFileSystem(); -Path pathWorking = fs.getWorkingDirectory(); -pathTable = new Path(pathWorking, TestTableStorer); -System.out.println(pathTable = + pathTable); + +init(); +pathTable =
svn commit: r925990 - in /hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra: pig/TestRealCluster.java types/TestColumnSecurity.java
Author: yanz Date: Mon Mar 22 08:00:27 2010 New Revision: 925990 URL: http://svn.apache.org/viewvc?rev=925990view=rev Log: PIG-1282 make Zebra's pig test cases run on real cluster (chaow via yanz) Removed: hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestRealCluster.java hadoop/pig/trunk/contrib/zebra/src/test/org/apache/hadoop/zebra/types/TestColumnSecurity.java
svn commit: r926228 - in /hadoop/pig/trunk: ./ src/org/apache/pig/impl/logicalLayer/optimizer/ src/org/apache/pig/impl/plan/optimizer/ test/org/apache/pig/test/
Author: pradeepkth Date: Mon Mar 22 18:02:34 2010 New Revision: 926228 URL: http://svn.apache.org/viewvc?rev=926228view=rev Log: PIG-1308: Inifinite loop in JobClient when reading from BinStorage Message: [org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 2] (pradeepkth) Modified: hadoop/pig/trunk/CHANGES.txt hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/LogicalOptimizer.java hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/OpLimitOptimizer.java hadoop/pig/trunk/src/org/apache/pig/impl/plan/optimizer/PlanOptimizer.java hadoop/pig/trunk/test/org/apache/pig/test/TestLogicalOptimizer.java Modified: hadoop/pig/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=926228r1=926227r2=926228view=diff == --- hadoop/pig/trunk/CHANGES.txt (original) +++ hadoop/pig/trunk/CHANGES.txt Mon Mar 22 18:02:34 2010 @@ -68,6 +68,10 @@ manner (rding via pradeepkth) IMPROVEMENTS +PIG-1308: Inifinite loop in JobClient when reading from BinStorage Message: +[org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to +process : 2] (pradeepkth) + PIG-1285: Allow SingleTupleBag to be serialized (dvryaboy) PIG-1117: Pig reading hive columnar rc tables (gerritjvv via dvryaboy) Modified: hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/LogicalOptimizer.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/LogicalOptimizer.java?rev=926228r1=926227r2=926228view=diff == --- hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/LogicalOptimizer.java (original) +++ hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/LogicalOptimizer.java Mon Mar 22 18:02:34 2010 @@ -185,7 +185,7 @@ public class LogicalOptimizer extends } @Override -public final void optimize() throws OptimizerException { +public final int optimize() throws OptimizerException { //the code that follows is a copy of the code in the //base class. see the todo note in the base class boolean sawMatch = false; @@ -240,5 +240,6 @@ public class LogicalOptimizer extends ((PruneColumns)pruneRule.getTransformer()).prune(); } } +return numIterations; } } \ No newline at end of file Modified: hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/OpLimitOptimizer.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/OpLimitOptimizer.java?rev=926228r1=926227r2=926228view=diff == --- hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/OpLimitOptimizer.java (original) +++ hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/optimizer/OpLimitOptimizer.java Mon Mar 22 18:02:34 2010 @@ -87,6 +87,34 @@ public class OpLimitOptimizer extends Lo + (lo == null ? lo : lo.getClass().getSimpleName()); throw new OptimizerException(msg, errCode, PigException.BUG); } +ListLogicalOperator predecessors = mPlan.getPredecessors(lo); +if (predecessors.size()!=1) { +int errCode = 2008; +String msg = Limit cannot have more than one input. Found + predecessors.size() + inputs.; +throw new OptimizerException(msg, errCode, PigException.BUG); +} +LogicalOperator predecessor = predecessors.get(0); + +// Limit cannot be pushed up +if (predecessor instanceof LOCogroup || predecessor instanceof LOFilter || +predecessor instanceof LOLoad || predecessor instanceof LOSplit || +predecessor instanceof LODistinct || predecessor instanceof LOJoin) +{ +return false; +} +// Limit cannot be pushed in front of ForEach if it has a flatten +if (predecessor instanceof LOForEach) +{ +LOForEach loForEach = (LOForEach)predecessor; +ListBoolean mFlatten = loForEach.getFlatten(); +boolean hasFlatten = false; +for (Boolean b:mFlatten) +if (b.equals(true)) hasFlatten = true; + +if (hasFlatten) { +return false; +} +} } catch (Exception e) { int errCode = 2049; String msg = Error while performing checks to optimize limit operator.; Modified: hadoop/pig/trunk/src/org/apache/pig/impl/plan/optimizer/PlanOptimizer.java URL:
svn commit: r926270 - in /hadoop/pig/trunk: CHANGES.txt src/org/apache/pig/impl/builtin/DefaultIndexableLoader.java
Author: daijy Date: Mon Mar 22 19:40:36 2010 New Revision: 926270 URL: http://svn.apache.org/viewvc?rev=926270view=rev Log: PIG-1312: Make Pig work with hadoop security Modified: hadoop/pig/trunk/CHANGES.txt hadoop/pig/trunk/src/org/apache/pig/impl/builtin/DefaultIndexableLoader.java Modified: hadoop/pig/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=926270r1=926269r2=926270view=diff == --- hadoop/pig/trunk/CHANGES.txt (original) +++ hadoop/pig/trunk/CHANGES.txt Mon Mar 22 19:40:36 2010 @@ -68,6 +68,8 @@ manner (rding via pradeepkth) IMPROVEMENTS +PIG-1312: Make Pig work with hadoop security (daijy) + PIG-1308: Inifinite loop in JobClient when reading from BinStorage Message: [org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 2] (pradeepkth) Modified: hadoop/pig/trunk/src/org/apache/pig/impl/builtin/DefaultIndexableLoader.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/impl/builtin/DefaultIndexableLoader.java?rev=926270r1=926269r2=926270view=diff == --- hadoop/pig/trunk/src/org/apache/pig/impl/builtin/DefaultIndexableLoader.java (original) +++ hadoop/pig/trunk/src/org/apache/pig/impl/builtin/DefaultIndexableLoader.java Mon Mar 22 19:40:36 2010 @@ -194,13 +194,18 @@ public class DefaultIndexableLoader exte private void initRightLoader(int [] splitsToBeRead) throws IOException{ PigContext pc = (PigContext) ObjectSerializer .deserialize(PigMapReduce.sJobConf.get(pig.pigContext)); + +Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties()); + +// Hadoop security need this property to be set +if (System.getenv(HADOOP_TOKEN_FILE_LOCATION) != null) { +conf.set(mapreduce.job.credentials.binary, +System.getenv(HADOOP_TOKEN_FILE_LOCATION)); +} + //create ReadToEndLoader that will read the given splits in order -loader = new ReadToEndLoader( - (LoadFunc)PigContext.instantiateFuncFromSpec(rightLoaderFuncSpec), -ConfigurationUtil.toConfiguration(pc.getProperties()), -inpLocation, -splitsToBeRead -); +loader = new ReadToEndLoader((LoadFunc)PigContext.instantiateFuncFromSpec(rightLoaderFuncSpec), +conf, inpLocation, splitsToBeRead); } private Object extractKeysFromIdxTuple(Tuple idxTuple) throws ExecException{
[Pig Wiki] Update of Pig070LoadStoreHowTo by PradeepK amath
Dear Wiki user, You have subscribed to a wiki page or wiki category on Pig Wiki for change notification. The Pig070LoadStoreHowTo page has been changed by PradeepKamath. http://wiki.apache.org/pig/Pig070LoadStoreHowTo?action=diffrev1=12rev2=13 -- * [[http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/LoadCaster.java?view=markup | LoadCaster]] has methods to convert byte arrays to specific types. A loader implementation should implement this if casts (implicit or explicit) from !DataByteArray fields to other types need to be supported. The !LoadFunc abstract class is the main class to extend for implementing a loader. The methods which need to be overriden are explained below: - * getInputFormat() :This method will be called by Pig to get the !InputFormat used by the loader. The methods in the !InputFormat (and underlying !RecordReader) will be called by pig in the same manner (and in the same context) as by Hadoop in a map-reduce java program. If the !InputFormat is a hadoop packaged one, the implementation should use the new API based one under org.apache.hadoop.mapreduce. If it is a custom !InputFormat, it should be implemented using the new API in org.apache.hadoop.mapreduce. + * getInputFormat() :This method will be called by Pig to get the !InputFormat used by the loader. The methods in the !InputFormat (and underlying !RecordReader) will be called by pig in the same manner (and in the same context) as by Hadoop in a map-reduce java program. If the !InputFormat is a hadoop packaged one, the implementation should use the new API based one under org.apache.hadoop.mapreduce. If it is a custom !InputFormat, it should be implemented using the new API in org.apache.hadoop.mapreduce. If a custom loader using a text-based !InputFormat or a file based !InputFormat would like to read files in all subdirectories under a given input directory recursively, then it should use the !PigFileInputFormat and !PigTextInputFormat classes provided in org.apache.pig.backend.hadoop.executionengine.mapReduceLayer. This is to work around the current limitation in Hadoop's !TextInputFormat and !FileInputFormat which only read one level down from provided input directory. So for example if the input in the load statement is 'dir1' and there are subdirs 'dir2' and 'dir2/dir3' underneath dir1, using Hadoop's !TextInputFormat or !FileInputFormat only files under 'dir1' can be read. Using !PigFileInputFormat or !PigTextInputFormat (or by extending them), files in all the directories can be read. + + + Changes to custom Load Functions + + + Low to medium + + + + + + This is to get around the problem of MAPREDUCE-1577. * setLocation() :This method is called by Pig to communicate the load location to the loader. The loader should use this method to communicate the same information to the underlying !InputFormat. This method is called multiple times by pig - implementations should bear this in mind and should ensure there are no inconsistent side effects due to the multiple calls. * prepareToRead() : Through this method the !RecordReader associated with the !InputFormat provided by the !LoadFunc is passed to the !LoadFunc. The !RecordReader can then be used by the implementation in getNext() to return a tuple representing a record of data back to pig. * getNext() :The meaning of getNext() has not changed and is called by Pig runtime to get the next tuple in the data - in this method the implementation should use the the underlying !RecordReader and construct the tuple to return.
[Pig Wiki] Update of LoadStoreMigrationGuide by Prade epKamath
Dear Wiki user, You have subscribed to a wiki page or wiki category on Pig Wiki for change notification. The LoadStoreMigrationGuide page has been changed by PradeepKamath. http://wiki.apache.org/pig/LoadStoreMigrationGuide?action=diffrev1=36rev2=37 -- ||No equivalent method ||relativeToAbsolutePath() ||!LoadFunc ||Pig runtime will call this method to allow the Loader to convert a relative load location to an absolute location. The default implementation provided in !LoadFunc handles this for !FileSystem locations. If the load source is something else, loader implementation may choose to override this. || ||determineSchema() ||getSchema() ||!LoadMetadata ||determineSchema() was used by old code to ask the loader to provide a schema for the data returned by it - the same semantics are now achieved through getSchema() of the !LoadMetadata interface. !LoadMetadata is an optional interface for loaders to implement - if a loader does not implement it, this will indicate to the pig runtime that the loader cannot return a schema for the data || ||fieldsToRead() ||pushProjection() ||!LoadPushDown ||fieldsToRead() was used by old code to convey to the loader the exact fields required by the pig script -the same semantics are now achieved through pushProject() of the !LoadPushDown interface. !LoadPushDown is an optional interface for loaders to implement - if a loader does not implement it, this will indicate to the pig runtime that the loader is not capable of returning just the required fields and will return all fields in the data. If a loader implementation is able to efficiently return only required fields, it should implement !LoadPushDown to improve query performance || - ||No equivalent method ||getInputFormat() ||!LoadFunc ||This method will be called by Pig to get the !InputFormat used by the loader. The methods in the !InputFormat (and underlying !RecordReader) will be called by pig in the same manner (and in the same context) as by Hadoop in a map-reduce java program. '''If the !InputFormat is a hadoop packaged one, the implementation should use the new API based one under org.apache.hadoop.mapreduce. If it is a custom !InputFormat, it should be implemented using the new API in org.apache.hadoop.mapreduce'''|| + ||No equivalent method ||getInputFormat() ||!LoadFunc ||This method will be called by Pig to get the !InputFormat used by the loader. The methods in the !InputFormat (and underlying !RecordReader) will be called by pig in the same manner (and in the same context) as by Hadoop in a map-reduce java program. '''If the !InputFormat is a hadoop packaged one, the implementation should use the new API based one under org.apache.hadoop.mapreduce. If it is a custom !InputFormat, it should be implemented using the new API in org.apache.hadoop.mapreduce'''. If a custom loader using a text-based InputFormat or a file based InputFormat would like to read files in all subdirectories under a given input directory recursively, then it should use the PigFileInputFormat and PigTextInputFormat classes provided in org.apache.pig.backend.hadoop.executionengine.mapReduceLayer. This is to work around the current limitation in Hadoop's TextInputFormat and FileInputFormat which only read one level down from provided input directory. So for example if the input in the load statement is 'dir1' and there are subdirs 'dir2' and 'dir2/dir3' underneath dir1, using Hadoop's TextInputFormat or FileInputFormat only files under 'dir1' can be read. Using PigFileInputFormat or PigTextInputFormat (or by extending them), files in all the directories can be read.|| ||No equivalent method ||setLocation() ||!LoadFunc ||This method is called by Pig to communicate the load location to the loader. The loader should use this method to communicate the same information to the underlying !InputFormat. This method is called multiple times by pig - implementations should bear in mind that this method is called multiple times and should ensure there are no inconsistent side effects due to the multiple calls. || ||bindTo() ||prepareToRead() ||!LoadFunc ||bindTo() was the old method which would provide an !InputStream among other things to the !LoadFunc. The !LoadFunc implementation would then read from the !InputStream in getNext(). In the new API, reading of the data is through the !InputFormat provided by the !LoadFunc. So the equivalent call is prepareToRead() wherein the !RecordReader associated with the !InputFormat provided by the !LoadFunc is passed to the !LoadFunc. The !RecordReader can then be used by the implementation in getNext() to return a tuple representing a record of data back to pig. || ||getNext() ||getNext() ||!LoadFunc ||The meaning of getNext() has not changed and is called by Pig runtime to get the next tuple in the data - in the new API, this is the method wherein the implementation will use the the
[Pig Wiki] Update of owl by jaytang
Dear Wiki user, You have subscribed to a wiki page or wiki category on Pig Wiki for change notification. The owl page has been changed by jaytang. http://wiki.apache.org/pig/owl?action=diffrev1=3rev2=4 -- = Apache Owl Wiki = - The goal of Owl (a.k.a Hadoop metadata system) is to allow users and applications to register data stored on HDFS, search for the data available on HDFS, and associate metadata such as schema, statistics, etc. with a particular data unit or a data set stored on HDFS. The initial goal is to provide a fairly generic, low level abstraction that any user or application on HDFS can use to store an retrieve metadata. Over time a higher level abstractions closely tied to particular applications or tools can be developed. + The goal of Owl is to provide a high level data management abstraction than that provided by HDFS directories and files. Applications written in MapReduce and Pig scripts must deal with low data data management issues such as storage format, serialization/compression schemes, data layout, and efficient data access paths, often with different solutions. Owl attempts to provide a standard way to addresses this issue. - Please refer to this document for more detailed [[http://wiki.apache.org/pig/Metadata|use case, architecture, data model]] + Owl supports the notion of Owl Tables, a basic unit of data management. An Owl Table has these characteristics: + +* lives in an Owl database name space and could contain multiple partitions +* has columns and rows and supports a unified table level schema +* supports MapReduce and Pig Latin and potentially other languages +* designed for batch read/write operations +* supports external tables (data already exists on file system) +* pluggable architecture for different storage format such as Zebra +* presents a logically partitioned view of data organization +* efficient data access mechanisms via partition and projection pruning + + + Owl supports two major public APIs. Owl Driver provides management APIs against Owl Table, Owl Database, and Partition. This API is backed up by an internal Owl metadata store that runs on Tomcat and a relational database. OwlInputFormat provides a data access API and is modeled after the traditional Hadoop InputFormat. In the future, we plan to support OwlOutputFormat and thus the notion of Owl Managed Table where Owl controls the data flow into and out of Owl Tables. Owl supports Pig integration with OwlPigLoader/Storer module. == Prerequisite == - Owl high no dependency on the release of Hadoop and Pig + Owl depends on Pig for its tuple classes as a basic unit of data container, and Hadoop 20 for OwlInputFormat. Owl supports Zebra integration out of the box. == Getting Owl == @@ -26, +38 @@ * JDK 1.6 * Ant 1.7.1 * download [[http://dev.mysql.com/downloads/connector/j/5.1.html|MySQL 5.1 JDBC driver]] +* Oracle How to compile * check out latest PIG trunk +* compile Pig * cd contrib/owl * copy MySQL JDBC driver to contrib/owl/java/lib directory * ant war (build owl web application) @@ -40, +54 @@ For development environment, Owl supports jetty 7.0 (with jetty-runner) and derby 10.5. For production deployment, Owl supports: * Tomcat 6.0 -* MySQL 5.1 +* MySQL 5.1 or Oracle 11g After installing Tomcat and MySQL, you will need these files: -* owl.war - owl web application +* owl-0.x.x.war - owl web application +* owl-0.x.x.jar - owl client library OwlInputFormat and OwlDriver with all their dependent 3rd party libs * mysql_schema.sql - owl database schema file at contrib/owl/setup/mysql * owlServerConfig.xml - owl server configuration file at contrib/owl/setup/mysql @@ -58, +73 @@ == Sample Code == - Owl comes with a Java-based client. Client API Javadoc is at: + Owl comes with a Java-based client. Client API Javadoc is at... These two key packages contain the public APIs for Owl's main features: org.apache.hadoop.owl.client and org.apache.hadoop.owl.mapreduce Sample code is attached to write a client application against owl:
svn commit: r926404 - in /hadoop/pig/trunk: CHANGES.txt build.xml
Author: pradeepkth Date: Mon Mar 22 23:50:48 2010 New Revision: 926404 URL: http://svn.apache.org/viewvc?rev=926404view=rev Log: Provide a way to exclude a testcase when running ant test (pradeepkth) Modified: hadoop/pig/trunk/CHANGES.txt hadoop/pig/trunk/build.xml Modified: hadoop/pig/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=926404r1=926403r2=926404view=diff == --- hadoop/pig/trunk/CHANGES.txt (original) +++ hadoop/pig/trunk/CHANGES.txt Mon Mar 22 23:50:48 2010 @@ -68,6 +68,9 @@ manner (rding via pradeepkth) IMPROVEMENTS +PIG-1325: Provide a way to exclude a testcase when running ant test +(pradeepkth) + PIG-1312: Make Pig work with hadoop security (daijy) PIG-1308: Inifinite loop in JobClient when reading from BinStorage Message: Modified: hadoop/pig/trunk/build.xml URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/build.xml?rev=926404r1=926403r2=926404view=diff == --- hadoop/pig/trunk/build.xml (original) +++ hadoop/pig/trunk/build.xml Mon Mar 22 23:50:48 2010 @@ -543,6 +543,7 @@ exclude name=**/TestOrderBy2.java / exclude name=**/TestPi.java / exclude name=**/nightly/** / +exclude name=**/${exclude.testcase}.java if=exclude.testcase / /fileset /batchtest batchtest fork=yes todir=${test.log.dir} if=testcase