svn commit: r901317 - in /hadoop/pig/branches/branch-0.6: CHANGES.txt src/org/apache/pig/impl/logicalLayer/LOCast.java src/org/apache/pig/impl/logicalLayer/optimizer/SchemaRemover.java test/org/apache
Author: gates Date: Wed Jan 20 18:34:41 2010 New Revision: 901317 URL: http://svn.apache.org/viewvc?rev=901317view=rev Log: PIG-1191: POCast throws exception for certain sequences of LOAD, FILTER, FORACH. Checking in for Pradeep since he is out. Modified: hadoop/pig/branches/branch-0.6/CHANGES.txt hadoop/pig/branches/branch-0.6/src/org/apache/pig/impl/logicalLayer/LOCast.java hadoop/pig/branches/branch-0.6/src/org/apache/pig/impl/logicalLayer/optimizer/SchemaRemover.java hadoop/pig/branches/branch-0.6/test/org/apache/pig/test/TestTypeCheckingValidator.java Modified: hadoop/pig/branches/branch-0.6/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/CHANGES.txt?rev=901317r1=901316r2=901317view=diff == --- hadoop/pig/branches/branch-0.6/CHANGES.txt (original) +++ hadoop/pig/branches/branch-0.6/CHANGES.txt Wed Jan 20 18:34:41 2010 @@ -145,6 +145,9 @@ BUG FIXES +PIG-1191: POCast throws exception for certain sequences of LOAD, FILTER, +FORACH (pradeepkth via gates) + PIG-1143: Poisson Sample Loader should compute the number of samples required only once (sriranjan via olgan) Modified: hadoop/pig/branches/branch-0.6/src/org/apache/pig/impl/logicalLayer/LOCast.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/src/org/apache/pig/impl/logicalLayer/LOCast.java?rev=901317r1=901316r2=901317view=diff == --- hadoop/pig/branches/branch-0.6/src/org/apache/pig/impl/logicalLayer/LOCast.java (original) +++ hadoop/pig/branches/branch-0.6/src/org/apache/pig/impl/logicalLayer/LOCast.java Wed Jan 20 18:34:41 2010 @@ -26,6 +26,7 @@ import org.apache.pig.impl.plan.PlanVisitor; import org.apache.pig.impl.plan.VisitorException; import org.apache.pig.impl.logicalLayer.schema.Schema; +import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; import org.apache.pig.data.DataType; public class LOCast extends ExpressionOperator { @@ -34,6 +35,11 @@ private static final long serialVersionUID = 2L; private FuncSpec mLoadFuncSpec = null; +// store field schema representing the schema +// in user specified casts -this is so that if +// field schema is unset and then getFieldSchema is called we still +// rebuild the fieldschema correctly as specified by the user in the script +private FieldSchema userSpecifiedFieldSchema; /** * @@ -65,11 +71,22 @@ public Schema getSchema() { return mSchema; } + + +@Override +public void setFieldSchema(FieldSchema fs) throws FrontendException { +super.setFieldSchema(fs); +userSpecifiedFieldSchema = new Schema.FieldSchema(fs); +} @Override public Schema.FieldSchema getFieldSchema() throws FrontendException { if(!mIsFieldSchemaComputed) { -mFieldSchema = new Schema.FieldSchema(null, mType); +if(userSpecifiedFieldSchema != null) { +mFieldSchema = userSpecifiedFieldSchema; +} else { +mFieldSchema = new Schema.FieldSchema(null, mType); +} Schema.FieldSchema parFs = getExpression().getFieldSchema(); String canonicalName = (parFs != null ? parFs.canonicalName : null); mFieldSchema.setParent(canonicalName, getExpression()); Modified: hadoop/pig/branches/branch-0.6/src/org/apache/pig/impl/logicalLayer/optimizer/SchemaRemover.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/src/org/apache/pig/impl/logicalLayer/optimizer/SchemaRemover.java?rev=901317r1=901316r2=901317view=diff == --- hadoop/pig/branches/branch-0.6/src/org/apache/pig/impl/logicalLayer/optimizer/SchemaRemover.java (original) +++ hadoop/pig/branches/branch-0.6/src/org/apache/pig/impl/logicalLayer/optimizer/SchemaRemover.java Wed Jan 20 18:34:41 2010 @@ -37,6 +37,7 @@ *the logical binary expression operator that has to be visited * @throws VisitorException */ +@Override protected void visit(BinaryExpressionOperator binOp) throws VisitorException { binOp.unsetFieldSchema(); @@ -49,6 +50,7 @@ *the logical unary operator that has to be visited * @throws VisitorException */ +@Override protected void visit(UnaryExpressionOperator uniOp) throws VisitorException { uniOp.unsetFieldSchema(); super.visit(uniOp); @@ -60,6 +62,7 @@ *the logical cogroup operator that has to be visited * @throws VisitorException */ +@Override protected void visit(LOCogroup cg) throws VisitorException { cg.unsetSchema(); super.visit(cg); @@ -71,6 +74,7 @@ *the logical sort operator that has to be visited
svn commit: r901333 - in /hadoop/pig/branches/branch-0.6: ./ src/docs/src/documentation/content/xdocs/
Author: olga Date: Wed Jan 20 19:03:57 2010 New Revision: 901333 URL: http://svn.apache.org/viewvc?rev=901333view=rev Log: PIG-1192: Pig 0.6 Docs fixes (chandec via olgan) Modified: hadoop/pig/branches/branch-0.6/CHANGES.txt hadoop/pig/branches/branch-0.6/src/docs/src/documentation/content/xdocs/cookbook.xml hadoop/pig/branches/branch-0.6/src/docs/src/documentation/content/xdocs/index.xml hadoop/pig/branches/branch-0.6/src/docs/src/documentation/content/xdocs/setup.xml hadoop/pig/branches/branch-0.6/src/docs/src/documentation/content/xdocs/site.xml hadoop/pig/branches/branch-0.6/src/docs/src/documentation/content/xdocs/zebra_pig.xml hadoop/pig/branches/branch-0.6/src/docs/src/documentation/content/xdocs/zebra_users.xml Modified: hadoop/pig/branches/branch-0.6/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/CHANGES.txt?rev=901333r1=901332r2=901333view=diff == --- hadoop/pig/branches/branch-0.6/CHANGES.txt (original) +++ hadoop/pig/branches/branch-0.6/CHANGES.txt Wed Jan 20 19:03:57 2010 @@ -26,6 +26,8 @@ IMPROVEMENTS +PIG-1192: Pig 0.6 Docs fixes (chandec via olgan) + PIG-1177: Pig 0.6 Docs - Zebra docs (chandec via olgan) PIG-1175: Pig 0.6 Docs - Store v. Dump (chandec via olgan) Modified: hadoop/pig/branches/branch-0.6/src/docs/src/documentation/content/xdocs/cookbook.xml URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/src/docs/src/documentation/content/xdocs/cookbook.xml?rev=901333r1=901332r2=901333view=diff == --- hadoop/pig/branches/branch-0.6/src/docs/src/documentation/content/xdocs/cookbook.xml (original) +++ hadoop/pig/branches/branch-0.6/src/docs/src/documentation/content/xdocs/cookbook.xml Wed Jan 20 19:03:57 2010 @@ -36,7 +36,7 @@ section titleUse Optimization/title -pPig supports various a href=piglatin_users.html#Optimization+Rulesoptimization rules/a which are turned on by default. +pPig supports various a href=piglatin_ref1.html#Optimization+Rulesoptimization rules/a which are turned on by default. Become familiar with these rules./p /section @@ -220,29 +220,34 @@ section titleSpecialized Join Optimizations/title pOptimization can also be achieved using fragment replicate joins, skewed joins, and merge joins. -For more information see a href=piglatin_users.html#Specialized+JoinsSpecialized Joins/a./p +For more information see a href=piglatin_ref1.html#Specialized+JoinsSpecialized Joins/a./p /section /section section -titleUse the PARALLEL Keyword/title +titleUse the PARALLEL Clause/title -pPARALLEL controls the number of reducers invoked by Hadoop. The default value is 1. However, the number of reducers you need for a particular construct in Pig that forms a MapReduce boundary depends entirely on (1) your data and the number of intermediate keys you are generating in your mappers and (2) the partitioner and distribution of map (combiner) output keys. In the best cases we have seen that a reducer processing about 500 MB of data behaves efficiently./p +pUse the PARALLEL clause to increase the parallelism of a job:/p +ul +liPARALLEL sets the number of reduce tasks for the MapReduce jobs generated by Pig. The default value is 1 (one reduce task)./li +liPARALLEL only affects the number of reduce tasks. Map parallelism is determined by the input file, one map for each HDFS block. /li +liIf you donât specify PARALLEL, you still get the same map parallelism but only one reduce task./li +/ul +p/p +pAs noted, the default value for PARALLEL is 1 (one reduce task). However, the number of reducers you need for a particular construct in Pig that forms a MapReduce boundary depends entirely on (1) your data and the number of intermediate keys you are generating in your mappers and (2) the partitioner and distribution of map (combiner) output keys. In the best cases we have seen that a reducer processing about 500 MB of data behaves efficiently./p -pThe keyword makes sense with any operator that starts a reduce phase. This includes -a href=piglatin_reference.html#COGROUPCOGROUP/a, -a href=piglatin_reference.html#CROSSCROSS/a, -a href=piglatin_reference.html#DISTINCTDISTINCT/a, -a href=piglatin_reference.html#GROUPGROUP/a, -a href=piglatin_reference.html#JOINJOIN/a, -a href=piglatin_reference.html#ORDERORDER/a, and -a href=piglatin_reference.html#JOIN%2C+OUTEROUTER JOIN/a. - -/p - -pYou can set the value of PARALLEL in your scripts in conjunction with the operator (see the example below). You can also set the value of PARALLEL for all scripts using the a href=piglatin_reference.html#setset/a command./p +pYou can include the PARALLEL clause with any operator that starts a reduce phase (see the example below). This includes +a href=piglatin_ref2.html#COGROUPCOGROUP/a, +a
svn commit: r901360 [2/2] - in /hadoop/pig/branches/load-store-redesign: contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/math/ contrib/piggybank/java/src/main/java/org/apache/
Modified: hadoop/pig/branches/load-store-redesign/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/PigStorageSchema.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/load-store-redesign/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/PigStorageSchema.java?rev=901360r1=901359r2=901360view=diff == --- hadoop/pig/branches/load-store-redesign/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/PigStorageSchema.java (original) +++ hadoop/pig/branches/load-store-redesign/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/PigStorageSchema.java Wed Jan 20 20:08:28 2010 @@ -20,27 +20,13 @@ import java.io.IOException; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.mapred.JobConf; -import org.apache.pig.ExecType; -import org.apache.pig.experimental.JsonMetadata; -import org.apache.pig.experimental.LoadMetadata; -import org.apache.pig.experimental.StoreMetadata; -import org.apache.pig.experimental.ResourceSchema; -import org.apache.pig.experimental.ResourceStatistics; -import org.apache.pig.StoreConfig; -import org.apache.pig.backend.datastorage.DataStorage; -import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil; -import org.apache.pig.backend.hadoop.datastorage.HDataStorage; -import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduce; -import org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil; +import org.apache.pig.Expression; +import org.apache.pig.LoadMetadata; +import org.apache.pig.ResourceSchema; +import org.apache.pig.ResourceStatistics; +import org.apache.pig.StoreMetadata; import org.apache.pig.builtin.PigStorage; -import org.apache.pig.data.DataType; -import org.apache.pig.impl.io.FileLocalizer; -import org.apache.pig.impl.logicalLayer.schema.Schema; -import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; /** * This Load/Store Func reads/writes metafiles that allow the schema and @@ -54,9 +40,7 @@ * Due to StoreFunc limitations, you can only write the metafiles in MapReduce * mode. You can read them in Local or MapReduce mode. */ -public class PigStorageSchema extends PigStorage implements StoreMetadata { - -private static final Log log = LogFactory.getLog(PigStorageSchema.class); +public class PigStorageSchema extends PigStorage implements LoadMetadata, StoreMetadata { public PigStorageSchema() { super(); @@ -65,61 +49,50 @@ public PigStorageSchema(String delim) { super(delim); } + +// +// Implementation of LoadMetaData interface @Override -public Schema determineSchema(String fileName, ExecType execType, -DataStorage storage) throws IOException { +public ResourceSchema getSchema(String location, +Configuration conf) throws IOException { +return (new JsonMetadata()).getSchema(location, conf); +} + +@Override +public ResourceStatistics getStatistics(String location, +Configuration conf) throws IOException { +return null; +} -// TODO fullPath should be retrieved ia relativeToAbsolutePath once PIG-966 is complete -String fullPath = FileLocalizer.fullPath(fileName, storage); -LoadMetadata metadataLoader = new JsonMetadata(fullPath, storage); -ResourceSchema resourceSchema = metadataLoader.getSchema(fullPath, null); -if (resourceSchema == null) { -return null; -} -Schema pigSchema = new Schema(); -for (ResourceSchema.ResourceFieldSchema field : resourceSchema.getFields()) { -FieldSchema pigFieldSchema = DataType.determineFieldSchema(field); -// determineFieldSchema only sets the types. we also want the aliases. -// TODO this doesn't work properly for complex types -pigFieldSchema.alias = field.getName(); -pigSchema.add(pigFieldSchema); -} -log.info(Loaded Schema: +pigSchema); -return pigSchema; +@Override +public void setPartitionFilter(Expression partitionFilter) +throws IOException { +} + +@Override +public String[] getPartitionKeys(String location, Configuration conf) +throws IOException { +return null; } +// +// Implementation of StoreMetadata + @Override -public void finish() throws IOException { -super.finish(); -JobConf jobConf = PigMapReduce.sJobConf; -if(jobConf != null){ -StoreConfig storeConfig = MapRedUtil.getStoreConfig(jobConf); -DataStorage store = new
svn commit: r901380 - /hadoop/pig/branches/load-store-redesign/src/org/apache/pig/StoreMetadata.java
Author: rding Date: Wed Jan 20 20:59:46 2010 New Revision: 901380 URL: http://svn.apache.org/viewvc?rev=901380view=rev Log: PIG-1090: Update sources to reflect recent changes in load-store interfaces Added: hadoop/pig/branches/load-store-redesign/src/org/apache/pig/StoreMetadata.java Added: hadoop/pig/branches/load-store-redesign/src/org/apache/pig/StoreMetadata.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/load-store-redesign/src/org/apache/pig/StoreMetadata.java?rev=901380view=auto == --- hadoop/pig/branches/load-store-redesign/src/org/apache/pig/StoreMetadata.java (added) +++ hadoop/pig/branches/load-store-redesign/src/org/apache/pig/StoreMetadata.java Wed Jan 20 20:59:46 2010 @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.pig; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.pig.ResourceSchema; +import org.apache.pig.ResourceStatistics; + +/** + * This interface defines how to write metadata related to data to be loaded. + * If a given store function does not implement this interface, it will be assumed that it + * is unable to record metadata about the associated data. + */ + +public interface StoreMetadata { + +/** + * Store statistics about the data being written. + * + * @throws IOException + */ +void storeStatistics(ResourceStatistics stats, String location, Configuration conf) throws IOException; + +/** + * Store schema of the data being written + * + * @throws IOException + */ +void storeSchema(ResourceSchema schema, String location, Configuration conf) throws IOException; +}