Author: daijy Date: Mon Apr 12 21:06:19 2010 New Revision: 933415 URL: http://svn.apache.org/viewvc?rev=933415&view=rev Log: PIG-1361: [Zebra] Zebra TableLoader.getSchema() should return the projectionSchema specified in the constructor of TableLoader instead of pruned proejction by pig
Modified: hadoop/pig/trunk/contrib/zebra/CHANGES.txt hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/pig/TableLoader.java Modified: hadoop/pig/trunk/contrib/zebra/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/zebra/CHANGES.txt?rev=933415&r1=933414&r2=933415&view=diff ============================================================================== --- hadoop/pig/trunk/contrib/zebra/CHANGES.txt (original) +++ hadoop/pig/trunk/contrib/zebra/CHANGES.txt Mon Apr 12 21:06:19 2010 @@ -18,6 +18,8 @@ Trunk (unreleased changes) IMPROVEMENTS + PIG-1361 Zebra TableLoader.getSchema() should return the projectionSchema specified in the constructor of TableLoader instead of pruned proejction by pig (gauravj via daijy) + PIG-1291 Support of virtual column "source_table" on unsorted table (yanz) PIG-1315 Implementing OrderedLoadFunc interface for Zebra TableLoader (xuefux via yanz) Modified: hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/pig/TableLoader.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/pig/TableLoader.java?rev=933415&r1=933414&r2=933415&view=diff ============================================================================== --- hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/pig/TableLoader.java (original) +++ hadoop/pig/trunk/contrib/zebra/src/java/org/apache/hadoop/zebra/pig/TableLoader.java Mon Apr 12 21:06:19 2010 @@ -158,6 +158,7 @@ public class TableLoader extends LoadFun schema = TableInputFormat.getSchema( job ); sorted = true; + setSortOrder( job ); setProjection( job ); try { @@ -169,36 +170,48 @@ public class TableLoader extends LoadFun } } + /** - * This method does more than set projection. For instance, it also try to grab sorting info if required. + * it processes sortedness of table . + * + * @param job + * @throws IOException + */ + private void setSortOrder(Job job) throws IOException { + Properties properties = UDFContext.getUDFContext().getUDFProperties( + this.getClass(), new String[]{ udfContextSignature } ); + boolean requireGlobalOrder = "true".equals(properties.getProperty( UDFCONTEXT_GLOBAL_SORTING)); + if (requireGlobalOrder && !sorted) + throw new IOException("Global sorting can be only asked on table loaded as sorted"); + if( sorted ) { + SplitMode splitMode = + requireGlobalOrder ? SplitMode.GLOBALLY_SORTED : SplitMode.LOCALLY_SORTED; + TableInputFormat.setSplitMode(job, splitMode, null); + sortInfo = TableInputFormat.getSortInfo( job ); + } + } + + + /** + * This method sets projection. * * @param job * @throws IOException */ private void setProjection(Job job) throws IOException { Properties properties = UDFContext.getUDFContext().getUDFProperties( - this.getClass(), new String[]{ udfContextSignature } ); - boolean requireGlobalOrder = "true".equals(properties.getProperty( UDFCONTEXT_GLOBAL_SORTING)); - if (requireGlobalOrder && !sorted) - throw new IOException("Global sorting can be only asked on table loaded as sorted"); - if( sorted ) { - SplitMode splitMode = - requireGlobalOrder ? SplitMode.GLOBALLY_SORTED : SplitMode.LOCALLY_SORTED; - TableInputFormat.setSplitMode(job, splitMode, null); - sortInfo = TableInputFormat.getSortInfo( job ); - } - - try { - String prunedProjStr = properties.getProperty( UDFCONTEXT_PROJ_STRING ); - - if( prunedProjStr != null ) { - TableInputFormat.setProjection( job, prunedProjStr ); - } else if( projectionString != null ) { - TableInputFormat.setProjection( job, projectionString ); - } - } catch (ParseException ex) { - throw new IOException( "Schema parsing failed : " + ex.getMessage() ); - } + this.getClass(), new String[]{ udfContextSignature } ); + try { + String prunedProjStr = properties.getProperty( UDFCONTEXT_PROJ_STRING ); + + if( prunedProjStr != null ) { + TableInputFormat.setProjection( job, prunedProjStr ); + } else if( projectionString != null ) { + TableInputFormat.setProjection( job, projectionString ); + } + } catch (ParseException ex) { + throw new IOException( "Schema parsing failed : " + ex.getMessage() ); + } } private KeyGenerator makeKeyBuilder(byte[] elems) { @@ -283,6 +296,7 @@ public class TableLoader extends LoadFun // The following obviously goes beyond of set location, but this is the only place that we // can do and it's suggested by Pig team. + setSortOrder( job ); setProjection( job ); } @@ -318,10 +332,25 @@ public class TableLoader extends LoadFun } } - setProjection( job ); + // This is needed as it does a check if a unsorted table is loaded as sorted + // It fails if unosrted table is loaded as sorted + setSortOrder( job ); + + /* + As per pig team any changes to this job object will be thrown away. + getSchema is needed to return the projectionSchema for the projection + string specified in TableLoader constructor. So, projectionString is used + here. However, setLocation() calls setPojection() because that is called after + projectionPruning and needs to read projection string from UDFCONTEXT + That also sets/calls TableInputFormat.setProjection(job, $prunedProj). But the + job object here in getSchema() is a different copy from setLocation() and hence + the changes will not be overridden as per PIG TEAM. + */ projectionSchema = tableSchema; try { + if(projectionString != null) + TableInputFormat.setProjection(job, projectionString); Projection projection = new org.apache.hadoop.zebra.types.Projection( tableSchema, TableInputFormat.getProjection( job ) ); projectionSchema = projection.getProjectionSchema();