Author: pradeepkth Date: Tue Jul 7 17:26:13 2009 New Revision: 791916 URL: http://svn.apache.org/viewvc?rev=791916&view=rev Log: PIG-820: additional fixes to original patch (Ashutosh Chauhan via pradeepkth)
Modified: hadoop/pig/trunk/src/org/apache/pig/SamplableLoader.java hadoop/pig/trunk/src/org/apache/pig/builtin/BinStorage.java hadoop/pig/trunk/src/org/apache/pig/builtin/PigStorage.java hadoop/pig/trunk/src/org/apache/pig/impl/builtin/RandomSampleLoader.java Modified: hadoop/pig/trunk/src/org/apache/pig/SamplableLoader.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/SamplableLoader.java?rev=791916&r1=791915&r2=791916&view=diff ============================================================================== --- hadoop/pig/trunk/src/org/apache/pig/SamplableLoader.java (original) +++ hadoop/pig/trunk/src/org/apache/pig/SamplableLoader.java Tue Jul 7 17:26:13 2009 @@ -20,6 +20,8 @@ import java.io.IOException; +import org.apache.pig.data.Tuple; + /** * Implementing this interface indicates to Pig that a given loader can be * used by a sampling loader. The requirement for this is that the loader @@ -46,4 +48,16 @@ * @return position in the stream. */ public long getPosition() throws IOException; + + /** + * Get the next tuple from the stream starting from the current + * read position. + * The loader implementation should not assume that current read position + * in the stream is at the beginning of a record since this method is called + * for sampling and the current read position in the stream could be anywhere + * in the stream. + * @return the next tuple from underlying input stream or null if there are no more tuples + * to be processed. + */ + public Tuple getSampledTuple() throws IOException; } Modified: hadoop/pig/trunk/src/org/apache/pig/builtin/BinStorage.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/BinStorage.java?rev=791916&r1=791915&r2=791916&view=diff ============================================================================== --- hadoop/pig/trunk/src/org/apache/pig/builtin/BinStorage.java (original) +++ hadoop/pig/trunk/src/org/apache/pig/builtin/BinStorage.java Tue Jul 7 17:26:13 2009 @@ -414,4 +414,9 @@ // TODO Auto-generated method stub return null; } + + @Override + public Tuple getSampledTuple() throws IOException { + return this.getNext(); + } } Modified: hadoop/pig/trunk/src/org/apache/pig/builtin/PigStorage.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/PigStorage.java?rev=791916&r1=791915&r2=791916&view=diff ============================================================================== --- hadoop/pig/trunk/src/org/apache/pig/builtin/PigStorage.java (original) +++ hadoop/pig/trunk/src/org/apache/pig/builtin/PigStorage.java Tue Jul 7 17:26:13 2009 @@ -59,6 +59,7 @@ private static final int OS_UNIX = 0; private static final int OS_WINDOWS = 1; private static final String UTF8 = "UTF-8"; + private Byte prevByte = null; public PigStorage() { os = OS_UNIX; @@ -104,7 +105,13 @@ @Override public long skip(long n) throws IOException { - return in.skip(n); + + long skipped = in.skip(n-1); + prevByte = (byte)in.read(); + if(prevByte == -1) // End of stream. + return skipped; + else + return skipped+1; } public Tuple getNext() throws IOException { @@ -136,6 +143,20 @@ } } + @Override + public Tuple getSampledTuple() throws IOException { + + if(prevByte == null || prevByte == recordDel) + // prevByte = null when this is called for the first time, in that case bindTo would have already + // called getNext() if it was required. + return getNext(); + + else{ // We are in middle of record. So, we skip this and return the next one. + getNext(); + return getNext(); + } + } + public void bindTo(String fileName, BufferedPositionedInputStream in, long offset, long end) throws IOException { this.in = in; this.end = end; @@ -341,5 +362,4 @@ // TODO Auto-generated method stub return null; } - -} + } Modified: hadoop/pig/trunk/src/org/apache/pig/impl/builtin/RandomSampleLoader.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/impl/builtin/RandomSampleLoader.java?rev=791916&r1=791915&r2=791916&view=diff ============================================================================== --- hadoop/pig/trunk/src/org/apache/pig/impl/builtin/RandomSampleLoader.java (original) +++ hadoop/pig/trunk/src/org/apache/pig/impl/builtin/RandomSampleLoader.java Tue Jul 7 17:26:13 2009 @@ -64,7 +64,7 @@ @Override public Tuple getNext() throws IOException { long initialPos = loader.getPosition(); - Tuple t = loader.getNext(); + Tuple t = loader.getSampledTuple(); long finalPos = loader.getPosition(); long toSkip = skipInterval - (finalPos - initialPos);