Author: pradeepkth Date: Mon Dec 21 20:05:34 2009 New Revision: 892954 URL: http://svn.apache.org/viewvc?rev=892954&view=rev Log: PIG-1149: Allow instantiation of SampleLoaders with parametrized LoadFuncs (dvryaboy via pradeepkth)
Modified: hadoop/pig/branches/load-store-redesign/CHANGES.txt hadoop/pig/branches/load-store-redesign/src/org/apache/pig/impl/builtin/SampleLoader.java hadoop/pig/branches/load-store-redesign/test/org/apache/pig/test/TestPoissonSampleLoader.java Modified: hadoop/pig/branches/load-store-redesign/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/pig/branches/load-store-redesign/CHANGES.txt?rev=892954&r1=892953&r2=892954&view=diff ============================================================================== --- hadoop/pig/branches/load-store-redesign/CHANGES.txt (original) +++ hadoop/pig/branches/load-store-redesign/CHANGES.txt Mon Dec 21 20:05:34 2009 @@ -45,6 +45,9 @@ IMPROVEMENTS +PIG-1149: Allow instantiation of SampleLoaders with parametrized LoadFuncs +(dvryaboy via pradeepkth) + PIG-1162: Pig 0.6.0 - UDF doc (chandec via olgan) PIG-1163: Pig/Zebra 0.6.0 release (chandec via olgan) Modified: hadoop/pig/branches/load-store-redesign/src/org/apache/pig/impl/builtin/SampleLoader.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/load-store-redesign/src/org/apache/pig/impl/builtin/SampleLoader.java?rev=892954&r1=892953&r2=892954&view=diff ============================================================================== --- hadoop/pig/branches/load-store-redesign/src/org/apache/pig/impl/builtin/SampleLoader.java (original) +++ hadoop/pig/branches/load-store-redesign/src/org/apache/pig/impl/builtin/SampleLoader.java Mon Dec 21 20:05:34 2009 @@ -40,24 +40,25 @@ // number of samples to be sampled protected int numSamples; - + protected LoadFunc loader; - + // RecordReader used by the underlying loader private RecordReader<?, ?> recordReader= null; - + public SampleLoader(String funcSpec) { - loader = (LoadFunc)PigContext.instantiateFuncFromSpec(funcSpec); + funcSpec = funcSpec.replaceAll("\\\\'", "'"); + loader = (LoadFunc)PigContext.instantiateFuncFromSpec(funcSpec); } - + public void setNumSamples(int n) { - numSamples = n; + numSamples = n; } - + public int getNumSamples() { - return numSamples; + return numSamples; } - + @Override public InputFormat<?,?> getInputFormat() throws IOException { return loader.getInputFormat(); @@ -70,22 +71,22 @@ throw new IOException("Error getting input",e); } } - + public void computeSamples(ArrayList<Pair<FileSpec, Boolean>> inputs, PigContext pc) throws ExecException { } - + @Override public LoadCaster getLoadCaster() throws IOException { return loader.getLoadCaster(); } - + @Override public String relativeToAbsolutePath(String location, Path curDir) - throws IOException { + throws IOException { return loader.relativeToAbsolutePath(location, curDir); } - + @Override public void prepareToRead(RecordReader reader, PigSplit split) throws IOException { loader.prepareToRead(reader, split); Modified: hadoop/pig/branches/load-store-redesign/test/org/apache/pig/test/TestPoissonSampleLoader.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/load-store-redesign/test/org/apache/pig/test/TestPoissonSampleLoader.java?rev=892954&r1=892953&r2=892954&view=diff ============================================================================== --- hadoop/pig/branches/load-store-redesign/test/org/apache/pig/test/TestPoissonSampleLoader.java (original) +++ hadoop/pig/branches/load-store-redesign/test/org/apache/pig/test/TestPoissonSampleLoader.java Mon Dec 21 20:05:34 2009 @@ -19,27 +19,22 @@ import java.io.*; -import java.util.Iterator; import java.util.ArrayList; +import java.util.Iterator; -import junit.framework.Assert; import junit.framework.TestCase; -import org.apache.pig.EvalFunc; import org.apache.pig.ExecType; import org.apache.pig.FuncSpec; import org.apache.pig.PigServer; import org.apache.pig.backend.executionengine.ExecException; -import org.apache.pig.builtin.BinStorage; import org.apache.pig.builtin.PigStorage; -import org.apache.pig.data.BagFactory; -import org.apache.pig.data.DataBag; import org.apache.pig.data.Tuple; import org.apache.pig.impl.builtin.PoissonSampleLoader; import org.apache.pig.impl.util.Pair; -import org.apache.pig.test.utils.TestHelper; import org.junit.After; import org.junit.Before; +import org.junit.Test; import org.apache.pig.impl.io.FileSpec; @@ -48,64 +43,85 @@ private PigServer pigServer; private MiniCluster cluster = MiniCluster.buildCluster(); - + public TestPoissonSampleLoader() throws ExecException, IOException{ - pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); + pigServer = new PigServer(ExecType.LOCAL, cluster.getProperties()); pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "5"); pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.memusage", "0.0001"); pigServer.getPigContext().getProperties().setProperty("mapred.child.java.opts", "-Xmx512m"); pigServer.getPigContext().getProperties().setProperty("pig.mapsplits.count", "5"); } - - + + @Before public void setUp() throws Exception { createFiles(); } private void createFiles() throws IOException { - PrintWriter w = new PrintWriter(new FileWriter(INPUT_FILE1)); - - int k = 0; - for(int j=0; j<100; j++) { - w.println("100\tapple1\taaa" + k); - k++; - w.println("200\torange1\tbbb" + k); - k++; - w.println("300\tstrawberry\tccc" + k); - k++; - } - - w.close(); - - Util.copyFromLocalToCluster(cluster, INPUT_FILE1, INPUT_FILE1); + PrintWriter w = new PrintWriter(new FileWriter(INPUT_FILE1)); + + int k = 0; + for(int j=0; j<100; j++) { + w.println("100:apple1:aaa" + k); + k++; + w.println("200:orange1:bbb" + k); + k++; + w.println("300:strawberry:ccc" + k); + k++; + } + + w.close(); + + Util.copyFromLocalToCluster(cluster, INPUT_FILE1, INPUT_FILE1); } - - + + @After public void tearDown() throws Exception { - new File(INPUT_FILE1).delete(); - + new File(INPUT_FILE1).delete(); + Util.deleteFile(cluster, INPUT_FILE1); } - - + + @Test public void testComputeSamples() throws IOException{ - FileSpec fs = new FileSpec(INPUT_FILE1, new FuncSpec(PigStorage.class.getName())); - - ArrayList<Pair<FileSpec, Boolean>> inputs = new ArrayList<Pair<FileSpec, Boolean> >(); - inputs.add(new Pair<FileSpec, Boolean>(fs, true)); - + FileSpec fs = new FileSpec(INPUT_FILE1, new FuncSpec(PigStorage.class.getName())); + + ArrayList<Pair<FileSpec, Boolean>> inputs = new ArrayList<Pair<FileSpec, Boolean> >(); + inputs.add(new Pair<FileSpec, Boolean>(fs, true)); + // Use 100 as a default value; PoissonSampleLoader ps = new PoissonSampleLoader((new FuncSpec(PigStorage.class.getName())).toString(), "100"); // Get the number of samples for the file ps.computeSamples(inputs, pigServer.getPigContext()); - + if (ps.getNumSamples() != 3) { - fail("Compute samples returned the wrong number of samples"); + fail("Compute samples returned the wrong number of samples"); } } - + + /* + * FIXME This currently tests for 5 elements because PoissonSampleLoader + * only produces a single sample for the test data, and the last sample has + * extra information appended in PoissonSampleLoader. + * + * This is incorrect. The proper number of samples should be > 1, and therefore + * the first sample should only have 3 elements. + * + * See PIG-1062 and PIG-1149 for more information. + * + */ + @Test + public void testInstantiation() throws IOException { + pigServer.registerQuery("A = Load '"+INPUT_FILE1+"' Using PoissonSampleLoader('PigStorage(\\\\\\':\\\\\\')', '100');"); + Iterator<Tuple> iter = pigServer.openIterator("A"); + assertTrue(iter.hasNext()); + assertEquals(5, iter.next().size()); + } + + + } \ No newline at end of file