Author: pradeepkth Date: Tue Apr 13 18:16:22 2010 New Revision: 933728 URL: http://svn.apache.org/viewvc?rev=933728&view=rev Log: PIG-1369: POProject does not handle null tuples and non existent fields in some cases (pradeepkth)
Modified: hadoop/pig/trunk/CHANGES.txt hadoop/pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POProject.java hadoop/pig/trunk/test/org/apache/pig/test/TestProject.java Modified: hadoop/pig/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=933728&r1=933727&r2=933728&view=diff ============================================================================== --- hadoop/pig/trunk/CHANGES.txt (original) +++ hadoop/pig/trunk/CHANGES.txt Tue Apr 13 18:16:22 2010 @@ -39,6 +39,9 @@ PIG-1309: Map-side Cogroup (ashutoshc) BUG FIXES +PIG-1369: POProject does not handle null tuples and non existent fields in +some cases (pradeepkth) + PIG-1364: Public javadoc on apache site still on 0.2, needs to be updated for each version release (gates) PIG-1338: Pig should exclude hadoop conf in local mode (daijy) Modified: hadoop/pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POProject.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POProject.java?rev=933728&r1=933727&r2=933728&view=diff ============================================================================== --- hadoop/pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POProject.java (original) +++ hadoop/pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POProject.java Tue Apr 13 18:16:22 2010 @@ -147,18 +147,19 @@ public class POProject extends Expressio } else if(columns.size() == 1) { try { ret = inpValue.get(columns.get(0)); - } catch (ExecException ee) { + } catch (IndexOutOfBoundsException ie) { if(pigLogger != null) { pigLogger.warn(this,"Attempt to access field " + "which was not found in the input", PigWarning.ACCESSING_NON_EXISTENT_FIELD); } res.returnStatus = POStatus.STATUS_OK; ret = null; - } catch (IndexOutOfBoundsException ie) { - if(pigLogger != null) { - pigLogger.warn(this,"Attempt to access field " + - "which was not found in the input", PigWarning.ACCESSING_NON_EXISTENT_FIELD); - } + } catch (NullPointerException npe) { + // the tuple is null, so a dereference should also produce a null + // there is a slight danger here that the Tuple implementation + // may have given the exception for a different reason but if we + // don't catch it, we will die and the most common case for the + // exception would be because the tuple is null res.returnStatus = POStatus.STATUS_OK; ret = null; } @@ -168,13 +169,21 @@ public class POProject extends Expressio for(int i: columns) { try { objList.add(inpValue.get(i)); - } catch (ExecException ee) { + } catch (IndexOutOfBoundsException ie) { if(pigLogger != null) { - pigLogger.warn(this,"Attempt to access field " + i + + pigLogger.warn(this,"Attempt to access field " + i + " which was not found in the input", PigWarning.ACCESSING_NON_EXISTENT_FIELD); } objList.add(null); } + catch (NullPointerException npe) { + // the tuple is null, so a dereference should also produce a null + // there is a slight danger here that the Tuple implementation + // may have given the exception for a different reason but if we + // don't catch it, we will die and the most common case for the + // exception would be because the tuple is null + objList.add(null); + } } ret = tupleFactory.newTuple(objList); } @@ -212,15 +221,48 @@ public class POProject extends Expressio // appropriately from the input bag Tuple tuple = inpBag.iterator().next(); Tuple tmpTuple = tupleFactory.newTuple(columns.size()); - for (int i = 0; i < columns.size(); i++) - tmpTuple.set(i, tuple.get(columns.get(i))); + + for (int i = 0; i < columns.size(); i++) { + try { + tmpTuple.set(i, tuple.get(columns.get(i))); + } catch (IndexOutOfBoundsException ie) { + if(pigLogger != null) { + pigLogger.warn(this,"Attempt to access field " + + "which was not found in the input", PigWarning.ACCESSING_NON_EXISTENT_FIELD); + } + tmpTuple.set(i, null); + } catch (NullPointerException npe) { + // the tuple is null, so a dereference should also produce a null + // there is a slight danger here that the Tuple implementation + // may have given the exception for a different reason but if we + // don't catch it, we will die and the most common case for the + // exception would be because the tuple is null + tmpTuple.set(i, null); + } + } outBag = new SingleTupleBag(tmpTuple); } else { outBag = bagFactory.newDefaultBag(); for (Tuple tuple : inpBag) { Tuple tmpTuple = tupleFactory.newTuple(columns.size()); - for (int i = 0; i < columns.size(); i++) - tmpTuple.set(i, tuple.get(columns.get(i))); + for (int i = 0; i < columns.size(); i++) { + try { + tmpTuple.set(i, tuple.get(columns.get(i))); + } catch (IndexOutOfBoundsException ie) { + if(pigLogger != null) { + pigLogger.warn(this,"Attempt to access field " + + "which was not found in the input", PigWarning.ACCESSING_NON_EXISTENT_FIELD); + } + tmpTuple.set(i, null); + } catch (NullPointerException npe) { + // the tuple is null, so a dereference should also produce a null + // there is a slight danger here that the Tuple implementation + // may have given the exception for a different reason but if we + // don't catch it, we will die and the most common case for the + // exception would be because the tuple is null + tmpTuple.set(i, null); + } + } outBag.add(tmpTuple); } } @@ -293,12 +335,42 @@ public class POProject extends Expressio Object ret; if(columns.size() == 1) { - ret = inpValue.get(columns.get(0)); + try{ + ret = inpValue.get(columns.get(0)); + } catch (IndexOutOfBoundsException ie) { + if(pigLogger != null) { + pigLogger.warn(this,"Attempt to access field " + + "which was not found in the input", PigWarning.ACCESSING_NON_EXISTENT_FIELD); + } + ret = null; + } catch (NullPointerException npe) { + // the tuple is null, so a dereference should also produce a null + // there is a slight danger here that the Tuple implementation + // may have given the exception for a different reason but if we + // don't catch it, we will die and the most common case for the + // exception would be because the tuple is null + ret = null; + } } else { ArrayList<Object> objList = new ArrayList<Object>(columns.size()); for(int i: columns) { - objList.add(inpValue.get(i)); + try { + objList.add(inpValue.get(i)); + } catch (IndexOutOfBoundsException ie) { + if(pigLogger != null) { + pigLogger.warn(this,"Attempt to access field " + + "which was not found in the input", PigWarning.ACCESSING_NON_EXISTENT_FIELD); + } + objList.add(null); + } catch (NullPointerException npe) { + // the tuple is null, so a dereference should also produce a null + // there is a slight danger here that the Tuple implementation + // may have given the exception for a different reason but if we + // don't catch it, we will die and the most common case for the + // exception would be because the tuple is null + objList.add(null); + } } ret = tupleFactory.newTuple(objList); res.result = (Tuple)ret; Modified: hadoop/pig/trunk/test/org/apache/pig/test/TestProject.java URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/test/org/apache/pig/test/TestProject.java?rev=933728&r1=933727&r2=933728&view=diff ============================================================================== --- hadoop/pig/trunk/test/org/apache/pig/test/TestProject.java (original) +++ hadoop/pig/trunk/test/org/apache/pig/test/TestProject.java Tue Apr 13 18:16:22 2010 @@ -50,6 +50,7 @@ public class TestProject extends junit. POProject proj; + @Override @Before public void setUp() throws Exception { r = new Random(); @@ -59,6 +60,7 @@ public class TestProject extends junit. proj = GenPhyOp.exprProject(); } + @Override @After public void tearDown() throws Exception { } @@ -281,14 +283,14 @@ public class TestProject extends junit. } @Test - public void testMissingCols() throws Exception { - MiniCluster cluster = MiniCluster.buildCluster(); + public void testMissingCols1() throws Exception { + String inputFileName = "TestProject-testMissingCols1-input.txt"; String input[] = { "hello\tworld", "good\tbye" }; - Util.createInputFile(cluster, "input.txt", input); - String query = "a = load 'input.txt' as (s1:chararray, s2:chararray, extra:chararray);" + + Util.createLocalInputFile(inputFileName, input); + String query = "a = load '" + inputFileName + "' as (s1:chararray, s2:chararray, extra:chararray);" + "b = foreach a generate s1, s2, extra;"; - PigServer ps = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); + PigServer ps = new PigServer(ExecType.LOCAL); Util.registerMultiLineQuery(ps, query); Iterator<Tuple> it = ps.openIterator("b"); Tuple[] expectedResults = new Tuple[] { @@ -300,5 +302,78 @@ public class TestProject extends junit. assertEquals(expectedResults[i++], it.next()); } } + + + @Test + public void testMissingCols2() throws Exception { + String inputFileName = "TestProject-testMissingCols2-input.txt"; + String input[] = { "1\t(hello,world)", "2\t(good,bye)" }; + Util.createLocalInputFile(inputFileName, input); + // in the script, PigStorage will return a null for the tuple field + // since it does not comply with the schema + String query = "a = load '" + inputFileName + "' as (i:int, " + + "t:tuple(s1:chararray, s2:chararray, s3:chararray));" + + "b = foreach a generate t.(s2,s3);"; + + PigServer ps = new PigServer(ExecType.LOCAL); + Util.registerMultiLineQuery(ps, query); + Iterator<Tuple> it = ps.openIterator("b"); + Tuple[] expectedResults = new Tuple[] { + (Tuple) Util.getPigConstant("((null, null))"), + (Tuple) Util.getPigConstant("((null, null))") + }; + int i = 0; + while(it.hasNext()) { + assertEquals(expectedResults[i++], it.next()); + } + } + + @Test + public void testMissingCols3() throws Exception { + String inputFileName = "TestProject-testMissingCols3-input.txt"; + String input[] = { "hello\tworld", "good\tbye" }; + Util.createLocalInputFile(inputFileName, input); + String query = "a = load '" + inputFileName + "';" + + "b = group a all;" + + "c = foreach b generate flatten(a.($1, $2)),a.$2;"; + + PigServer ps = new PigServer(ExecType.LOCAL); + Util.registerMultiLineQuery(ps, query); + Iterator<Tuple> it = ps.openIterator("c"); + Tuple[] expectedResults = new Tuple[] { + (Tuple) Util.getPigConstant("('world', null, {(null),(null)})"), + (Tuple) Util.getPigConstant("('bye', null, {(null),(null)})") + }; + int i = 0; + while(it.hasNext()) { + assertEquals(expectedResults[i++].toString(), it.next().toString()); + } + } + + @Test + public void testNullTupleCols() throws Exception { + String inputFileName = "TestProject-testNullTupleCols-input.txt"; + String input[] = { "1\t(hello,world)", "2\t(good)", "3" }; + Util.createLocalInputFile(inputFileName, input); + // PigStorage will return null as the value for the tuple field in the + // second record since it does not comply with the schema and in the + // third record since the field is absent + String query = "a = load '" + inputFileName + "' as (i:int, " + + "t:tuple(s1:chararray, s2:chararray));" + + "b = foreach a generate t.s1, t.s2;"; + + PigServer ps = new PigServer(ExecType.LOCAL); + Util.registerMultiLineQuery(ps, query); + Iterator<Tuple> it = ps.openIterator("b"); + Tuple[] expectedResults = new Tuple[] { + (Tuple) Util.getPigConstant("('hello', 'world')"), + (Tuple) Util.getPigConstant("(null, null)"), + (Tuple) Util.getPigConstant("(null, null)") + }; + int i = 0; + while(it.hasNext()) { + assertEquals(expectedResults[i++], it.next()); + } + } }