Author: pradeepkth
Date: Tue Apr 13 18:34:53 2010
New Revision: 933738
URL: http://svn.apache.org/viewvc?rev=933738&view=rev
Log:
PIG-1369: POProject does not handle null tuples and non existent fields in some
cases (pradeepkth)
Modified:
hadoop/pig/branches/branch-0.7/CHANGES.txt
hadoop/pig/branches/branch-0.7/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POProject.java
hadoop/pig/branches/branch-0.7/test/org/apache/pig/test/TestProject.java
Modified: hadoop/pig/branches/branch-0.7/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.7/CHANGES.txt?rev=933738&r1=933737&r2=933738&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.7/CHANGES.txt (original)
+++ hadoop/pig/branches/branch-0.7/CHANGES.txt Tue Apr 13 18:34:53 2010
@@ -183,6 +183,9 @@ OPTIMIZATIONS
BUG FIXES
+PIG-1369: POProject does not handle null tuples and non existent fields in
+some cases (pradeepkth)
+
PIG-1364: Public javadoc on apache site still on 0.2, needs to be updated for
each version release (gates)
PIG-1366: PigStorage's pushProjection implementation results in NPE under
Modified:
hadoop/pig/branches/branch-0.7/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POProject.java
URL:
http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.7/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POProject.java?rev=933738&r1=933737&r2=933738&view=diff
==============================================================================
---
hadoop/pig/branches/branch-0.7/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POProject.java
(original)
+++
hadoop/pig/branches/branch-0.7/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POProject.java
Tue Apr 13 18:34:53 2010
@@ -147,18 +147,19 @@ public class POProject extends Expressio
} else if(columns.size() == 1) {
try {
ret = inpValue.get(columns.get(0));
- } catch (ExecException ee) {
+ } catch (IndexOutOfBoundsException ie) {
if(pigLogger != null) {
pigLogger.warn(this,"Attempt to access field " +
"which was not found in the input",
PigWarning.ACCESSING_NON_EXISTENT_FIELD);
}
res.returnStatus = POStatus.STATUS_OK;
ret = null;
- } catch (IndexOutOfBoundsException ie) {
- if(pigLogger != null) {
- pigLogger.warn(this,"Attempt to access field " +
- "which was not found in the input",
PigWarning.ACCESSING_NON_EXISTENT_FIELD);
- }
+ } catch (NullPointerException npe) {
+ // the tuple is null, so a dereference should also produce a
null
+ // there is a slight danger here that the Tuple implementation
+ // may have given the exception for a different reason but if
we
+ // don't catch it, we will die and the most common case for the
+ // exception would be because the tuple is null
res.returnStatus = POStatus.STATUS_OK;
ret = null;
}
@@ -168,13 +169,21 @@ public class POProject extends Expressio
for(int i: columns) {
try {
objList.add(inpValue.get(i));
- } catch (ExecException ee) {
+ } catch (IndexOutOfBoundsException ie) {
if(pigLogger != null) {
- pigLogger.warn(this,"Attempt to access field " + i +
+ pigLogger.warn(this,"Attempt to access field " + i +
" which was not found in the input",
PigWarning.ACCESSING_NON_EXISTENT_FIELD);
}
objList.add(null);
}
+ catch (NullPointerException npe) {
+ // the tuple is null, so a dereference should also produce
a null
+ // there is a slight danger here that the Tuple
implementation
+ // may have given the exception for a different reason but
if we
+ // don't catch it, we will die and the most common case
for the
+ // exception would be because the tuple is null
+ objList.add(null);
+ }
}
ret = tupleFactory.newTuple(objList);
}
@@ -212,15 +221,48 @@ public class POProject extends Expressio
// appropriately from the input bag
Tuple tuple = inpBag.iterator().next();
Tuple tmpTuple = tupleFactory.newTuple(columns.size());
- for (int i = 0; i < columns.size(); i++)
- tmpTuple.set(i, tuple.get(columns.get(i)));
+
+ for (int i = 0; i < columns.size(); i++) {
+ try {
+ tmpTuple.set(i, tuple.get(columns.get(i)));
+ } catch (IndexOutOfBoundsException ie) {
+ if(pigLogger != null) {
+ pigLogger.warn(this,"Attempt to access field " +
+ "which was not found in the input",
PigWarning.ACCESSING_NON_EXISTENT_FIELD);
+ }
+ tmpTuple.set(i, null);
+ } catch (NullPointerException npe) {
+ // the tuple is null, so a dereference should also produce
a null
+ // there is a slight danger here that the Tuple
implementation
+ // may have given the exception for a different reason but
if we
+ // don't catch it, we will die and the most common case
for the
+ // exception would be because the tuple is null
+ tmpTuple.set(i, null);
+ }
+ }
outBag = new SingleTupleBag(tmpTuple);
} else {
outBag = bagFactory.newDefaultBag();
for (Tuple tuple : inpBag) {
Tuple tmpTuple = tupleFactory.newTuple(columns.size());
- for (int i = 0; i < columns.size(); i++)
- tmpTuple.set(i, tuple.get(columns.get(i)));
+ for (int i = 0; i < columns.size(); i++) {
+ try {
+ tmpTuple.set(i, tuple.get(columns.get(i)));
+ } catch (IndexOutOfBoundsException ie) {
+ if(pigLogger != null) {
+ pigLogger.warn(this,"Attempt to access field " +
+ "which was not found in the input",
PigWarning.ACCESSING_NON_EXISTENT_FIELD);
+ }
+ tmpTuple.set(i, null);
+ } catch (NullPointerException npe) {
+ // the tuple is null, so a dereference should also
produce a null
+ // there is a slight danger here that the Tuple
implementation
+ // may have given the exception for a different reason
but if we
+ // don't catch it, we will die and the most common
case for the
+ // exception would be because the tuple is null
+ tmpTuple.set(i, null);
+ }
+ }
outBag.add(tmpTuple);
}
}
@@ -293,12 +335,42 @@ public class POProject extends Expressio
Object ret;
if(columns.size() == 1) {
- ret = inpValue.get(columns.get(0));
+ try{
+ ret = inpValue.get(columns.get(0));
+ } catch (IndexOutOfBoundsException ie) {
+ if(pigLogger != null) {
+ pigLogger.warn(this,"Attempt to access field " +
+ "which was not found in the input",
PigWarning.ACCESSING_NON_EXISTENT_FIELD);
+ }
+ ret = null;
+ } catch (NullPointerException npe) {
+ // the tuple is null, so a dereference should also produce
a null
+ // there is a slight danger here that the Tuple
implementation
+ // may have given the exception for a different reason but
if we
+ // don't catch it, we will die and the most common case
for the
+ // exception would be because the tuple is null
+ ret = null;
+ }
} else {
ArrayList<Object> objList = new
ArrayList<Object>(columns.size());
for(int i: columns) {
- objList.add(inpValue.get(i));
+ try {
+ objList.add(inpValue.get(i));
+ } catch (IndexOutOfBoundsException ie) {
+ if(pigLogger != null) {
+ pigLogger.warn(this,"Attempt to access field " +
+ "which was not found in the input",
PigWarning.ACCESSING_NON_EXISTENT_FIELD);
+ }
+ objList.add(null);
+ } catch (NullPointerException npe) {
+ // the tuple is null, so a dereference should also
produce a null
+ // there is a slight danger here that the Tuple
implementation
+ // may have given the exception for a different reason
but if we
+ // don't catch it, we will die and the most common
case for the
+ // exception would be because the tuple is null
+ objList.add(null);
+ }
}
ret = tupleFactory.newTuple(objList);
res.result = (Tuple)ret;
Modified:
hadoop/pig/branches/branch-0.7/test/org/apache/pig/test/TestProject.java
URL:
http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.7/test/org/apache/pig/test/TestProject.java?rev=933738&r1=933737&r2=933738&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.7/test/org/apache/pig/test/TestProject.java
(original)
+++ hadoop/pig/branches/branch-0.7/test/org/apache/pig/test/TestProject.java
Tue Apr 13 18:34:53 2010
@@ -50,6 +50,7 @@ public class TestProject extends junit.
POProject proj;
+ @Override
@Before
public void setUp() throws Exception {
r = new Random();
@@ -59,6 +60,7 @@ public class TestProject extends junit.
proj = GenPhyOp.exprProject();
}
+ @Override
@After
public void tearDown() throws Exception {
}
@@ -281,14 +283,14 @@ public class TestProject extends junit.
}
@Test
- public void testMissingCols() throws Exception {
- MiniCluster cluster = MiniCluster.buildCluster();
+ public void testMissingCols1() throws Exception {
+ String inputFileName = "TestProject-testMissingCols1-input.txt";
String input[] = { "hello\tworld", "good\tbye" };
- Util.createInputFile(cluster, "input.txt", input);
- String query = "a = load 'input.txt' as (s1:chararray, s2:chararray,
extra:chararray);" +
+ Util.createLocalInputFile(inputFileName, input);
+ String query = "a = load '" + inputFileName + "' as (s1:chararray,
s2:chararray, extra:chararray);" +
"b = foreach a generate s1, s2, extra;";
- PigServer ps = new PigServer(ExecType.MAPREDUCE,
cluster.getProperties());
+ PigServer ps = new PigServer(ExecType.LOCAL);
Util.registerMultiLineQuery(ps, query);
Iterator<Tuple> it = ps.openIterator("b");
Tuple[] expectedResults = new Tuple[] {
@@ -300,5 +302,78 @@ public class TestProject extends junit.
assertEquals(expectedResults[i++], it.next());
}
}
+
+
+ @Test
+ public void testMissingCols2() throws Exception {
+ String inputFileName = "TestProject-testMissingCols2-input.txt";
+ String input[] = { "1\t(hello,world)", "2\t(good,bye)" };
+ Util.createLocalInputFile(inputFileName, input);
+ // in the script, PigStorage will return a null for the tuple field
+ // since it does not comply with the schema
+ String query = "a = load '" + inputFileName + "' as (i:int, " +
+ "t:tuple(s1:chararray, s2:chararray, s3:chararray));" +
+ "b = foreach a generate t.(s2,s3);";
+
+ PigServer ps = new PigServer(ExecType.LOCAL);
+ Util.registerMultiLineQuery(ps, query);
+ Iterator<Tuple> it = ps.openIterator("b");
+ Tuple[] expectedResults = new Tuple[] {
+ (Tuple) Util.getPigConstant("((null, null))"),
+ (Tuple) Util.getPigConstant("((null, null))")
+ };
+ int i = 0;
+ while(it.hasNext()) {
+ assertEquals(expectedResults[i++], it.next());
+ }
+ }
+
+ @Test
+ public void testMissingCols3() throws Exception {
+ String inputFileName = "TestProject-testMissingCols3-input.txt";
+ String input[] = { "hello\tworld", "good\tbye" };
+ Util.createLocalInputFile(inputFileName, input);
+ String query = "a = load '" + inputFileName + "';" +
+ "b = group a all;" +
+ "c = foreach b generate flatten(a.($1, $2)),a.$2;";
+
+ PigServer ps = new PigServer(ExecType.LOCAL);
+ Util.registerMultiLineQuery(ps, query);
+ Iterator<Tuple> it = ps.openIterator("c");
+ Tuple[] expectedResults = new Tuple[] {
+ (Tuple) Util.getPigConstant("('world', null,
{(null),(null)})"),
+ (Tuple) Util.getPigConstant("('bye', null, {(null),(null)})")
+ };
+ int i = 0;
+ while(it.hasNext()) {
+ assertEquals(expectedResults[i++].toString(),
it.next().toString());
+ }
+ }
+
+ @Test
+ public void testNullTupleCols() throws Exception {
+ String inputFileName = "TestProject-testNullTupleCols-input.txt";
+ String input[] = { "1\t(hello,world)", "2\t(good)", "3" };
+ Util.createLocalInputFile(inputFileName, input);
+ // PigStorage will return null as the value for the tuple field in the
+ // second record since it does not comply with the schema and in the
+ // third record since the field is absent
+ String query = "a = load '" + inputFileName + "' as (i:int, " +
+ "t:tuple(s1:chararray, s2:chararray));" +
+ "b = foreach a generate t.s1, t.s2;";
+
+ PigServer ps = new PigServer(ExecType.LOCAL);
+ Util.registerMultiLineQuery(ps, query);
+ Iterator<Tuple> it = ps.openIterator("b");
+ Tuple[] expectedResults = new Tuple[] {
+ (Tuple) Util.getPigConstant("('hello', 'world')"),
+ (Tuple) Util.getPigConstant("(null, null)"),
+ (Tuple) Util.getPigConstant("(null, null)")
+ };
+ int i = 0;
+ while(it.hasNext()) {
+ assertEquals(expectedResults[i++], it.next());
+ }
+ }
}