[
https://issues.apache.org/jira/browse/PIG-4276?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14502535#comment-14502535
]
liyunzhang_intel commented on PIG-4276:
---------------------------------------
[~mohitsabharwal], i saw in PIG-4276.patch,
patch uses Util.checkQueryOutputsAfterSort is ok. I think whether in spark or
non-spark mode, we can both use Util.checkQueryOutputsAfterSort to check the
results.
{code}
if (Util.isSparkExecType(cluster.getExecType())) {
String[] expectedResults =
new String[] {"(2,{(2,2)},{(2,5,2)})",
"(1,{(1,1)},{(1,2,3)})" };
Util.checkQueryOutputsAfterSortRecursive(iter,
expectedResults,
org.apache.pig.newplan.logical.Util.translateSchema(pigServer.dumpSchema("D")));
} else {
.....
}
{code}
can be
{code}
String[] expectedResults =
new String[] {"(2,{(2,2)},{(2,5,2)})",
"(1,{(1,1)},{(1,2,3)})" };
Util.checkQueryOutputsAfterSortRecursive(iter,
expectedResults,
org.apache.pig.newplan.logical.Util.translateSchema(pigServer.dumpSchema("D")));
{code}
In other places, you introduces an ORDER BY command to sort the GROUP BY or
DISTINCT output. I think it is not very good to change the original script. I
think following is better:
{code}
@Test
public void testNestedPlan() throws Exception{
int LOOP_COUNT = 10;
File tmpFile = Util.createTempFileDelOnExit("test", "txt");
PrintStream ps = new PrintStream(new FileOutputStream(tmpFile));
for(int i = 0; i < LOOP_COUNT; i++) {
for(int j=0;j<LOOP_COUNT;j+=2){
ps.println(i+"\t"+j);
ps.println(i+"\t"+j);
}
}
ps.close();
pigServer.registerQuery("A = LOAD '"
+ Util.generateURI(tmpFile.toString(), pigContext) + "';");
pigServer.registerQuery("B = group A by $0;");
String query = "C = foreach B {"
+ "C1 = filter A by $0 > -1;"
+ "C2 = distinct C1;"
+ "C3 = distinct A;"
+ "generate (int)group," + Identity.class.getName() +"(*), COUNT(C2),
SUM(C2.$1)," + TitleNGrams.class.getName() + "(C3), MAX(C3.$1), C2;"
+ "};";
pigServer.registerQuery(query);
Iterator<Tuple> iter = pigServer.openIterator("C");
if(!iter.hasNext()) Assert.fail("No output found");
int numIdentity = 0;
while(iter.hasNext()){
Tuple t = iter.next();
Assert.assertEquals((Integer)numIdentity, (Integer)t.get(0));
Assert.assertEquals((Long)5L, (Long)t.get(2));
Assert.assertEquals(LOOP_COUNT*2.0, (Double)t.get(3), 0.01);
Assert.assertEquals(8.0, (Double)t.get(5), 0.01);
Assert.assertEquals(5L, ((DataBag)t.get(6)).size());
Assert.assertEquals(7, t.size());
++numIdentity;
}
Assert.assertEquals(LOOP_COUNT, numIdentity);
}
{code}
can be
{code}
@Test
public void testNestedPlan() throws Exception{
int LOOP_COUNT = 10;
File tmpFile = Util.createTempFileDelOnExit("test", "txt");
PrintStream ps = new PrintStream(new FileOutputStream(tmpFile));
for(int i = 0; i < LOOP_COUNT; i++) {
for(int j=0;j<LOOP_COUNT;j+=2){
ps.println(i+"\t"+j);
ps.println(i+"\t"+j);
}
}
ps.close();
pigServer.registerQuery("A = LOAD '"
+ Util.generateURI(tmpFile.toString(), pigContext) + "';");
pigServer.registerQuery("B = group A by $0;");
String query = "C = foreach B {"
+ "C1 = filter A by $0 > -1;"
+ "C2 = distinct C1;"
+ "C3 = distinct A;"
+ "generate (int)group," + Identity.class.getName() +"(*), COUNT(C2),
SUM(C2.$1)," + TitleNGrams.class.getName() + "(C3), MAX(C3.$1), C2;"
+ "};";
pigServer.registerQuery(query);
Iterator<Tuple> iter = pigServer.openIterator("C");
if(!iter.hasNext()) Assert.fail("No output found");
int numIdentity = 0;
List<String> expectedStrResults = new ArrayList<String>();
for(int i=0;i<LOOP_COUNT;i++){
StringBuilder sb = new StringBuilder();
sb.append("(").append(numIdentity).append(",");
sb.append("5L").append(",");
sb.append(Double.toString(LOOP_COUNT*2.0)).append(",");
sb.append("8.0").append(",");
sb.append("5L").append(",");
sb.append("7").append(")");
expectedStrResults.add(sb.toString());
++numIdentity;
}
List<Tuple> expectedResults =
Util.getTuplesFromConstantTupleStrings(expectedStrResults.toArray(new
String[0]));
List<String> actualStrResults = new ArrayList<String>();
while(iter.hasNext()){
Tuple t = iter.next();
StringBuilder sb = new StringBuilder();
sb.append("(").append(t.get(0)).append(",");
sb.append(t.get(2)).append(",");
sb.append(t.get(3)).append(",");
sb.append(t.get(5)).append(",");
sb.append(((DataBag)t.get(6)).size()).append(",");
sb.append(t.size()).append(")");
actualStrResults.add(sb.toString());
++numIdentity;
}
List<Tuple> actualResults =
Util.getTuplesFromConstantTupleStrings(actualStrResults.toArray(new String[0]));
Util.checkQueryOutputsAfterSort(actualResults.iterator(),
expectedResults);
Assert.assertEquals(LOOP_COUNT, numIdentity);
}
{code}
but it throws failure message="Comparing actual and expected results. " +
"expected: java.util.ArrayList<[(0,5,20.0,8.0,5,7)," +
" (1,5,20.0,8.0,5,7), (2,5,20.0,8.0,5,7), (3,5,20.0,
8.0,5,7), " +
"(4,5,20.0,8.0,5,7), (5,5,20.0,8.0,5,7), (6,5,20.0,8.0,5,7),
(7,5,20.0,8.0,5,7), " +
"(8,5,20.0,8.0,5,7), (9,5,20.0,8.0,5,7)]> but was:
java.util.ArrayList< " +
" [(0,5,20.0,8.0,5,7), (1,5,20.0,8.0,5,7), (2,5,20.0,8.0,5,7),
" +
"(3,5,20.0,8.0,5,7), (4,5,20.0,8.0,5,7), (5,5,20.0,8.0,5,7),
(6,5,20.0,8.0,5,7)," +
" (7,5,20.0,8.0,5,7), (8,5, 20.0,8.0,5,7),
(9,5,20.0,8.0,5,7)]>"
This error is because expected arrayList
(0(Int),5(Long),20.0(Double),8.0(Double),5(Double),7(Int)) while actual
arrayList (0(Int),5(Int),20.0(Double),8.0(Double),5(Int),7(Int)) . I don't
know how to fix it.
> Fix ordering related failures in TestEvalPipeline for Spark
> -----------------------------------------------------------
>
> Key: PIG-4276
> URL: https://issues.apache.org/jira/browse/PIG-4276
> Project: Pig
> Issue Type: Sub-task
> Components: spark
> Reporter: liyunzhang_intel
> Assignee: Mohit Sabharwal
> Fix For: spark-branch
>
> Attachments: PIG-4276.patch,
> TEST-org.apache.pig.test.TestEvalPipeline.txt
>
>
> error log is attached
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)