Repository: incubator-datafu Updated Branches: refs/heads/master 16a82e8b4 -> d4a5c5d43
DATAFU-31 DistinctBy works incorrectly on string containing minuses https://issues.apache.org/jira/browse/DATAFU-31 Signed-off-by: Matt Hayes <mha...@linkedin.com> Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/d4a5c5d4 Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/d4a5c5d4 Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/d4a5c5d4 Branch: refs/heads/master Commit: d4a5c5d434c33c6b614ae08ee4179661fda0d358 Parents: 16a82e8 Author: Jian J. Wang <wj...@dogfavorshot-lm.peking.corp.yahoo.com> Authored: Sun Feb 16 20:58:02 2014 +0800 Committer: Matt Hayes <mha...@linkedin.com> Committed: Tue Feb 18 13:18:06 2014 -0800 ---------------------------------------------------------------------- src/java/datafu/pig/bags/DistinctBy.java | 25 +++++----- test/pig/datafu/test/pig/bags/BagTests.java | 62 ++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/d4a5c5d4/src/java/datafu/pig/bags/DistinctBy.java ---------------------------------------------------------------------- diff --git a/src/java/datafu/pig/bags/DistinctBy.java b/src/java/datafu/pig/bags/DistinctBy.java index 0bbb0e6..a79e4de 100644 --- a/src/java/datafu/pig/bags/DistinctBy.java +++ b/src/java/datafu/pig/bags/DistinctBy.java @@ -28,6 +28,7 @@ import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; +import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.schema.Schema; @@ -60,9 +61,8 @@ import org.apache.pig.impl.logicalLayer.schema.Schema; */ public class DistinctBy extends AccumulatorEvalFunc<DataBag> { - private final static String delimiter = "-"; private HashSet<Integer> fields = new HashSet<Integer>(); - private HashSet<String> seen = new HashSet<String>(); + private HashSet<Tuple> seen = new HashSet<Tuple>(); private DataBag outputBag; public DistinctBy(String... fields) @@ -85,10 +85,10 @@ public class DistinctBy extends AccumulatorEvalFunc<DataBag> DataBag inputBag = (DataBag)input.get(0); for (Tuple t : inputBag) { - String distinctString = getDistinctString(t, this.fields); - if (!seen.contains(distinctString)) { + Tuple distinctFieldTuple = getDistinctFieldTuple(t, this.fields); + if (!seen.contains(distinctFieldTuple)) { outputBag.add(t); - seen.add(distinctString); + seen.add(distinctFieldTuple); } } } @@ -147,17 +147,16 @@ public class DistinctBy extends AccumulatorEvalFunc<DataBag> } } - private String getDistinctString(Tuple t, HashSet<Integer> distinctFieldPositions) throws ExecException { - String[] tokens = t.toDelimitedString(delimiter).split(delimiter); - StringBuffer buffer = new StringBuffer(); - for(int i=0; i<tokens.length; i++) { + private Tuple getDistinctFieldTuple(Tuple t, HashSet<Integer> distinctFieldPositions) throws ExecException { + Tuple fieldTuple = TupleFactory.getInstance().newTuple(distinctFieldPositions.size()); + int idx = 0; + for(int i=0; i<t.size(); i++) { if (distinctFieldPositions.contains(i)) { - buffer.append(tokens[i]); - buffer.append(delimiter); + fieldTuple.set(idx, t.get(i)); + idx++; } } - buffer.substring(0, buffer.length() - delimiter.length()); - return buffer.toString(); + return fieldTuple; } } http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/d4a5c5d4/test/pig/datafu/test/pig/bags/BagTests.java ---------------------------------------------------------------------- diff --git a/test/pig/datafu/test/pig/bags/BagTests.java b/test/pig/datafu/test/pig/bags/BagTests.java index c9e3f63..80bb0cc 100644 --- a/test/pig/datafu/test/pig/bags/BagTests.java +++ b/test/pig/datafu/test/pig/bags/BagTests.java @@ -708,6 +708,68 @@ public class BagTests extends PigTests "({(Z,1,0),(A,1,0),(B,2,0),(C,3,0),(D,4,0),(E,5,0)})", "({(A,10,2),(M,50,3),(Z,49,22),(B,1,1)})"); } + + /** + register $JAR_PATH + + define DistinctBy datafu.pig.bags.DistinctBy('1', '2'); + + data = LOAD 'input' AS (data: bag {T: tuple(a:CHARARRAY, b:map[INT], c:bag{t: tuple(c0:CHARARRAY, c1:INT)})}); + + data2 = FOREACH data GENERATE DistinctBy(data); + + --describe data2; + + STORE data2 INTO 'output'; + + */ + @Multiline + private String distinctByMultiComplexFieldTest; + + @Test + public void distinctByMultiComplexFieldTest() throws Exception + { + PigTest test = createPigTestFromString(distinctByMultiComplexFieldTest); + + writeLinesToFile("input", + "({(a-b,[a#0,b#1],{(a-b,0),(a-b,1)}),(a-c,[b#1,a#0],{(a-b,0),(a-b,1)}),(a-d,[a#1,b#0],{(a-b,1),(a-b,2)})})"); + + test.runScript(); + + assertOutput(test, "data2", + "({(a-b,[b#1,a#0],{(a-b,0),(a-b,1)}),(a-d,[b#0,a#1],{(a-b,1),(a-b,2)})})"); + } + + /** + register $JAR_PATH + + define DistinctBy datafu.pig.bags.DistinctBy('1'); + + data = LOAD 'input' AS (data: bag {T: tuple(a:CHARARRAY, b:CHARARRAY)}); + + data2 = FOREACH data GENERATE DistinctBy(data); + + --describe data2; + + STORE data2 INTO 'output'; + + */ + @Multiline + private String distinctByDelimTest; + + @Test + public void distinctByDelimTest() throws Exception + { + PigTest test = createPigTestFromString(distinctByDelimTest); + + writeLinesToFile("input", + "({(a-b,c),(a-b,d)})"); + + test.runScript(); + + assertOutput(test, "data2", + "({(a-b,c),(a-b,d)})"); + } @Test public void distinctByExecTest() throws Exception