Repository: tajo Updated Branches: refs/heads/master 329e508ca -> 95292d29d
TAJO-934: Multiple DISTINCT returns null grouping key value. (Hyoungjun Kim via hyunsik) Closes #63 Project: http://git-wip-us.apache.org/repos/asf/tajo/repo Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/95292d29 Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/95292d29 Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/95292d29 Branch: refs/heads/master Commit: 95292d29d96963c769ce2fd17a3350375145683e Parents: 329e508 Author: Hyunsik Choi <[email protected]> Authored: Fri Jul 11 15:52:59 2014 +0900 Committer: Hyunsik Choi <[email protected]> Committed: Fri Jul 11 15:52:59 2014 +0900 ---------------------------------------------------------------------- CHANGES | 3 + .../tajo/engine/planner/LogicalPlanner.java | 3 +- .../DistinctGroupbyHashAggregationExec.java | 30 ++++++--- .../tajo/engine/query/TestGroupByQuery.java | 68 +++++++++++++++++++- .../java/org/apache/tajo/storage/v2/RCFile.java | 5 -- 5 files changed, 93 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tajo/blob/95292d29/CHANGES ---------------------------------------------------------------------- diff --git a/CHANGES b/CHANGES index 921183e..43075f0 100644 --- a/CHANGES +++ b/CHANGES @@ -82,6 +82,9 @@ Release 0.9.0 - unreleased BUG FIXES + TAJO-934: Multiple DISTINCT returns null grouping key value. + (Hyoungjun Kim via hyunsik) + TAJO-929: Broadcast join with empty outer join table returns empty result. (Hyoungjun Kim via hyunsik) http://git-wip-us.apache.org/repos/asf/tajo/blob/95292d29/tajo-core/src/main/java/org/apache/tajo/engine/planner/LogicalPlanner.java ---------------------------------------------------------------------- diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/LogicalPlanner.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/LogicalPlanner.java index 80390d3..ea517c0 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/LogicalPlanner.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/LogicalPlanner.java @@ -676,7 +676,8 @@ public class LogicalPlanner extends BaseAlgebraVisitor<LogicalPlanner.PlanContex for (Iterator<NamedExpr> it = block.namedExprsMgr.getIteratorForUnevaluatedExprs(); it.hasNext();) { NamedExpr rawTarget = it.next(); try { - includeDistinctFunction = PlannerUtil.existsDistinctAggregationFunction(rawTarget.getExpr()); + // check if at least distinct aggregation function + includeDistinctFunction |= PlannerUtil.existsDistinctAggregationFunction(rawTarget.getExpr()); EvalNode evalNode = exprAnnotator.createEvalNode(context.plan, context.queryBlock, rawTarget.getExpr()); if (evalNode.getType() == EvalType.AGG_FUNCTION) { aggEvalNames.add(rawTarget.getAlias()); http://git-wip-us.apache.org/repos/asf/tajo/blob/95292d29/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/DistinctGroupbyHashAggregationExec.java ---------------------------------------------------------------------- diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/DistinctGroupbyHashAggregationExec.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/DistinctGroupbyHashAggregationExec.java index 1a4b706..3fac509 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/DistinctGroupbyHashAggregationExec.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/DistinctGroupbyHashAggregationExec.java @@ -114,6 +114,7 @@ public class DistinctGroupbyHashAggregationExec extends PhysicalExec { } if (first) { loadChildHashTable(); + progress = 0.5f; first = false; } @@ -141,9 +142,12 @@ public class DistinctGroupbyHashAggregationExec extends PhysicalExec { //-------------------------------------------------------------------------------------- List<List<Tuple>> tupleSlots = new ArrayList<List<Tuple>>(); + + // aggregation with single grouping key for (int i = 0; i < hashAggregators.length; i++) { if (!hashAggregators[i].iterator.hasNext()) { nullCount++; + tupleSlots.add(new ArrayList<Tuple>()); continue; } Entry<Tuple, Map<Tuple, FunctionContext[]>> entry = hashAggregators[i].iterator.next(); @@ -158,10 +162,10 @@ public class DistinctGroupbyHashAggregationExec extends PhysicalExec { finished = true; progress = 1.0f; - // If DistinctGroupbyHashAggregationExec didn't has any rows, + // If DistinctGroupbyHashAggregationExec does not have any rows, // it should return NullDatum. if (totalNumRows == 0 && groupbyNodeNum == 0) { - Tuple tuple = new VTuple(hashAggregators.length); + Tuple tuple = new VTuple(outputColumnNum); for (int i = 0; i < tuple.size(); i++) { tuple.put(i, DatumFactory.createNullDatum()); } @@ -199,9 +203,11 @@ public class DistinctGroupbyHashAggregationExec extends PhysicalExec { */ + // currentAggregatedTuples has tuples which has same group key. currentAggregatedTuples = new ArrayList<Tuple>(); int listIndex = 0; while (true) { + // Each item in tuples is VTuple. So the tuples variable is two dimensions(tuple[aggregator][datum]). Tuple[] tuples = new Tuple[hashAggregators.length]; for (int i = 0; i < hashAggregators.length; i++) { List<Tuple> aggregatedTuples = tupleSlots.get(i); @@ -212,7 +218,7 @@ public class DistinctGroupbyHashAggregationExec extends PhysicalExec { //merge Tuple mergedTuple = new VTuple(outputColumnNum); - int mergeTupleIndex = 0; + int resultColumnIdx = 0; boolean allNull = true; for (int i = 0; i < hashAggregators.length; i++) { @@ -222,14 +228,22 @@ public class DistinctGroupbyHashAggregationExec extends PhysicalExec { int tupleSize = hashAggregators[i].getTupleSize(); for (int j = 0; j < tupleSize; j++) { - if (resultColumnIdIndexes[mergeTupleIndex] >= 0) { - if (tuples[i] != null) { - mergedTuple.put(resultColumnIdIndexes[mergeTupleIndex], tuples[i].get(j)); + int mergeTupleIndex = resultColumnIdIndexes[resultColumnIdx]; + if (mergeTupleIndex >= 0) { + if (mergeTupleIndex < distinctGroupingKey.size()) { + // set group key tuple + // Because each hashAggregator has different number of tuples, + // sometimes getting group key from each hashAggregator will be null value. + mergedTuple.put(mergeTupleIndex, distinctGroupingKey.get(mergeTupleIndex)); } else { - mergedTuple.put(resultColumnIdIndexes[mergeTupleIndex], NullDatum.get()); + if (tuples[i] != null) { + mergedTuple.put(mergeTupleIndex, tuples[i].get(j)); + } else { + mergedTuple.put(mergeTupleIndex, NullDatum.get()); + } } } - mergeTupleIndex++; + resultColumnIdx++; } } http://git-wip-us.apache.org/repos/asf/tajo/blob/95292d29/tajo-core/src/test/java/org/apache/tajo/engine/query/TestGroupByQuery.java ---------------------------------------------------------------------- diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestGroupByQuery.java b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestGroupByQuery.java index 40ee54e..935e520 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestGroupByQuery.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestGroupByQuery.java @@ -288,20 +288,84 @@ public class TestGroupByQuery extends QueryTestCaseBase { schema.addColumn("code", Type.TEXT); schema.addColumn("qty", Type.INT4); schema.addColumn("qty2", Type.FLOAT8); - String[] data = new String[]{ "1|a|3|3.0", "1|a|4|4.0", "1|b|5|5.0", "2|a|1|6.0", "2|c|2|7.0", "2|d|3|8.0" }; + String[] data = new String[]{"1|a|3|3.0", "1|a|4|4.0", "1|b|5|5.0", "2|a|1|6.0", "2|c|2|7.0", "2|d|3|8.0"}; TajoTestingCluster.createTable("table10", schema, tableOptions, data); res = executeString("select id, count(distinct code), " + "avg(qty), min(qty), max(qty), sum(qty), " + "cast(avg(qty2) as INT8), cast(min(qty2) as INT8), cast(max(qty2) as INT8), cast(sum(qty2) as INT8) " + "from table10 group by id"); - String result = resultSetToString(res); String expected = "id,?count_4,?avg_5,?min_6,?max_7,?sum_8,?cast_9,?cast_10,?cast_11,?cast_12\n" + "-------------------------------\n" + "1,2,4.0,0,5,12,4,0,5,12\n" + "2,3,2.0,0,3,6,7,0,8,21\n"; + assertEquals(expected, resultSetToString(res)); + + // multiple distinct with expression + res = executeString( + "select count(distinct code) + count(distinct qty) from table10" + ); + + expected = "?plus_2\n" + + "-------------------------------\n" + + "9\n"; + + assertEquals(expected, resultSetToString(res)); + res.close(); + + res = executeString( + "select id, count(distinct code) + count(distinct qty) from table10 group by id" + ); + + expected = "id,?plus_2\n" + + "-------------------------------\n" + + "1,5\n" + + "2,6\n"; + + assertEquals(expected, resultSetToString(res)); + res.close(); + + executeString("DROP TABLE table10 PURGE").close(); + } + + @Test + public final void testDistinctAggregationCasebyCase2() throws Exception { + // first distinct is smaller than second distinct. + KeyValueSet tableOptions = new KeyValueSet(); + tableOptions.put(StorageConstants.CSVFILE_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER); + tableOptions.put(StorageConstants.CSVFILE_NULL, "\\\\N"); + + Schema schema = new Schema(); + schema.addColumn("col1", Type.TEXT); + schema.addColumn("col2", Type.TEXT); + schema.addColumn("col3", Type.TEXT); + + String[] data = new String[]{ + "a|b-1|\\N", + "a|b-2|\\N", + "a|b-2|\\N", + "a|b-3|\\N", + "a|b-3|\\N", + "a|b-3|\\N" + }; + + TajoTestingCluster.createTable("table10", schema, tableOptions, data); + + ResultSet res = executeString( + "select col1 \n" + + ",count(distinct col2) as cnt1\n" + + ",count(distinct case when col3 is not null then col2 else null end) as cnt2\n" + + "from table10 \n" + + "group by col1" + ); + String result = resultSetToString(res); + + String expected = "col1,cnt1,cnt2\n" + + "-------------------------------\n" + + "a,3,1\n"; + assertEquals(expected, result); executeString("DROP TABLE table10 PURGE").close(); http://git-wip-us.apache.org/repos/asf/tajo/blob/95292d29/tajo-storage/src/main/java/org/apache/tajo/storage/v2/RCFile.java ---------------------------------------------------------------------- diff --git a/tajo-storage/src/main/java/org/apache/tajo/storage/v2/RCFile.java b/tajo-storage/src/main/java/org/apache/tajo/storage/v2/RCFile.java index 4b79c51..47dce74 100644 --- a/tajo-storage/src/main/java/org/apache/tajo/storage/v2/RCFile.java +++ b/tajo-storage/src/main/java/org/apache/tajo/storage/v2/RCFile.java @@ -1457,11 +1457,6 @@ public class RCFile { currentKeyLength = sin.readInt(); compressedKeyLen = sin.readInt(); -// System.out.println(">>>currentRecordLength=" + currentRecordLength + -// ",currentKeyLength=" + currentKeyLength + -// ",compressedKeyLen=" + compressedKeyLen + -// ",decompress=" + decompress); - if (decompress) { keyTempBuffer.reset(); keyTempBuffer.write(sin, compressedKeyLen);
