[ 
https://issues.apache.org/jira/browse/TAJO-601?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13906690#comment-13906690
 ] 

Hudson commented on TAJO-601:
-----------------------------

SUCCESS: Integrated in Tajo-master-build #72 (See 
[https://builds.apache.org/job/Tajo-master-build/72/])
TAJO-601: Improve distinct aggregation query processing. (hyunsik: 
https://git-wip-us.apache.org/repos/asf?p=incubator-tajo.git&a=commit&h=6053ad11efc31ed25c6ea8be10d4ce967c34dda3)
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/planner/rewrite/ProjectionPushDownRule.java
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/master/querymaster/QueryMasterManagerService.java
* 
tajo-core/tajo-core-backend/src/test/resources/queries/TestGroupByQuery/testDistinctAggregation1.sql
* 
tajo-core/tajo-core-backend/src/test/resources/queries/TestGroupByQuery/testCountDistinct2.sql
* 
tajo-core/tajo-core-backend/src/test/resources/results/TestGroupByQuery/testDistinctAggregation4.result
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/function/builtin/SumLongDistinct.java
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/planner/LogicalPlanner.java
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/eval/EvalTreeUtil.java
* 
tajo-core/tajo-core-backend/src/test/resources/results/TestGroupByQuery/testDistinctAggregationWithHaving1.result
* 
tajo-core/tajo-core-backend/src/test/resources/queries/TestGroupByQuery/testDistinctAggregation3.sql
* 
tajo-core/tajo-core-backend/src/test/java/org/apache/tajo/engine/eval/TestEvalTreeUtil.java
* 
tajo-core/tajo-core-backend/src/test/resources/queries/TestGroupByQuery/testDistinctAggregationWithHaving1.sql
* tajo-storage/src/main/java/org/apache/tajo/storage/RawFile.java
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/planner/PreLogicalPlanVerifier.java
* 
tajo-core/tajo-core-backend/src/test/resources/queries/TestGroupByQuery/testCountDistinct.sql
* 
tajo-core/tajo-core-backend/src/test/resources/results/TestGroupByQuery/testDistinctAggregation1.result
* 
tajo-core/tajo-core-backend/src/test/resources/results/TestGroupByQuery/testCountDistinct2.result
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/planner/rewrite/PartitionedTableRewriter.java
* 
tajo-core/tajo-core-backend/src/test/java/org/apache/tajo/engine/query/TestGroupByQuery.java
* 
tajo-core/tajo-core-backend/src/test/resources/queries/TestGroupByQuery/testDistinctAggregationWithUnion1.sql
* 
tajo-core/tajo-core-backend/src/test/resources/results/TestGroupByQuery/testDistinctAggregationWithUnion1.result
* 
tajo-core/tajo-core-backend/src/test/java/org/apache/tajo/master/TestExecutionBlockCursor.java
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/function/builtin/SumIntDistinct.java
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/function/builtin/SumFloatDistinct.java
* 
tajo-core/tajo-core-backend/src/test/resources/queries/TestGroupByQuery/testDistinctAggregation4.sql
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/planner/physical/SeqScanExec.java
* 
tajo-core/tajo-core-backend/src/test/java/org/apache/tajo/master/TestGlobalPlanner.java
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/planner/PlannerUtil.java
* 
tajo-core/tajo-core-backend/src/test/resources/results/TestGroupByQuery/testDistinctAggregation3.result
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/master/querymaster/QueryMaster.java
* CHANGES.txt
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/master/querymaster/Repartitioner.java
* tajo-common/src/main/java/org/apache/tajo/util/TUtil.java
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/function/builtin/SumFloat.java
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/planner/global/MasterPlan.java
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/planner/enforce/Enforcer.java
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/planner/global/GlobalPlanner.java
* 
tajo-core/tajo-core-backend/src/test/resources/results/TestGroupByQuery/testCountDistinct.result
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/planner/rewrite/FilterPushDownRule.java
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/planner/PhysicalPlannerImpl.java
* 
tajo-core/tajo-core-backend/src/test/resources/results/TestGroupByQuery/testDistinctAggregation2.result
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/function/builtin/SumDoubleDistinct.java
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/planner/ExprsVerifier.java
* 
tajo-core/tajo-core-backend/src/test/resources/results/TestGroupByQuery/testDistinctAggregation5.result
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/eval/FunctionEval.java
* 
tajo-core/tajo-core-backend/src/test/resources/queries/TestGroupByQuery/testDistinctAggregation2.sql
* 
tajo-core/tajo-core-backend/src/test/resources/queries/TestGroupByQuery/testDistinctAggregation5.sql
* 
tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/SortSpec.java
* 
tajo-core/tajo-core-backend/src/main/java/org/apache/tajo/engine/planner/global/DataChannel.java


> Improve distinct aggregation query processing
> ---------------------------------------------
>
>                 Key: TAJO-601
>                 URL: https://issues.apache.org/jira/browse/TAJO-601
>             Project: Tajo
>          Issue Type: Improvement
>          Components: planner/optimizer
>            Reporter: Hyunsik Choi
>            Assignee: Hyunsik Choi
>             Fix For: 0.8-incubating
>
>         Attachments: TAJO-601.patch, TAJO-601_140220_142800.patch
>
>
> Currently, distinct aggregation queries are executed as follows:
> * the first stage: it just shuffles tuples by hashing grouping keys.
> * the second stage: it sorts them and executes sort aggregation.
> This way executes queries including distinct aggregation functions with only 
> two stages. But, it leads to large intermediate data during shuffle phase.
> This kind of query can be rewritten as two queries:
> {code:title=original query}
> SELECT grp1, grp2, count(*) as total, count(distinct grp3) as distinct_col 
> from rel1 group by grp1, grp2;
> {code}
> {code:title=rewritten query}
> SELECT grp1, grp2, sum(cnt) as total, count(grp3) as distinct_col from (
>   SELECT grp1, grp2, grp3, count(*) as cnt from rel1 group by grp1, grp2, 
> grp3) tmp1 group by grp1, grp2
> ) table1;
> {code}
> I'm expecting that this rewrite will significantly reduce the intermediate 
> data volume and query response time in most cases.



--
This message was sent by Atlassian JIRA
(v6.1.5#6160)

Reply via email to