Another information:
for the last query, there are three eb, each eb running time:
eb_1425543096152_0001_000001 -> 15secs
eb_1425543096152_0001_000002 -> 40secs
eb_1425543096152_0001_000003 -> 80secs
Distributed Query Plan
-------------------------------------------------------------------------------
Execution Block Graph (TERMINAL - eb_1425543096152_0001_000004)
-------------------------------------------------------------------------------
|-eb_1425543096152_0001_000004
|-eb_1425543096152_0001_000003
|-eb_1425543096152_0001_000002
|-eb_1425543096152_0001_000001
-------------------------------------------------------------------------------
Order of Execution
-------------------------------------------------------------------------------
1: eb_1425543096152_0001_000001
2: eb_1425543096152_0001_000002
3: eb_1425543096152_0001_000003
4: eb_1425543096152_0001_000004
-------------------------------------------------------------------------------
=======================================================
Block Id: eb_1425543096152_0001_000001 [LEAF]
=======================================================
[Outgoing]
[q_1425543096152_0001] 1 => 2 (type=HASH_SHUFFLE, key=?distinctseq
(INT2), default.tds_did_user_targ_day.dvc_id (TEXT), num=7)
[Enforcers]
0: type=Distinct,alg=hash
DISTINCT_GROUP_BY(9)()
=> exprs: (count( distinct default.tds_did_user_targ_day.dvc_id (TEXT)))
=> target list: ?distinctseq (INT2),
default.tds_did_user_targ_day.dvc_id (TEXT)
=> out schema:{(2) ?distinctseq
(INT2),default.tds_did_user_targ_day.dvc_id (TEXT)}
=> in schema:{(1) default.tds_did_user_targ_day.dvc_id (TEXT)}
=> distinct: true, GROUP_BY(10)(dvc_id), exprs: (count( distinct
default.tds_did_user_targ_day.dvc_id (TEXT))), target
list:{default.tds_did_user_targ_day.dvc_id (TEXT), ?count (INT8)}, out
schema:{(2) default.tds_did_user_targ_day.dvc_id (TEXT),?count
(INT8)}, in schema:{(1) default.tds_did_user_targ_day.dvc_id (TEXT)}
PARTITIONS_SCAN(5) on default.tds_did_user_targ_day
=> target list: default.tds_did_user_targ_day.dvc_id (TEXT)
=> out schema: {(1) default.tds_did_user_targ_day.dvc_id (TEXT)}
=> in schema: {(91) default.tds_did_user_targ_day.dvc_id
(TEXT),default.tds_did_user_targ_day.user_id
(TEXT),default.tds_did_user_targ_day.p1
(TEXT),default.tds_did_user_targ_day.p2
(TEXT),default.tds_did_user_targ_day.p3
(TEXT),default.tds_did_user_targ_day.prod_code
(TEXT),default.tds_did_user_targ_day.login_ip
(TEXT),default.tds_did_user_targ_day.cntry_name
(TEXT),default.tds_did_user_targ_day.area_name
(TEXT),default.tds_did_user_targ_day.prov_name
(TEXT),default.tds_did_user_targ_day.city_name
(TEXT),default.tds_did_user_targ_day.chnl_type
(TEXT),default.tds_did_user_targ_day.chnl_type_name
(TEXT),default.tds_did_user_targ_day.chnl_code
(TEXT),default.tds_did_user_targ_day.chnl_name
(TEXT),default.tds_did_user_targ_day.login_ref
(TEXT),default.tds_did_user_targ_day.net_type
(TEXT),default.tds_did_user_targ_day.oper_sys
(TEXT),default.tds_did_user_targ_day.oper_sys_ver
(TEXT),default.tds_did_user_targ_day.dvc_brand
(TEXT),default.tds_did_user_targ_day.dvc_model
(TEXT),default.tds_did_user_targ_day.dvc_type
(TEXT),default.tds_did_user_targ_day.dev_dpi
(TEXT),default.tds_did_user_targ_day.brows_name
(TEXT),default.tds_did_user_targ_day.login_ts
(TEXT),default.tds_did_user_targ_day.first_login_date
(TEXT),default.tds_did_user_targ_day.first_login_ver
(TEXT),default.tds_did_user_targ_day.last_login_date
(TEXT),default.tds_did_user_targ_day.last_app_ver
(TEXT),default.tds_did_user_targ_day.evil_ip
(TEXT),default.tds_did_user_targ_day.day_pv
(TEXT),default.tds_did_user_targ_day.day_input_pv
(TEXT),default.tds_did_user_targ_day.day_ins_pv
(TEXT),default.tds_did_user_targ_day.day_qry_pv
(TEXT),default.tds_did_user_targ_day.day_outs_pv
(TEXT),default.tds_did_user_targ_day.day_coop_pv
(TEXT),default.tds_did_user_targ_day.day_vv
(TEXT),default.tds_did_user_targ_day.day_cv
(TEXT),default.tds_did_user_targ_day.day_pt
(TEXT),default.tds_did_user_targ_day.day_vod_vv
(TEXT),default.tds_did_user_targ_day.day_vod_cv
(TEXT),default.tds_did_user_targ_day.day_vod_pt
(TEXT),default.tds_did_user_targ_day.day_live_vv
(TEXT),default.tds_did_user_targ_day.day_live_cv
(TEXT),default.tds_did_user_targ_day.day_live_pt
(TEXT),default.tds_did_user_targ_day.day_ca_vv
(TEXT),default.tds_did_user_targ_day.day_ca_cv
(TEXT),default.tds_did_user_targ_day.day_ca_pt
(TEXT),default.tds_did_user_targ_day.day_try_vv
(TEXT),default.tds_did_user_targ_day.day_try_cv
(TEXT),default.tds_did_user_targ_day.day_try_pt
(TEXT),default.tds_did_user_targ_day.day_pay_vv
(TEXT),default.tds_did_user_targ_day.day_pay_cv
(TEXT),default.tds_did_user_targ_day.day_pay_pt
(TEXT),default.tds_did_user_targ_day.day_off_vv
(TEXT),default.tds_did_user_targ_day.day_off_cv
(TEXT),default.tds_did_user_targ_day.day_off_pt
(TEXT),default.tds_did_user_targ_day.block_ts
(TEXT),default.tds_did_user_targ_day.day_drag_ts
(TEXT),default.tds_did_user_targ_day.day_drag_ahd_ts
(TEXT),default.tds_did_user_targ_day.day_drag_bwd_ts
(TEXT),default.tds_did_user_targ_day.day_click_ts
(TEXT),default.tds_did_user_targ_day.day_instl_ts
(TEXT),default.tds_did_user_targ_day.day_stup_ts
(TEXT),default.tds_did_user_targ_day.day_movie_vv
(TEXT),default.tds_did_user_targ_day.day_movie_cv
(TEXT),default.tds_did_user_targ_day.day_movie_pt
(TEXT),default.tds_did_user_targ_day.day_tvp_vv
(TEXT),default.tds_did_user_targ_day.day_tvp_cv
(TEXT),default.tds_did_user_targ_day.day_tvp_pt
(TEXT),default.tds_did_user_targ_day.day_cartn_vv
(TEXT),default.tds_did_user_targ_day.day_cartn_cv
(TEXT),default.tds_did_user_targ_day.day_cartn_pt
(TEXT),default.tds_did_user_targ_day.day_var_vv
(TEXT),default.tds_did_user_targ_day.day_var_cv
(TEXT),default.tds_did_user_targ_day.day_var_pt
(TEXT),default.tds_did_user_targ_day.day_amuse_vv
(TEXT),default.tds_did_user_targ_day.day_amuse_cv
(TEXT),default.tds_did_user_targ_day.day_amuse_pt
(TEXT),default.tds_did_user_targ_day.day_sport_vv
(TEXT),default.tds_did_user_targ_day.day_sport_cv
(TEXT),default.tds_did_user_targ_day.day_sport_pt
(TEXT),default.tds_did_user_targ_day.day_music_vv
(TEXT),default.tds_did_user_targ_day.day_music_cv
(TEXT),default.tds_did_user_targ_day.day_music_pt
(TEXT),default.tds_did_user_targ_day.day_fin_vv
(TEXT),default.tds_did_user_targ_day.day_fin_cv
(TEXT),default.tds_did_user_targ_day.day_fin_pt
(TEXT),default.tds_did_user_targ_day.day_hot_vv
(TEXT),default.tds_did_user_targ_day.day_hot_cv
(TEXT),default.tds_did_user_targ_day.day_hot_pt (TEXT)}
=======================================================
Block Id: eb_1425543096152_0001_000002 [INTERMEDIATE]
=======================================================
[Incoming]
[q_1425543096152_0001] 1 => 2 (type=HASH_SHUFFLE, key=?distinctseq
(INT2), default.tds_did_user_targ_day.dvc_id (TEXT), num=7)
[Outgoing]
[q_1425543096152_0001] 2 => 3 (type=HASH_SHUFFLE, key=, num=1)
[Enforcers]
0: type=Distinct,alg=hash
DISTINCT_GROUP_BY(11)()
=> exprs: (count( distinct default.tds_did_user_targ_day.dvc_id (TEXT)))
=> target list: ?distinctseq (INT2),
default.tds_did_user_targ_day.dvc_id (TEXT)
=> out schema:{(2) ?distinctseq
(INT2),default.tds_did_user_targ_day.dvc_id (TEXT)}
=> in schema:{(2) ?distinctseq
(INT2),default.tds_did_user_targ_day.dvc_id (TEXT)}
=> distinct: true, GROUP_BY(12)(dvc_id), exprs: (count( distinct
default.tds_did_user_targ_day.dvc_id (TEXT))), target
list:{default.tds_did_user_targ_day.dvc_id (TEXT), ?count (INT8)}, out
schema:{(2) default.tds_did_user_targ_day.dvc_id (TEXT),?count
(INT8)}, in schema:{(1) default.tds_did_user_targ_day.dvc_id (TEXT)}
SCAN(15) on eb_1425543096152_0001_000001
=> out schema: {(2) ?distinctseq
(INT2),default.tds_did_user_targ_day.dvc_id (TEXT)}
=> in schema: {(2) ?distinctseq
(INT2),default.tds_did_user_targ_day.dvc_id (TEXT)}
=======================================================
Block Id: eb_1425543096152_0001_000003 [ROOT]
=======================================================
[Incoming]
[q_1425543096152_0001] 2 => 3 (type=HASH_SHUFFLE, key=, num=1)
[Enforcers]
0: type=Distinct,alg=sort,keys=default.tds_did_user_targ_day.dvc_id
DISTINCT_GROUP_BY(13)()
=> exprs: (count( distinct default.tds_did_user_targ_day.dvc_id (TEXT)))
=> target list: ?distinctseq (INT2),
default.tds_did_user_targ_day.dvc_id (TEXT)
=> out schema:{(1) ?count (INT8)}
=> in schema:{(2) ?distinctseq
(INT2),default.tds_did_user_targ_day.dvc_id (TEXT)}
=> distinct: true, GROUP_BY(14)(dvc_id), exprs: (count( distinct
default.tds_did_user_targ_day.dvc_id (TEXT))), target
list:{default.tds_did_user_targ_day.dvc_id (TEXT), ?count (INT8)}, out
schema:{(2) default.tds_did_user_targ_day.dvc_id (TEXT),?count
(INT8)}, in schema:{(1) default.tds_did_user_targ_day.dvc_id (TEXT)}
SCAN(16) on eb_1425543096152_0001_000002
=> out schema: {(2) ?distinctseq
(INT2),default.tds_did_user_targ_day.dvc_id (TEXT)}
=> in schema: {(2) ?distinctseq
(INT2),default.tds_did_user_targ_day.dvc_id (TEXT)}
=======================================================
Block Id: eb_1425543096152_0001_000004 [TERMINAL]
=======================================================
On Thu, Mar 5, 2015 at 3:49 PM, Azuryy Yu <[email protected]> wrote:
> Thanks Jihoon.
>
> My test dataset has 20 files under one partition with RCFile format. (96
> columns), the first column is deviceID.
> I only test this one partition, if I use all partitions,
> count(distinct) is even slow.
>
> I've set HDFS replication to 9.(I have 9 datanodes) , HDFS block size:
> 64MB, set dfs.datanode.hdfs-blocks-metadata.enabled=true
>
> the following is some test results:
>
> 1.
> default> select count(*) from tds_did_user_targ_day where dt='20150228'
> and platform='pc';
> Progress: 0%, response time: 0.306 sec
> Progress: 0%, response time: 0.307 sec
> Progress: 3%, response time: 0.711 sec
> Progress: 21%, response time: 1.513 sec
> Progress: 43%, response time: 2.514 sec
> Progress: 100%, response time: 3.139 sec
> ?count
> -------------------------------
> 35743711
> (1 rows, 3.139 sec, 9 B selected)
>
> 2. default> select sum(cast(day_movie_vv as bigint)),
> sum(cast(day_movie_cv as bigint)), sum(cast(day_movie_pt as bigint))
> from tds_did_user_targ_day where dt= '20150228' and platform='pc';
> Progress: 0%, response time: 0.299 sec
> Progress: 0%, response time: 0.299 sec
> Progress: 0%, response time: 0.7 sec
> Progress: 6%, response time: 1.502 sec
> Progress: 21%, response time: 2.503 sec
> Progress: 32%, response time: 3.504 sec
> Progress: 44%, response time: 4.505 sec
> Progress: 50%, response time: 5.506 sec
> Progress: 100%, response time: 5.568 sec
> ?sum_3, ?sum_4, ?sum_5
> -------------------------------
> 7302934, 6453007, 6504000842
> (1 rows, 5.568 sec, 27 B selected)
>
>
> 3)
> default> select count(distinct dvc_id) from tds_did_user_targ_day where
> dt= '20150228' and platform='pc';
> Progress: 0%, response time: 0.3 sec
> Progress: 0%, response time: 0.301 sec
> Progress: 0%, response time: 0.702 sec
> Progress: 2%, response time: 1.503 sec
> Progress: 3%, response time: 2.504 sec
> Progress: 4%, response time: 3.506 sec
> Progress: 10%, response time: 4.507 sec
> Progress: 14%, response time: 5.508 sec
> Progress: 17%, response time: 6.509 sec
> Progress: 21%, response time: 7.51 sec
> Progress: 25%, response time: 8.511 sec
> Progress: 27%, response time: 9.512 sec
> Progress: 28%, response time: 10.513 sec
> Progress: 33%, response time: 11.514 sec
> Progress: 33%, response time: 12.516 sec
> Progress: 33%, response time: 13.52 sec
> Progress: 50%, response time: 14.523 sec
> Progress: 50%, response time: 15.525 sec
> Progress: 50%, response time: 16.527 sec
> Progress: 50%, response time: 17.529 sec
> Progress: 50%, response time: 18.53 sec
> Progress: 50%, response time: 19.531 sec
> Progress: 50%, response time: 20.533 sec
> Progress: 51%, response time: 21.534 sec
> Progress: 51%, response time: 22.535 sec
> Progress: 51%, response time: 23.536 sec
> Progress: 52%, response time: 24.538 sec
> Progress: 52%, response time: 25.539 sec
> Progress: 54%, response time: 26.54 sec
> Progress: 54%, response time: 27.542 sec
> Progress: 54%, response time: 28.543 sec
> Progress: 54%, response time: 29.545 sec
> Progress: 55%, response time: 30.546 sec
> Progress: 56%, response time: 31.547 sec
> Progress: 57%, response time: 32.549 sec
> Progress: 60%, response time: 33.55 sec
> Progress: 60%, response time: 34.551 sec
> Progress: 63%, response time: 35.552 sec
> Progress: 64%, response time: 36.554 sec
> Progress: 65%, response time: 37.555 sec
> Progress: 66%, response time: 38.556 sec
> Progress: 66%, response time: 39.559 sec
> Progress: 66%, response time: 40.563 sec
> Progress: 66%, response time: 41.567 sec
> Progress: 66%, response time: 42.571 sec
> Progress: 66%, response time: 43.575 sec
> Progress: 66%, response time: 44.579 sec
> Progress: 66%, response time: 45.584 sec
> Progress: 66%, response time: 46.588 sec
> Progress: 66%, response time: 47.592 sec
> Progress: 66%, response time: 48.596 sec
> Progress: 66%, response time: 49.6 sec
> Progress: 66%, response time: 50.601 sec
> Progress: 83%, response time: 51.602 sec
> Progress: 83%, response time: 52.603 sec
> Progress: 83%, response time: 53.604 sec
> Progress: 83%, response time: 54.605 sec
> Progress: 84%, response time: 55.606 sec
> Progress: 84%, response time: 56.607 sec
> Progress: 84%, response time: 57.608 sec
> Progress: 84%, response time: 58.609 sec
> Progress: 84%, response time: 59.61 sec
> Progress: 85%, response time: 60.612 sec
> Progress: 85%, response time: 61.613 sec
> Progress: 85%, response time: 62.614 sec
> Progress: 86%, response time: 63.615 sec
> Progress: 86%, response time: 64.616 sec
> Progress: 86%, response time: 65.617 sec
> Progress: 88%, response time: 66.618 sec
> Progress: 88%, response time: 67.619 sec
> Progress: 88%, response time: 68.62 sec
> Progress: 89%, response time: 69.621 sec
> Progress: 89%, response time: 70.622 sec
> Progress: 89%, response time: 71.623 sec
> Progress: 89%, response time: 72.624 sec
> Progress: 90%, response time: 73.625 sec
> Progress: 90%, response time: 74.627 sec
> Progress: 90%, response time: 75.628 sec
> Progress: 90%, response time: 76.629 sec
> Progress: 90%, response time: 77.63 sec
> Progress: 90%, response time: 78.632 sec
> Progress: 90%, response time: 79.633 sec
> Progress: 90%, response time: 80.634 sec
> Progress: 90%, response time: 81.636 sec
> Progress: 86%, response time: 82.637 sec
> Progress: 86%, response time: 83.638 sec
> Progress: 86%, response time: 84.64 sec
> Progress: 86%, response time: 85.641 sec
> Progress: 86%, response time: 86.642 sec
> Progress: 86%, response time: 87.643 sec
> Progress: 88%, response time: 88.645 sec
> Progress: 88%, response time: 89.646 sec
> Progress: 88%, response time: 90.647 sec
> Progress: 92%, response time: 91.648 sec
> Progress: 92%, response time: 92.649 sec
> Progress: 92%, response time: 93.65 sec
> Progress: 92%, response time: 94.651 sec
> Progress: 93%, response time: 95.652 sec
> Progress: 93%, response time: 96.653 sec
> Progress: 94%, response time: 97.654 sec
> Progress: 94%, response time: 98.655 sec
> Progress: 95%, response time: 99.656 sec
> Progress: 95%, response time: 100.657 sec
> Progress: 95%, response time: 101.658 sec
> Progress: 95%, response time: 102.659 sec
> Progress: 96%, response time: 103.66 sec
> Progress: 96%, response time: 104.661 sec
> Progress: 97%, response time: 105.662 sec
> Progress: 97%, response time: 106.663 sec
> Progress: 99%, response time: 107.665 sec
> Progress: 99%, response time: 108.666 sec
> Progress: 100%, response time: 109.074 sec
> ?count
> -------------------------------
> 35620158
> (1 rows, 109.074 sec, 9 B selected)
>
> For the last query, Logic plan:
> Logical Plan
>
> -----------------------------
> Query Block Graph
> -----------------------------
> |-#ROOT
> -----------------------------
> Optimization Log:
> [LogicalPlan]
> > ProjectionNode is eliminated.
> > PartitionTableRewriter chooses 1 of partitions
> -----------------------------
>
> GROUP_BY(3)()
> => exprs: (count( distinct default.tds_did_user_targ_day.dvc_id (TEXT)))
> => target list: ?count (INT8)
> => out schema:{(1) ?count (INT8)}
> => in schema:{(1) default.tds_did_user_targ_day.dvc_id (TEXT)}
> PARTITIONS_SCAN(5) on default.tds_did_user_targ_day
> => target list: default.tds_did_user_targ_day.dvc_id (TEXT)
> => num of filtered paths: 1
> => out schema: {(1) default.tds_did_user_targ_day.dvc_id (TEXT)}
> => in schema: {(91) default.tds_did_user_targ_day.dvc_id
> (TEXT),default.tds_did_user_targ_day.user_id
> (TEXT),default.tds_did_user_targ_day.p1
> (TEXT),default.tds_did_user_targ_day.p2
> (TEXT),default.tds_did_user_targ_day.p3
> (TEXT),default.tds_did_user_targ_day.prod_code
> (TEXT),default.tds_did_user_targ_day.login_ip
> (TEXT),default.tds_did_user_targ_day.cntry_name
> (TEXT),default.tds_did_user_targ_day.area_name
> (TEXT),default.tds_did_user_targ_day.prov_name
> (TEXT),default.tds_did_user_targ_day.city_name
> (TEXT),default.tds_did_user_targ_day.chnl_type
> (TEXT),default.tds_did_user_targ_day.chnl_type_name
> (TEXT),default.tds_did_user_targ_day.chnl_code
> (TEXT),default.tds_did_user_targ_day.chnl_name
> (TEXT),default.tds_did_user_targ_day.login_ref
> (TEXT),default.tds_did_user_targ_day.net_type
> (TEXT),default.tds_did_user_targ_day.oper_sys
> (TEXT),default.tds_did_user_targ_day.oper_sys_ver
> (TEXT),default.tds_did_user_targ_day.dvc_brand
> (TEXT),default.tds_did_user_targ_day.dvc_model
> (TEXT),default.tds_did_user_targ_day.dvc_type
> (TEXT),default.tds_did_user_targ_day.dev_dpi
> (TEXT),default.tds_did_user_targ_day.brows_name
> (TEXT),default.tds_did_user_targ_day.login_ts
> (TEXT),default.tds_did_user_targ_day.first_login_date
> (TEXT),default.tds_did_user_targ_day.first_login_ver
> (TEXT),default.tds_did_user_targ_day.last_login_date
> (TEXT),default.tds_did_user_targ_day.last_app_ver
> (TEXT),default.tds_did_user_targ_day.evil_ip
> (TEXT),default.tds_did_user_targ_day.day_pv
> (TEXT),default.tds_did_user_targ_day.day_input_pv
> (TEXT),default.tds_did_user_targ_day.day_ins_pv
> (TEXT),default.tds_did_user_targ_day.day_qry_pv
> (TEXT),default.tds_did_user_targ_day.day_outs_pv
> (TEXT),default.tds_did_user_targ_day.day_coop_pv
> (TEXT),default.tds_did_user_targ_day.day_vv
> (TEXT),default.tds_did_user_targ_day.day_cv
> (TEXT),default.tds_did_user_targ_day.day_pt
> (TEXT),default.tds_did_user_targ_day.day_vod_vv
> (TEXT),default.tds_did_user_targ_day.day_vod_cv
> (TEXT),default.tds_did_user_targ_day.day_vod_pt
> (TEXT),default.tds_did_user_targ_day.day_live_vv
> (TEXT),default.tds_did_user_targ_day.day_live_cv
> (TEXT),default.tds_did_user_targ_day.day_live_pt
> (TEXT),default.tds_did_user_targ_day.day_ca_vv
> (TEXT),default.tds_did_user_targ_day.day_ca_cv
> (TEXT),default.tds_did_user_targ_day.day_ca_pt
> (TEXT),default.tds_did_user_targ_day.day_try_vv
> (TEXT),default.tds_did_user_targ_day.day_try_cv
> (TEXT),default.tds_did_user_targ_day.day_try_pt
> (TEXT),default.tds_did_user_targ_day.day_pay_vv
> (TEXT),default.tds_did_user_targ_day.day_pay_cv
> (TEXT),default.tds_did_user_targ_day.day_pay_pt
> (TEXT),default.tds_did_user_targ_day.day_off_vv
> (TEXT),default.tds_did_user_targ_day.day_off_cv
> (TEXT),default.tds_did_user_targ_day.day_off_pt
> (TEXT),default.tds_did_user_targ_day.block_ts
> (TEXT),default.tds_did_user_targ_day.day_drag_ts
> (TEXT),default.tds_did_user_targ_day.day_drag_ahd_ts
> (TEXT),default.tds_did_user_targ_day.day_drag_bwd_ts
> (TEXT),default.tds_did_user_targ_day.day_click_ts
> (TEXT),default.tds_did_user_targ_day.day_instl_ts
> (TEXT),default.tds_did_user_targ_day.day_stup_ts
> (TEXT),default.tds_did_user_targ_day.day_movie_vv
> (TEXT),default.tds_did_user_targ_day.day_movie_cv
> (TEXT),default.tds_did_user_targ_day.day_movie_pt
> (TEXT),default.tds_did_user_targ_day.day_tvp_vv
> (TEXT),default.tds_did_user_targ_day.day_tvp_cv
> (TEXT),default.tds_did_user_targ_day.day_tvp_pt
> (TEXT),default.tds_did_user_targ_day.day_cartn_vv
> (TEXT),default.tds_did_user_targ_day.day_cartn_cv
> (TEXT),default.tds_did_user_targ_day.day_cartn_pt
> (TEXT),default.tds_did_user_targ_day.day_var_vv
> (TEXT),default.tds_did_user_targ_day.day_var_cv
> (TEXT),default.tds_did_user_targ_day.day_var_pt
> (TEXT),default.tds_did_user_targ_day.day_amuse_vv
> (TEXT),default.tds_did_user_targ_day.day_amuse_cv
> (TEXT),default.tds_did_user_targ_day.day_amuse_pt
> (TEXT),default.tds_did_user_targ_day.day_sport_vv
> (TEXT),default.tds_did_user_targ_day.day_sport_cv
> (TEXT),default.tds_did_user_targ_day.day_sport_pt
> (TEXT),default.tds_did_user_targ_day.day_music_vv
> (TEXT),default.tds_did_user_targ_day.day_music_cv
> (TEXT),default.tds_did_user_targ_day.day_music_pt
> (TEXT),default.tds_did_user_targ_day.day_fin_vv
> (TEXT),default.tds_did_user_targ_day.day_fin_cv
> (TEXT),default.tds_did_user_targ_day.day_fin_pt
> (TEXT),default.tds_did_user_targ_day.day_hot_vv
> (TEXT),default.tds_did_user_targ_day.day_hot_cv
> (TEXT),default.tds_did_user_targ_day.day_hot_pt (TEXT)}
> => 0:
> hdfs://realtime-cluster/data/basetable/tds_did_user_targ_day/dt=20150228/platform=pc
>
>
>
>
>
>
>
>
> On Thu, Mar 5, 2015 at 3:17 PM, Jihoon Son <[email protected]> wrote:
>
>> Hi Azuryy,
>> truly sorry for late response.
>> I left some comments below.
>>
>> Sincerely,
>> Jihoon
>>
>> On Wed, Mar 4, 2015 at 7:15 PM Azuryy Yu <[email protected]> wrote:
>>
>> > Hi,
>> >
>> > I read theTajo-0.9.0 source code, I found Tajo using a simple FIFO
>> > scheduler,
>> >
>> > I accept this in the current stage. but when Tajo peek a query from the
>> > scheduler queue, then allocate workers for this query,
>> >
>> > Allocator only consider availale resource on a random worker list, then
>> > specify a set of workers.
>> >
>> > 1)
>> > so My question is why we don't consider HDFS locatbility? otherwise
>> network
>> > will be the bottleneck.
>> >
>> > I understand Tajo don't use YARN as a scheduler currently. and write a
>> > temporary simple FIFO scheduler. and I am also looked at
>> > https://issues.apache.org/jira/browse/TAJO-540 , I hope new Tajo
>> scheduler
>> > similar to Sparrow.
>> >
>> It seems that there are some misunderstandings on our resource
>> scheduling. The
>> FIFO scheduler has a role of the query scheduler. That is, given a list of
>> submitted queries, it reserves resources required to execute queries
>> consecutively. The Sparrow-like scheduler can be used for the concurrent
>> execution of multiple queries.
>>
>> Once a query is started, the *task scheduler* is responsible for
>> allocating
>> tasks to workers. As you said, tasks are allocated to workers if they have
>> enough resources. However when allocating tasks, our task scheduler
>> considers the physical disk location where the data is stored on as well
>> as
>> the location of the node containing data. For example, with your cluster,
>> each worker can be assigned 12 tasks each of which processes data stored
>> on
>> different 12 disks. Since a worker is generally equipped multiple disks,
>> this approach can utilize the disk bandwidth efficiently.
>>
>> You can see the locality information in the Tajo's query master log. Here
>> is an example.
>> ...
>> 2015-03-05 15:14:12,662 INFO
>> org.apache.tajo.querymaster.DefaultTaskScheduler: Assigned
>> Local/Rack/Total: (9264/1555/10819), Locality: 85.63%, Rack host:
>> xxx.xxx.xxx.xxx
>> ...
>>
>>
>> > 2) performance related.
>> > I setup 10 nodes clusters, (1 master, 9 workers)
>> >
>> > 64GB mem, 24CPU, 12*4TB HDD, 1.6GB test data.(160 million records)
>> >
>> > It's works good for some agg sql tests except count(distinct)
>> >
>> > count(distinct) is very slow - ten minutes.
>> >
>> This result looks strange and difficult to find what makes the query
>> execution slow.
>> Would you mind sharing some logs and additional information of input data
>> (# of files, the distribution of data on HDFS)?
>> In addition, it would be great if you share the evaluation results of
>> other
>> queries which you think the response time is sufficiently short.
>>
>> >
>> > who can give me a simple explanation of how Tajo works with
>> > count(distinct), I can share my tajo-site here:
>> >
>> > <configuration>
>> > <property>
>> > <name>tajo.rootdir</name>
>> > <value>hdfs://realtime-cluster/tajo</value>
>> > </property>
>> >
>> > <!-- master -->
>> > <property>
>> > <name>tajo.master.umbilical-rpc.address</name>
>> > <value>xx:26001</value>
>> > </property>
>> > <property>
>> > <name>tajo.master.client-rpc.address</name>
>> > <value>xx:26002</value>
>> > </property>
>> > <property>
>> > <name>tajo.master.info-http.address</name>
>> > <value>xx:26080</value>
>> > </property>
>> > <property>
>> > <name>tajo.resource-tracker.rpc.address</name>
>> > <value>xx:26003</value>
>> > </property>
>> > <property>
>> > <name>tajo.catalog.client-rpc.address</name>
>> > <value>xx:26005</value>
>> > </property>
>> > <!-- worker -->
>> > <property>
>> > <name>tajo.worker.tmpdir.locations</name>
>> >
>> > <value>file:///data/hadoop/data1/tajo,file:///data/hadoop/
>> > data2/tajo,file:///data/hadoop/data3/tajo,file:///
>> > data/hadoop/data4/tajo,file:///data/hadoop/data5/tajo,file:/
>> > //data/hadoop/data6/tajo,file:///data/hadoop/data7/tajo,
>> > file:///data/hadoop/data8/tajo,file:///data/hadoop/data9/tajo,file:///
>> > data/hadoop/data10/tajo,file:///data/hadoop/data11/tajo,file:/
>> > //data/hadoop/data12/tajo</value>
>> > </property>
>> > <property>
>> > <name>tajo.worker.tmpdir.cleanup-at-startup</name>
>> > <value>true</value>
>> > </property>
>> > <property>
>> > <name>tajo.worker.history.expire-interval-minutes</name>
>> > <value>60</value>
>> > </property>
>> > <property>
>> > <name>tajo.worker.resource.tajo.worker.resource.cpu-cores</name>
>> > <value>24</value>
>> > </property>
>> > <property>
>> > <name>tajo.worker.resource.memory-mb</name>
>> > <value>60512</value> <!-- 3584 3 tasks + 1 qm task -->
>> > </property>
>> > <property>
>> > <name>tajo.task.memory-slot-mb.default</name>
>> > <value>3000</value> <!-- default 512 -->
>> > </property>
>> > <property>
>> > <name>tajo.task.disk-slot.default</name>
>> > <value>1.0f</value> <!-- default 0.5 -->
>> > </property>
>> > <property>
>> > <name>tajo.shuffle.fetcher.parallel-execution.max-num</name>
>> > <value>5</value>
>> > </property>
>> > <property>
>> > <name>tajo.executor.external-sort.thread-num</name>
>> > <value>2</value>
>> > </property>
>> > <!-- client -->
>> > <property>
>> > <name>tajo.rpc.client.worker-thread-num</name>
>> > <value>4</value>
>> > </property>
>> > <property>
>> > <name>tajo.cli.print.pause</name>
>> > <value>false</value>
>> > </property>
>> > <!--
>> > <property>
>> > <name>tajo.worker.resource.dfs-dir-aware</name>
>> > <value>true</value>
>> > </property>
>> > <property>
>> > <name>tajo.worker.resource.dedicated</name>
>> > <value>true</value>
>> > </property>
>> > <property>
>> > <name>tajo.worker.resource.dedicated-memory-ratio</name>
>> > <value>0.6</value>
>> > </property>
>> > -->
>> > </configuration>
>> >
>> >
>> > tajo-env:
>> >
>> > export TAJO_WORKER_HEAPSIZE=60000
>> >
>>
>
>