Re: tpcds query3 slower than parquet

Ravindra Pesala Sun, 27 Aug 2017 22:44:08 -0700

Hi,

Here what I can observe those join columns are dictionary include so during
join it needs to convert all columns to decode, so it may affect the
performance. Please try to keep the join columns to dictionary exclude to
avoid the dictionary decoding.


Regards,
Ravindra.

On 28 August 2017 at 09:19, lk_hadoop <[email protected]> wrote:

> hi,all:
>
> I have make total 20G tpcds data , and I chang it to both carbondata and
> parquet type, query3  performance was slower:
>
>                      carbon                     parquet
> time1        2391.686318ms         899.256838ms
> time2        4129.92724ms           745.656853ms
> time3        1488.651428ms
>
>
> *query3:*
>
> SELECT dt.d_year, item.i_brand_id brand_id, item.i_brand
> brand,SUM(ss_ext_sales_price) sum_agg
>  FROM  date_dim dt, store_sales, item
>  WHERE dt.d_date_sk = store_sales.ss_sold_date_sk
>    AND store_sales.ss_item_sk = item.i_item_sk
>    AND item.i_manufact_id = 128
>    AND dt.d_moy=11
>  GROUP BY dt.d_year, item.i_brand, item.i_brand_id
>  ORDER BY dt.d_year, sum_agg desc, brand_id
>  LIMIT 100
>
>
> *carbondata table :*
>
> create table if not exists tpcds_carbon2.date_dim
> (
>     d_date_sk                 int               ,
>     d_date_id                 char(16)              ,
>     d_date                    date                          ,
>     d_month_seq               int                       ,
>     d_week_seq                int                       ,
>     d_quarter_seq             int                       ,
>     d_year                    int                       ,
>     d_dow                     int                       ,
>     d_moy                     int                       ,
>     d_dom                     int                       ,
>     d_qoy                     int                       ,
>     d_fy_year                 int                       ,
>     d_fy_quarter_seq          int                       ,
>     d_fy_week_seq             int                       ,
>     d_day_name                char(9)                       ,
>     d_quarter_name            char(6)                       ,
>     d_holiday                 char(1)                       ,
>     d_weekend                 char(1)                       ,
>     d_following_holiday       char(1)                       ,
>     d_first_dom               int                       ,
>     d_last_dom                int                       ,
>     d_same_day_ly             int                       ,
>     d_same_day_lq             int                       ,
>     d_current_day             char(1)                       ,
>     d_current_week            char(1)                       ,
>     d_current_month           char(1)                       ,
>     d_current_quarter         char(1)                       ,
>     d_current_year            char(1)
> )
> STORED BY 'carbondata'
> TBLPROPERTIES ('DICTIONARY_INCLUDE'='d_date_sk,d_moy'
> ,'DICTIONARY_EXCLUDE'='d_current_day,d_current_week,d_current_month
> ,d_current_quarter,d_current_year,d_day_name,d_quarter_
> name,d_holiday,d_weekend,d_following_holiday'
> ,'SORT_COLUMNS'='d_year')
>
>
> create table if not exists tpcds_carbon2.store_sales
> (
>     ss_sold_date_sk           int                       ,
>     ss_sold_time_sk           int                       ,
>     ss_item_sk                int               ,
>     ss_customer_sk            int                       ,
>     ss_cdemo_sk               int                       ,
>     ss_hdemo_sk               int                       ,
>     ss_addr_sk                int                       ,
>     ss_store_sk               int                       ,
>     ss_promo_sk               int                       ,
>     ss_ticket_number          int               ,
>     ss_quantity               int                       ,
>     ss_wholesale_cost         decimal(7,2)                  ,
>     ss_list_price             decimal(7,2)                  ,
>     ss_sales_price            decimal(7,2)                  ,
>     ss_ext_discount_amt       decimal(7,2)                  ,
>     ss_ext_sales_price        decimal(7,2)                  ,
>     ss_ext_wholesale_cost     decimal(7,2)                  ,
>     ss_ext_list_price         decimal(7,2)                  ,
>     ss_ext_tax                decimal(7,2)                  ,
>     ss_coupon_amt             decimal(7,2)                  ,
>     ss_net_paid               decimal(7,2)                  ,
>     ss_net_paid_inc_tax       decimal(7,2)                  ,
>     ss_net_profit             decimal(7,2)
> )
> STORED BY 'carbondata'
> TBLPROPERTIES ('DICTIONARY_INCLUDE'='ss_item_sk,ss_ticket_number,ss_
> sold_date_sk')
>
>
> create table if not exists tpcds_carbon2.item
> (
>     i_item_sk                 int               ,
>     i_item_id                 char(16)              ,
>     i_rec_start_date          date                          ,
>     i_rec_end_date            date                          ,
>     i_item_desc               varchar(200)                  ,
>     i_current_price           decimal(7,2)                  ,
>     i_wholesale_cost          decimal(7,2)                  ,
>     i_brand_id                int                       ,
>     i_brand                   char(50)                      ,
>     i_class_id                int                       ,
>     i_class                   char(50)                      ,
>     i_category_id             int                       ,
>     i_category                char(50)                      ,
>     i_manufact_id             int                       ,
>     i_manufact                char(50)                      ,
>     i_size                    char(20)                      ,
>     i_formulation             char(20)                      ,
>     i_color                   char(20)                      ,
>     i_units                   char(10)                      ,
>     i_container               char(10)                      ,
>     i_manager_id              int                       ,
>     i_product_name            char(50)
> )
> STORED BY 'carbondata'
> TBLPROPERTIES ('DICTIONARY_INCLUDE'='i_item_sk,i_brand_id,i_class_id,i_
> category_id,i_manufact_id'
> ,'DICTIONARY_EXCLUDE'='i_item_desc,i_brand,i_class,i_category,i_manufact'
> ,'SORT_COLUMNS'='i_brand_id')
>
> anyone know how to improve performance?
>
> 2017-08-28
> ------------------------------
> lk_hadoop
>



-- 
Thanks & Regards,
Ravi

Re: tpcds query3 slower than parquet

Reply via email to