tpcds query3 slower than parquet

classic Classic list List threaded Threaded
3 messages Options
Reply | Threaded
Open this post in threaded view
|

tpcds query3 slower than parquet

lk_hadoop
hi,all:
 
I have make total 20G tpcds data , and I chang it to both carbondata and parquet type, query3  performance was slower:
 
                     carbon                     parquet
time1        2391.686318ms         899.256838ms
time2        4129.92724ms           745.656853ms
time3        1488.651428ms
 
 
query3:
 
SELECT dt.d_year, item.i_brand_id brand_id, item.i_brand brand,SUM(ss_ext_sales_price) sum_agg
 FROM  date_dim dt, store_sales, item
 WHERE dt.d_date_sk = store_sales.ss_sold_date_sk
   AND store_sales.ss_item_sk = item.i_item_sk
   AND item.i_manufact_id = 128
   AND dt.d_moy=11
 GROUP BY dt.d_year, item.i_brand, item.i_brand_id
 ORDER BY dt.d_year, sum_agg desc, brand_id
 LIMIT 100
 
 
carbondata table :
 
create table if not exists tpcds_carbon2.date_dim
(
    d_date_sk                 int               ,
    d_date_id                 char(16)              ,
    d_date                    date                          ,
    d_month_seq               int                       ,
    d_week_seq                int                       ,
    d_quarter_seq             int                       ,
    d_year                    int                       ,
    d_dow                     int                       ,
    d_moy                     int                       ,
    d_dom                     int                       ,
    d_qoy                     int                       ,
    d_fy_year                 int                       ,
    d_fy_quarter_seq          int                       ,
    d_fy_week_seq             int                       ,
    d_day_name                char(9)                       ,
    d_quarter_name            char(6)                       ,
    d_holiday                 char(1)                       ,
    d_weekend                 char(1)                       ,
    d_following_holiday       char(1)                       ,
    d_first_dom               int                       ,
    d_last_dom                int                       ,
    d_same_day_ly             int                       ,
    d_same_day_lq             int                       ,
    d_current_day             char(1)                       ,
    d_current_week            char(1)                       ,
    d_current_month           char(1)                       ,
    d_current_quarter         char(1)                       ,
    d_current_year            char(1)        
)
STORED BY 'carbondata'
TBLPROPERTIES ('DICTIONARY_INCLUDE'='d_date_sk,d_moy'
,'DICTIONARY_EXCLUDE'='d_current_day,d_current_week,d_current_month
,d_current_quarter,d_current_year,d_day_name,d_quarter_name,d_holiday,d_weekend,d_following_holiday'
,'SORT_COLUMNS'='d_year')
 

create table if not exists tpcds_carbon2.store_sales
(
    ss_sold_date_sk           int                       ,
    ss_sold_time_sk           int                       ,
    ss_item_sk                int               ,
    ss_customer_sk            int                       ,
    ss_cdemo_sk               int                       ,
    ss_hdemo_sk               int                       ,
    ss_addr_sk                int                       ,
    ss_store_sk               int                       ,
    ss_promo_sk               int                       ,
    ss_ticket_number          int               ,
    ss_quantity               int                       ,
    ss_wholesale_cost         decimal(7,2)                  ,
    ss_list_price             decimal(7,2)                  ,
    ss_sales_price            decimal(7,2)                  ,
    ss_ext_discount_amt       decimal(7,2)                  ,
    ss_ext_sales_price        decimal(7,2)                  ,
    ss_ext_wholesale_cost     decimal(7,2)                  ,
    ss_ext_list_price         decimal(7,2)                  ,
    ss_ext_tax                decimal(7,2)                  ,
    ss_coupon_amt             decimal(7,2)                  ,
    ss_net_paid               decimal(7,2)                  ,
    ss_net_paid_inc_tax       decimal(7,2)                  ,
    ss_net_profit             decimal(7,2) 
)
STORED BY 'carbondata'
TBLPROPERTIES ('DICTIONARY_INCLUDE'='ss_item_sk,ss_ticket_number,ss_sold_date_sk')
 

create table if not exists tpcds_carbon2.item
(
    i_item_sk                 int               ,
    i_item_id                 char(16)              ,
    i_rec_start_date          date                          ,
    i_rec_end_date            date                          ,
    i_item_desc               varchar(200)                  ,
    i_current_price           decimal(7,2)                  ,
    i_wholesale_cost          decimal(7,2)                  , 
    i_brand_id                int                       ,
    i_brand                   char(50)                      ,
    i_class_id                int                       ,
    i_class                   char(50)                      ,
    i_category_id             int                       ,
    i_category                char(50)                      ,
    i_manufact_id             int                       ,
    i_manufact                char(50)                      ,
    i_size                    char(20)                      ,
    i_formulation             char(20)                      ,
    i_color                   char(20)                      ,
    i_units                   char(10)                      ,
    i_container               char(10)                      ,
    i_manager_id              int                       ,
    i_product_name            char(50)            
)
STORED BY 'carbondata'
TBLPROPERTIES ('DICTIONARY_INCLUDE'='i_item_sk,i_brand_id,i_class_id,i_category_id,i_manufact_id'
,'DICTIONARY_EXCLUDE'='i_item_desc,i_brand,i_class,i_category,i_manufact'
,'SORT_COLUMNS'='i_brand_id')
 
anyone know how to improve performance?
 
2017-08-28

lk_hadoop
Reply | Threaded
Open this post in threaded view
|

Re: tpcds query3 slower than parquet

Ravindra Pesala
Hi,

Here what I can observe those join columns are dictionary include so during join it needs to convert all columns to decode, so it may affect the performance. Please try to keep the join columns to dictionary exclude to avoid the dictionary decoding.

Regards,
Ravindra.

On 28 August 2017 at 09:19, lk_hadoop <[hidden email]> wrote:
hi,all:
 
I have make total 20G tpcds data , and I chang it to both carbondata and parquet type, query3  performance was slower:
 
                     carbon                     parquet
time1        2391.686318ms         899.256838ms
time2        4129.92724ms           745.656853ms
time3        1488.651428ms
 
 
query3:
 
SELECT dt.d_year, item.i_brand_id brand_id, item.i_brand brand,SUM(ss_ext_sales_price) sum_agg
 FROM  date_dim dt, store_sales, item
 WHERE dt.d_date_sk = store_sales.ss_sold_date_sk
   AND store_sales.ss_item_sk = item.i_item_sk
   AND item.i_manufact_id = 128
   AND dt.d_moy=11
 GROUP BY dt.d_year, item.i_brand, item.i_brand_id
 ORDER BY dt.d_year, sum_agg desc, brand_id
 LIMIT 100
 
 
carbondata table :
 
create table if not exists tpcds_carbon2.date_dim
(
    d_date_sk                 int               ,
    d_date_id                 char(16)              ,
    d_date                    date                          ,
    d_month_seq               int                       ,
    d_week_seq                int                       ,
    d_quarter_seq             int                       ,
    d_year                    int                       ,
    d_dow                     int                       ,
    d_moy                     int                       ,
    d_dom                     int                       ,
    d_qoy                     int                       ,
    d_fy_year                 int                       ,
    d_fy_quarter_seq          int                       ,
    d_fy_week_seq             int                       ,
    d_day_name                char(9)                       ,
    d_quarter_name            char(6)                       ,
    d_holiday                 char(1)                       ,
    d_weekend                 char(1)                       ,
    d_following_holiday       char(1)                       ,
    d_first_dom               int                       ,
    d_last_dom                int                       ,
    d_same_day_ly             int                       ,
    d_same_day_lq             int                       ,
    d_current_day             char(1)                       ,
    d_current_week            char(1)                       ,
    d_current_month           char(1)                       ,
    d_current_quarter         char(1)                       ,
    d_current_year            char(1)        
)
STORED BY 'carbondata'
TBLPROPERTIES ('DICTIONARY_INCLUDE'='d_date_sk,d_moy'
,'DICTIONARY_EXCLUDE'='d_current_day,d_current_week,d_current_month
,d_current_quarter,d_current_year,d_day_name,d_quarter_name,d_holiday,d_weekend,d_following_holiday'
,'SORT_COLUMNS'='d_year')
 

create table if not exists tpcds_carbon2.store_sales
(
    ss_sold_date_sk           int                       ,
    ss_sold_time_sk           int                       ,
    ss_item_sk                int               ,
    ss_customer_sk            int                       ,
    ss_cdemo_sk               int                       ,
    ss_hdemo_sk               int                       ,
    ss_addr_sk                int                       ,
    ss_store_sk               int                       ,
    ss_promo_sk               int                       ,
    ss_ticket_number          int               ,
    ss_quantity               int                       ,
    ss_wholesale_cost         decimal(7,2)                  ,
    ss_list_price             decimal(7,2)                  ,
    ss_sales_price            decimal(7,2)                  ,
    ss_ext_discount_amt       decimal(7,2)                  ,
    ss_ext_sales_price        decimal(7,2)                  ,
    ss_ext_wholesale_cost     decimal(7,2)                  ,
    ss_ext_list_price         decimal(7,2)                  ,
    ss_ext_tax                decimal(7,2)                  ,
    ss_coupon_amt             decimal(7,2)                  ,
    ss_net_paid               decimal(7,2)                  ,
    ss_net_paid_inc_tax       decimal(7,2)                  ,
    ss_net_profit             decimal(7,2) 
)
STORED BY 'carbondata'
TBLPROPERTIES ('DICTIONARY_INCLUDE'='ss_item_sk,ss_ticket_number,ss_sold_date_sk')
 

create table if not exists tpcds_carbon2.item
(
    i_item_sk                 int               ,
    i_item_id                 char(16)              ,
    i_rec_start_date          date                          ,
    i_rec_end_date            date                          ,
    i_item_desc               varchar(200)                  ,
    i_current_price           decimal(7,2)                  ,
    i_wholesale_cost          decimal(7,2)                  , 
    i_brand_id                int                       ,
    i_brand                   char(50)                      ,
    i_class_id                int                       ,
    i_class                   char(50)                      ,
    i_category_id             int                       ,
    i_category                char(50)                      ,
    i_manufact_id             int                       ,
    i_manufact                char(50)                      ,
    i_size                    char(20)                      ,
    i_formulation             char(20)                      ,
    i_color                   char(20)                      ,
    i_units                   char(10)                      ,
    i_container               char(10)                      ,
    i_manager_id              int                       ,
    i_product_name            char(50)            
)
STORED BY 'carbondata'
TBLPROPERTIES ('DICTIONARY_INCLUDE'='i_item_sk,i_brand_id,i_class_id,i_category_id,i_manufact_id'
,'DICTIONARY_EXCLUDE'='i_item_desc,i_brand,i_class,i_category,i_manufact'
,'SORT_COLUMNS'='i_brand_id')
 
anyone know how to improve performance?
 
2017-08-28

lk_hadoop



--
Thanks & Regards,
Ravi
Reply | Threaded
Open this post in threaded view
|

Re: Re: tpcds query3 slower than parquet

lk_hadoop
thank you Ravindra , after try your suggestion , performance improved :
 
time1      1550.942057ms
time2      1384.018871ms
time3      1009.129836ms
time4      1052.680097ms
 
but it also slower than parquet , I will keep trying.
 
2017-08-28
lk_hadoop

发件人:Ravindra Pesala <[hidden email]>
发送时间:2017-08-28 19:13
主题:Re: tpcds query3 slower than parquet
收件人:"user"<[hidden email]>
抄送:
 
Hi,

Here what I can observe those join columns are dictionary include so during join it needs to convert all columns to decode, so it may affect the performance. Please try to keep the join columns to dictionary exclude to avoid the dictionary decoding.

Regards,
Ravindra.

On 28 August 2017 at 09:19, lk_hadoop <[hidden email]> wrote:
hi,all:
 
I have make total 20G tpcds data , and I chang it to both carbondata and parquet type, query3  performance was slower:
 
                     carbon                     parquet
time1        2391.686318ms         899.256838ms
time2        4129.92724ms           745.656853ms
time3        1488.651428ms
 
 
query3:
 
SELECT dt.d_year, item.i_brand_id brand_id, item.i_brand brand,SUM(ss_ext_sales_price) sum_agg
 FROM  date_dim dt, store_sales, item
 WHERE dt.d_date_sk = store_sales.ss_sold_date_sk
   AND store_sales.ss_item_sk = item.i_item_sk
   AND item.i_manufact_id = 128
   AND dt.d_moy=11
 GROUP BY dt.d_year, item.i_brand, item.i_brand_id
 ORDER BY dt.d_year, sum_agg desc, brand_id
 LIMIT 100
 
 
carbondata table :
 
create table if not exists tpcds_carbon2.date_dim
(
    d_date_sk                 int               ,
    d_date_id                 char(16)              ,
    d_date                    date                          ,
    d_month_seq               int                       ,
    d_week_seq                int                       ,
    d_quarter_seq             int                       ,
    d_year                    int                       ,
    d_dow                     int                       ,
    d_moy                     int                       ,
    d_dom                     int                       ,
    d_qoy                     int                       ,
    d_fy_year                 int                       ,
    d_fy_quarter_seq          int                       ,
    d_fy_week_seq             int                       ,
    d_day_name                char(9)                       ,
    d_quarter_name            char(6)                       ,
    d_holiday                 char(1)                       ,
    d_weekend                 char(1)                       ,
    d_following_holiday       char(1)                       ,
    d_first_dom               int                       ,
    d_last_dom                int                       ,
    d_same_day_ly             int                       ,
    d_same_day_lq             int                       ,
    d_current_day             char(1)                       ,
    d_current_week            char(1)                       ,
    d_current_month           char(1)                       ,
    d_current_quarter         char(1)                       ,
    d_current_year            char(1)        
)
STORED BY 'carbondata'
TBLPROPERTIES ('DICTIONARY_INCLUDE'='d_date_sk,d_moy'
,'DICTIONARY_EXCLUDE'='d_current_day,d_current_week,d_current_month
,d_current_quarter,d_current_year,d_day_name,d_quarter_name,d_holiday,d_weekend,d_following_holiday'
,'SORT_COLUMNS'='d_year')
 

create table if not exists tpcds_carbon2.store_sales
(
    ss_sold_date_sk           int                       ,
    ss_sold_time_sk           int                       ,
    ss_item_sk                int               ,
    ss_customer_sk            int                       ,
    ss_cdemo_sk               int                       ,
    ss_hdemo_sk               int                       ,
    ss_addr_sk                int                       ,
    ss_store_sk               int                       ,
    ss_promo_sk               int                       ,
    ss_ticket_number          int               ,
    ss_quantity               int                       ,
    ss_wholesale_cost         decimal(7,2)                  ,
    ss_list_price             decimal(7,2)                  ,
    ss_sales_price            decimal(7,2)                  ,
    ss_ext_discount_amt       decimal(7,2)                  ,
    ss_ext_sales_price        decimal(7,2)                  ,
    ss_ext_wholesale_cost     decimal(7,2)                  ,
    ss_ext_list_price         decimal(7,2)                  ,
    ss_ext_tax                decimal(7,2)                  ,
    ss_coupon_amt             decimal(7,2)                  ,
    ss_net_paid               decimal(7,2)                  ,
    ss_net_paid_inc_tax       decimal(7,2)                  ,
    ss_net_profit             decimal(7,2) 
)
STORED BY 'carbondata'
TBLPROPERTIES ('DICTIONARY_INCLUDE'='ss_item_sk,ss_ticket_number,ss_sold_date_sk')
 

create table if not exists tpcds_carbon2.item
(
    i_item_sk                 int               ,
    i_item_id                 char(16)              ,
    i_rec_start_date          date                          ,
    i_rec_end_date            date                          ,
    i_item_desc               varchar(200)                  ,
    i_current_price           decimal(7,2)                  ,
    i_wholesale_cost          decimal(7,2)                  , 
    i_brand_id                int                       ,
    i_brand                   char(50)                      ,
    i_class_id                int                       ,
    i_class                   char(50)                      ,
    i_category_id             int                       ,
    i_category                char(50)                      ,
    i_manufact_id             int                       ,
    i_manufact                char(50)                      ,
    i_size                    char(20)                      ,
    i_formulation             char(20)                      ,
    i_color                   char(20)                      ,
    i_units                   char(10)                      ,
    i_container               char(10)                      ,
    i_manager_id              int                       ,
    i_product_name            char(50)            
)
STORED BY 'carbondata'
TBLPROPERTIES ('DICTIONARY_INCLUDE'='i_item_sk,i_brand_id,i_class_id,i_category_id,i_manufact_id'
,'DICTIONARY_EXCLUDE'='i_item_desc,i_brand,i_class,i_category,i_manufact'
,'SORT_COLUMNS'='i_brand_id')
 
anyone know how to improve performance?
 
2017-08-28

lk_hadoop



--
Thanks & Regards,
Ravi