Hi,
       I am a analyst from banggood.com,China.We have use impala to optimize 
hive or spark's work.But we found impala don't support aggregate function.There 
are some example below.While statement like 'set APPX_COUNT_DISTINCT=true; ' is 
not so sure for us to use.I would like to hear from you.Thank you!

https://segmentfault.com/a/1190000007530586


--复购率
set mapreduce.job.queuename=os;
select a.date1,
count(distinct a.customers_id) users,
count(distinct case when datediff(b.date1,a.date1) between 1 and 7 then 
b.customers_id end) rebuy_1,
count(distinct case when datediff(b.date1,a.date1) between 8 and 15 then 
b.customers_id end) rebuy_2,
count(distinct case when datediff(b.date1,a.date1) between 16 and 30 then 
b.customers_id end) rebuy_3,
count(distinct case when datediff(b.date1,a.date1) between 31 and 60 then 
b.customers_id end) rebuy_4,
count(distinct case when datediff(b.date1,a.date1) between 61 and 90 then 
b.customers_id end) rebuy_5,
count(distinct case when datediff(b.date1,a.date1) between 91 and 120 then 
b.customers_id end) rebuy_6,
count(distinct case when datediff(b.date1,a.date1) between 121 and 180 then 
b.customers_id end) rebuy_7,
count(distinct case when datediff(b.date1,a.date1) between 181 and 360 then 
b.customers_id end) rebuy_8,
count(distinct case when datediff(b.date1,a.date1) > 361 then b.customers_id 
end) rebuy_9
from
(select from_unixtime(add_time,'yyyy-MM-dd') date1,customers_id,row_number() 
over(partition by customers_id order by from_unixtime(add_time,'yyyy-MM-dd')) 
rown
from datacube.dc_orders
where from_unixtime(add_time,'yyyy-MM-dd') >= '2017-01-01' 
and domain = 1
and order_type not in ('dropship','dropshipping','wholesale','gift','snatch')
and customers_id <> 0
and orders_status NOT IN (1 , 4, 6, 12, 17, 20, 21, 22, 23, 27)
-- and site in ('pwa.yoins.com','android','ios','www.yoins.com','m.yoins.com')
group by from_unixtime(add_time,'yyyy-MM-dd'),customers_id
) a
left outer join
(
select from_unixtime(add_time,'yyyy-MM-dd') as date1,customers_id,row_number() 
over(partition by customers_id order by from_unixtime(add_time,'yyyy-MM-dd')) 
as rown
from datacube.dc_orders
where from_unixtime(add_time,'yyyy-MM-dd') >= '2017-01-01' 
and domain = 1
and order_type not in ('dropship','dropshipping','wholesale','gift','snatch')
and customers_id <> 0
and orders_status NOT IN (1 , 4, 6, 12, 17, 20, 21, 22, 23, 27)
-- and site in ('pwa.yoins.com','android','ios','www.yoins.com','m.yoins.com')
group by from_unixtime(add_time,'yyyy-MM-dd'),customers_id
) b
on (a.customers_id=b.customers_id and a.rown=b.rown-1)
group by a.date1
order by date1
;


chenbix...@banggood.com

Reply via email to