3.
This is a working code for consecutive values.
MyColumn should be a column (or list of columns) with good uniformed
distribution.
with group_rows
as
(
select abs(hash(MyColumn))%10000 as group_id
,count (*) as cnt
from INTER_ETL
group by abs(hash(MyColumn))%10000
)
,group_rows_accumulated
as
(
select g1.group_id
,sum (g2.cnt) - min (g1.cnt) as accumulated_rows
from
group_rows as g1
cross join group_rows as g2
where g2.group_id <= g1.group_id
group by g1.group_id
)
select t.*
,row_number () over (partition by a.group_id order by null) +
a.accumulated_rows as ETL_ROW_ID
from INTER_ETL as t
join group_rows_accumulated as a
on a.group_id =
abs(hash(MyColumn))%10000
;
From: Markovitz, Dudu [mailto:[email protected]]
Sent: Thursday, June 30, 2016 12:43 PM
To: [email protected]; [email protected]
Subject: RE: Query Performance Issue : Group By and Distinct and load on reducer
1.
This works.
I’ve recalled that the CAST is needed since FLOOR defaults to FLOAT.
select (cast (floor(r*1000000) as bigint)+ 1) + 1000000L * (row_number ()
over (partition by (cast (floor(r*1000000) as bigint) + 1) order by null) - 1)
as ETL_ROW_ID
from (select *,rand() as r from INTER_ETL) as t
;
Here is a test result from our dev system
select min (ETL_ROW_ID) as min_ETL_ROW_ID
,count (ETL_ROW_ID) as count_ETL_ROW_ID
,max (ETL_ROW_ID) as max_ETL_ROW_ID
from (select (cast (floor(r*1000000) as bigint)+ 1) + 1000000L *
(row_number () over (partition by (cast (floor(r*1000000) as bigint) + 1) order
by null) - 1) as ETL_ROW_ID
from (select *,rand() as r from INTER_ETL) as t
)
as t
;
min_ETL_ROW_ID
count_ETL_ROW_ID
max_ETL_ROW_ID
1
39567412227
40529759537
From: Markovitz, Dudu [mailto:[email protected]]
Sent: Wednesday, June 29, 2016 11:37 PM
To: [email protected]<mailto:[email protected]>
Cc: [email protected]<mailto:[email protected]>
Subject: RE: Query Performance Issue : Group By and Distinct and load on reducer
1.
This is strange.
The negative numbers are due to overflow of the ‘int’ type, but for that reason
exactly I’ve casted the expressions in my code to ‘bigint’.
I’ve tested this code before sending it to you and it worked fine, returning
results that are beyond the range of the ‘int’ type.
Please try this:
select *
,(floor(r*1000000) + 1) + (1000000L * (row_number () over (partition
by (floor(r*1000000) + 1) order by null) - 1) as ETL_ROW_ID
from (select *,rand() as r from INTER_ETL) as t
;
2.
Great
3.
Sorry, hadn’t had the time to test it (nor the change I’m going to suggest
now…☺)
Please check if the following code works and if so, replace the ‘a’ subquery
code with it.
select a1.group_id
,sum (a2.cnt) - a1.cnt as accum_rows
from (select abs(hash(MyCol1,MyCol2))%1000 as group_id
,count (*) as cnt
from INTER_ETL
group by abs(hash(MyCol1,MyCol2))%1000
)
as a1
cross join (select abs(hash(MyCol1,MyCol2))%1000 as group_id
,count (*) as cnt
from INTER_ETL
group by abs(hash(MyCol1,MyCol2))%1000
)
as a2
where a2.group_id <= a1.group_id
group by a1.group_id
;
From: @Sanjiv Singh [mailto:[email protected]]
Sent: Wednesday, June 29, 2016 10:55 PM
To: Markovitz, Dudu <[email protected]<mailto:[email protected]>>
Cc: [email protected]<mailto:[email protected]>
Subject: Re: Query Performance Issue : Group By and Distinct and load on reducer
Hi Dudu,
I tried the same on same table which has 6357592675 rows. See response of all
three.
I tried 1st one , its giving duplicates for rows.
> CREATE TEMPORARY TABLE INTER_ETL_T AS
select *
,cast (floor(r*1000000) + 1 as bigint) + (1000000 * (row_number () over
(partition by cast (floor(r*1000000) + 1 as bigint) order by null) - 1)) as
ROW_NUM
from (select *,rand() as r from INTER_ETL) as t ;
> select ROW_NUM, count(*) from INTER_ETL_T by ROW_NUM having count(*) > 1
> limit 10;
+--------------+------+--+
| ROW_NUM| _c1 |
+--------------+------+--+
| -2146932303 | 2 |
| -2146924922 | 2 |
| -2146922710 | 2 |
| -2146901450 | 2 |
| -2146897115 | 2 |
| -2146874805 | 2 |
| -2146869449 | 2 |
| -2146865918 | 2 |
| -2146864595 | 2 |
| -2146857688 | 2 |
+--------------+------+--+
On 2nd one, it is not giving any duplicate and was much faster than
ROW_NUMBER() atleast.
numRows=6357592675, totalSize=405516934422, rawDataSize=399159341747
And on 3rd for consecutive number, query is not compatible to HIVE.
CREATE TEMPORARY TABLE INTER_ETL_T AS
select *
,a.accum_rows + row_number () over (partition by
abs(hash(t.m_d_key,t.s_g_key))%10000 order by null) as ROW_NUM
from INTER_ETL as t
join (select abs(hash(m_d_key,s_g_key))%10000 as group_id
,sum (count (*)) over (order by m_d_key,s_g_key rows between unbounded
preceding and 1 preceding) - count(*) as accum_rows
from INTER_ETL
group by abs(hash(m_d_key,s_g_key))%10000
) as a
on a.group_id = abs(hash(t.m_d_key,t.s_g_key))%10000
;
Error :
Error: Error while compiling statement: FAILED: SemanticException End of a
WindowFrame cannot be UNBOUNDED PRECEDING (state=42000,code=40000)
Regards
Sanjiv Singh
Mob : +091 9990-447-339
On Tue, Jun 28, 2016 at 6:16 PM, @Sanjiv Singh
<[email protected]<mailto:[email protected]>> wrote:
thanks a lot.
let me give it a try.
Regards
Sanjiv Singh
Mob : +091 9990-447-339
On Tue, Jun 28, 2016 at 5:32 PM, Markovitz, Dudu
<[email protected]<mailto:[email protected]>> wrote:
There’s a distributed algorithm for windows function that is based on the ORDER
BY clause rather than the PARTITION BY clause.
I doubt if is implemented in Hive, but it’s worth a shot.
select *
,row_number () over (order by rand()) as ETL_ROW_ID
from INTER_ETL
;
For unique, not consecutive values you can try this:
select *
,cast (floor(r*1000000) + 1 as bigint) + (1000000 * (row_number ()
over (partition by cast (floor(r*1000000) + 1 as bigint) order by null) - 1))
as ETL_ROW_ID
from (select *,rand() as r from INTER_ETL) as t
;
If you have in your table a column/combination of columns with unified
distribution you can also do something like this:
select *
, (abs(hash(MyCol1,MyCol2))%1000000 + 1) + (row_number () over
(partition by (abs(hash(MyCol1,MyCol2))%1000000 + 1) order by null) - 1) *
1000000L as ETL_ROW_ID
from INTER_ETL
;
For consecutive values you can do something (ugly…) like this:
select *
,a.accum_rows + row_number () over (partition by
abs(hash(t.MyCol1,t.MyCol2))%10000 order by null) as ETL_ROW_ID
from INTER_ETL as t
join (select abs(hash(MyCol1,MyCol2))%10000
as group_id
,sum (count (*)) over (order by
MyCol1,MyCol2 rows between unbounded preceding and 1 preceding) - count(*) as
accum_rows
from INTER_ETL
group by abs(hash(MyCol1,MyCol2))%10000
)
as a
on a.group_id = abs(hash(t.MyCol1,t.MyCol2))%10000
;
From: @Sanjiv Singh
[mailto:[email protected]<mailto:[email protected]>]
Sent: Tuesday, June 28, 2016 11:52 PM
To: Markovitz, Dudu <[email protected]<mailto:[email protected]>>
Cc: [email protected]<mailto:[email protected]>
Subject: Re: Query Performance Issue : Group By and Distinct and load on reducer
ETL_ROW_ID is to be consecutive number. I need to check if having unique number
would not break any logic.
Considering unique number for ETL_ROW_ID column, what are optimum options
available?
What id it has to be consecutive number only?
Regards
Sanjiv Singh
Mob : +091 9990-447-339
On Tue, Jun 28, 2016 at 4:17 PM, Markovitz, Dudu
<[email protected]<mailto:[email protected]>> wrote:
I’m guessing ETL_ROW_ID should be unique but not necessarily contain only
consecutive numbers?
From: @Sanjiv Singh
[mailto:[email protected]<mailto:[email protected]>]
Sent: Tuesday, June 28, 2016 10:57 PM
To: Markovitz, Dudu <[email protected]<mailto:[email protected]>>
Cc: [email protected]<mailto:[email protected]>
Subject: Re: Query Performance Issue : Group By and Distinct and load on reducer
Hi Dudu,
You are correct ...ROW_NUMBER() is main culprit.
ROW_NUMBER() OVER Not Fast Enough With Large Result Set, any good solution?
Regards
Sanjiv Singh
Mob : +091 9990-447-339
On Tue, Jun 28, 2016 at 3:42 PM, Markovitz, Dudu
<[email protected]<mailto:[email protected]>> wrote:
The row_number operation seems to be skewed.
Dudu
From: @Sanjiv Singh
[mailto:[email protected]<mailto:[email protected]>]
Sent: Tuesday, June 28, 2016 8:54 PM
To: [email protected]<mailto:[email protected]>
Subject: Query Performance Issue : Group By and Distinct and load on reducer
Hi All,
I am having performance issue with data skew of the distinct statement in
Hive<http://stackoverflow.com/questions/37894023/understanding-the-data-skew-of-the-countdistinct-statement-in-hive>.
See below query with DISTINCT operator.
Original Query :
SELECT DISTINCT
SD.REGION
,SD.HEADEND
,SD.NETWORK
,SD.RETAILUNITCODE
,SD.LOGTIMEDATE
,SD.SPOTKEY
,SD.CRE_DT
,CASE
WHEN SD.LOGTIMEDATE IS NULL
THEN 'Y'
ELSE 'N'
END AS DROP_REASON
,ROW_NUMBER() OVER (
ORDER BY NULL
) AS ETL_ROW_ID
FROM INTER_ETL AS SD;
Table INTER_ETL used for query is big enough.
From the logs , it seems that data skew for specific set of values , causing
one of reducer have to do all the job. I tried to achieve the same through
GROUP BY still having the same issue. Help me to understand the issue and
resolution.
Query with Distinct V2 :
CREATE TEMPORARY TABLE ETL_TMP AS
SELECT DISTINCT dt.*
FROM (
SELECT SD.REGION
,SD.HEADEND
,SD.NETWORK
,SD.RETAILUNITCODE
,SD.LOGTIMEDATE
,SD.SPOTKEY
,SD.CRE_DT
,CASE
WHEN SD.LOGTIMEDATE IS NULL
THEN 'Y'
ELSE 'N'
END AS DROP_REASON
,ROW_NUMBER() OVER (
ORDER BY NULL
) AS ETL_ROW_ID
FROM INTER_ETL AS SD
) AS dt;
Logs:
INFO : Map 1: 107/107 Reducer 2: 417(+1)/418 Reducer 3: 0(+56)/418
INFO : Map 1: 107/107 Reducer 2: 417(+1)/418 Reducer 3: 0(+56)/418
INFO : Map 1: 107/107 Reducer 2: 417(+1)/418 Reducer 3: 0(+56)/418
INFO : Map 1: 107/107 Reducer 2: 417(+1)/418 Reducer 3: 0(+56)/418
INFO : Map 1: 107/107 Reducer 2: 417(+1)/418 Reducer 3: 0(+418)/418
Query With Group By:
CREATE TEMPORARY TABLE ETL_TMP AS
SELECT REGION
,HEADEND
,NETWORK
,RETAILUNITCODE
,LOGTIMEDATE
,SPOTKEY
,CRE_DT
,DROP_REASON
,ETL_ROW_ID
FROM (
SELECT SD.REGION
,SD.HEADEND
,SD.NETWORK
,SD.RETAILUNITCODE
,SD.LOGTIMEDATE
,SD.SPOTKEY
,SD.CRE_DT
,CASE
WHEN SD.LOGTIMEDATE IS NULL
THEN 'Y'
ELSE 'N'
END AS DROP_REASON
,ROW_NUMBER() OVER (
ORDER BY NULL
) AS ETL_ROW_ID
FROM INTER_ETL AS SD
) AS dt
GROUP BY
REGION
,HEADEND
,NETWORK
,RETAILUNITCODE
,LOGTIMEDATE
,SPOTKEY
,CRE_DT
,DROP_REASON
,ETL_ROW_ID;
Logs:
INFO : Map 1: 818/818 Reducer 2: 417(+1)/418 Reducer 3: 0(+418)/418
INFO : Map 1: 818/818 Reducer 2: 417(+1)/418 Reducer 3: 0(+418)/418
INFO : Map 1: 818/818 Reducer 2: 417(+1)/418 Reducer 3: 0(+418)/418
INFO : Map 1: 818/818 Reducer 2: 417(+1)/418 Reducer 3: 0(+418)/418
INFO : Map 1: 818/818 Reducer 2: 417(+1)/418 Reducer 3: 0(+418)/418
Table details :
Beeline > dfs -ls /apps/hive/warehouse/PRD_DB.db/INTER_ETL ;
+--------------------------------------------------------------------------------------------------------------------------------------------+--+
| DFS Output
|
+--------------------------------------------------------------------------------------------------------------------------------------------+--+
| Found 15 items
|
| -rwxrwxrwx 3 Z56034 hdfs 2075244899 2016-06-28 10:24
/apps/hive/warehouse/PRD_DB.db/INTER_ETL/000000_0 |
| -rwxrwxrwx 3 Z56034 hdfs 2090030620 2016-06-28 10:24
/apps/hive/warehouse/PRD_DB.db/INTER_ETL/000001_0 |
| -rwxrwxrwx 3 Z56034 hdfs 2025516774 2016-06-28 10:24
/apps/hive/warehouse/PRD_DB.db/INTER_ETL/000002_0 |
| -rwxrwxrwx 3 Z56034 hdfs 1986848213 2016-06-28 10:24
/apps/hive/warehouse/PRD_DB.db/INTER_ETL/000003_0 |
| -rwxrwxrwx 3 Z56034 hdfs 2018883723 2016-06-28 10:24
/apps/hive/warehouse/PRD_DB.db/INTER_ETL/000004_0 |
| -rwxrwxrwx 3 Z56034 hdfs 1984690335 2016-06-28 10:24
/apps/hive/warehouse/PRD_DB.db/INTER_ETL/000005_0 |
| -rwxrwxrwx 3 Z56034 hdfs 1987494444 2016-06-28 10:24
/apps/hive/warehouse/PRD_DB.db/INTER_ETL/000006_0 |
| -rwxrwxrwx 3 Z56034 hdfs 1974674515 2016-06-28 10:24
/apps/hive/warehouse/PRD_DB.db/INTER_ETL/000007_0 |
| -rwxrwxrwx 3 Z56034 hdfs 1963720218 2016-06-28 10:24
/apps/hive/warehouse/PRD_DB.db/INTER_ETL/000008_0 |
| -rwxrwxrwx 3 Z56034 hdfs 1965892384 2016-06-28 10:24
/apps/hive/warehouse/PRD_DB.db/INTER_ETL/000009_0 |
| -rwxrwxrwx 3 Z56034 hdfs 1974272622 2016-06-28 10:24
/apps/hive/warehouse/PRD_DB.db/INTER_ETL/000010_0 |
| -rwxrwxrwx 3 Z56034 hdfs 1971948208 2016-06-28 10:24
/apps/hive/warehouse/PRD_DB.db/INTER_ETL/000011_0 |
| -rwxrwxrwx 3 Z56034 hdfs 1968141886 2016-06-28 10:24
/apps/hive/warehouse/PRD_DB.db/INTER_ETL/000012_0 |
| -rwxrwxrwx 3 Z56034 hdfs 1970930771 2016-06-28 10:24
/apps/hive/warehouse/PRD_DB.db/INTER_ETL/000013_0 |
| -rwxrwxrwx 3 Z56034 hdfs 192820628 2016-06-28 10:23
/apps/hive/warehouse/PRD_DB.db/INTER_ETL/000014_0 |
+--------------------------------------------------------------------------------------------------------------------------------------------+--+
'numFiles'='15',
'numRows'='108363614',
Regards
Sanjiv Singh
Mob : +091 9990-447-339