[
https://issues.apache.org/jira/browse/ASTERIXDB-1852?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Wenhai updated ASTERIXDB-1852:
------------------------------
Description:
In a binary join query, the optimizer should generate a hash join plan. Due to
the two functions has been used before the join condition, we get a wrong plan
that employs the nested loop join.
Schema
{noformat}
drop dataverse test if exists;
create dataverse test;
use dataverse test;
create type DBLPType as
closed {
id : bigint,
dblpid : string,
title : string,
authors : string,
misc : string
}
create type CSXType as
closed {
id : bigint,
csxid : string,
title : string,
authors : string,
misc : string
}
create dataset DBLP(DBLPType) primary key id;
create dataset CSX(CSXType) primary key id;
{noformat}
One function-nested query
{noformat}
use dataverse test
set import-private-functions 'true'
for $paperDBLP in dataset('DBLP')
let $idDBLP := $paperDBLP.id
let $unrankedTokensDBLP := word-tokens($paperDBLP.title)
let $lenDBLP := len($unrankedTokensDBLP)
let $tokensDBLP :=
for $token in $unrankedTokensDBLP
for $tokenRanked at $i in
//
// -- - Stage 1 - --
//
for $paper in dataset('DBLP')
let $id := $paper.id
for $token in word-tokens($paper.title)
/*+ hash */
group by $tokenGroupped := $token with $id
/*+ inmem 16 16384 */
order by count($id), $tokenGroupped
return $tokenGroupped
where $token = /*+ bcast */ $tokenRanked
order by $i
return $i
for $prefixTokenDBLP in subset-collection($tokensDBLP, 0,
prefix-len-jaccard(len($tokensDBLP), .8f))
for $paperCSX in dataset('CSX')
let $idCSX := $paperCSX.id
let $unrankedTokensCSX := word-tokens($paperCSX.title)
let $lenCSX := len($unrankedTokensCSX)
let $tokensCSX :=
for $token in $unrankedTokensCSX
for $tokenRanked at $i in
//
// -- - Stage 1 - --
//
for $paper in dataset('DBLP')
let $id := $paper.id
for $token in word-tokens($paper.title)
/*+ hash */
group by $tokenGroupped := $token with $id
/*+ inmem 16 16384 */
order by count($id), $tokenGroupped
return $tokenGroupped
where $token = /*+ bcast */ $tokenRanked
order by $i
return $i
let $csxLen := prefix-len-jaccard(len($unrankedTokensCSX), .8f)
for $prefixTokenCSX in subset-collection($tokensCSX, 0, $csxLen)
where $prefixTokenDBLP = $prefixTokenCSX
let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX,
.8f)
where $sim[1] >= .8f
/*+ hash */
group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
{noformat}
and another nested functions-based equal query
{noformat}
use dataverse test
set import-private-functions 'true'
for $paperDBLP in dataset('DBLP')
let $idDBLP := $paperDBLP.id
let $unrankedTokensDBLP := word-tokens($paperDBLP.title)
let $lenDBLP := len($unrankedTokensDBLP)
let $tokensDBLP :=
for $token in $unrankedTokensDBLP
for $tokenRanked at $i in
//
// -- - Stage 1 - --
//
for $paper in dataset('DBLP')
let $id := $paper.id
for $token in word-tokens($paper.title)
/*+ hash */
group by $tokenGroupped := $token with $id
/*+ inmem 16 16384 */
order by count($id), $tokenGroupped
return $tokenGroupped
where $token = /*+ bcast */ $tokenRanked
order by $i
return $i
for $prefixTokenDBLP in subset-collection($tokensDBLP, 0,
prefix-len-jaccard(len($tokensDBLP), .8f))
for $paperCSX in dataset('CSX')
let $idCSX := $paperCSX.id
let $unrankedTokensCSX := word-tokens($paperCSX.title)
let $lenCSX := len($unrankedTokensCSX)
let $tokensCSX :=
for $token in $unrankedTokensCSX
for $tokenRanked at $i in
//
// -- - Stage 1 - --
//
for $paper in dataset('DBLP')
let $id := $paper.id
for $token in word-tokens($paper.title)
/*+ hash */
group by $tokenGroupped := $token with $id
/*+ inmem 16 16384 */
order by count($id), $tokenGroupped
return $tokenGroupped
where $token = /*+ bcast */ $tokenRanked
order by $i
return $i
for $prefixTokenCSX in subset-collection($tokensCSX, 0,
prefix-len-jaccard(len($unrankedTokensCSX), .8f))
where $prefixTokenDBLP = $prefixTokenCSX
let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX,
.8f)
where $sim[1] >= .8f
/*+ hash */
group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
{noformat}
generate different plans, where the later will produce a nested loop join.
was:
In a binary join query, the optimizer should generate a hash join plan. Due to
the two functions has been used before the join condition, we get a wrong plan
that employs the nested loop join.
Schema
{noformat}
drop dataverse test if exists;
create dataverse test;
use dataverse test;
create type DBLPType as
closed {
id : bigint,
dblpid : string,
title : string,
authors : string,
misc : string
}
create type CSXType as
closed {
id : bigint,
csxid : string,
title : string,
authors : string,
misc : string
}
create dataset DBLP(DBLPType) primary key id;
create dataset CSX(CSXType) primary key id;
{noformat}
One function-nested query
{noformat}
use dataverse test
set import-private-functions 'true'
for $paperDBLP in dataset('DBLP')
let $idDBLP := $paperDBLP.id
let $unrankedTokensDBLP := word-tokens($paperDBLP.title)
for $prefixTokenDBLP in subset-collection($unrankedTokensDBLP, 0,
len($unrankedTokensDBLP))
for $paperCSX in dataset('CSX')
let $idCSX := $paperCSX.id
let $unrankedTokensCSX := word-tokens($paperCSX.title)
for $prefixTokenCSX in subset-collection($unrankedTokensCSX, 0,
len($unrankedTokensCSX))
where $prefixTokenDBLP = $prefixTokenCSX
let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX,
.8f)
where $sim[1] >= .8f
/*+ hash */
group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
{noformat}
will generate the hash-based join plan
{noformat}
distribute result [%0->$$29]
-- DISTRIBUTE_RESULT |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
project ([$$29])
-- STREAM_PROJECT |PARTITIONED|
assign [$$29] <- [function-call: asterix:closed-object-constructor,
Args:[AString: {idDBLP}, %0->$$9, AString: {idCSX}, %0->$$10]]
-- ASSIGN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
group by ([$$9 := %0->$$30; $$10 := %0->$$31]) decor ([]) {
aggregate [] <- []
-- AGGREGATE |LOCAL|
nested tuple source
-- NESTED_TUPLE_SOURCE |LOCAL|
}
-- EXTERNAL_GROUP_BY[$$30, $$31] |PARTITIONED|
exchange
-- HASH_PARTITION_EXCHANGE [$$30, $$31] |PARTITIONED|
project ([$$30, $$31])
-- STREAM_PROJECT |PARTITIONED|
select (function-call: algebricks:ge, Args:[function-call:
asterix:get-item, Args:[function-call: asterix:similarity-jaccard-check,
Args:[%0->$$2, %0->$$6, AFloat: {0.8}], AInt64: {1}], AFloat: {0.8}])
-- STREAM_SELECT |PARTITIONED|
project ([$$2, $$6, $$30, $$31])
-- STREAM_PROJECT |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
join (function-call: algebricks:eq, Args:[%0->$$3,
%0->$$7])
-- HYBRID_HASH_JOIN [$$3][$$7] |PARTITIONED|
exchange
-- HASH_PARTITION_EXCHANGE [$$3] |PARTITIONED|
unnest $$3 <- function-call:
asterix:subset-collection, Args:[%0->$$2, AInt64: {0}, function-call:
asterix:len, Args:[%0->$$2]]
-- UNNEST |PARTITIONED|
project ([$$2, $$30])
-- STREAM_PROJECT |PARTITIONED|
assign [$$2] <- [function-call:
asterix:word-tokens, Args:[function-call: asterix:field-access-by-index,
Args:[%0->$$0, AInt32: {2}]]]
-- ASSIGN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
data-scan []<-[$$30, $$0] <- test:DBLP
-- DATASOURCE_SCAN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
empty-tuple-source
-- EMPTY_TUPLE_SOURCE |PARTITIONED|
exchange
-- HASH_PARTITION_EXCHANGE [$$7] |PARTITIONED|
unnest $$7 <- function-call:
asterix:subset-collection, Args:[%0->$$6, AInt64: {0}, function-call:
asterix:len, Args:[%0->$$6]]
-- UNNEST |PARTITIONED|
project ([$$6, $$31])
-- STREAM_PROJECT |PARTITIONED|
assign [$$6] <- [function-call:
asterix:word-tokens, Args:[function-call: asterix:field-access-by-index,
Args:[%0->$$4, AInt32: {2}]]]
-- ASSIGN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
data-scan []<-[$$31, $$4] <- test:CSX
-- DATASOURCE_SCAN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
empty-tuple-source
-- EMPTY_TUPLE_SOURCE |PARTITIONED|
{noformat}
In contrast, when we conduct two functions in the query like
{noformat}
use dataverse test
set import-private-functions 'true'
for $paperDBLP in dataset('DBLP')
let $idDBLP := $paperDBLP.id
let $unrankedTokensDBLP := word-tokens($paperDBLP.title)
for $prefixTokenDBLP in subset-collection($unrankedTokensDBLP, 0,
abs(len($unrankedTokensDBLP)))
for $paperCSX in dataset('CSX')
let $idCSX := $paperCSX.id
let $unrankedTokensCSX := word-tokens($paperCSX.title)
for $prefixTokenCSX in subset-collection($unrankedTokensCSX, 0,
abs(len($unrankedTokensDBLP)))
where $prefixTokenDBLP = $prefixTokenCSX
let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX,
.8f)
where $sim[1] >= .8f
/*+ hash */
group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
{noformat}
We got a wrong plan with a nested loop join as follows
{noformat}
distribute result [%0->$$31]
-- DISTRIBUTE_RESULT |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
project ([$$31])
-- STREAM_PROJECT |PARTITIONED|
assign [$$31] <- [function-call: asterix:closed-object-constructor,
Args:[AString: {idDBLP}, %0->$$9, AString: {idCSX}, %0->$$10]]
-- ASSIGN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
group by ([$$9 := %0->$$32; $$10 := %0->$$33]) decor ([]) {
aggregate [] <- []
-- AGGREGATE |LOCAL|
nested tuple source
-- NESTED_TUPLE_SOURCE |LOCAL|
}
-- EXTERNAL_GROUP_BY[$$32, $$33] |PARTITIONED|
exchange
-- HASH_PARTITION_EXCHANGE [$$32, $$33] |PARTITIONED|
project ([$$32, $$33])
-- STREAM_PROJECT |PARTITIONED|
select (function-call: algebricks:eq, Args:[%0->$$3, %0->$$7])
-- STREAM_SELECT |PARTITIONED|
project ([$$32, $$33, $$3, $$7])
-- STREAM_PROJECT |PARTITIONED|
unnest $$7 <- function-call: asterix:subset-collection,
Args:[%0->$$6, AInt64: {0}, function-call: asterix:abs, Args:[function-call:
asterix:len, Args:[%0->$$2]]]
-- UNNEST |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
join (function-call: algebricks:ge,
Args:[function-call: asterix:get-item, Args:[function-call:
asterix:similarity-jaccard-check, Args:[%0->$$2, %0->$$6, AFloat: {0.8}],
AInt64: {1}], AFloat: {0.8}])
-- NESTED_LOOP |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
unnest $$3 <- function-call:
asterix:subset-collection, Args:[%0->$$2, AInt64: {0}, function-call:
asterix:abs, Args:[function-call: asterix:len, Args:[%0->$$2]]]
-- UNNEST |PARTITIONED|
project ([$$32, $$2])
-- STREAM_PROJECT |PARTITIONED|
assign [$$2] <- [function-call:
asterix:word-tokens, Args:[function-call: asterix:field-access-by-index,
Args:[%0->$$0, AInt32: {2}]]]
-- ASSIGN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
data-scan []<-[$$32, $$0] <- test:DBLP
-- DATASOURCE_SCAN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
empty-tuple-source
-- EMPTY_TUPLE_SOURCE |PARTITIONED|
exchange
-- BROADCAST_EXCHANGE |PARTITIONED|
project ([$$33, $$6])
-- STREAM_PROJECT |PARTITIONED|
assign [$$6] <- [function-call:
asterix:word-tokens, Args:[function-call: asterix:field-access-by-index,
Args:[%0->$$4, AInt32: {2}]]]
-- ASSIGN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
data-scan []<-[$$33, $$4] <- test:CSX
-- DATASOURCE_SCAN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
empty-tuple-source
-- EMPTY_TUPLE_SOURCE |PARTITIONED|
{noformat}
> Nested function poses wrong nested loop plan.
> ---------------------------------------------
>
> Key: ASTERIXDB-1852
> URL: https://issues.apache.org/jira/browse/ASTERIXDB-1852
> Project: Apache AsterixDB
> Issue Type: Bug
> Components: AsterixDB
> Environment: MAC Linux
> Reporter: Wenhai
> Assignee: Wenhai
>
> In a binary join query, the optimizer should generate a hash join plan. Due
> to the two functions has been used before the join condition, we get a wrong
> plan that employs the nested loop join.
> Schema
> {noformat}
> drop dataverse test if exists;
> create dataverse test;
> use dataverse test;
> create type DBLPType as
> closed {
> id : bigint,
> dblpid : string,
> title : string,
> authors : string,
> misc : string
> }
> create type CSXType as
> closed {
> id : bigint,
> csxid : string,
> title : string,
> authors : string,
> misc : string
> }
> create dataset DBLP(DBLPType) primary key id;
> create dataset CSX(CSXType) primary key id;
> {noformat}
> One function-nested query
> {noformat}
> use dataverse test
> set import-private-functions 'true'
> for $paperDBLP in dataset('DBLP')
> let $idDBLP := $paperDBLP.id
> let $unrankedTokensDBLP := word-tokens($paperDBLP.title)
> let $lenDBLP := len($unrankedTokensDBLP)
> let $tokensDBLP :=
> for $token in $unrankedTokensDBLP
> for $tokenRanked at $i in
> //
> // -- - Stage 1 - --
> //
> for $paper in dataset('DBLP')
> let $id := $paper.id
> for $token in word-tokens($paper.title)
> /*+ hash */
> group by $tokenGroupped := $token with $id
> /*+ inmem 16 16384 */
> order by count($id), $tokenGroupped
> return $tokenGroupped
> where $token = /*+ bcast */ $tokenRanked
> order by $i
> return $i
> for $prefixTokenDBLP in subset-collection($tokensDBLP, 0,
> prefix-len-jaccard(len($tokensDBLP), .8f))
> for $paperCSX in dataset('CSX')
> let $idCSX := $paperCSX.id
> let $unrankedTokensCSX := word-tokens($paperCSX.title)
> let $lenCSX := len($unrankedTokensCSX)
> let $tokensCSX :=
> for $token in $unrankedTokensCSX
> for $tokenRanked at $i in
> //
> // -- - Stage 1 - --
> //
> for $paper in dataset('DBLP')
> let $id := $paper.id
> for $token in word-tokens($paper.title)
> /*+ hash */
> group by $tokenGroupped := $token with $id
> /*+ inmem 16 16384 */
> order by count($id), $tokenGroupped
> return $tokenGroupped
> where $token = /*+ bcast */ $tokenRanked
> order by $i
> return $i
> let $csxLen := prefix-len-jaccard(len($unrankedTokensCSX), .8f)
> for $prefixTokenCSX in subset-collection($tokensCSX, 0, $csxLen)
> where $prefixTokenDBLP = $prefixTokenCSX
> let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX,
> .8f)
> where $sim[1] >= .8f
> /*+ hash */
> group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
> return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
> {noformat}
> and another nested functions-based equal query
> {noformat}
> use dataverse test
> set import-private-functions 'true'
> for $paperDBLP in dataset('DBLP')
> let $idDBLP := $paperDBLP.id
> let $unrankedTokensDBLP := word-tokens($paperDBLP.title)
> let $lenDBLP := len($unrankedTokensDBLP)
> let $tokensDBLP :=
> for $token in $unrankedTokensDBLP
> for $tokenRanked at $i in
> //
> // -- - Stage 1 - --
> //
> for $paper in dataset('DBLP')
> let $id := $paper.id
> for $token in word-tokens($paper.title)
> /*+ hash */
> group by $tokenGroupped := $token with $id
> /*+ inmem 16 16384 */
> order by count($id), $tokenGroupped
> return $tokenGroupped
> where $token = /*+ bcast */ $tokenRanked
> order by $i
> return $i
> for $prefixTokenDBLP in subset-collection($tokensDBLP, 0,
> prefix-len-jaccard(len($tokensDBLP), .8f))
> for $paperCSX in dataset('CSX')
> let $idCSX := $paperCSX.id
> let $unrankedTokensCSX := word-tokens($paperCSX.title)
> let $lenCSX := len($unrankedTokensCSX)
> let $tokensCSX :=
> for $token in $unrankedTokensCSX
> for $tokenRanked at $i in
> //
> // -- - Stage 1 - --
> //
> for $paper in dataset('DBLP')
> let $id := $paper.id
> for $token in word-tokens($paper.title)
> /*+ hash */
> group by $tokenGroupped := $token with $id
> /*+ inmem 16 16384 */
> order by count($id), $tokenGroupped
> return $tokenGroupped
> where $token = /*+ bcast */ $tokenRanked
> order by $i
> return $i
> for $prefixTokenCSX in subset-collection($tokensCSX, 0,
> prefix-len-jaccard(len($unrankedTokensCSX), .8f))
> where $prefixTokenDBLP = $prefixTokenCSX
> let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX,
> .8f)
> where $sim[1] >= .8f
> /*+ hash */
> group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
> return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
> {noformat}
> generate different plans, where the later will produce a nested loop join.
--
This message was sent by Atlassian JIRA
(v6.3.15#6346)