[ https://issues.apache.org/jira/browse/ASTERIXDB-1852?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Wenhai updated ASTERIXDB-1852: ------------------------------ Description: In a binary join query, the optimizer should generate a hash join plan. Due to the two functions has been used before the join condition, we get a wrong plan that employs the nested loop join. Schema {noformat} drop dataverse test if exists; create dataverse test; use dataverse test; create type DBLPType as closed { id : bigint, dblpid : string, title : string, authors : string, misc : string } create type CSXType as closed { id : bigint, csxid : string, title : string, authors : string, misc : string } create dataset DBLP(DBLPType) primary key id; create dataset CSX(CSXType) primary key id; {noformat} One function-nested query {noformat} use dataverse test set import-private-functions 'true' for $paperDBLP in dataset('DBLP') let $idDBLP := $paperDBLP.id let $unrankedTokensDBLP := word-tokens($paperDBLP.title) let $lenDBLP := len($unrankedTokensDBLP) let $tokensDBLP := for $token in $unrankedTokensDBLP for $tokenRanked at $i in // // -- - Stage 1 - -- // for $paper in dataset('DBLP') let $id := $paper.id for $token in word-tokens($paper.title) /*+ hash */ group by $tokenGroupped := $token with $id /*+ inmem 16 16384 */ order by count($id), $tokenGroupped return $tokenGroupped where $token = /*+ bcast */ $tokenRanked order by $i return $i for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, prefix-len-jaccard(len($tokensDBLP), .8f)) for $paperCSX in dataset('CSX') let $idCSX := $paperCSX.id let $unrankedTokensCSX := word-tokens($paperCSX.title) let $lenCSX := len($unrankedTokensCSX) let $tokensCSX := for $token in $unrankedTokensCSX for $tokenRanked at $i in // // -- - Stage 1 - -- // for $paper in dataset('DBLP') let $id := $paper.id for $token in word-tokens($paper.title) /*+ hash */ group by $tokenGroupped := $token with $id /*+ inmem 16 16384 */ order by count($id), $tokenGroupped return $tokenGroupped where $token = /*+ bcast */ $tokenRanked order by $i return $i let $csxLen := prefix-len-jaccard(len($unrankedTokensCSX), .8f) for $prefixTokenCSX in subset-collection($tokensCSX, 0, $csxLen) where $prefixTokenDBLP = $prefixTokenCSX let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX, .8f) where $sim[1] >= .8f /*+ hash */ group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim return {'idDBLP': $idDBLP, 'idCSX': $idCSX} {noformat} and another nested functions-based equal query {noformat} use dataverse test set import-private-functions 'true' for $paperDBLP in dataset('DBLP') let $idDBLP := $paperDBLP.id let $unrankedTokensDBLP := word-tokens($paperDBLP.title) let $lenDBLP := len($unrankedTokensDBLP) let $tokensDBLP := for $token in $unrankedTokensDBLP for $tokenRanked at $i in // // -- - Stage 1 - -- // for $paper in dataset('DBLP') let $id := $paper.id for $token in word-tokens($paper.title) /*+ hash */ group by $tokenGroupped := $token with $id /*+ inmem 16 16384 */ order by count($id), $tokenGroupped return $tokenGroupped where $token = /*+ bcast */ $tokenRanked order by $i return $i for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, prefix-len-jaccard(len($tokensDBLP), .8f)) for $paperCSX in dataset('CSX') let $idCSX := $paperCSX.id let $unrankedTokensCSX := word-tokens($paperCSX.title) let $lenCSX := len($unrankedTokensCSX) let $tokensCSX := for $token in $unrankedTokensCSX for $tokenRanked at $i in // // -- - Stage 1 - -- // for $paper in dataset('DBLP') let $id := $paper.id for $token in word-tokens($paper.title) /*+ hash */ group by $tokenGroupped := $token with $id /*+ inmem 16 16384 */ order by count($id), $tokenGroupped return $tokenGroupped where $token = /*+ bcast */ $tokenRanked order by $i return $i for $prefixTokenCSX in subset-collection($tokensCSX, 0, prefix-len-jaccard(len($unrankedTokensCSX), .8f)) where $prefixTokenDBLP = $prefixTokenCSX let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX, .8f) where $sim[1] >= .8f /*+ hash */ group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim return {'idDBLP': $idDBLP, 'idCSX': $idCSX} {noformat} generate different plans, where the later will produce a nested loop join. was: In a binary join query, the optimizer should generate a hash join plan. Due to the two functions has been used before the join condition, we get a wrong plan that employs the nested loop join. Schema {noformat} drop dataverse test if exists; create dataverse test; use dataverse test; create type DBLPType as closed { id : bigint, dblpid : string, title : string, authors : string, misc : string } create type CSXType as closed { id : bigint, csxid : string, title : string, authors : string, misc : string } create dataset DBLP(DBLPType) primary key id; create dataset CSX(CSXType) primary key id; {noformat} One function-nested query {noformat} use dataverse test set import-private-functions 'true' for $paperDBLP in dataset('DBLP') let $idDBLP := $paperDBLP.id let $unrankedTokensDBLP := word-tokens($paperDBLP.title) for $prefixTokenDBLP in subset-collection($unrankedTokensDBLP, 0, len($unrankedTokensDBLP)) for $paperCSX in dataset('CSX') let $idCSX := $paperCSX.id let $unrankedTokensCSX := word-tokens($paperCSX.title) for $prefixTokenCSX in subset-collection($unrankedTokensCSX, 0, len($unrankedTokensCSX)) where $prefixTokenDBLP = $prefixTokenCSX let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX, .8f) where $sim[1] >= .8f /*+ hash */ group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim return {'idDBLP': $idDBLP, 'idCSX': $idCSX} {noformat} will generate the hash-based join plan {noformat} distribute result [%0->$$29] -- DISTRIBUTE_RESULT |PARTITIONED| exchange -- ONE_TO_ONE_EXCHANGE |PARTITIONED| project ([$$29]) -- STREAM_PROJECT |PARTITIONED| assign [$$29] <- [function-call: asterix:closed-object-constructor, Args:[AString: {idDBLP}, %0->$$9, AString: {idCSX}, %0->$$10]] -- ASSIGN |PARTITIONED| exchange -- ONE_TO_ONE_EXCHANGE |PARTITIONED| group by ([$$9 := %0->$$30; $$10 := %0->$$31]) decor ([]) { aggregate [] <- [] -- AGGREGATE |LOCAL| nested tuple source -- NESTED_TUPLE_SOURCE |LOCAL| } -- EXTERNAL_GROUP_BY[$$30, $$31] |PARTITIONED| exchange -- HASH_PARTITION_EXCHANGE [$$30, $$31] |PARTITIONED| project ([$$30, $$31]) -- STREAM_PROJECT |PARTITIONED| select (function-call: algebricks:ge, Args:[function-call: asterix:get-item, Args:[function-call: asterix:similarity-jaccard-check, Args:[%0->$$2, %0->$$6, AFloat: {0.8}], AInt64: {1}], AFloat: {0.8}]) -- STREAM_SELECT |PARTITIONED| project ([$$2, $$6, $$30, $$31]) -- STREAM_PROJECT |PARTITIONED| exchange -- ONE_TO_ONE_EXCHANGE |PARTITIONED| join (function-call: algebricks:eq, Args:[%0->$$3, %0->$$7]) -- HYBRID_HASH_JOIN [$$3][$$7] |PARTITIONED| exchange -- HASH_PARTITION_EXCHANGE [$$3] |PARTITIONED| unnest $$3 <- function-call: asterix:subset-collection, Args:[%0->$$2, AInt64: {0}, function-call: asterix:len, Args:[%0->$$2]] -- UNNEST |PARTITIONED| project ([$$2, $$30]) -- STREAM_PROJECT |PARTITIONED| assign [$$2] <- [function-call: asterix:word-tokens, Args:[function-call: asterix:field-access-by-index, Args:[%0->$$0, AInt32: {2}]]] -- ASSIGN |PARTITIONED| exchange -- ONE_TO_ONE_EXCHANGE |PARTITIONED| data-scan []<-[$$30, $$0] <- test:DBLP -- DATASOURCE_SCAN |PARTITIONED| exchange -- ONE_TO_ONE_EXCHANGE |PARTITIONED| empty-tuple-source -- EMPTY_TUPLE_SOURCE |PARTITIONED| exchange -- HASH_PARTITION_EXCHANGE [$$7] |PARTITIONED| unnest $$7 <- function-call: asterix:subset-collection, Args:[%0->$$6, AInt64: {0}, function-call: asterix:len, Args:[%0->$$6]] -- UNNEST |PARTITIONED| project ([$$6, $$31]) -- STREAM_PROJECT |PARTITIONED| assign [$$6] <- [function-call: asterix:word-tokens, Args:[function-call: asterix:field-access-by-index, Args:[%0->$$4, AInt32: {2}]]] -- ASSIGN |PARTITIONED| exchange -- ONE_TO_ONE_EXCHANGE |PARTITIONED| data-scan []<-[$$31, $$4] <- test:CSX -- DATASOURCE_SCAN |PARTITIONED| exchange -- ONE_TO_ONE_EXCHANGE |PARTITIONED| empty-tuple-source -- EMPTY_TUPLE_SOURCE |PARTITIONED| {noformat} In contrast, when we conduct two functions in the query like {noformat} use dataverse test set import-private-functions 'true' for $paperDBLP in dataset('DBLP') let $idDBLP := $paperDBLP.id let $unrankedTokensDBLP := word-tokens($paperDBLP.title) for $prefixTokenDBLP in subset-collection($unrankedTokensDBLP, 0, abs(len($unrankedTokensDBLP))) for $paperCSX in dataset('CSX') let $idCSX := $paperCSX.id let $unrankedTokensCSX := word-tokens($paperCSX.title) for $prefixTokenCSX in subset-collection($unrankedTokensCSX, 0, abs(len($unrankedTokensDBLP))) where $prefixTokenDBLP = $prefixTokenCSX let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX, .8f) where $sim[1] >= .8f /*+ hash */ group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim return {'idDBLP': $idDBLP, 'idCSX': $idCSX} {noformat} We got a wrong plan with a nested loop join as follows {noformat} distribute result [%0->$$31] -- DISTRIBUTE_RESULT |PARTITIONED| exchange -- ONE_TO_ONE_EXCHANGE |PARTITIONED| project ([$$31]) -- STREAM_PROJECT |PARTITIONED| assign [$$31] <- [function-call: asterix:closed-object-constructor, Args:[AString: {idDBLP}, %0->$$9, AString: {idCSX}, %0->$$10]] -- ASSIGN |PARTITIONED| exchange -- ONE_TO_ONE_EXCHANGE |PARTITIONED| group by ([$$9 := %0->$$32; $$10 := %0->$$33]) decor ([]) { aggregate [] <- [] -- AGGREGATE |LOCAL| nested tuple source -- NESTED_TUPLE_SOURCE |LOCAL| } -- EXTERNAL_GROUP_BY[$$32, $$33] |PARTITIONED| exchange -- HASH_PARTITION_EXCHANGE [$$32, $$33] |PARTITIONED| project ([$$32, $$33]) -- STREAM_PROJECT |PARTITIONED| select (function-call: algebricks:eq, Args:[%0->$$3, %0->$$7]) -- STREAM_SELECT |PARTITIONED| project ([$$32, $$33, $$3, $$7]) -- STREAM_PROJECT |PARTITIONED| unnest $$7 <- function-call: asterix:subset-collection, Args:[%0->$$6, AInt64: {0}, function-call: asterix:abs, Args:[function-call: asterix:len, Args:[%0->$$2]]] -- UNNEST |PARTITIONED| exchange -- ONE_TO_ONE_EXCHANGE |PARTITIONED| join (function-call: algebricks:ge, Args:[function-call: asterix:get-item, Args:[function-call: asterix:similarity-jaccard-check, Args:[%0->$$2, %0->$$6, AFloat: {0.8}], AInt64: {1}], AFloat: {0.8}]) -- NESTED_LOOP |PARTITIONED| exchange -- ONE_TO_ONE_EXCHANGE |PARTITIONED| unnest $$3 <- function-call: asterix:subset-collection, Args:[%0->$$2, AInt64: {0}, function-call: asterix:abs, Args:[function-call: asterix:len, Args:[%0->$$2]]] -- UNNEST |PARTITIONED| project ([$$32, $$2]) -- STREAM_PROJECT |PARTITIONED| assign [$$2] <- [function-call: asterix:word-tokens, Args:[function-call: asterix:field-access-by-index, Args:[%0->$$0, AInt32: {2}]]] -- ASSIGN |PARTITIONED| exchange -- ONE_TO_ONE_EXCHANGE |PARTITIONED| data-scan []<-[$$32, $$0] <- test:DBLP -- DATASOURCE_SCAN |PARTITIONED| exchange -- ONE_TO_ONE_EXCHANGE |PARTITIONED| empty-tuple-source -- EMPTY_TUPLE_SOURCE |PARTITIONED| exchange -- BROADCAST_EXCHANGE |PARTITIONED| project ([$$33, $$6]) -- STREAM_PROJECT |PARTITIONED| assign [$$6] <- [function-call: asterix:word-tokens, Args:[function-call: asterix:field-access-by-index, Args:[%0->$$4, AInt32: {2}]]] -- ASSIGN |PARTITIONED| exchange -- ONE_TO_ONE_EXCHANGE |PARTITIONED| data-scan []<-[$$33, $$4] <- test:CSX -- DATASOURCE_SCAN |PARTITIONED| exchange -- ONE_TO_ONE_EXCHANGE |PARTITIONED| empty-tuple-source -- EMPTY_TUPLE_SOURCE |PARTITIONED| {noformat} > Nested function poses wrong nested loop plan. > --------------------------------------------- > > Key: ASTERIXDB-1852 > URL: https://issues.apache.org/jira/browse/ASTERIXDB-1852 > Project: Apache AsterixDB > Issue Type: Bug > Components: AsterixDB > Environment: MAC Linux > Reporter: Wenhai > Assignee: Wenhai > > In a binary join query, the optimizer should generate a hash join plan. Due > to the two functions has been used before the join condition, we get a wrong > plan that employs the nested loop join. > Schema > {noformat} > drop dataverse test if exists; > create dataverse test; > use dataverse test; > create type DBLPType as > closed { > id : bigint, > dblpid : string, > title : string, > authors : string, > misc : string > } > create type CSXType as > closed { > id : bigint, > csxid : string, > title : string, > authors : string, > misc : string > } > create dataset DBLP(DBLPType) primary key id; > create dataset CSX(CSXType) primary key id; > {noformat} > One function-nested query > {noformat} > use dataverse test > set import-private-functions 'true' > for $paperDBLP in dataset('DBLP') > let $idDBLP := $paperDBLP.id > let $unrankedTokensDBLP := word-tokens($paperDBLP.title) > let $lenDBLP := len($unrankedTokensDBLP) > let $tokensDBLP := > for $token in $unrankedTokensDBLP > for $tokenRanked at $i in > // > // -- - Stage 1 - -- > // > for $paper in dataset('DBLP') > let $id := $paper.id > for $token in word-tokens($paper.title) > /*+ hash */ > group by $tokenGroupped := $token with $id > /*+ inmem 16 16384 */ > order by count($id), $tokenGroupped > return $tokenGroupped > where $token = /*+ bcast */ $tokenRanked > order by $i > return $i > for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, > prefix-len-jaccard(len($tokensDBLP), .8f)) > for $paperCSX in dataset('CSX') > let $idCSX := $paperCSX.id > let $unrankedTokensCSX := word-tokens($paperCSX.title) > let $lenCSX := len($unrankedTokensCSX) > let $tokensCSX := > for $token in $unrankedTokensCSX > for $tokenRanked at $i in > // > // -- - Stage 1 - -- > // > for $paper in dataset('DBLP') > let $id := $paper.id > for $token in word-tokens($paper.title) > /*+ hash */ > group by $tokenGroupped := $token with $id > /*+ inmem 16 16384 */ > order by count($id), $tokenGroupped > return $tokenGroupped > where $token = /*+ bcast */ $tokenRanked > order by $i > return $i > let $csxLen := prefix-len-jaccard(len($unrankedTokensCSX), .8f) > for $prefixTokenCSX in subset-collection($tokensCSX, 0, $csxLen) > where $prefixTokenDBLP = $prefixTokenCSX > let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX, > .8f) > where $sim[1] >= .8f > /*+ hash */ > group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim > return {'idDBLP': $idDBLP, 'idCSX': $idCSX} > {noformat} > and another nested functions-based equal query > {noformat} > use dataverse test > set import-private-functions 'true' > for $paperDBLP in dataset('DBLP') > let $idDBLP := $paperDBLP.id > let $unrankedTokensDBLP := word-tokens($paperDBLP.title) > let $lenDBLP := len($unrankedTokensDBLP) > let $tokensDBLP := > for $token in $unrankedTokensDBLP > for $tokenRanked at $i in > // > // -- - Stage 1 - -- > // > for $paper in dataset('DBLP') > let $id := $paper.id > for $token in word-tokens($paper.title) > /*+ hash */ > group by $tokenGroupped := $token with $id > /*+ inmem 16 16384 */ > order by count($id), $tokenGroupped > return $tokenGroupped > where $token = /*+ bcast */ $tokenRanked > order by $i > return $i > for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, > prefix-len-jaccard(len($tokensDBLP), .8f)) > for $paperCSX in dataset('CSX') > let $idCSX := $paperCSX.id > let $unrankedTokensCSX := word-tokens($paperCSX.title) > let $lenCSX := len($unrankedTokensCSX) > let $tokensCSX := > for $token in $unrankedTokensCSX > for $tokenRanked at $i in > // > // -- - Stage 1 - -- > // > for $paper in dataset('DBLP') > let $id := $paper.id > for $token in word-tokens($paper.title) > /*+ hash */ > group by $tokenGroupped := $token with $id > /*+ inmem 16 16384 */ > order by count($id), $tokenGroupped > return $tokenGroupped > where $token = /*+ bcast */ $tokenRanked > order by $i > return $i > for $prefixTokenCSX in subset-collection($tokensCSX, 0, > prefix-len-jaccard(len($unrankedTokensCSX), .8f)) > where $prefixTokenDBLP = $prefixTokenCSX > let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX, > .8f) > where $sim[1] >= .8f > /*+ hash */ > group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim > return {'idDBLP': $idDBLP, 'idCSX': $idCSX} > {noformat} > generate different plans, where the later will produce a nested loop join. -- This message was sent by Atlassian JIRA (v6.3.15#6346)