[ 
https://issues.apache.org/jira/browse/ASTERIXDB-1852?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Wenhai updated ASTERIXDB-1852:
------------------------------
    Description: 
In a binary join query, the optimizer should generate a hash join plan. Due to 
the two functions has been used before the join condition, we get a wrong plan 
that employs the nested loop join.
Schema
{noformat}
drop  dataverse test if exists;
create  dataverse test;
use dataverse test;
create type DBLPType as
 closed {
  id : bigint,
  dblpid : string,
  title : string,
  authors : string,
  misc : string
}
create type CSXType as
 closed {
  id : bigint,
  csxid : string,
  title : string,
  authors : string,
  misc : string
}
create  dataset DBLP(DBLPType) primary key id;
create  dataset CSX(CSXType) primary key id;
{noformat}
One function-nested query
{noformat}
use dataverse test
set import-private-functions 'true'

for $paperDBLP in dataset('DBLP')
let $idDBLP := $paperDBLP.id
let $unrankedTokensDBLP := word-tokens($paperDBLP.title)
let $lenDBLP := len($unrankedTokensDBLP)
let $tokensDBLP :=
    for $token in $unrankedTokensDBLP
    for $tokenRanked at $i in
        //
        // -- - Stage 1 - --
        //
        for $paper in dataset('DBLP')
        let $id := $paper.id
        for $token in word-tokens($paper.title)
        /*+ hash */
        group by $tokenGroupped := $token with $id
        /*+ inmem 16 16384 */
        order by count($id), $tokenGroupped
        return $tokenGroupped
    where $token = /*+ bcast */ $tokenRanked
    order by $i
    return $i
for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, 
prefix-len-jaccard(len($tokensDBLP), .8f))

for $paperCSX in dataset('CSX')
let $idCSX := $paperCSX.id
let $unrankedTokensCSX := word-tokens($paperCSX.title)
let $lenCSX := len($unrankedTokensCSX)
let $tokensCSX :=
    for $token in $unrankedTokensCSX
    for $tokenRanked at $i in
        //
        // -- - Stage 1 - --
        //
        for $paper in dataset('DBLP')
        let $id := $paper.id
        for $token in word-tokens($paper.title)
        /*+ hash */
        group by $tokenGroupped := $token with $id
        /*+ inmem 16 16384 */
        order by count($id), $tokenGroupped
        return $tokenGroupped
    where $token = /*+ bcast */ $tokenRanked
    order by $i
    return $i
let $csxLen := prefix-len-jaccard(len($unrankedTokensCSX), .8f)
for $prefixTokenCSX in subset-collection($tokensCSX, 0, $csxLen)

where $prefixTokenDBLP = $prefixTokenCSX
let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX, 
.8f)
where $sim[1] >= .8f
/*+ hash */
group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
{noformat}
and another nested functions-based equal query
{noformat}
use dataverse test
set import-private-functions 'true'

for $paperDBLP in dataset('DBLP')
let $idDBLP := $paperDBLP.id
let $unrankedTokensDBLP := word-tokens($paperDBLP.title)
let $lenDBLP := len($unrankedTokensDBLP)
let $tokensDBLP :=
    for $token in $unrankedTokensDBLP
    for $tokenRanked at $i in
        //
        // -- - Stage 1 - --
        //
        for $paper in dataset('DBLP')
        let $id := $paper.id
        for $token in word-tokens($paper.title)
        /*+ hash */
        group by $tokenGroupped := $token with $id
        /*+ inmem 16 16384 */
        order by count($id), $tokenGroupped
        return $tokenGroupped
    where $token = /*+ bcast */ $tokenRanked
    order by $i
    return $i
for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, 
prefix-len-jaccard(len($tokensDBLP), .8f))

for $paperCSX in dataset('CSX')
let $idCSX := $paperCSX.id
let $unrankedTokensCSX := word-tokens($paperCSX.title)
let $lenCSX := len($unrankedTokensCSX)
let $tokensCSX :=
    for $token in $unrankedTokensCSX
    for $tokenRanked at $i in
        //
        // -- - Stage 1 - --
        //
        for $paper in dataset('DBLP')
        let $id := $paper.id
        for $token in word-tokens($paper.title)
        /*+ hash */
        group by $tokenGroupped := $token with $id
        /*+ inmem 16 16384 */
        order by count($id), $tokenGroupped
        return $tokenGroupped
    where $token = /*+ bcast */ $tokenRanked
    order by $i
    return $i
for $prefixTokenCSX in subset-collection($tokensCSX, 0, 
prefix-len-jaccard(len($unrankedTokensCSX), .8f))

where $prefixTokenDBLP = $prefixTokenCSX
let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX, 
.8f)
where $sim[1] >= .8f
/*+ hash */
group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
{noformat}
generate different plans, where the later will produce a nested loop join.

  was:
In a binary join query, the optimizer should generate a hash join plan. Due to 
the two functions has been used before the join condition, we get a wrong plan 
that employs the nested loop join.
Schema
{noformat}
drop  dataverse test if exists;
create  dataverse test;
use dataverse test;
create type DBLPType as
 closed {
  id : bigint,
  dblpid : string,
  title : string,
  authors : string,
  misc : string
}
create type CSXType as
 closed {
  id : bigint,
  csxid : string,
  title : string,
  authors : string,
  misc : string
}
create  dataset DBLP(DBLPType) primary key id;
create  dataset CSX(CSXType) primary key id;
{noformat}
One function-nested query
{noformat}
use dataverse test
set import-private-functions 'true'

for $paperDBLP in dataset('DBLP')
let $idDBLP := $paperDBLP.id
let $unrankedTokensDBLP := word-tokens($paperDBLP.title)
for $prefixTokenDBLP in subset-collection($unrankedTokensDBLP, 0, 
len($unrankedTokensDBLP))

for $paperCSX in dataset('CSX')
let $idCSX := $paperCSX.id
let $unrankedTokensCSX := word-tokens($paperCSX.title)
for $prefixTokenCSX in subset-collection($unrankedTokensCSX, 0, 
len($unrankedTokensCSX))

where $prefixTokenDBLP = $prefixTokenCSX
let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX, 
.8f)
where $sim[1] >= .8f
/*+ hash */
group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
{noformat}
will generate the hash-based join plan
{noformat}
distribute result [%0->$$29]
-- DISTRIBUTE_RESULT  |PARTITIONED|
  exchange
  -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
    project ([$$29])
    -- STREAM_PROJECT  |PARTITIONED|
      assign [$$29] <- [function-call: asterix:closed-object-constructor, 
Args:[AString: {idDBLP}, %0->$$9, AString: {idCSX}, %0->$$10]]
      -- ASSIGN  |PARTITIONED|
        exchange
        -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
          group by ([$$9 := %0->$$30; $$10 := %0->$$31]) decor ([]) {
                    aggregate [] <- []
                    -- AGGREGATE  |LOCAL|
                      nested tuple source
                      -- NESTED_TUPLE_SOURCE  |LOCAL|
                 }
          -- EXTERNAL_GROUP_BY[$$30, $$31]  |PARTITIONED|
            exchange
            -- HASH_PARTITION_EXCHANGE [$$30, $$31]  |PARTITIONED|
              project ([$$30, $$31])
              -- STREAM_PROJECT  |PARTITIONED|
                select (function-call: algebricks:ge, Args:[function-call: 
asterix:get-item, Args:[function-call: asterix:similarity-jaccard-check, 
Args:[%0->$$2, %0->$$6, AFloat: {0.8}], AInt64: {1}], AFloat: {0.8}])
                -- STREAM_SELECT  |PARTITIONED|
                  project ([$$2, $$6, $$30, $$31])
                  -- STREAM_PROJECT  |PARTITIONED|
                    exchange
                    -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                      join (function-call: algebricks:eq, Args:[%0->$$3, 
%0->$$7])
                      -- HYBRID_HASH_JOIN [$$3][$$7]  |PARTITIONED|
                        exchange
                        -- HASH_PARTITION_EXCHANGE [$$3]  |PARTITIONED|
                          unnest $$3 <- function-call: 
asterix:subset-collection, Args:[%0->$$2, AInt64: {0}, function-call: 
asterix:len, Args:[%0->$$2]]
                          -- UNNEST  |PARTITIONED|
                            project ([$$2, $$30])
                            -- STREAM_PROJECT  |PARTITIONED|
                              assign [$$2] <- [function-call: 
asterix:word-tokens, Args:[function-call: asterix:field-access-by-index, 
Args:[%0->$$0, AInt32: {2}]]]
                              -- ASSIGN  |PARTITIONED|
                                exchange
                                -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                  data-scan []<-[$$30, $$0] <- test:DBLP
                                  -- DATASOURCE_SCAN  |PARTITIONED|
                                    exchange
                                    -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                      empty-tuple-source
                                      -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
                        exchange
                        -- HASH_PARTITION_EXCHANGE [$$7]  |PARTITIONED|
                          unnest $$7 <- function-call: 
asterix:subset-collection, Args:[%0->$$6, AInt64: {0}, function-call: 
asterix:len, Args:[%0->$$6]]
                          -- UNNEST  |PARTITIONED|
                            project ([$$6, $$31])
                            -- STREAM_PROJECT  |PARTITIONED|
                              assign [$$6] <- [function-call: 
asterix:word-tokens, Args:[function-call: asterix:field-access-by-index, 
Args:[%0->$$4, AInt32: {2}]]]
                              -- ASSIGN  |PARTITIONED|
                                exchange
                                -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                  data-scan []<-[$$31, $$4] <- test:CSX
                                  -- DATASOURCE_SCAN  |PARTITIONED|
                                    exchange
                                    -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                      empty-tuple-source
                                      -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
{noformat}
In contrast, when we conduct two functions in the query like
{noformat}
use dataverse test
set import-private-functions 'true'

for $paperDBLP in dataset('DBLP')
let $idDBLP := $paperDBLP.id
let $unrankedTokensDBLP := word-tokens($paperDBLP.title)
for $prefixTokenDBLP in subset-collection($unrankedTokensDBLP, 0, 
abs(len($unrankedTokensDBLP)))

for $paperCSX in dataset('CSX')
let $idCSX := $paperCSX.id
let $unrankedTokensCSX := word-tokens($paperCSX.title)
for $prefixTokenCSX in subset-collection($unrankedTokensCSX, 0, 
abs(len($unrankedTokensDBLP)))

where $prefixTokenDBLP = $prefixTokenCSX
let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX, 
.8f)
where $sim[1] >= .8f
/*+ hash */
group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
{noformat}
We got a wrong plan with a nested loop join as follows
{noformat}
distribute result [%0->$$31]
-- DISTRIBUTE_RESULT  |PARTITIONED|
  exchange
  -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
    project ([$$31])
    -- STREAM_PROJECT  |PARTITIONED|
      assign [$$31] <- [function-call: asterix:closed-object-constructor, 
Args:[AString: {idDBLP}, %0->$$9, AString: {idCSX}, %0->$$10]]
      -- ASSIGN  |PARTITIONED|
        exchange
        -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
          group by ([$$9 := %0->$$32; $$10 := %0->$$33]) decor ([]) {
                    aggregate [] <- []
                    -- AGGREGATE  |LOCAL|
                      nested tuple source
                      -- NESTED_TUPLE_SOURCE  |LOCAL|
                 }
          -- EXTERNAL_GROUP_BY[$$32, $$33]  |PARTITIONED|
            exchange
            -- HASH_PARTITION_EXCHANGE [$$32, $$33]  |PARTITIONED|
              project ([$$32, $$33])
              -- STREAM_PROJECT  |PARTITIONED|
                select (function-call: algebricks:eq, Args:[%0->$$3, %0->$$7])
                -- STREAM_SELECT  |PARTITIONED|
                  project ([$$32, $$33, $$3, $$7])
                  -- STREAM_PROJECT  |PARTITIONED|
                    unnest $$7 <- function-call: asterix:subset-collection, 
Args:[%0->$$6, AInt64: {0}, function-call: asterix:abs, Args:[function-call: 
asterix:len, Args:[%0->$$2]]]
                    -- UNNEST  |PARTITIONED|
                      exchange
                      -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                        join (function-call: algebricks:ge, 
Args:[function-call: asterix:get-item, Args:[function-call: 
asterix:similarity-jaccard-check, Args:[%0->$$2, %0->$$6, AFloat: {0.8}], 
AInt64: {1}], AFloat: {0.8}])
                        -- NESTED_LOOP  |PARTITIONED|
                          exchange
                          -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                            unnest $$3 <- function-call: 
asterix:subset-collection, Args:[%0->$$2, AInt64: {0}, function-call: 
asterix:abs, Args:[function-call: asterix:len, Args:[%0->$$2]]]
                            -- UNNEST  |PARTITIONED|
                              project ([$$32, $$2])
                              -- STREAM_PROJECT  |PARTITIONED|
                                assign [$$2] <- [function-call: 
asterix:word-tokens, Args:[function-call: asterix:field-access-by-index, 
Args:[%0->$$0, AInt32: {2}]]]
                                -- ASSIGN  |PARTITIONED|
                                  exchange
                                  -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                    data-scan []<-[$$32, $$0] <- test:DBLP
                                    -- DATASOURCE_SCAN  |PARTITIONED|
                                      exchange
                                      -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                        empty-tuple-source
                                        -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
                          exchange
                          -- BROADCAST_EXCHANGE  |PARTITIONED|
                            project ([$$33, $$6])
                            -- STREAM_PROJECT  |PARTITIONED|
                              assign [$$6] <- [function-call: 
asterix:word-tokens, Args:[function-call: asterix:field-access-by-index, 
Args:[%0->$$4, AInt32: {2}]]]
                              -- ASSIGN  |PARTITIONED|
                                exchange
                                -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                  data-scan []<-[$$33, $$4] <- test:CSX
                                  -- DATASOURCE_SCAN  |PARTITIONED|
                                    exchange
                                    -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                      empty-tuple-source
                                      -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
{noformat}


> Nested function poses wrong nested loop plan.
> ---------------------------------------------
>
>                 Key: ASTERIXDB-1852
>                 URL: https://issues.apache.org/jira/browse/ASTERIXDB-1852
>             Project: Apache AsterixDB
>          Issue Type: Bug
>          Components: AsterixDB
>         Environment: MAC Linux
>            Reporter: Wenhai
>            Assignee: Wenhai
>
> In a binary join query, the optimizer should generate a hash join plan. Due 
> to the two functions has been used before the join condition, we get a wrong 
> plan that employs the nested loop join.
> Schema
> {noformat}
> drop  dataverse test if exists;
> create  dataverse test;
> use dataverse test;
> create type DBLPType as
>  closed {
>   id : bigint,
>   dblpid : string,
>   title : string,
>   authors : string,
>   misc : string
> }
> create type CSXType as
>  closed {
>   id : bigint,
>   csxid : string,
>   title : string,
>   authors : string,
>   misc : string
> }
> create  dataset DBLP(DBLPType) primary key id;
> create  dataset CSX(CSXType) primary key id;
> {noformat}
> One function-nested query
> {noformat}
> use dataverse test
> set import-private-functions 'true'
> for $paperDBLP in dataset('DBLP')
> let $idDBLP := $paperDBLP.id
> let $unrankedTokensDBLP := word-tokens($paperDBLP.title)
> let $lenDBLP := len($unrankedTokensDBLP)
> let $tokensDBLP :=
>     for $token in $unrankedTokensDBLP
>     for $tokenRanked at $i in
>         //
>         // -- - Stage 1 - --
>         //
>         for $paper in dataset('DBLP')
>         let $id := $paper.id
>         for $token in word-tokens($paper.title)
>         /*+ hash */
>         group by $tokenGroupped := $token with $id
>         /*+ inmem 16 16384 */
>         order by count($id), $tokenGroupped
>         return $tokenGroupped
>     where $token = /*+ bcast */ $tokenRanked
>     order by $i
>     return $i
> for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, 
> prefix-len-jaccard(len($tokensDBLP), .8f))
> for $paperCSX in dataset('CSX')
> let $idCSX := $paperCSX.id
> let $unrankedTokensCSX := word-tokens($paperCSX.title)
> let $lenCSX := len($unrankedTokensCSX)
> let $tokensCSX :=
>     for $token in $unrankedTokensCSX
>     for $tokenRanked at $i in
>         //
>         // -- - Stage 1 - --
>         //
>         for $paper in dataset('DBLP')
>         let $id := $paper.id
>         for $token in word-tokens($paper.title)
>         /*+ hash */
>         group by $tokenGroupped := $token with $id
>         /*+ inmem 16 16384 */
>         order by count($id), $tokenGroupped
>         return $tokenGroupped
>     where $token = /*+ bcast */ $tokenRanked
>     order by $i
>     return $i
> let $csxLen := prefix-len-jaccard(len($unrankedTokensCSX), .8f)
> for $prefixTokenCSX in subset-collection($tokensCSX, 0, $csxLen)
> where $prefixTokenDBLP = $prefixTokenCSX
> let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX, 
> .8f)
> where $sim[1] >= .8f
> /*+ hash */
> group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
> return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
> {noformat}
> and another nested functions-based equal query
> {noformat}
> use dataverse test
> set import-private-functions 'true'
> for $paperDBLP in dataset('DBLP')
> let $idDBLP := $paperDBLP.id
> let $unrankedTokensDBLP := word-tokens($paperDBLP.title)
> let $lenDBLP := len($unrankedTokensDBLP)
> let $tokensDBLP :=
>     for $token in $unrankedTokensDBLP
>     for $tokenRanked at $i in
>         //
>         // -- - Stage 1 - --
>         //
>         for $paper in dataset('DBLP')
>         let $id := $paper.id
>         for $token in word-tokens($paper.title)
>         /*+ hash */
>         group by $tokenGroupped := $token with $id
>         /*+ inmem 16 16384 */
>         order by count($id), $tokenGroupped
>         return $tokenGroupped
>     where $token = /*+ bcast */ $tokenRanked
>     order by $i
>     return $i
> for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, 
> prefix-len-jaccard(len($tokensDBLP), .8f))
> for $paperCSX in dataset('CSX')
> let $idCSX := $paperCSX.id
> let $unrankedTokensCSX := word-tokens($paperCSX.title)
> let $lenCSX := len($unrankedTokensCSX)
> let $tokensCSX :=
>     for $token in $unrankedTokensCSX
>     for $tokenRanked at $i in
>         //
>         // -- - Stage 1 - --
>         //
>         for $paper in dataset('DBLP')
>         let $id := $paper.id
>         for $token in word-tokens($paper.title)
>         /*+ hash */
>         group by $tokenGroupped := $token with $id
>         /*+ inmem 16 16384 */
>         order by count($id), $tokenGroupped
>         return $tokenGroupped
>     where $token = /*+ bcast */ $tokenRanked
>     order by $i
>     return $i
> for $prefixTokenCSX in subset-collection($tokensCSX, 0, 
> prefix-len-jaccard(len($unrankedTokensCSX), .8f))
> where $prefixTokenDBLP = $prefixTokenCSX
> let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX, 
> .8f)
> where $sim[1] >= .8f
> /*+ hash */
> group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
> return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
> {noformat}
> generate different plans, where the later will produce a nested loop join.



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)

Reply via email to