[ 
https://issues.apache.org/jira/browse/ASTERIXDB-1637?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Yingyi Bu updated ASTERIXDB-1637:
---------------------------------
    Description: 
For optimizer test 
asterixdb/asterix-app/src/test/resources/optimizerts/queries/inverted-index-join/issue741.aql,
 the optimized plan is not right.

{noformat}
for $t in dataset('TweetMessages')
where $t.send_time >= datetime('2011-06-18T14:10:17')
and
$t.send_time < datetime('2011-06-18T15:10:17')
return {
    "tweet": $t.tweetid,
    "similar-tweets": for $t2 in dataset('TweetMessages')
                      let $sim := similarity-jaccard-check($t.referred_topics, 
$t2.referred_topics, 0.6f)
              where $sim[0] and
                      $t2.tweetid != $t.tweetid
                      return $t2.tweetid
}
{noformat}

{noformat}
distribute result [%0->$$11]
-- DISTRIBUTE_RESULT  |PARTITIONED|
  exchange
  -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
    project ([$$11])
    -- STREAM_PROJECT  |PARTITIONED|
      assign [$$11] <- [function-call: asterix:closed-record-constructor, 
Args:[AString: {tweet}, %0->$$33, AString: {similar-tweets}, %0->$$23]]
      -- ASSIGN  |PARTITIONED|
        exchange
        -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
          group by ([$$33 := %0->$$25]) decor ([]) {
                    aggregate [$$23] <- [function-call: asterix:listify, 
Args:[%0->$$26]]
                    -- AGGREGATE  |LOCAL|
                      select (function-call: algebricks:not, 
Args:[function-call: algebricks:is-missing, Args:[%0->$$26]])
                      -- STREAM_SELECT  |LOCAL|
                        nested tuple source
                        -- NESTED_TUPLE_SOURCE  |LOCAL|
                 }
          -- PRE_CLUSTERED_GROUP_BY[$$25]  |PARTITIONED|
            exchange
            -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
              order (ASC, %0->$$25) 
              -- STABLE_SORT [$$25(ASC)]  |PARTITIONED|
                exchange
                -- HASH_PARTITION_EXCHANGE [$$25]  |PARTITIONED|
                  project ([$$25, $$26])
                  -- STREAM_PROJECT  |PARTITIONED|
                    exchange
                    -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                      join (function-call: algebricks:eq, Args:[%0->$$36, 
%0->$$25])
                      -- HYBRID_HASH_JOIN [$$36][$$25]  |PARTITIONED|
                        exchange
                        -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                          project ([$$36])
                          -- STREAM_PROJECT  |PARTITIONED|
                            select (function-call: algebricks:and, 
Args:[function-call: algebricks:ge, Args:[%0->$$24, ADateTime: { 
2011-06-18T14:10:17.000Z }], function-call: algebricks:lt, Args:[%0->$$24, 
ADateTime: { 2011-06-18T15:10:17.000Z }]])
                            -- STREAM_SELECT  |PARTITIONED|
                              project ([$$36, $$24])
                              -- STREAM_PROJECT  |PARTITIONED|
                                assign [$$24] <- [function-call: 
asterix:field-access-by-index, Args:[%0->$$0, AInt32: {3}]]
                                -- ASSIGN  |PARTITIONED|
                                  exchange
                                  -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                    data-scan []<-[$$36, $$0] <- 
test:TweetMessages
                                    -- DATASOURCE_SCAN  |PARTITIONED|
                                      exchange
                                      -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                        empty-tuple-source
                                        -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
                        exchange
                        -- HASH_PARTITION_EXCHANGE [$$25]  |PARTITIONED|
                          project ([$$25, $$26])
                          -- STREAM_PROJECT  |PARTITIONED|
                            select (function-call: algebricks:and, 
Args:[function-call: algebricks:neq, Args:[%0->$$26, %0->$$25], function-call: 
asterix:get-item, Args:[function-call: asterix:similarity-jaccard-check, 
Args:[%0->$$29, function-call: asterix:field-access-by-index, Args:[%0->$$1, 
AInt32: {4}], AFloat: {0.6}], AInt64: {0}]])
                            -- STREAM_SELECT  |PARTITIONED|
                              project ([$$1, $$25, $$26, $$29])
                              -- STREAM_PROJECT  |PARTITIONED|
                                exchange
                                -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                  left-outer-unnest-map [$$26, $$1] <- 
function-call: asterix:index-search, Args:[AString: {TweetMessages}, AInt32: 
{0}, AString: {test}, AString: {TweetMessages}, ABoolean: {true}, ABoolean: 
{false}, AInt32: {1}, %0->$$39, AInt32: {1}, %0->$$39, TRUE, TRUE, TRUE]
                                  -- BTREE_SEARCH  |PARTITIONED|
                                    exchange
                                    -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                      order (ASC, %0->$$39) 
                                      -- STABLE_SORT [$$39(ASC)]  |PARTITIONED|
                                        exchange
                                        -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                          left-outer-unnest-map [$$39] <- 
function-call: asterix:index-search, Args:[AString: {topicIIx}, AInt32: {4}, 
AString: {test}, AString: {TweetMessages}, ABoolean: {true}, ABoolean: {true}, 
AInt32: {1}, AFloat: {0.6}, AInt32: {22}, AInt32: {1}, %0->$$29]
                                          -- 
LENGTH_PARTITIONED_INVERTED_INDEX_SEARCH  |PARTITIONED|
                                            exchange
                                            -- BROADCAST_EXCHANGE  |PARTITIONED|
                                              project ([$$25, $$29])
                                              -- STREAM_PROJECT  |PARTITIONED|
                                                select (function-call: 
algebricks:and, Args:[function-call: algebricks:ge, Args:[%0->$$37, ADateTime: 
{ 2011-06-18T14:10:17.000Z }], function-call: algebricks:lt, Args:[%0->$$37, 
ADateTime: { 2011-06-18T15:10:17.000Z }]])
                                                -- STREAM_SELECT  |PARTITIONED|
                                                  project ([$$37, $$25, $$29])
                                                  -- STREAM_PROJECT  
|PARTITIONED|
                                                    assign [$$29, $$37] <- 
[function-call: asterix:field-access-by-index, Args:[%0->$$38, AInt32: {4}], 
function-call: asterix:field-access-by-index, Args:[%0->$$38, AInt32: {3}]]
                                                    -- ASSIGN  |PARTITIONED|
                                                      exchange
                                                      -- ONE_TO_ONE_EXCHANGE  
|PARTITIONED|
                                                        data-scan []<-[$$25, 
$$38] <- test:TweetMessages
                                                        -- DATASOURCE_SCAN  
|PARTITIONED|
                                                          exchange
                                                          -- 
ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                                            empty-tuple-source
                                                            -- 
EMPTY_TUPLE_SOURCE  |PARTITIONED|
{noformat}

There are several issues here:

1. The filtering condition on $t.send_time gets pushed to both input branches.  
It should *only* be on the outer branch, rather than the inner branch.

2. The left_outer_unnest_maps in the plan should be unnest_map.

3. The join in the plan should be a left outer join instead of an inner join.


  was:
For optimizer test 
asterixdb/asterix-app/src/test/resources/optimizerts/queries/inverted-index-join/issue741.aql,
 the optimized plan is not right.

{noformat}
for $t in dataset('TweetMessages')
where $t.send_time >= datetime('2011-06-18T14:10:17')
and
$t.send_time < datetime('2011-06-18T15:10:17')
return {
    "tweet": $t.tweetid,
    "similar-tweets": for $t2 in dataset('TweetMessages')
                      let $sim := similarity-jaccard-check($t.referred_topics, 
$t2.referred_topics, 0.6f)
              where $sim[0] and
                      $t2.tweetid != $t.tweetid
                      return $t2.tweetid
}
{noformat}

{noformat}
distribute result [%0->$$11]
-- DISTRIBUTE_RESULT  |PARTITIONED|
  exchange
  -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
    project ([$$11])
    -- STREAM_PROJECT  |PARTITIONED|
      assign [$$11] <- [function-call: asterix:closed-record-constructor, 
Args:[AString: {tweet}, %0->$$33, AString: {similar-tweets}, %0->$$23]]
      -- ASSIGN  |PARTITIONED|
        exchange
        -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
          group by ([$$33 := %0->$$25]) decor ([]) {
                    aggregate [$$23] <- [function-call: asterix:listify, 
Args:[%0->$$26]]
                    -- AGGREGATE  |LOCAL|
                      select (function-call: algebricks:not, 
Args:[function-call: algebricks:is-missing, Args:[%0->$$26]])
                      -- STREAM_SELECT  |LOCAL|
                        nested tuple source
                        -- NESTED_TUPLE_SOURCE  |LOCAL|
                 }
          -- PRE_CLUSTERED_GROUP_BY[$$25]  |PARTITIONED|
            exchange
            -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
              order (ASC, %0->$$25) 
              -- STABLE_SORT [$$25(ASC)]  |PARTITIONED|
                exchange
                -- HASH_PARTITION_EXCHANGE [$$25]  |PARTITIONED|
                  project ([$$25, $$26])
                  -- STREAM_PROJECT  |PARTITIONED|
                    exchange
                    -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                      join (function-call: algebricks:eq, Args:[%0->$$36, 
%0->$$25])
                      -- HYBRID_HASH_JOIN [$$36][$$25]  |PARTITIONED|
                        exchange
                        -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                          project ([$$36])
                          -- STREAM_PROJECT  |PARTITIONED|
                            select (function-call: algebricks:and, 
Args:[function-call: algebricks:ge, Args:[%0->$$24, ADateTime: { 
2011-06-18T14:10:17.000Z }], function-call: algebricks:lt, Args:[%0->$$24, 
ADateTime: { 2011-06-18T15:10:17.000Z }]])
                            -- STREAM_SELECT  |PARTITIONED|
                              project ([$$36, $$24])
                              -- STREAM_PROJECT  |PARTITIONED|
                                assign [$$24] <- [function-call: 
asterix:field-access-by-index, Args:[%0->$$0, AInt32: {3}]]
                                -- ASSIGN  |PARTITIONED|
                                  exchange
                                  -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                    data-scan []<-[$$36, $$0] <- 
test:TweetMessages
                                    -- DATASOURCE_SCAN  |PARTITIONED|
                                      exchange
                                      -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                        empty-tuple-source
                                        -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
                        exchange
                        -- HASH_PARTITION_EXCHANGE [$$25]  |PARTITIONED|
                          project ([$$25, $$26])
                          -- STREAM_PROJECT  |PARTITIONED|
                            select (function-call: algebricks:and, 
Args:[function-call: algebricks:neq, Args:[%0->$$26, %0->$$25], function-call: 
asterix:get-item, Args:[function-call: asterix:similarity-jaccard-check, 
Args:[%0->$$29, function-call: asterix:field-access-by-index, Args:[%0->$$1, 
AInt32: {4}], AFloat: {0.6}], AInt64: {0}]])
                            -- STREAM_SELECT  |PARTITIONED|
                              project ([$$1, $$25, $$26, $$29])
                              -- STREAM_PROJECT  |PARTITIONED|
                                exchange
                                -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                  left-outer-unnest-map [$$26, $$1] <- 
function-call: asterix:index-search, Args:[AString: {TweetMessages}, AInt32: 
{0}, AString: {test}, AString: {TweetMessages}, ABoolean: {true}, ABoolean: 
{false}, AInt32: {1}, %0->$$39, AInt32: {1}, %0->$$39, TRUE, TRUE, TRUE]
                                  -- BTREE_SEARCH  |PARTITIONED|
                                    exchange
                                    -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                      order (ASC, %0->$$39) 
                                      -- STABLE_SORT [$$39(ASC)]  |PARTITIONED|
                                        exchange
                                        -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                          left-outer-unnest-map [$$39] <- 
function-call: asterix:index-search, Args:[AString: {topicIIx}, AInt32: {4}, 
AString: {test}, AString: {TweetMessages}, ABoolean: {true}, ABoolean: {true}, 
AInt32: {1}, AFloat: {0.6}, AInt32: {22}, AInt32: {1}, %0->$$29]
                                          -- 
LENGTH_PARTITIONED_INVERTED_INDEX_SEARCH  |PARTITIONED|
                                            exchange
                                            -- BROADCAST_EXCHANGE  |PARTITIONED|
                                              project ([$$25, $$29])
                                              -- STREAM_PROJECT  |PARTITIONED|
                                                select (function-call: 
algebricks:and, Args:[function-call: algebricks:ge, Args:[%0->$$37, ADateTime: 
{ 2011-06-18T14:10:17.000Z }], function-call: algebricks:lt, Args:[%0->$$37, 
ADateTime: { 2011-06-18T15:10:17.000Z }]])
                                                -- STREAM_SELECT  |PARTITIONED|
                                                  project ([$$37, $$25, $$29])
                                                  -- STREAM_PROJECT  
|PARTITIONED|
                                                    assign [$$29, $$37] <- 
[function-call: asterix:field-access-by-index, Args:[%0->$$38, AInt32: {4}], 
function-call: asterix:field-access-by-index, Args:[%0->$$38, AInt32: {3}]]
                                                    -- ASSIGN  |PARTITIONED|
                                                      exchange
                                                      -- ONE_TO_ONE_EXCHANGE  
|PARTITIONED|
                                                        data-scan []<-[$$25, 
$$38] <- test:TweetMessages
                                                        -- DATASOURCE_SCAN  
|PARTITIONED|
                                                          exchange
                                                          -- 
ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                                            empty-tuple-source
                                                            -- 
EMPTY_TUPLE_SOURCE  |PARTITIONED|
{noformat}

There are several issues here:

1. The filtering condition on $t.send_time gets pushed to a wrong input branch. 
 It should be on the outer branch, rather than the inner branch.

2. The left_outer_unnest_maps in the plan should be unnest_map.

3. The join in the plan should be a left outer join instead of an inner join.



> Incorrect plan generated by left outer index join rewriting
> -----------------------------------------------------------
>
>                 Key: ASTERIXDB-1637
>                 URL: https://issues.apache.org/jira/browse/ASTERIXDB-1637
>             Project: Apache AsterixDB
>          Issue Type: Bug
>          Components: Optimizer
>            Reporter: Yingyi Bu
>            Assignee: Taewoo Kim
>
> For optimizer test 
> asterixdb/asterix-app/src/test/resources/optimizerts/queries/inverted-index-join/issue741.aql,
>  the optimized plan is not right.
> {noformat}
> for $t in dataset('TweetMessages')
> where $t.send_time >= datetime('2011-06-18T14:10:17')
> and
> $t.send_time < datetime('2011-06-18T15:10:17')
> return {
>     "tweet": $t.tweetid,
>     "similar-tweets": for $t2 in dataset('TweetMessages')
>                       let $sim := 
> similarity-jaccard-check($t.referred_topics, $t2.referred_topics, 0.6f)
>               where $sim[0] and
>                       $t2.tweetid != $t.tweetid
>                       return $t2.tweetid
> }
> {noformat}
> {noformat}
> distribute result [%0->$$11]
> -- DISTRIBUTE_RESULT  |PARTITIONED|
>   exchange
>   -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>     project ([$$11])
>     -- STREAM_PROJECT  |PARTITIONED|
>       assign [$$11] <- [function-call: asterix:closed-record-constructor, 
> Args:[AString: {tweet}, %0->$$33, AString: {similar-tweets}, %0->$$23]]
>       -- ASSIGN  |PARTITIONED|
>         exchange
>         -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>           group by ([$$33 := %0->$$25]) decor ([]) {
>                     aggregate [$$23] <- [function-call: asterix:listify, 
> Args:[%0->$$26]]
>                     -- AGGREGATE  |LOCAL|
>                       select (function-call: algebricks:not, 
> Args:[function-call: algebricks:is-missing, Args:[%0->$$26]])
>                       -- STREAM_SELECT  |LOCAL|
>                         nested tuple source
>                         -- NESTED_TUPLE_SOURCE  |LOCAL|
>                  }
>           -- PRE_CLUSTERED_GROUP_BY[$$25]  |PARTITIONED|
>             exchange
>             -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>               order (ASC, %0->$$25) 
>               -- STABLE_SORT [$$25(ASC)]  |PARTITIONED|
>                 exchange
>                 -- HASH_PARTITION_EXCHANGE [$$25]  |PARTITIONED|
>                   project ([$$25, $$26])
>                   -- STREAM_PROJECT  |PARTITIONED|
>                     exchange
>                     -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                       join (function-call: algebricks:eq, Args:[%0->$$36, 
> %0->$$25])
>                       -- HYBRID_HASH_JOIN [$$36][$$25]  |PARTITIONED|
>                         exchange
>                         -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                           project ([$$36])
>                           -- STREAM_PROJECT  |PARTITIONED|
>                             select (function-call: algebricks:and, 
> Args:[function-call: algebricks:ge, Args:[%0->$$24, ADateTime: { 
> 2011-06-18T14:10:17.000Z }], function-call: algebricks:lt, Args:[%0->$$24, 
> ADateTime: { 2011-06-18T15:10:17.000Z }]])
>                             -- STREAM_SELECT  |PARTITIONED|
>                               project ([$$36, $$24])
>                               -- STREAM_PROJECT  |PARTITIONED|
>                                 assign [$$24] <- [function-call: 
> asterix:field-access-by-index, Args:[%0->$$0, AInt32: {3}]]
>                                 -- ASSIGN  |PARTITIONED|
>                                   exchange
>                                   -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                                     data-scan []<-[$$36, $$0] <- 
> test:TweetMessages
>                                     -- DATASOURCE_SCAN  |PARTITIONED|
>                                       exchange
>                                       -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                                         empty-tuple-source
>                                         -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
>                         exchange
>                         -- HASH_PARTITION_EXCHANGE [$$25]  |PARTITIONED|
>                           project ([$$25, $$26])
>                           -- STREAM_PROJECT  |PARTITIONED|
>                             select (function-call: algebricks:and, 
> Args:[function-call: algebricks:neq, Args:[%0->$$26, %0->$$25], 
> function-call: asterix:get-item, Args:[function-call: 
> asterix:similarity-jaccard-check, Args:[%0->$$29, function-call: 
> asterix:field-access-by-index, Args:[%0->$$1, AInt32: {4}], AFloat: {0.6}], 
> AInt64: {0}]])
>                             -- STREAM_SELECT  |PARTITIONED|
>                               project ([$$1, $$25, $$26, $$29])
>                               -- STREAM_PROJECT  |PARTITIONED|
>                                 exchange
>                                 -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                                   left-outer-unnest-map [$$26, $$1] <- 
> function-call: asterix:index-search, Args:[AString: {TweetMessages}, AInt32: 
> {0}, AString: {test}, AString: {TweetMessages}, ABoolean: {true}, ABoolean: 
> {false}, AInt32: {1}, %0->$$39, AInt32: {1}, %0->$$39, TRUE, TRUE, TRUE]
>                                   -- BTREE_SEARCH  |PARTITIONED|
>                                     exchange
>                                     -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                                       order (ASC, %0->$$39) 
>                                       -- STABLE_SORT [$$39(ASC)]  
> |PARTITIONED|
>                                         exchange
>                                         -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                                           left-outer-unnest-map [$$39] <- 
> function-call: asterix:index-search, Args:[AString: {topicIIx}, AInt32: {4}, 
> AString: {test}, AString: {TweetMessages}, ABoolean: {true}, ABoolean: 
> {true}, AInt32: {1}, AFloat: {0.6}, AInt32: {22}, AInt32: {1}, %0->$$29]
>                                           -- 
> LENGTH_PARTITIONED_INVERTED_INDEX_SEARCH  |PARTITIONED|
>                                             exchange
>                                             -- BROADCAST_EXCHANGE  
> |PARTITIONED|
>                                               project ([$$25, $$29])
>                                               -- STREAM_PROJECT  |PARTITIONED|
>                                                 select (function-call: 
> algebricks:and, Args:[function-call: algebricks:ge, Args:[%0->$$37, 
> ADateTime: { 2011-06-18T14:10:17.000Z }], function-call: algebricks:lt, 
> Args:[%0->$$37, ADateTime: { 2011-06-18T15:10:17.000Z }]])
>                                                 -- STREAM_SELECT  
> |PARTITIONED|
>                                                   project ([$$37, $$25, $$29])
>                                                   -- STREAM_PROJECT  
> |PARTITIONED|
>                                                     assign [$$29, $$37] <- 
> [function-call: asterix:field-access-by-index, Args:[%0->$$38, AInt32: {4}], 
> function-call: asterix:field-access-by-index, Args:[%0->$$38, AInt32: {3}]]
>                                                     -- ASSIGN  |PARTITIONED|
>                                                       exchange
>                                                       -- ONE_TO_ONE_EXCHANGE  
> |PARTITIONED|
>                                                         data-scan []<-[$$25, 
> $$38] <- test:TweetMessages
>                                                         -- DATASOURCE_SCAN  
> |PARTITIONED|
>                                                           exchange
>                                                           -- 
> ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                                                             empty-tuple-source
>                                                             -- 
> EMPTY_TUPLE_SOURCE  |PARTITIONED|
> {noformat}
> There are several issues here:
> 1. The filtering condition on $t.send_time gets pushed to both input 
> branches.  It should *only* be on the outer branch, rather than the inner 
> branch.
> 2. The left_outer_unnest_maps in the plan should be unnest_map.
> 3. The join in the plan should be a left outer join instead of an inner join.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to