[ 
https://issues.apache.org/jira/browse/ASTERIXDB-2815?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Wail Y. Alkowaileet updated ASTERIXDB-2815:
-------------------------------------------
    Description: 
Summary:

The issue happens when you have multiway join queries in a subquery with 
DISTINCT. The DISTINCT values might be not computed correctly (see details 
below). However, changing the join order fixes the issue.

DDL: 
{code:sql}
DROP DATAVERSE checkedtweets_schemaless IF EXISTS;
CREATE DATAVERSE checkedtweets_schemaless;
USE checkedtweets_schemaless;
// Create the data type for all our datasets.
CREATE TYPE GenericType AS {
 _id: uuid
};
// Create our datasets.
CREATE DATASET RawTweet(GenericType) PRIMARY KEY _id AUTOGENERATED;
CREATE DATASET User(GenericType) PRIMARY KEY _id AUTOGENERATED;
CREATE DATASET Evidence(GenericType) PRIMARY KEY _id AUTOGENERATED;
CREATE DATASET EvidenceFrom(GenericType) PRIMARY KEY _id AUTOGENERATED;
CREATE DATASET Verification(GenericType) PRIMARY KEY _id AUTOGENERATED;
{code}
 
 Query:
{code:sql}
USE checkedtweets_schemaless;
SELECT t.id, urls
FROM RawTweet t
LET urls = (SELECT DISTINCT VALUE e.url -- the issue is here
            FROM Verification v, v.evidence ve, Evidence e
            WHERE t.id = v.tweet_id
            AND   ve = e.ev_id
            )
WHERE array_count(urls) > 5;
{code}
Part of the plan that's not correct (see attached for the full plan):
{code:java}
                            distinct ([$$63])
                            -- PRE_SORTED_DISTINCT_BY  |PARTITIONED|
                              exchange
                              -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                order (ASC, $$63)
                                -- STABLE_SORT [$$63(ASC)]  |PARTITIONED|
                                  exchange
{code}
The issue is *DISTINCT url* is computed globally not locally for each 'group'

 

Changing the order of the join would produce the correct plan:
{code:sql}
USE checkedtweets_schemaless;
SELECT t.id, urls
FROM RawTweet t
LET urls = (SELECT DISTINCT VALUE e.url -- the issue is here
            FROM Evidence e, Verification v, v.evidence ve
            WHERE t.id = v.tweet_id
            AND   ve = e.ev_id
            )
WHERE array_count(urls) > 5;
{code}
 Correct plan:
{code:java}
                group by ([$$79 := $$69]) decor ([$$73]) {
                          aggregate [$$63, $$72] <- [listify($$62), 
agg-sql-count($$62)]
                          -- AGGREGATE  |LOCAL|
                            distinct ([$$62])
                            -- MICRO_PRE_SORTED_DISTINCT_BY  |LOCAL|
                              order (ASC, $$62)
                              -- MICRO_STABLE_SORT [$$62(ASC)]  |LOCAL|
                                select (not(is-missing($$78)))
                                -- STREAM_SELECT  |LOCAL|
                                  nested tuple source
                                  -- NESTED_TUPLE_SOURCE  |LOCAL|
                       }
{code}
 

 

Another variation needs to be inspected:
{code:sql}
USE checkedtweets_schemaless;
//To make the plan easier to read
set `compiler.sort.parallel` "false";
EXPLAIN
SELECT t.id, urls
FROM RawTweet t
LET urls = (SELECT DISTINCT VALUE e.url
            FROM Verification v, v.evidence ve, Evidence e
            WHERE 
                t.id = v.tweet_id
            AND   
                ve = e.ev_id
            AND
                -- The problem is here, when you have a nested subplan in a 
subplan
                (SOME ver_ev IN v.evidence SATISFIES ver_ev = 10)
            )
WHERE array_count(urls) > 2;
{code}

  was:
Summary:

The issue happens when you have multiway join queries in a subquery with 
DISTINCT. The DISTINCT values might be not computed correctly (see details 
below). However, changing the join order fixes the issue.

DDL: 
{code:sql}
DROP DATAVERSE checkedtweets_schemaless IF EXISTS;
CREATE DATAVERSE checkedtweets_schemaless;
USE checkedtweets_schemaless;
// Create the data type for all our datasets.
CREATE TYPE GenericType AS {
 _id: uuid
};
// Create our datasets.
CREATE DATASET RawTweet(GenericType) PRIMARY KEY _id AUTOGENERATED;
CREATE DATASET User(GenericType) PRIMARY KEY _id AUTOGENERATED;
CREATE DATASET Evidence(GenericType) PRIMARY KEY _id AUTOGENERATED;
CREATE DATASET EvidenceFrom(GenericType) PRIMARY KEY _id AUTOGENERATED;
CREATE DATASET Verification(GenericType) PRIMARY KEY _id AUTOGENERATED;
{code}
 
 Query:
{code:sql}
USE checkedtweets_schemaless;
//To make the plan easier to read
set `compiler.sort.parallel` "false";
SELECT t.id, urls
FROM RawTweet t
LET urls = (SELECT DISTINCT VALUE e.url -- the issue is here
            FROM Verification v, v.evidence ve, Evidence e
            WHERE t.id = v.tweet_id
            AND   ve = e.ev_id
            )
WHERE array_count(urls) > 5;
{code}
Part of the plan that's not correct (see attached for the full plan):
{code:java}
                            distinct ([$$63])
                            -- PRE_SORTED_DISTINCT_BY  |PARTITIONED|
                              exchange
                              -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                order (ASC, $$63)
                                -- STABLE_SORT [$$63(ASC)]  |PARTITIONED|
                                  exchange
{code}
The issue is *DISTINCT url* is computed globally not locally for each 'group'

 

Changing the order of the join would produce the correct plan:
{code:sql}
USE checkedtweets_schemaless;
//To make the plan easier to read
set `compiler.sort.parallel` "false";
SELECT t.id, urls
FROM RawTweet t
LET urls = (SELECT DISTINCT VALUE e.url -- the issue is here
            FROM Evidence e, Verification v, v.evidence ve
            WHERE t.id = v.tweet_id
            AND   ve = e.ev_id
            )
WHERE array_count(urls) > 5;
{code}
 Correct plan:
{code:java}
                group by ([$$79 := $$69]) decor ([$$73]) {
                          aggregate [$$63, $$72] <- [listify($$62), 
agg-sql-count($$62)]
                          -- AGGREGATE  |LOCAL|
                            distinct ([$$62])
                            -- MICRO_PRE_SORTED_DISTINCT_BY  |LOCAL|
                              order (ASC, $$62)
                              -- MICRO_STABLE_SORT [$$62(ASC)]  |LOCAL|
                                select (not(is-missing($$78)))
                                -- STREAM_SELECT  |LOCAL|
                                  nested tuple source
                                  -- NESTED_TUPLE_SOURCE  |LOCAL|
                       }
{code}


> DISTINCT and multiway join in a subquery computes wrong result
> --------------------------------------------------------------
>
>                 Key: ASTERIXDB-2815
>                 URL: https://issues.apache.org/jira/browse/ASTERIXDB-2815
>             Project: Apache AsterixDB
>          Issue Type: Bug
>            Reporter: Wail Y. Alkowaileet
>            Assignee: Wail Y. Alkowaileet
>            Priority: Major
>         Attachments: correct_plan, incorrect_plan
>
>
> Summary:
> The issue happens when you have multiway join queries in a subquery with 
> DISTINCT. The DISTINCT values might be not computed correctly (see details 
> below). However, changing the join order fixes the issue.
> DDL: 
> {code:sql}
> DROP DATAVERSE checkedtweets_schemaless IF EXISTS;
> CREATE DATAVERSE checkedtweets_schemaless;
> USE checkedtweets_schemaless;
> // Create the data type for all our datasets.
> CREATE TYPE GenericType AS {
>  _id: uuid
> };
> // Create our datasets.
> CREATE DATASET RawTweet(GenericType) PRIMARY KEY _id AUTOGENERATED;
> CREATE DATASET User(GenericType) PRIMARY KEY _id AUTOGENERATED;
> CREATE DATASET Evidence(GenericType) PRIMARY KEY _id AUTOGENERATED;
> CREATE DATASET EvidenceFrom(GenericType) PRIMARY KEY _id AUTOGENERATED;
> CREATE DATASET Verification(GenericType) PRIMARY KEY _id AUTOGENERATED;
> {code}
>  
>  Query:
> {code:sql}
> USE checkedtweets_schemaless;
> SELECT t.id, urls
> FROM RawTweet t
> LET urls = (SELECT DISTINCT VALUE e.url -- the issue is here
>             FROM Verification v, v.evidence ve, Evidence e
>             WHERE t.id = v.tweet_id
>             AND   ve = e.ev_id
>             )
> WHERE array_count(urls) > 5;
> {code}
> Part of the plan that's not correct (see attached for the full plan):
> {code:java}
>                             distinct ([$$63])
>                             -- PRE_SORTED_DISTINCT_BY  |PARTITIONED|
>                               exchange
>                               -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                                 order (ASC, $$63)
>                                 -- STABLE_SORT [$$63(ASC)]  |PARTITIONED|
>                                   exchange
> {code}
> The issue is *DISTINCT url* is computed globally not locally for each 'group'
>  
> Changing the order of the join would produce the correct plan:
> {code:sql}
> USE checkedtweets_schemaless;
> SELECT t.id, urls
> FROM RawTweet t
> LET urls = (SELECT DISTINCT VALUE e.url -- the issue is here
>             FROM Evidence e, Verification v, v.evidence ve
>             WHERE t.id = v.tweet_id
>             AND   ve = e.ev_id
>             )
> WHERE array_count(urls) > 5;
> {code}
>  Correct plan:
> {code:java}
>                 group by ([$$79 := $$69]) decor ([$$73]) {
>                           aggregate [$$63, $$72] <- [listify($$62), 
> agg-sql-count($$62)]
>                           -- AGGREGATE  |LOCAL|
>                             distinct ([$$62])
>                             -- MICRO_PRE_SORTED_DISTINCT_BY  |LOCAL|
>                               order (ASC, $$62)
>                               -- MICRO_STABLE_SORT [$$62(ASC)]  |LOCAL|
>                                 select (not(is-missing($$78)))
>                                 -- STREAM_SELECT  |LOCAL|
>                                   nested tuple source
>                                   -- NESTED_TUPLE_SOURCE  |LOCAL|
>                        }
> {code}
>  
>  
> Another variation needs to be inspected:
> {code:sql}
> USE checkedtweets_schemaless;
> //To make the plan easier to read
> set `compiler.sort.parallel` "false";
> EXPLAIN
> SELECT t.id, urls
> FROM RawTweet t
> LET urls = (SELECT DISTINCT VALUE e.url
>             FROM Verification v, v.evidence ve, Evidence e
>             WHERE 
>                 t.id = v.tweet_id
>             AND   
>                 ve = e.ev_id
>             AND
>                 -- The problem is here, when you have a nested subplan in a 
> subplan
>                 (SOME ver_ev IN v.evidence SATISFIES ver_ev = 10)
>             )
> WHERE array_count(urls) > 2;
> {code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to