Wenhai created ASTERIXDB-1487:
---------------------------------

             Summary: Fuzzy select-join on inverted index poses inconsistent 
results.
                 Key: ASTERIXDB-1487
                 URL: https://issues.apache.org/jira/browse/ASTERIXDB-1487
             Project: Apache AsterixDB
          Issue Type: Bug
          Components: AsterixDB
         Environment: MAC 4 cores, 8GB memory. The current master till 
3/17/2016.
            Reporter: Wenhai
            Priority: Critical


As shown in below. After we switching the two "for" branches of the fuzzy join 
over a select, the results are consistent.
Schema
{noformat}
drop dataverse test if exists;
create dataverse test;
use dataverse test;

create type DBLPNestedType as closed {
  id: int64,
  dblpid: string,
  title: string,
  authors: string,
  misc: string
}

create type DBLPType as closed {
  nested: DBLPNestedType
}

create type CSXNestedType as closed {
  id: int64,
  csxid: string,
  title: string,
  authors: string,
  misc: string
}

create type CSXType as closed {
  nested: CSXNestedType
}

create dataset DBLPtmp(DBLPNestedType) primary key id;
create dataset CSXtmp(CSXNestedType) primary key id;

create dataset DBLP(DBLPType) primary key nested.id;
create dataset CSX(CSXType) primary key nested.id;

use dataverse test;

load dataset DBLPtmp
using localfs
(("path"="asterix_nc1://data/dblp-small/dblp-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"))
 pre-sorted;

load dataset CSXtmp
using localfs
(("path"="asterix_nc1://data/pub-small/csx-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));

insert into dataset DBLP(
        for $x in dataset DBLPtmp
        return {
                "nested": $x
        }
);

insert into dataset CSX(
        for $x in dataset CSXtmp
        return {
                "nested": $x
        }
);

{noformat}
Indexes
{noformat}
create index keyword_index on DBLP(nested.title) type keyword; 
create index keyword_indexdbauhors on DBLP(nested.authors) type keyword;
create index keyword_indexcsxauthors on CSX(nested.authors) type keyword;
{noformat}
The following query
{noformat}
use dataverse test;
set simthresholds '.1'
let $s := count(
for $o in dataset DBLP
for $t in dataset CSX
where contains($o.nested.title, "System") and word-tokens($o.nested.authors) ~= 
word-tokens($t.nested.authors)
return $o
)
return $s
{noformat}
will return 28, while the query
{noformat}
use dataverse test;
set simthresholds '.1'
let $s := count(
for $t in dataset CSX
for $o in dataset DBLP
where contains($o.nested.title, "System") and word-tokens($o.nested.authors) ~= 
word-tokens($t.nested.authors)
return $o
)
return $s
{noformat}
will return 3 or pose a error in a big dataset.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to