[
https://issues.apache.org/jira/browse/ASTERIXDB-1556?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15399664#comment-15399664
]
Wenhai edited comment on ASTERIXDB-1556 at 7/29/16 5:18 PM:
------------------------------------------------------------
{noformat}
drop dataverse fuzzyjointest if exists;
create dataverse fuzzyjointest;
use dataverse fuzzyjointest;
create type DBLPType as open {
tid: uuid,
id: int64,
dblpid: string?,
title: string?,
authors: string?,
misc: string?
}
create dataset DBLP(DBLPType) primary key tid autogenerated;
create dataset CSX(DBLPType) primary key tid autogenerated;
create dataset ACM(DBLPType) primary key tid autogenerated;
load dataset DBLP
using localfs
(("path"="127.0.0.1:///Users/michael/Research/asterixdb-src/asterixdb-fuzzy/asterixdb/asterixdb/asterix-app/data/dblp-small/dblp-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
load dataset CSX
using localfs
(("path"="127.0.0.1:///Users/michael/Research/asterixdb-src/asterixdb-fuzzy/asterixdb/asterixdb/asterix-app/data/pub-small/csx-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
load dataset ACM
using localfs
(("path"="127.0.0.1:///Users/michael/Research/asterixdb-src/asterixdb-fuzzy/asterixdb/asterixdb/asterix-app/data/pub-small/csx-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
{noformat}
//The 2wayjoin
{noformat}
for $r in dataset DBLP
for $s in dataset CSX
where word-tokens($r.title) ~= word-tokens($s.title)
return {"rid": $r.tid, "sid": $s.tid}
{noformat}
//The 3wayjoin
{noformat}
use dataverse fuzzyjointest;
for $r in dataset DBLP
for $s in dataset CSX
for $t in dataset ACM
where word-tokens($r.title) ~= word-tokens($s.title)
and word-tokens($r.authors) ~= word-tokens($t.authors)
return {"rid": $r.tid, "sid": $s.tid, "tid": $t.tid}
{noformat}
was (Author: lwhay):
{noformat}
drop dataverse fuzzyjointest if exists;
create dataverse fuzzyjointest;
use dataverse fuzzyjointest;
create type DBLPType as open {
tid: uuid,
id: int64,
dblpid: string?,
title: string?,
authors: string?,
misc: string?
}
create type CSXType as closed {
tid: uuid,
id: int64,
csxid: string?,
title: string?,
authors: string?,
misc: string?
}
create dataset DBLP(DBLPType) primary key tid autogenerated;
create dataset CSX(CSXType) primary key tid autogenerated;
load dataset DBLP
using localfs
(("path"="127.0.0.1:///Users/michael/Research/asterixdb-src/asterixdb-fuzzy/asterixdb/asterixdb/asterix-app/data/dblp-small/dblp-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"))
pre-sorted;
load dataset CSX
using localfs
(("path"="127.0.0.1:///Users/michael/Research/asterixdb-src/asterixdb-fuzzy/asterixdb/asterixdb/asterix-app/data/pub-small/csx-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
{noformat}
//The 2wayjoin
{noformat}
use dataverse fuzzyjointest;
for $r in dataset DBLP
for $s in dataset DBLP
where word-tokens($r.title) ~= word-tokens($s.title)
return {"rid": $r.tid, "sid": $s.tid}
{noformat}
//The 3wayjoin
{noformat}
use dataverse fuzzyjointest;
for $r in dataset DBLP
for $s in dataset DBLP
for $t in dataset DBLP
where word-tokens($r.title) ~= word-tokens($s.title)
and word-tokens($r.authors) ~= word-tokens($t.authors)
return {"rid": $r.tid, "sid": $s.tid, “tid": $t.tid}
{noformat}
> Prefix-based multi-way Fuzzy-join generates an exception.
> ---------------------------------------------------------
>
> Key: ASTERIXDB-1556
> URL: https://issues.apache.org/jira/browse/ASTERIXDB-1556
> Project: Apache AsterixDB
> Issue Type: Bug
> Reporter: Taewoo Kim
> Attachments: 2wayjoin.pdf, 2wayjoin.rtf, 2wayjoinplan.rtf,
> 3wayjoin.pdf, 3wayjoin.rtf, 3wayjoinplan.rtf
>
>
> When we enable prefix-based fuzzy-join and apply the multi-way fuzzy-join ( >
> 2), the system generates an out-of-memory exception.
> Since a fuzzy-join is created using 30-40 lines of AQL codes and this AQL is
> translated into massive number of operators (more than 200 operators in the
> plan for a 3-way fuzzy join), it could generate out-of-memory exception.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)