[
https://issues.apache.org/jira/browse/ASTERIXDB-1435?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Wenhai updated ASTERIXDB-1435:
------------------------------
Description:
When we testing the fuzzy-join patch
(https://asterix-gerrit.ics.uci.edu/#/c/531/) on a million records table ACM
joining half-million DBLP joining 2million CITE datasets, we encountered a heap
error that should not to be. We divide the three-way prefix-based fuzzy join in
two rounds as following.
Schema
{noformat}
drop dataverse testtype if exists;
create dataverse testtype;
use dataverse testtype;
create type ZipfanType as closed {
id: uuid,
zipfan_double1: double,
uniform_int1: int32,
zipfan_long1: int64,
gaussian_short1: int16,
zipfan_double2: double,
uniform_int2: int32,
zipfan_long2: int64,
gaussian_short2: int16,
log_string1: string}
create dataset Zipfan(ZipfanType)
primary key id autogenerated;
drop dataverse test if exists;
create dataverse test;
use dataverse test;
create type PaperType as open {
tid:uuid,
title: string,
authors: string?,
year: int?,
conf: string?,
idx: string,
abstract: string?
}
create dataset ACM(PaperType) primary key tid autogenerated;
use dataverse test;
drop dataset ACM if exists;
create dataset ACM(PaperType) primary key tid autogenerated;
load dataset ACM
using localfs
(("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ac,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ad,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ae"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));
use dataverse test;
create dataset DBLP(PaperType) primary key tid autogenerated;
load dataset DBLP
using localfs
(("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ac,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ad,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ae,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.af"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));
create dataset ACM(PaperType) primary key tid autogenerated;
use dataverse test;
drop dataset CITE if exists;
create dataset CITE(PaperType) primary key tid autogenerated;
load dataset CITE
using localfs
(("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/citation_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/citation_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/citation_split.ac"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));
use dataverse test;
drop index ACM.word_index if exists
create index word_index on ACM(title) type keyword
use dataverse test;
drop index ACM.ngram_index if exists
create index ngram_index on ACM(title) type ngram(3)
{noformat}
Firstly, we divide the three join onto two two-way joins and joined ACM with
DBLP onto a temporary table STATGE1,
{noformat}
use dataverse test;
drop dataset Stage1 if exists;
create dataset Stage1(PaperType) primary key tid autogenerated;
set import-private-functions 'true'
set simthreshold '.9f';
insert into dataset Stage1 (
for $t in dataset ('ACM')
for $o in dataset('DBLP')
where word-tokens($o.authors) ~= word-tokens($t.authors)
return {"title":$t.title,
"authors":$t.authors,"year":$t.year,"conf":$t.conf,"idx":$t.idx,"abstract":$t.abstract})
{noformat}
Afterwards, we executed another two-way join as follows.
{noformat}
use dataverse test;
set import-private-functions 'true'
set simthreshold '.9f';
let $s := sum(
for $t in dataset ('Stage1')
for $o in dataset('CITE')
where word-tokens($o.authors) ~= word-tokens($t.authors)
order by $o.id
return 1)
return $s
{nofomrat}
It is successful to generate the final 7-million results.
Nevertheless, if we enforce the following query directly,
{noformat}
use dataverse test;
set import-private-functions 'true'
set simthreshold '.9f';
let $s := sum(
for $t in dataset ('ACM')
for $o in dataset('DBLP')
for $g in dataset('CITE')
where word-tokens($o.authors) ~= word-tokens($t.authors) and
word-tokens($t.authors) ~= word-tokens($g.authors)
order by $o.id
return 1)
return $s
{noformat}
we got the error
{noformat}
Java heap error.
{noformat}
was:
When we testing the fuzzy-join patch
(https://asterix-gerrit.ics.uci.edu/#/c/531/) on a million join half million
join 2million datasets, we encountered a heap error that should not to be. We
divide the three-way prefix-based fuzzy join in two rounds.
Schema
{nofomrat}
drop dataverse test if exists;
create dataverse test;
use dataverse test;
create type PaperType as open {
tid:uuid,
title: string,
authors: string?,
year: int?,
conf: string?,
idx: string,
abstract: string?
}
create dataset ACM(PaperType) primary key tid autogenerated;
use dataverse test;
drop dataset ACM if exists;
create dataset ACM(PaperType) primary key tid autogenerated;
load dataset ACM
using localfs
(("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ac,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ad,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ae"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));
use dataverse test;
create dataset OUTPUT(PaperType) primary key tid autogenerated;
load dataset OUTPUT
using localfs
(("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/outputacm_raw.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/outputacm_raw.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/outputacm_raw.ac,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/outputacm_raw.ad,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/outputacm_raw.ae,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/outputacm_raw.af,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/outputacm_raw.ag"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));
use dataverse test;
create dataset DBLP(PaperType) primary key tid autogenerated;
load dataset DBLP
using localfs
(("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ac,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ad,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ae,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.af"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));
{noformat}
Firstly, we divide the three join onto two two-way joins and joined ACM with
DBLP onto a temporary table STATGE1,
{noformat}
use dataverse test;
drop dataset Stage1 if exists;
create dataset Stage1(PaperType) primary key tid autogenerated;
set import-private-functions 'true'
set simthreshold '.9f';
insert into dataset Stage1 (
for $t in dataset ('ACM')
for $o in dataset('DBLP')
where word-tokens($o.authors) ~= word-tokens($t.authors)
return {"title":$t.title,
"authors":$t.authors,"year":$t.year,"conf":$t.conf,"idx":$t.idx,"abstract":$t.abstract})
{noformat}
Afterwards, we executed another two-way join as follows.
{noformat}
use dataverse test;
set import-private-functions 'true'
set simthreshold '.9f';
let $s := sum(
for $t in dataset ('Stage1')
for $o in dataset('OUTPUT')
where word-tokens($o.authors) ~= word-tokens($t.authors)
order by $o.id
return 1)
return $s
{nofomrat}
It is successful to generate the final 10million results.
Nevertheless, if we enforce the following query directly,
{noformat}
use dataverse test;
set import-private-functions 'true'
set simthreshold '.9f';
let $s := sum(
for $t in dataset ('ACM')
for $o in dataset('DBLP')
for $g in dataset('OUTPUT')
where word-tokens($o.authors) ~= word-tokens($t.authors) and
word-tokens($t.authors) ~= word-tokens($g.authors)
order by $o.id
return 1)
return $s
{noformat}
We got the error
{noformat}
{noformat}
> Massive operators will induce a heap error that should not be.
> ---------------------------------------------------------------
>
> Key: ASTERIXDB-1435
> URL: https://issues.apache.org/jira/browse/ASTERIXDB-1435
> Project: Apache AsterixDB
> Issue Type: Improvement
> Components: AsterixDB
> Environment: Ubuntu 12.04 on a single machine with 2 X 12 partitions
> on 6 CPUs X 4 hard threads
> Reporter: Wenhai
>
> When we testing the fuzzy-join patch
> (https://asterix-gerrit.ics.uci.edu/#/c/531/) on a million records table ACM
> joining half-million DBLP joining 2million CITE datasets, we encountered a
> heap error that should not to be. We divide the three-way prefix-based fuzzy
> join in two rounds as following.
> Schema
> {noformat}
> drop dataverse testtype if exists;
> create dataverse testtype;
> use dataverse testtype;
> create type ZipfanType as closed {
> id: uuid,
> zipfan_double1: double,
> uniform_int1: int32,
> zipfan_long1: int64,
> gaussian_short1: int16,
> zipfan_double2: double,
> uniform_int2: int32,
> zipfan_long2: int64,
> gaussian_short2: int16,
> log_string1: string}
> create dataset Zipfan(ZipfanType)
> primary key id autogenerated;
> drop dataverse test if exists;
> create dataverse test;
> use dataverse test;
> create type PaperType as open {
> tid:uuid,
> title: string,
> authors: string?,
> year: int?,
> conf: string?,
> idx: string,
> abstract: string?
> }
> create dataset ACM(PaperType) primary key tid autogenerated;
> use dataverse test;
> drop dataset ACM if exists;
> create dataset ACM(PaperType) primary key tid autogenerated;
> load dataset ACM
> using localfs
> (("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ac,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ad,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ae"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));
> use dataverse test;
> create dataset DBLP(PaperType) primary key tid autogenerated;
> load dataset DBLP
> using localfs
> (("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ac,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ad,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ae,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.af"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));
> create dataset ACM(PaperType) primary key tid autogenerated;
> use dataverse test;
> drop dataset CITE if exists;
> create dataset CITE(PaperType) primary key tid autogenerated;
> load dataset CITE
> using localfs
> (("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/citation_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/citation_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/citation_split.ac"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));
> use dataverse test;
> drop index ACM.word_index if exists
> create index word_index on ACM(title) type keyword
> use dataverse test;
> drop index ACM.ngram_index if exists
> create index ngram_index on ACM(title) type ngram(3)
> {noformat}
> Firstly, we divide the three join onto two two-way joins and joined ACM with
> DBLP onto a temporary table STATGE1,
> {noformat}
> use dataverse test;
> drop dataset Stage1 if exists;
> create dataset Stage1(PaperType) primary key tid autogenerated;
> set import-private-functions 'true'
> set simthreshold '.9f';
> insert into dataset Stage1 (
> for $t in dataset ('ACM')
> for $o in dataset('DBLP')
> where word-tokens($o.authors) ~= word-tokens($t.authors)
> return {"title":$t.title,
> "authors":$t.authors,"year":$t.year,"conf":$t.conf,"idx":$t.idx,"abstract":$t.abstract})
> {noformat}
> Afterwards, we executed another two-way join as follows.
> {noformat}
> use dataverse test;
> set import-private-functions 'true'
> set simthreshold '.9f';
> let $s := sum(
> for $t in dataset ('Stage1')
> for $o in dataset('CITE')
> where word-tokens($o.authors) ~= word-tokens($t.authors)
> order by $o.id
> return 1)
> return $s
> {nofomrat}
> It is successful to generate the final 7-million results.
> Nevertheless, if we enforce the following query directly,
> {noformat}
> use dataverse test;
> set import-private-functions 'true'
> set simthreshold '.9f';
> let $s := sum(
> for $t in dataset ('ACM')
> for $o in dataset('DBLP')
> for $g in dataset('CITE')
> where word-tokens($o.authors) ~= word-tokens($t.authors) and
> word-tokens($t.authors) ~= word-tokens($g.authors)
> order by $o.id
> return 1)
> return $s
> {noformat}
> we got the error
> {noformat}
> Java heap error.
> {noformat}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)