[ 
https://issues.apache.org/jira/browse/ASTERIXDB-1435?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Wenhai updated ASTERIXDB-1435:
------------------------------
    Description: 
When we testing the fuzzy-join patch 
(https://asterix-gerrit.ics.uci.edu/#/c/531/) on a million records table ACM 
joining half-million DBLP joining 2million CITE datasets, we encountered a heap 
error that should not to be. We divide the three-way prefix-based fuzzy join in 
two rounds as following.
Schema
{noformat}
drop dataverse testtype if exists;
create dataverse testtype;
use dataverse testtype;
create type ZipfanType as closed {
id: uuid,
zipfan_double1: double,
uniform_int1: int32,
zipfan_long1: int64,
gaussian_short1: int16,
zipfan_double2: double,
uniform_int2: int32,
zipfan_long2: int64,
gaussian_short2: int16,
log_string1: string}
create dataset Zipfan(ZipfanType)
primary key id autogenerated;



drop dataverse test if exists;

create dataverse test;

use dataverse test;

create type PaperType as open {
  tid:uuid,
  title: string,
  authors: string?,
  year: int?,
  conf: string?,
  idx: string,
  abstract: string?
}

create dataset ACM(PaperType) primary key tid autogenerated;

use dataverse test;
drop dataset ACM if exists;
create dataset ACM(PaperType) primary key tid autogenerated;
load dataset ACM
using localfs
(("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ac,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ad,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ae"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));

use dataverse test;

create dataset DBLP(PaperType) primary key tid autogenerated;

load dataset DBLP
using localfs
(("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ac,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ad,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ae,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.af"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));

create dataset ACM(PaperType) primary key tid autogenerated;

use dataverse test;
drop dataset CITE if exists;
create dataset CITE(PaperType) primary key tid autogenerated;
load dataset CITE
using localfs
(("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/citation_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/citation_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/citation_split.ac"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));

use dataverse test;
drop index ACM.word_index if exists
create index word_index on ACM(title) type keyword
use dataverse test;
drop index ACM.ngram_index if exists
create index ngram_index on ACM(title) type ngram(3)
{noformat}
Firstly, we divide the three join onto two two-way joins and joined ACM with 
DBLP onto a temporary table STATGE1,
{noformat}
use dataverse test;
drop dataset Stage1 if exists;
create dataset Stage1(PaperType) primary key tid autogenerated;

set import-private-functions 'true'
set simthreshold '.9f';
insert into dataset Stage1 (
for $t in dataset ('ACM')
for $o in dataset('DBLP')
where word-tokens($o.authors) ~= word-tokens($t.authors)
return {"title":$t.title, 
"authors":$t.authors,"year":$t.year,"conf":$t.conf,"idx":$t.idx,"abstract":$t.abstract})
{noformat}
Afterwards, we executed another two-way join as follows.
{noformat}
use dataverse test;
set import-private-functions 'true'
set simthreshold '.9f';
let $s := sum(
for $t in dataset ('Stage1')
for $o in dataset('CITE')
where word-tokens($o.authors) ~= word-tokens($t.authors)
order by $o.id
return 1)
return $s
{noformat}
It is successful to generate the final 7-million results.
Nevertheless, if we enforce the following query directly,
{noformat}
use dataverse test;
set import-private-functions 'true'
set simthreshold '.9f';
let $s := sum(
for $t in dataset ('ACM')
for $o in dataset('DBLP')
for $g in dataset('CITE')
where word-tokens($o.authors) ~= word-tokens($t.authors) and 
word-tokens($t.authors) ~= word-tokens($g.authors)
order by $o.id
return 1)
return $s
{noformat}

we got the error
{noformat}
Java heap error.
{noformat}

  was:
When we testing the fuzzy-join patch 
(https://asterix-gerrit.ics.uci.edu/#/c/531/) on a million records table ACM 
joining half-million DBLP joining 2million CITE datasets, we encountered a heap 
error that should not to be. We divide the three-way prefix-based fuzzy join in 
two rounds as following.
Schema
{noformat}
drop dataverse testtype if exists;
create dataverse testtype;
use dataverse testtype;
create type ZipfanType as closed {
id: uuid,
zipfan_double1: double,
uniform_int1: int32,
zipfan_long1: int64,
gaussian_short1: int16,
zipfan_double2: double,
uniform_int2: int32,
zipfan_long2: int64,
gaussian_short2: int16,
log_string1: string}
create dataset Zipfan(ZipfanType)
primary key id autogenerated;



drop dataverse test if exists;

create dataverse test;

use dataverse test;

create type PaperType as open {
  tid:uuid,
  title: string,
  authors: string?,
  year: int?,
  conf: string?,
  idx: string,
  abstract: string?
}

create dataset ACM(PaperType) primary key tid autogenerated;

use dataverse test;
drop dataset ACM if exists;
create dataset ACM(PaperType) primary key tid autogenerated;
load dataset ACM
using localfs
(("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ac,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ad,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ae"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));

use dataverse test;

create dataset DBLP(PaperType) primary key tid autogenerated;

load dataset DBLP
using localfs
(("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ac,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ad,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ae,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.af"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));

create dataset ACM(PaperType) primary key tid autogenerated;

use dataverse test;
drop dataset CITE if exists;
create dataset CITE(PaperType) primary key tid autogenerated;
load dataset CITE
using localfs
(("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/citation_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/citation_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/citation_split.ac"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));

use dataverse test;
drop index ACM.word_index if exists
create index word_index on ACM(title) type keyword
use dataverse test;
drop index ACM.ngram_index if exists
create index ngram_index on ACM(title) type ngram(3)
{noformat}
Firstly, we divide the three join onto two two-way joins and joined ACM with 
DBLP onto a temporary table STATGE1,
{noformat}
use dataverse test;
drop dataset Stage1 if exists;
create dataset Stage1(PaperType) primary key tid autogenerated;

set import-private-functions 'true'
set simthreshold '.9f';
insert into dataset Stage1 (
for $t in dataset ('ACM')
for $o in dataset('DBLP')
where word-tokens($o.authors) ~= word-tokens($t.authors)
return {"title":$t.title, 
"authors":$t.authors,"year":$t.year,"conf":$t.conf,"idx":$t.idx,"abstract":$t.abstract})
{noformat}
Afterwards, we executed another two-way join as follows.
{noformat}
use dataverse test;
set import-private-functions 'true'
set simthreshold '.9f';
let $s := sum(
for $t in dataset ('Stage1')
for $o in dataset('CITE')
where word-tokens($o.authors) ~= word-tokens($t.authors)
order by $o.id
return 1)
return $s
{nofomrat}
It is successful to generate the final 7-million results.
Nevertheless, if we enforce the following query directly,
{noformat}
use dataverse test;
set import-private-functions 'true'
set simthreshold '.9f';
let $s := sum(
for $t in dataset ('ACM')
for $o in dataset('DBLP')
for $g in dataset('CITE')
where word-tokens($o.authors) ~= word-tokens($t.authors) and 
word-tokens($t.authors) ~= word-tokens($g.authors)
order by $o.id
return 1)
return $s
{noformat}

we got the error
{noformat}
Java heap error.
{noformat}


> Massive operators will induce a  heap error that should not be.
> ---------------------------------------------------------------
>
>                 Key: ASTERIXDB-1435
>                 URL: https://issues.apache.org/jira/browse/ASTERIXDB-1435
>             Project: Apache AsterixDB
>          Issue Type: Improvement
>          Components: AsterixDB
>         Environment: Ubuntu 12.04 on a single machine with 2 X 12 partitions 
> on 6 CPUs X 4 hard threads
>            Reporter: Wenhai
>
> When we testing the fuzzy-join patch 
> (https://asterix-gerrit.ics.uci.edu/#/c/531/) on a million records table ACM 
> joining half-million DBLP joining 2million CITE datasets, we encountered a 
> heap error that should not to be. We divide the three-way prefix-based fuzzy 
> join in two rounds as following.
> Schema
> {noformat}
> drop dataverse testtype if exists;
> create dataverse testtype;
> use dataverse testtype;
> create type ZipfanType as closed {
> id: uuid,
> zipfan_double1: double,
> uniform_int1: int32,
> zipfan_long1: int64,
> gaussian_short1: int16,
> zipfan_double2: double,
> uniform_int2: int32,
> zipfan_long2: int64,
> gaussian_short2: int16,
> log_string1: string}
> create dataset Zipfan(ZipfanType)
> primary key id autogenerated;
> drop dataverse test if exists;
> create dataverse test;
> use dataverse test;
> create type PaperType as open {
>   tid:uuid,
>   title: string,
>   authors: string?,
>   year: int?,
>   conf: string?,
>   idx: string,
>   abstract: string?
> }
> create dataset ACM(PaperType) primary key tid autogenerated;
> use dataverse test;
> drop dataset ACM if exists;
> create dataset ACM(PaperType) primary key tid autogenerated;
> load dataset ACM
> using localfs
> (("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ac,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ad,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/acm_split.ae"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));
> use dataverse test;
> create dataset DBLP(PaperType) primary key tid autogenerated;
> load dataset DBLP
> using localfs
> (("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ac,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ad,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.ae,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/dblp_split.af"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));
> create dataset ACM(PaperType) primary key tid autogenerated;
> use dataverse test;
> drop dataset CITE if exists;
> create dataset CITE(PaperType) primary key tid autogenerated;
> load dataset CITE
> using localfs
> (("path"="127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/citation_split.aa,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/citation_split.ab,127.0.0.1:///home/hadoop/Downloads/doccorpus/reproduce/citation_split.ac"),("format"="delimited-text"),("delimiter"="#"),("quote"="\u0000"));
> use dataverse test;
> drop index ACM.word_index if exists
> create index word_index on ACM(title) type keyword
> use dataverse test;
> drop index ACM.ngram_index if exists
> create index ngram_index on ACM(title) type ngram(3)
> {noformat}
> Firstly, we divide the three join onto two two-way joins and joined ACM with 
> DBLP onto a temporary table STATGE1,
> {noformat}
> use dataverse test;
> drop dataset Stage1 if exists;
> create dataset Stage1(PaperType) primary key tid autogenerated;
> set import-private-functions 'true'
> set simthreshold '.9f';
> insert into dataset Stage1 (
> for $t in dataset ('ACM')
> for $o in dataset('DBLP')
> where word-tokens($o.authors) ~= word-tokens($t.authors)
> return {"title":$t.title, 
> "authors":$t.authors,"year":$t.year,"conf":$t.conf,"idx":$t.idx,"abstract":$t.abstract})
> {noformat}
> Afterwards, we executed another two-way join as follows.
> {noformat}
> use dataverse test;
> set import-private-functions 'true'
> set simthreshold '.9f';
> let $s := sum(
> for $t in dataset ('Stage1')
> for $o in dataset('CITE')
> where word-tokens($o.authors) ~= word-tokens($t.authors)
> order by $o.id
> return 1)
> return $s
> {noformat}
> It is successful to generate the final 7-million results.
> Nevertheless, if we enforce the following query directly,
> {noformat}
> use dataverse test;
> set import-private-functions 'true'
> set simthreshold '.9f';
> let $s := sum(
> for $t in dataset ('ACM')
> for $o in dataset('DBLP')
> for $g in dataset('CITE')
> where word-tokens($o.authors) ~= word-tokens($t.authors) and 
> word-tokens($t.authors) ~= word-tokens($g.authors)
> order by $o.id
> return 1)
> return $s
> {noformat}
> we got the error
> {noformat}
> Java heap error.
> {noformat}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to