http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.6.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.6.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.6.query.aql new file mode 100644 index 0000000..310caa9 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.6.query.aql @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +// Stage 1 +let $rankedTokens := ( + for $right in dataset left + let $id := $right.id + for $token in word-tokens($right.authors) + /*+ hash */ group by $tokenGroupped := $token with $id + /*+ inmem 34 198608 */ order by count($id), $tokenGroupped + return $tokenGroupped +) + +// Stage 2_2 of right +let $tokenRightVerify := ( + for $right in dataset right + let $tokenUnrankedLeft := word-tokens($right.authors) + let $tokens := ( + for $token in $tokenUnrankedLeft + let $index := + for $tokenRanked at $i in $rankedTokens + where $token = /*+ bcast */ $tokenRanked + return $i + order by $index + return $index + ) + order by $right.authors + return {"authors": $right.authors, "tokens": $tokenUnrankedLeft, "ranked": $tokens} +) +let $tokenRight := ( + for $right in dataset right + let $tokenUnrankedRight := word-tokens($right.authors) + for $token in $tokenUnrankedRight + for $tokenRanked at $i in $rankedTokens + where $token = /*+ bcast */ $tokenRanked + order by $i + return $i +) + +for $r in $tokenRightVerify +return $r \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.7.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.7.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.7.query.aql new file mode 100644 index 0000000..4e51613 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.7.query.aql @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +for $paperDBLP in dataset('right') + let $tokensUnrankedDBLP := word-tokens($paperDBLP.authors) + let $lenDBLP := len($tokensUnrankedDBLP) + let $tokensDBLP := + for $tokenUnranked in $tokensUnrankedDBLP + let $index := + for $tokenRanked at $i in + // + // -- - Stage 1 - -- + // + for $paper in dataset('left') + let $id := $paper.id + for $token in word-tokens($paper.authors) + /*+ hash */ + group by $tokenGrouped := $token with $id + /*+ inmem 1 302 */ + order by count($id), $tokenGrouped + return $tokenGrouped + where $tokenUnranked = /*+ bcast */ $tokenRanked + return $i + order by $index[0] + return $index[0] +order by $paperDBLP.authors +return {"tokens": $tokensUnrankedDBLP, "ranks": $tokensDBLP} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.1.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.1.ddl.aql new file mode 100644 index 0000000..45cc975 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.1.ddl.aql @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +drop dataverse fuzzyjoin_basic if exists; + +create dataverse fuzzyjoin_basic; + +use dataverse fuzzyjoin_basic; + +create type BasicType as closed { + id: uuid, + authors: string +} + +create dataset left(BasicType) primary key id autogenerated; +create dataset right(BasicType) primary key id autogenerated; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.2.update.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.2.update.aql new file mode 100644 index 0000000..c9aceb2 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.2.update.aql @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +load dataset left +using localfs +(("path"="asterix_nc1://data/pub-small/dblpauthors.adm"),("format"="adm")); + +load dataset right +using localfs +(("path"="asterix_nc1://data/pub-small/csxauthors.adm"),("format"="adm")); \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.3.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.3.query.aql new file mode 100644 index 0000000..510c1f0 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.3.query.aql @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $r := count( + for $paperDBLP in dataset('left') + let $idDBLP := $paperDBLP.id + let $tokensUnrankedDBLP := word-tokens($paperDBLP.authors) + let $lenDBLP := len($tokensUnrankedDBLP) + let $tokensDBLP := + for $tokenUnranked in $tokensUnrankedDBLP + for $tokenRanked at $i in + // + // -- - Stage 1 - -- + // + for $paper in dataset('left') + let $id := $paper.id + for $token in word-tokens($paper.authors) + /*+ hash */ + group by $tokenGrouped := $token with $id + /*+ inmem 1 302 */ + order by count($id), $tokenGrouped + return $tokenGrouped + where $tokenUnranked = /*+ bcast */ $tokenRanked + order by $i + return $i + for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, prefix-len-jaccard(len($tokensDBLP), .8f)) + + for $paperCSX in dataset('right') + let $idCSX := $paperCSX.id + let $tokensUnrankedCSX := word-tokens($paperCSX.authors) + let $lenCSX := len($tokensUnrankedCSX) + let $tokensCSX := + for $tokenUnranked in $tokensUnrankedCSX + for $tokenRanked at $i in + // + // -- - Stage 1 - -- + // + for $paper in dataset('left') + let $id := $paper.id + for $token in word-tokens($paper.authors) + /*+ hash */ + group by $tokenGrouped := $token with $id + /*+ inmem 1 302 */ + order by count($id), $tokenGrouped + return $tokenGrouped + where $tokenUnranked = /*+ bcast */ $tokenRanked + order by $i + return $i + for $prefixTokenCSX in subset-collection($tokensCSX, 0, prefix-len-jaccard(len($tokensCSX), .8f)) + + where $prefixTokenDBLP = $prefixTokenCSX + let $sim := similarity-jaccard-prefix($lenDBLP, $tokensDBLP, $lenCSX, $tokensCSX, $prefixTokenCSX, .8f) + where $sim >= .8f + /*+ hash*/ + group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim + return {'idDBLP': $idDBLP, 'idCSX': $idCSX, "sim": $sim[0]} +) +return $r \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.4.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.4.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.4.query.aql new file mode 100644 index 0000000..b903881 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.4.query.aql @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $s := count( + for $paperDBLP in dataset('left') + let $idDBLP := $paperDBLP.id + let $tokensUnrankedDBLP := word-tokens($paperDBLP.authors) + let $lenDBLP := len($tokensUnrankedDBLP) + let $tokensDBLP := + for $tokenUnranked in $tokensUnrankedDBLP + for $tokenRanked at $i in + // + // -- - Stage 1 - -- + // + for $paper in dataset('left') + let $id := $paper.id + for $token in word-tokens($paper.authors) + /*+ hash */ + group by $tokenGrouped := $token with $id + /*+ inmem 1 302 */ + order by count($id), $tokenGrouped + return $tokenGrouped + where $tokenUnranked = /*+ bcast */ $tokenRanked + order by $i + return $i + for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, prefix-len-jaccard(len($tokensDBLP), .8f)) + + for $paperCSX in dataset('right') + let $idCSX := $paperCSX.id + let $tokensUnrankedCSX := word-tokens($paperCSX.authors) + let $lenCSX := len($tokensUnrankedCSX) + let $tokensCSX := + for $tokenUnranked in $tokensUnrankedCSX + for $tokenRanked at $i in + // + // -- - Stage 1 - -- + // + for $paper in dataset('left') + let $id := $paper.id + for $token in word-tokens($paper.authors) + /*+ hash */ + group by $tokenGrouped := $token with $id + /*+ inmem 1 302 */ + order by count($id), $tokenGrouped + return $tokenGrouped + where $tokenUnranked = /*+ bcast */ $tokenRanked + order by $i + return $i + let $actualPrefixLen := prefix-len-jaccard(len($tokensUnrankedCSX), .8f) - len($tokensUnrankedCSX) + len($tokensCSX) + for $prefixTokenCSX in subset-collection($tokensCSX, 0, $actualPrefixLen) + + where $prefixTokenDBLP = $prefixTokenCSX + let $sim := similarity-jaccard-prefix($lenDBLP, $tokensDBLP, $lenCSX, $tokensCSX, $prefixTokenCSX, .8f) + where $sim >= .8f + /*+ hash*/ + group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim + return {'idDBLP': $idDBLP, 'idCSX': $idCSX, "sim": $sim[0]} +) +return 0 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.5.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.5.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.5.query.aql new file mode 100644 index 0000000..119520a --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.5.query.aql @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $t := count( + for $paperDBLP in dataset('left') + let $idDBLP := $paperDBLP.id + let $tokensUnrankedDBLP := word-tokens($paperDBLP.authors) + let $lenDBLP := len($tokensUnrankedDBLP) + let $tokensDBLP := + for $tokenUnranked in $tokensUnrankedDBLP + for $tokenRanked at $i in + // + // -- - Stage 1 - -- + // + for $paper in dataset('left') + let $id := $paper.id + for $token in word-tokens($paper.authors) + /*+ hash */ + group by $tokenGrouped := $token with $id + /*+ inmem 1 302 */ + order by count($id), $tokenGrouped + return $tokenGrouped + where $tokenUnranked = /*+ bcast */ $tokenRanked + order by $i + return $i + for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, prefix-len-jaccard(len($tokensDBLP), .8f)) + + for $paperCSX in dataset('right') + let $idCSX := $paperCSX.id + let $tokensUnrankedCSX := word-tokens($paperCSX.authors) + let $lenCSX := len($tokensUnrankedCSX) + let $tokensCSX := + for $tokenUnranked in $tokensUnrankedCSX + for $tokenRanked at $i in + // + // -- - Stage 1 - -- + // + for $paper in dataset('left') + let $id := $paper.id + for $token in word-tokens($paper.authors) + /*+ hash */ + group by $tokenGrouped := $token with $id + /*+ inmem 1 302 */ + order by count($id), $tokenGrouped + return $tokenGrouped + where $tokenUnranked = /*+ bcast */ $tokenRanked + order by $i + return $i + let $actualPrefixLen := prefix-len-jaccard(len($tokensUnrankedCSX), .8f) - len($tokensUnrankedCSX) + len($tokensCSX) + for $prefixTokenCSX in subset-collection($tokensCSX, 0, $actualPrefixLen) + + where $prefixTokenDBLP = $prefixTokenCSX + + /*+ hash*/ + group by $idDBLP := $idDBLP, $idCSX := $idCSX with $tokensUnrankedDBLP, $tokensUnrankedCSX + let $sim := similarity-jaccard-check($tokensUnrankedDBLP[0], $tokensUnrankedCSX[0], .8f) + where $sim[1] >= .8f + return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[1]} +) +return $t \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.6.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.6.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.6.query.aql new file mode 100644 index 0000000..465cda9 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.6.query.aql @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +for $paperDBLP in dataset('left') +let $idDBLP := $paperDBLP.id +let $tokensUnrankedDBLP := word-tokens($paperDBLP.authors) +let $lenDBLP := len($tokensUnrankedDBLP) +let $tokensDBLP := + for $tokenUnranked in $tokensUnrankedDBLP + for $tokenRanked at $i in + // + // -- - Stage 1 - -- + // + for $paper in dataset('left') + let $id := $paper.id + for $token in word-tokens($paper.authors) + /*+ hash */ + group by $tokenGrouped := $token with $id + /*+ inmem 1 302 */ + order by count($id), $tokenGrouped + return $tokenGrouped + where $tokenUnranked = /*+ bcast */ $tokenRanked + order by $i + return $i +for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, prefix-len-jaccard(len($tokensDBLP), .8f)) + +for $paperCSX in dataset('right') +let $idCSX := $paperCSX.id +let $tokensUnrankedCSX := word-tokens($paperCSX.authors) +let $lenCSX := len($tokensUnrankedCSX) +let $tokensCSX := + for $tokenUnranked in $tokensUnrankedCSX + for $tokenRanked at $i in + // + // -- - Stage 1 - -- + // + for $paper in dataset('left') + let $id := $paper.id + for $token in word-tokens($paper.authors) + /*+ hash */ + group by $tokenGrouped := $token with $id + /*+ inmem 1 302 */ + order by count($id), $tokenGrouped + return $tokenGrouped + where $tokenUnranked = /*+ bcast */ $tokenRanked + order by $i + return $i +let $actualPrefixLen := prefix-len-jaccard(len($tokensUnrankedCSX), .8f) - len($tokensUnrankedCSX) + len($tokensCSX) +for $prefixTokenCSX in subset-collection($tokensCSX, 0, $actualPrefixLen) + +where $prefixTokenDBLP = $prefixTokenCSX + +/*+ hash*/ +distinct by $idDBLP, $idCSX +let $sim := similarity-jaccard-check($tokensUnrankedDBLP, $tokensUnrankedCSX, .8f) +where $sim[1] >= .8f +order by $tokensUnrankedDBLP, $tokensUnrankedCSX +return {'DBLP': $tokensUnrankedDBLP, 'CSX': $tokensUnrankedCSX, 'sim': $sim[1]} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.1.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.1.ddl.aql new file mode 100644 index 0000000..45cc975 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.1.ddl.aql @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +drop dataverse fuzzyjoin_basic if exists; + +create dataverse fuzzyjoin_basic; + +use dataverse fuzzyjoin_basic; + +create type BasicType as closed { + id: uuid, + authors: string +} + +create dataset left(BasicType) primary key id autogenerated; +create dataset right(BasicType) primary key id autogenerated; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.2.update.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.2.update.aql new file mode 100644 index 0000000..c9aceb2 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.2.update.aql @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +load dataset left +using localfs +(("path"="asterix_nc1://data/pub-small/dblpauthors.adm"),("format"="adm")); + +load dataset right +using localfs +(("path"="asterix_nc1://data/pub-small/csxauthors.adm"),("format"="adm")); \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.3.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.3.query.aql new file mode 100644 index 0000000..009c2b9 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.3.query.aql @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +set simthreshold '.2f'; + +let $pj := ( + for $r in dataset left + for $s in dataset right + where word-tokens($r.authors) ~= word-tokens($s.authors) + return {"rid": $r.id, "sid": $s.id, "rstr": $r.authors, "sstr": $s.authors} +) + +let $nl := ( + for $r in dataset left + for $s in dataset right + where word-tokens($r.authors) /*+ indexnl */ ~= word-tokens($s.authors) + return {"rid": $r.id, "sid": $s.id, "rstr": $r.authors, "sstr": $s.authors} +) + +let $orderedTokens := ( +for $paper in dataset('right') + let $id := $paper.id + for $token in word-tokens($paper.authors) + /*+ hash */ + group by $tokenGrouped := $token with $id + /*+ inmem 1 302 */ + order by count($id), $tokenGrouped + return $tokenGrouped +) + +let $simpairs := ( + for $r in $nl + return { + "rid": $r.rid, + "sid": $r.sid, + "rstr": $r.rstr, + "sstr": $r.sstr, + "simpairs": + for $s in $pj + where $r.rid = $s.rid and $r.sid = $s.sid + return {"rid": $s.rid, "sid": $s.sid} + } +) + +for $d in $simpairs +where count($d.simpairs) = 0 + let $rid := $d.rid + let $sid := $d.sid + let $rstr := for $t in word-tokens($d.rstr) order by $t return $t + let $sstr := for $t in word-tokens($d.sstr) order by $t return $t + let $rlen := len(for $t in word-tokens($d.rstr) order by $t return $t) + let $slen := len(for $t in word-tokens($d.sstr) order by $t return $t) + let $orstr := for $t in word-tokens($d.rstr) for $token at $i in $orderedTokens where $t /*+ bcast */ = $token order by $i return $i + let $osstr := for $t in word-tokens($d.sstr) for $token at $i in $orderedTokens where $t /*+ bcast */ = $token order by $i return $i + let $lorstr := len(for $t in word-tokens($d.rstr) for $token at $i in $orderedTokens where $t /*+ bcast */ = $token order by $i return $i) + let $losstr := len(for $t in word-tokens($d.sstr) for $token at $i in $orderedTokens where $t /*+ bcast */ = $token order by $i return $i) +return { + "rid": $rid, + "sid": $sid, + "rstr": $rstr, + "sstr": $sstr, + "rlen": $rlen, + "slen": $slen, + "orstr": $orstr, + "osstr": $osstr, + "lorstr": $lorstr, + "losstr": $losstr, + "simpairs": $d, + "sim": let $cmmon := for $r in $orstr for $s in $osstr where $r = $s return $r + return similarity-jaccard-prefix($rlen, $orstr, $slen, $osstr, $cmmon[0], 0.2f)} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.1.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.1.ddl.aql new file mode 100644 index 0000000..45cc975 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.1.ddl.aql @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +drop dataverse fuzzyjoin_basic if exists; + +create dataverse fuzzyjoin_basic; + +use dataverse fuzzyjoin_basic; + +create type BasicType as closed { + id: uuid, + authors: string +} + +create dataset left(BasicType) primary key id autogenerated; +create dataset right(BasicType) primary key id autogenerated; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.2.update.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.2.update.aql new file mode 100644 index 0000000..c9aceb2 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.2.update.aql @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +load dataset left +using localfs +(("path"="asterix_nc1://data/pub-small/dblpauthors.adm"),("format"="adm")); + +load dataset right +using localfs +(("path"="asterix_nc1://data/pub-small/csxauthors.adm"),("format"="adm")); \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.3.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.3.query.aql new file mode 100644 index 0000000..b6976bd --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.3.query.aql @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $tokensUnrankedLeft := [ "a", "baesens", "baestaens", "bart", "den", "dirk", "dirk", "emma", "gestel", "johan", "k", +"marleen", "poel", "suykens", "tony", "van", "van", "willekens" ] + +let $tokensUnrankedRight := [ "a", "baesens", "baestaens", "bart", "bedrijfskunde", "den", "dirk", "dirk", "emma", "en", +"gent", "gestel", "johan", "k", "marleen", "poel", "suykens", "tony", "van", "van", "willekens" ] + +let $lenLeft := len($tokensUnrankedLeft) +let $tokensLeft := + for $tokenUnranked in $tokensUnrankedLeft + for $tokenRanked at $i in + // + // -- - Stage 1 - -- + // + for $orders in dataset left + let $id := $orders.id + for $token in word-tokens($orders.authors) + /*+ hash */ + group by $tokenGrouped := $token with $id + /*+ inmem 1 302 */ + order by count($id), $tokenGrouped + return $tokenGrouped + where $tokenUnranked = /*+ bcast */ $tokenRanked + order by $i + return $i + +let $lenRight := len($tokensUnrankedRight) +let $tokensRight := + for $tokenUnranked in $tokensUnrankedRight + for $tokenRanked at $i in + // + // -- - Stage 1 - -- + // + for $orders in dataset left + let $id := $orders.id + for $token in word-tokens($orders.authors) + /*+ hash */ + group by $tokenGrouped := $token with $id + /*+ inmem 1 302 */ + order by count($id), $tokenGrouped + return $tokenGrouped + where $tokenUnranked = /*+ bcast */ $tokenRanked + order by $i + return $i + +return { + "leftTokens": $tokensLeft, "rightTokens": $tokensRight, "lenLeftTrue": len($tokensUnrankedLeft), + "lenLeft": len($tokensLeft), "lenRightTrue": len($tokensUnrankedRight), "lenRight": len($tokensRight), + "full_sim": similarity-jaccard-check($tokensLeft, $tokensRight, .8f), + "true_sim": similarity-jaccard-check($tokensUnrankedLeft, $tokensUnrankedRight, .8f), + "pref_sim": similarity-jaccard-prefix(len($tokensLeft), $tokensLeft, len($tokensRight), + $tokensRight, $tokensLeft[0], .8f), + "fast_sim": similarity-jaccard-prefix(len($tokensUnrankedLeft), $tokensLeft, len($tokensUnrankedRight), + $tokensRight, $tokensLeft[0], .8f) + } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.1.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.1.ddl.aql new file mode 100644 index 0000000..45cc975 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.1.ddl.aql @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +drop dataverse fuzzyjoin_basic if exists; + +create dataverse fuzzyjoin_basic; + +use dataverse fuzzyjoin_basic; + +create type BasicType as closed { + id: uuid, + authors: string +} + +create dataset left(BasicType) primary key id autogenerated; +create dataset right(BasicType) primary key id autogenerated; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.2.update.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.2.update.aql new file mode 100644 index 0000000..c9aceb2 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.2.update.aql @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +load dataset left +using localfs +(("path"="asterix_nc1://data/pub-small/dblpauthors.adm"),("format"="adm")); + +load dataset right +using localfs +(("path"="asterix_nc1://data/pub-small/csxauthors.adm"),("format"="adm")); \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.3.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.3.query.aql new file mode 100644 index 0000000..3b5d44b --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.3.query.aql @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +set simthreshold '.2f'; + +let $cpj := count( + for $r in dataset left + for $s in dataset right + where word-tokens($r.authors) ~= word-tokens($s.authors) + return {"rid": $r.id, "sid": $s.id} +) + +let $cnl := count( + for $r in dataset left + for $s in dataset right + where word-tokens($r.authors) /*+ indexnl */ ~= word-tokens($s.authors) + return {"rid": $r.id, "sid": $s.id} +) + +return [$cnl, $cpj] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.1.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.1.ddl.aql new file mode 100644 index 0000000..a72efb5 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.1.ddl.aql @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +drop dataverse fuzzyjoin if exists; + +create dataverse fuzzyjoin; + +use dataverse fuzzyjoin; + +create type DBLPType as closed { + id: int64, + dblpid: string, + title: string, + authors: string, + misc: string +} + +create dataset DBLP(DBLPType) primary key id; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.2.update.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.2.update.aql new file mode 100644 index 0000000..d3d02d4 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.2.update.aql @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin; + +load dataset DBLP +using localfs +(("path"="asterix_nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")); \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.3.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.3.query.aql new file mode 100644 index 0000000..c53475f --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.3.query.aql @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin; + +set simthreshold '.15f'; + +let $cpj := count( + for $dblp in dataset('DBLP') + for $dblp2 in dataset('DBLP') + where word-tokens($dblp.title) ~= word-tokens($dblp2.title) + order by $dblp.id, $dblp2.id + return {'dblp': $dblp.id, 'dblp2': $dblp2.id} +) + +let $cnl := count( + for $dblp in dataset('DBLP') + for $dblp2 in dataset('DBLP') + where word-tokens($dblp.title) /*+indexnl*/ ~= word-tokens($dblp2.title) + order by $dblp.id, $dblp2.id + return {'dblp': $dblp.id, 'dblp2': $dblp2.id} +) + +return [$cnl, $cpj] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.1.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.1.ddl.aql new file mode 100644 index 0000000..bd84097 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.1.ddl.aql @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +drop dataverse fuzzyjoin if exists; + +create dataverse fuzzyjoin; + +use dataverse fuzzyjoin; + +create type DBLPType as closed { + id: int64, + dblpid: string, + title: string, + authors: string, + misc: string +} + +create dataset DBLP(DBLPType) primary key id; + +create dataset TO(DBLPType) primary key id; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.2.update.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.2.update.aql new file mode 100644 index 0000000..7674827 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.2.update.aql @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin; + +load dataset DBLP +using localfs +(("path"="asterix_nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")); + +load dataset TO +using localfs +(("path"="asterix_nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")); \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.3.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.3.query.aql new file mode 100644 index 0000000..597e8a1 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.3.query.aql @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin; + +set import-private-functions 'true' + +set simthreshold '.61f'; + +let $pj := ( + for $dblp in dataset DBLP + for $dblp2 in dataset TO + where word-tokens($dblp.title) ~= word-tokens($dblp2.title) + return {"rid": $dblp.id, "sid": $dblp2.id, "rstr": $dblp.title, "sstr": $dblp2.title} +) + +let $nl := ( + for $dblp in dataset DBLP + for $dblp2 in dataset TO + where word-tokens($dblp.title) /* +indexnl */ ~= word-tokens($dblp2.title) + return {"rid": $dblp.id, "sid": $dblp2.id, "rstr": $dblp.title, "sstr": $dblp2.title} +) + +let $orderedTokens := ( + for $paper in dataset TO + let $id := $paper.id + for $token in word-tokens($paper.title) + /*+ hash */ + group by $tokenGrouped := $token with $id + /*+ inmem 1 302 */ + order by count($id), $tokenGrouped + return $tokenGrouped +) + +let $simpairs := ( + for $r in $nl + return { + "rid": $r.rid, + "sid": $r.sid, + "rstr": $r.rstr, + "sstr": $r.sstr, + "simpairs": + for $s in $pj + where $r.rid = $s.rid and $r.sid = $s.sid + return {"rid": $s.rid, "sid": $s.sid} + } +) + +for $d in $simpairs +where count($d.simpairs) = 0 + let $rid := $d.rid + let $sid := $d.sid + let $rstr := for $t in word-tokens($d.rstr) order by $t return $t + let $sstr := for $t in word-tokens($d.sstr) order by $t return $t + let $rlen := len(for $t in word-tokens($d.rstr) order by $t return $t) + let $slen := len(for $t in word-tokens($d.sstr) order by $t return $t) + let $orstr := for $t in word-tokens($d.rstr) for $token at $i in $orderedTokens where $t /*+ bcast */ = $token order by $i return $i + let $osstr := for $t in word-tokens($d.sstr) for $token at $i in $orderedTokens where $t /*+ bcast */ = $token order by $i return $i + let $lorstr := len(for $t in word-tokens($d.rstr) for $token at $i in $orderedTokens where $t /*+ bcast */ = $token order by $i return $i) + let $losstr := len(for $t in word-tokens($d.sstr) for $token at $i in $orderedTokens where $t /*+ bcast */ = $token order by $i return $i) +return { + "rid": $rid, + "sid": $sid, + "rstr": $rstr, + "sstr": $sstr, + "rlen": $rlen, + "slen": $slen, + "orstr": $orstr, + "osstr": $osstr, + "lorstr": $lorstr, + "losstr": $losstr, + "simpairs": $d, + "sim": let $cmmon := for $r in $orstr for $s in $osstr where $r = $s return $r + return similarity-jaccard-prefix($rlen, $orstr, $slen, $osstr, $cmmon[0], 0.61f)} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.1.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.1.ddl.aql new file mode 100644 index 0000000..45cc975 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.1.ddl.aql @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +drop dataverse fuzzyjoin_basic if exists; + +create dataverse fuzzyjoin_basic; + +use dataverse fuzzyjoin_basic; + +create type BasicType as closed { + id: uuid, + authors: string +} + +create dataset left(BasicType) primary key id autogenerated; +create dataset right(BasicType) primary key id autogenerated; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.2.update.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.2.update.aql new file mode 100644 index 0000000..c9aceb2 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.2.update.aql @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +load dataset left +using localfs +(("path"="asterix_nc1://data/pub-small/dblpauthors.adm"),("format"="adm")); + +load dataset right +using localfs +(("path"="asterix_nc1://data/pub-small/csxauthors.adm"),("format"="adm")); \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.3.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.3.query.aql new file mode 100644 index 0000000..f91e841 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.3.query.aql @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +// +// -- - Stage 1 - -- +// +for $orderRight in dataset('right') +let $rightId := $orderRight.id +for $orderTokenRight in word-tokens($orderRight.authors) + /*+ hash */ group by $tokenRightGrouped := $orderTokenRight with $rightId +/*+ inmem 1 302 */ order by count($rightId), $tokenRightGrouped +return [ $tokenRightGrouped, count($rightId) ] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.4.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.4.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.4.query.aql new file mode 100644 index 0000000..66dbbbc --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.4.query.aql @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +// +// -- - Stage 1 - -- +// +for $r in +for $orderRight in dataset('right') +let $rightId := $orderRight.id +for $orderTokenRight in word-tokens($orderRight.authors) + /*+ hash */ group by $tokenRightGrouped := $orderTokenRight with $rightId +return {"rt": $tokenRightGrouped, "rc": count($rightId)} + +for $l in +for $orderLeft in dataset('left') +let $leftId := $orderLeft.id +for $orderTokenLeft in word-tokens($orderLeft.authors) + /*+ hash */ group by $tokenLeftGrouped := $orderTokenLeft with $leftId +return {"lt": $tokenLeftGrouped, "lc": count($leftId)} + +where $r.rt = $l.lt +/*+ inmem 1 302 */ order by $r.rc * $l.lc, $r.rt +return [ $r.rt, $r.rc * $l.lc ] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.5.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.5.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.5.query.aql new file mode 100644 index 0000000..ad93db1 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.5.query.aql @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $r := count( + for $right in dataset('right') + let $idRight := $right.id + let $tokensUnrankedRight := word-tokens($right.authors) + let $lenRight := len($tokensUnrankedRight) + let $tokensRight := + for $tokenUnranked in $tokensUnrankedRight + for $tokenRanked at $i in + // + // -- - Stage 1 - -- + // + for $orderRight in dataset('right') + let $rightId := $orderRight.id + for $orderTokenRight in word-tokens($orderRight.authors) + /*+ hash */ group by $tokenRightGrouped := $orderTokenRight with $rightId + /*+ inmem 1 302 */ order by count($rightId) + return $tokenRightGrouped + where $tokenUnranked = /*+ bcast */ $tokenRanked + order by $i + return $i + for $prefixTokenRight in subset-collection($tokensRight, 0, prefix-len-jaccard(len($tokensRight), .8f)) + + for $left in dataset('left') + let $idLeft := $left.id + let $tokensUnrankedLeft := word-tokens($left.authors) + let $lenLeft := len($tokensUnrankedLeft) + let $tokensLeft := + for $tokenUnranked in $tokensUnrankedLeft + for $tokenRanked at $i in + // + // -- - Stage 1 - -- + // + for $orderRight in dataset('right') + let $rightId := $orderRight.id + for $orderTokenRight in word-tokens($orderRight.authors) + /*+ hash */ group by $tokenRightGrouped := $orderTokenRight with $rightId + /*+ inmem 1 302 */ order by count($rightId) + return $tokenRightGrouped + where $tokenUnranked = /*+ bcast */ $tokenRanked + order by $i + return $i + let $actualPrefixLen := prefix-len-jaccard(len($tokensUnrankedLeft), .8f) - len($tokensUnrankedLeft) + len($tokensLeft) + for $prefixTokenLeft in subset-collection($tokensLeft, 0, $actualPrefixLen) + + where $prefixTokenRight = $prefixTokenLeft + let $sim := similarity-jaccard-prefix($lenRight, $tokensRight, $lenLeft, $tokensLeft, $prefixTokenLeft, .8f) + where $sim >= .8f + /*+ hash*/ group by $idRight := $idRight, $idLeft := $idLeft with $sim + return {'idDBLP': $idRight, 'idCSX': $idLeft, "sim": $sim[0]} +) +return $r \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.6.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.6.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.6.query.aql new file mode 100644 index 0000000..5594de3 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.6.query.aql @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $r := count( + for $right in dataset('right') + let $idRight := $right.id + let $tokensUnrankedRight := word-tokens($right.authors) + let $lenRight := len($tokensUnrankedRight) + let $tokensRight := + for $tokenUnranked in $tokensUnrankedRight + for $tokenRanked at $i in + // + // -- - Stage 1 - -- + // + for $r in + for $orderRight in dataset('right') + let $rightId := $orderRight.id + for $orderTokenRight in word-tokens($orderRight.authors) + /*+ hash */ group by $tokenRightGrouped := $orderTokenRight with $rightId + return {"rt": $tokenRightGrouped, "rc": count($rightId)} + for $l in + for $orderLeft in dataset('left') + let $leftId := $orderLeft.id + for $orderTokenLeft in word-tokens($orderLeft.authors) + /*+ hash */ group by $tokenLeftGrouped := $orderTokenLeft with $leftId + return {"lt": $tokenLeftGrouped, "lc": count($leftId)} + where $r.rt = $l.lt + /*+ inmem 1 302 */ order by $r.rc * $l.lc + return $r.rt + + where $tokenUnranked = /*+ bcast */ $tokenRanked + order by $i + return $i + for $prefixTokenRight in subset-collection($tokensRight, 0, prefix-len-jaccard(len($tokensRight), .8f)) + + for $left in dataset('left') + let $idLeft := $left.id + let $tokensUnrankedLeft := word-tokens($left.authors) + let $lenLeft := len($tokensUnrankedLeft) + let $tokensLeft := + for $tokenUnranked in $tokensUnrankedLeft + for $tokenRanked at $i in + // + // -- - Stage 1 - -- + // + for $r in + for $orderRight in dataset('right') + let $rightId := $orderRight.id + for $orderTokenRight in word-tokens($orderRight.authors) + /*+ hash */ group by $tokenRightGrouped := $orderTokenRight with $rightId + return {"rt": $tokenRightGrouped, "rc": count($rightId)} + for $l in + for $orderLeft in dataset('left') + let $leftId := $orderLeft.id + for $orderTokenLeft in word-tokens($orderLeft.authors) + /*+ hash */ group by $tokenLeftGrouped := $orderTokenLeft with $leftId + return {"lt": $tokenLeftGrouped, "lc": count($leftId)} + where $r.rt = $l.lt + /*+ inmem 1 302 */ order by $r.rc * $l.lc + return $r.rt + + where $tokenUnranked = /*+ bcast */ $tokenRanked + order by $i + return $i + let $actualPrefixLen := prefix-len-jaccard(len($tokensUnrankedLeft), .8f) - len($tokensUnrankedLeft) + len($tokensLeft) + for $prefixTokenLeft in subset-collection($tokensLeft, 0, $actualPrefixLen) + + where $prefixTokenRight = $prefixTokenLeft + let $sim := similarity-jaccard-prefix($lenRight, $tokensRight, $lenLeft, $tokensLeft, $prefixTokenLeft, .8f) + where $sim >= .8f + /*+ hash*/ group by $idRight := $idRight, $idLeft := $idLeft with $sim + return {'idDBLP': $idRight, 'idCSX': $idLeft, "sim": $sim[0]} +) +return $r \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2/dblp-csx-2_5.2.3.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2/dblp-csx-2_5.2.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2/dblp-csx-2_5.2.3.query.aql index 1cff8fc..7ecca70 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2/dblp-csx-2_5.2.3.query.aql +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2/dblp-csx-2_5.2.3.query.aql @@ -71,9 +71,7 @@ set import-private-functions 'true'; $tokensCSX, 0, prefix-len-jaccard(len($tokensCSX), .5f)) - where $prefixTokenDBLP = $prefixTokenCSX - let $sim := similarity-jaccard-prefix( $lenDBLP, $tokensDBLP, http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.1.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.1.ddl.aql new file mode 100644 index 0000000..3573f47 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.1.ddl.aql @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard function of their titles' word tokens. + * We expect the join to be transformed into an prefix-based fuzzy join following with an < select. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPNestedType as closed { + id: int64, + dblpid: string, + title: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested: DBLPNestedType +} + +create type CSXNestedType as closed { + id: int64, + csxid: string, + title: string, + authors: string, + misc: string +} + +create type CSXType as closed { + nested: CSXNestedType +} + +create dataset DBLPtmp(DBLPNestedType) primary key id; +create dataset CSXtmp(CSXNestedType) primary key id; + +create dataset DBLP(DBLPType) primary key nested.id; +create dataset CSX(CSXType) primary key nested.id; + http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.2.update.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.2.update.aql new file mode 100644 index 0000000..a2633b1 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.2.update.aql @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use dataverse test; + +load dataset DBLPtmp +using localfs +(("path"="asterix_nc1://data/dblp-small/dblp-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000")) pre-sorted; + +load dataset CSXtmp +using localfs +(("path"="asterix_nc1://data/pub-small/csx-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000")); + +insert into dataset DBLP( + for $x in dataset DBLPtmp + return { + "nested": $x + } +); + +insert into dataset CSX( + for $x in dataset CSXtmp + return { + "nested": $x + } +); http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.3.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.3.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.3.ddl.aql new file mode 100644 index 0000000..0359448 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.3.ddl.aql @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use dataverse test; + +create index keyword_index on DBLP(nested.title) type keyword; + http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.4.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.4.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.4.query.aql new file mode 100644 index 0000000..65e2576 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.4.query.aql @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use dataverse test; + +for $a in dataset('DBLP') +for $b in dataset('CSX') +where word-tokens($a.nested.title) ~= word-tokens($b.nested.title) + and $a.nested.id < $b.nested.id +order by $a.nested.id, $b.nested.id +return { "arec": $a.nested, "brec": $b.nested } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.1.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.1.ddl.aql new file mode 100644 index 0000000..72458b9 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.1.ddl.aql @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard function of their titles' 3-gram tokens. + * We expect the join to be transformed into an indexed prefix-based fuzzy join. + * We treat the < condition as a select over the fuzzy join results. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPNestedType as closed { + id: int64, + dblpid: string, + title: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested: DBLPNestedType +} + +create type CSXNestedType as closed { + id: int64, + csxid: string, + title: string, + authors: string, + misc: string +} + +create type CSXType as closed { + nested: CSXNestedType +} + +create dataset DBLPtmp(DBLPNestedType) primary key id; +create dataset CSXtmp(CSXNestedType) primary key id; + +create dataset DBLP(DBLPType) primary key nested.id; +create dataset CSX(CSXType) primary key nested.id; + http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.2.update.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.2.update.aql new file mode 100644 index 0000000..a2633b1 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.2.update.aql @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use dataverse test; + +load dataset DBLPtmp +using localfs +(("path"="asterix_nc1://data/dblp-small/dblp-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000")) pre-sorted; + +load dataset CSXtmp +using localfs +(("path"="asterix_nc1://data/pub-small/csx-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000")); + +insert into dataset DBLP( + for $x in dataset DBLPtmp + return { + "nested": $x + } +); + +insert into dataset CSX( + for $x in dataset CSXtmp + return { + "nested": $x + } +); http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.3.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.3.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.3.ddl.aql new file mode 100644 index 0000000..9307af9 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.3.ddl.aql @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use dataverse test; + +create index ngram_index on DBLP(nested.title) type ngram(3); +
