http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/optimizerts/results/similarity/jaccard-similarity-join-dual-order.plan ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/results/similarity/jaccard-similarity-join-dual-order.plan b/asterixdb/asterix-app/src/test/resources/optimizerts/results/similarity/jaccard-similarity-join-dual-order.plan new file mode 100644 index 0000000..e8b029a --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/results/similarity/jaccard-similarity-join-dual-order.plan @@ -0,0 +1,191 @@ +-- DISTRIBUTE_RESULT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- PRE_CLUSTERED_GROUP_BY[$$162, $$164] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STABLE_SORT [$$162(ASC), $$164(ASC)] |PARTITIONED| + -- HASH_PARTITION_EXCHANGE [$$162, $$164] |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- STREAM_SELECT |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- HYBRID_HASH_JOIN [$$prefixTokenRight][$$prefixTokenLeft] |PARTITIONED| + -- HASH_PARTITION_EXCHANGE [$$prefixTokenRight] |PARTITIONED| + -- UNNEST |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- PRE_CLUSTERED_GROUP_BY[$$126] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- IN_MEMORY_STABLE_SORT [$$i(ASC)] |LOCAL| + -- STREAM_SELECT |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- IN_MEMORY_HASH_JOIN [$$tokenUnranked][$$tokenRightGrouped] |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- UNNEST |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- DATASOURCE_SCAN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- EMPTY_TUPLE_SOURCE |PARTITIONED| + -- BROADCAST_EXCHANGE |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- RUNNING_AGGREGATE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- SORT_MERGE_EXCHANGE [$$124(ASC) ] |PARTITIONED| + -- STABLE_SORT [$$124(ASC)] |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- HYBRID_HASH_JOIN [$$tokenRightGrouped][$$tokenLeftGrouped] |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- REPLICATE |PARTITIONED| + -- HASH_PARTITION_EXCHANGE [$$tokenRightGrouped] |PARTITIONED| + -- EXTERNAL_GROUP_BY[$$172] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- HASH_PARTITION_EXCHANGE [$$172] |PARTITIONED| + -- EXTERNAL_GROUP_BY[$$orderTokenRight] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- UNNEST |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- DATASOURCE_SCAN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- EMPTY_TUPLE_SOURCE |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- REPLICATE |PARTITIONED| + -- HASH_PARTITION_EXCHANGE [$$tokenLeftGrouped] |PARTITIONED| + -- EXTERNAL_GROUP_BY[$$174] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- HASH_PARTITION_EXCHANGE [$$174] |PARTITIONED| + -- EXTERNAL_GROUP_BY[$$orderTokenLeft] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- UNNEST |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- DATASOURCE_SCAN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- EMPTY_TUPLE_SOURCE |PARTITIONED| + -- HASH_PARTITION_EXCHANGE [$$prefixTokenLeft] |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- UNNEST |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- PRE_CLUSTERED_GROUP_BY[$$129] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- IN_MEMORY_STABLE_SORT [$$i(ASC)] |LOCAL| + -- STREAM_SELECT |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- IN_MEMORY_HASH_JOIN [$$tokenUnranked][$$tokenRightGrouped] |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- UNNEST |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- DATASOURCE_SCAN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- EMPTY_TUPLE_SOURCE |PARTITIONED| + -- BROADCAST_EXCHANGE |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- RUNNING_AGGREGATE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- SORT_MERGE_EXCHANGE [$$125(ASC) ] |PARTITIONED| + -- STABLE_SORT [$$125(ASC)] |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- HYBRID_HASH_JOIN [$$tokenRightGrouped][$$tokenLeftGrouped] |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- REPLICATE |PARTITIONED| + -- HASH_PARTITION_EXCHANGE [$$tokenRightGrouped] |PARTITIONED| + -- EXTERNAL_GROUP_BY[$$172] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- HASH_PARTITION_EXCHANGE [$$172] |PARTITIONED| + -- EXTERNAL_GROUP_BY[$$orderTokenRight] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- UNNEST |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- DATASOURCE_SCAN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- EMPTY_TUPLE_SOURCE |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- REPLICATE |PARTITIONED| + -- HASH_PARTITION_EXCHANGE [$$tokenLeftGrouped] |PARTITIONED| + -- EXTERNAL_GROUP_BY[$$174] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- HASH_PARTITION_EXCHANGE [$$174] |PARTITIONED| + -- EXTERNAL_GROUP_BY[$$orderTokenLeft] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- UNNEST |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- DATASOURCE_SCAN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- EMPTY_TUPLE_SOURCE |PARTITIONED|
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/optimizerts/results/similarity/jaccard-similarity-join-right-ahead.plan ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/results/similarity/jaccard-similarity-join-right-ahead.plan b/asterixdb/asterix-app/src/test/resources/optimizerts/results/similarity/jaccard-similarity-join-right-ahead.plan new file mode 100644 index 0000000..ab700b6 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/results/similarity/jaccard-similarity-join-right-ahead.plan @@ -0,0 +1,128 @@ +-- DISTRIBUTE_RESULT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- PRE_CLUSTERED_GROUP_BY[$$98, $$100] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STABLE_SORT [$$98(ASC), $$100(ASC)] |PARTITIONED| + -- HASH_PARTITION_EXCHANGE [$$98, $$100] |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- STREAM_SELECT |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- HYBRID_HASH_JOIN [$$prefixTokenRight][$$prefixTokenLeft] |PARTITIONED| + -- HASH_PARTITION_EXCHANGE [$$prefixTokenRight] |PARTITIONED| + -- UNNEST |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- PRE_CLUSTERED_GROUP_BY[$$78] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- IN_MEMORY_STABLE_SORT [$$i(ASC)] |LOCAL| + -- STREAM_SELECT |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- IN_MEMORY_HASH_JOIN [$$tokenUnranked][$$tokenRightGrouped] |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- UNNEST |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- DATASOURCE_SCAN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- EMPTY_TUPLE_SOURCE |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- REPLICATE |PARTITIONED| + -- BROADCAST_EXCHANGE |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- RUNNING_AGGREGATE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- SORT_MERGE_EXCHANGE [$$83(ASC) ] |PARTITIONED| + -- STABLE_SORT [$$83(ASC)] |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- EXTERNAL_GROUP_BY[$$106] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- HASH_PARTITION_EXCHANGE [$$106] |PARTITIONED| + -- EXTERNAL_GROUP_BY[$$orderTokenRight] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- UNNEST |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- DATASOURCE_SCAN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- EMPTY_TUPLE_SOURCE |PARTITIONED| + -- HASH_PARTITION_EXCHANGE [$$prefixTokenLeft] |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- UNNEST |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- PRE_CLUSTERED_GROUP_BY[$$80] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- IN_MEMORY_STABLE_SORT [$$i(ASC)] |LOCAL| + -- STREAM_SELECT |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- IN_MEMORY_HASH_JOIN [$$tokenUnranked][$$tokenRightGrouped] |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- UNNEST |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- DATASOURCE_SCAN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- EMPTY_TUPLE_SOURCE |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- REPLICATE |PARTITIONED| + -- BROADCAST_EXCHANGE |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- RUNNING_AGGREGATE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- SORT_MERGE_EXCHANGE [$$83(ASC) ] |PARTITIONED| + -- STABLE_SORT [$$83(ASC)] |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- EXTERNAL_GROUP_BY[$$106] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- HASH_PARTITION_EXCHANGE [$$106] |PARTITIONED| + -- EXTERNAL_GROUP_BY[$$orderTokenRight] |PARTITIONED| + { + -- AGGREGATE |LOCAL| + -- NESTED_TUPLE_SOURCE |LOCAL| + } + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- UNNEST |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- DATASOURCE_SCAN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- EMPTY_TUPLE_SOURCE |PARTITIONED| http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1/basic-1_1.1.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1/basic-1_1.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1/basic-1_1.1.ddl.aql new file mode 100644 index 0000000..a3b6ec2 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1/basic-1_1.1.ddl.aql @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +drop dataverse fuzzyjoin_basic if exists; + +create dataverse fuzzyjoin_basic; + +use dataverse fuzzyjoin_basic; + +create type BasicType as open { + id: int32, + summary: string? +} + +create type BasicWithIndexType as open { + id: int32 +} + +create dataset Basic(BasicType) primary key id; + +create dataset BasicWithIndex(BasicWithIndexType) primary key id; + +create index BasicWithIndex_summary_bt_idx on BasicWithIndex(summary: string?) type btree enforced; + +create index BasicWithIndex_summary_kw_idx on BasicWithIndex(summary: string?) type keyword enforced; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1/basic-1_1.2.update.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1/basic-1_1.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1/basic-1_1.2.update.aql new file mode 100644 index 0000000..95ef5f1 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1/basic-1_1.2.update.aql @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +insert into dataset Basic( +{"id": 1, "summary": "Clear, Concise, and fun!"} +); +insert into dataset Basic( +{"id": 2, "summary": "Clear, Concise, and Charitable"} +); + +insert into dataset BasicWithIndex( +{"id": 1, "summary": "Clear, Concise, and fun!"} +); +insert into dataset BasicWithIndex( +{"id": 2, "summary": "Clear, Concise, and Charitable"} +); + http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1/basic-1_1.3.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1/basic-1_1.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1/basic-1_1.3.query.aql new file mode 100644 index 0000000..0e2aea7 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1/basic-1_1.3.query.aql @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +set simthreshold '0.6f'; + +let $s1 := "Clear, Concise, and fun!" +let $s2 := "Clear, Concise, and Charitable" +let $psj := ( + for $s in [$s1] + for $t in [$s2] + where word-tokens($s) ~= word-tokens($t) + return {"s1": $s, "s2": $t} +) +let $nsj := ( + for $s in [$s1] + for $t in [$s2] + where word-tokens($s) /*+ indexnl */ ~= word-tokens($t) + return {"s1": $s, "s2": $t} +) +let $nvj := ( + for $s in [$s1] + for $t in [$s2] + where similarity-jaccard-check(word-tokens($s), word-tokens($t), .6f)[0] = true + return {"s1": $s, "s2": $t} +) +let $nvr := ( + for $s in [$s1] + for $t in [$s2] + where similarity-jaccard(word-tokens($s), word-tokens($t)) >= .6f + return {"s1": $s, "s2": $t} +) +let $tpsj := ( + for $s in dataset Basic + for $t in dataset Basic + where /*+ skip-index */ similarity-jaccard(word-tokens($s.summary), word-tokens($t.summary)) >= .6f + and $s.id < $t.id + return {"s1": $s.summary, "s2": $t.summary} +) +let $tnsj := ( + for $s in dataset Basic + for $t in dataset Basic + where /*+ indexnl */ similarity-jaccard(word-tokens($s.summary), word-tokens($t.summary)) >= .6f + and $s.id < $t.id + return {"s1": $s.summary, "s2": $t.summary} +) +let $itpsj := ( + for $s in dataset BasicWithIndex + for $t in dataset BasicWithIndex + where /*+ skip-index */ similarity-jaccard(word-tokens($s.summary), word-tokens($t.summary)) >= 0.6 + and $s.id < $t.id + return {"s1": $s.summary, "s2": $t.summary} +) +let $itnsj := ( + for $s in dataset BasicWithIndex + for $t in dataset BasicWithIndex + where /*+ indexnl */ similarity-jaccard(word-tokens($s.summary), word-tokens($t.summary)) >= 0.6 + and $s.id < $t.id + return {"s1": $s.summary, "s2": $t.summary} +) +let $left := word-tokens($s1) +let $right := word-tokens($s2) +let $vj := similarity-jaccard-check($left, $right, 0.6f) +let $sr := similarity-jaccard($left, $right) >= 0.6 +return {"psj": $psj, "nsj": $nsj, "nvj": $nvj, "nvr": $nvr, "tpsj": $tpsj, "tnsj": $tnsj, "itpsj": $itpsj, "itnsj": $itnsj, "vj": $vj, "sr": $sr} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_1/basic-1_1_1.1.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_1/basic-1_1_1.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_1/basic-1_1_1.1.ddl.aql new file mode 100644 index 0000000..45cc975 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_1/basic-1_1_1.1.ddl.aql @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +drop dataverse fuzzyjoin_basic if exists; + +create dataverse fuzzyjoin_basic; + +use dataverse fuzzyjoin_basic; + +create type BasicType as closed { + id: uuid, + authors: string +} + +create dataset left(BasicType) primary key id autogenerated; +create dataset right(BasicType) primary key id autogenerated; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_1/basic-1_1_1.2.update.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_1/basic-1_1_1.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_1/basic-1_1_1.2.update.aql new file mode 100644 index 0000000..c9aceb2 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_1/basic-1_1_1.2.update.aql @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +load dataset left +using localfs +(("path"="asterix_nc1://data/pub-small/dblpauthors.adm"),("format"="adm")); + +load dataset right +using localfs +(("path"="asterix_nc1://data/pub-small/csxauthors.adm"),("format"="adm")); \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_1/basic-1_1_1.3.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_1/basic-1_1_1.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_1/basic-1_1_1.3.query.aql new file mode 100644 index 0000000..c00681b --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_1/basic-1_1_1.3.query.aql @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $hash := ( + for $r in dataset left + let $c := ( + for $t in counthashed-word-tokens($r.authors) + order by $t + distinct by $t + return $t + ) + order by $r.id + return {"id": $r.id, "authors": $r.authors, "nondup": $c} +) + +let $word := ( + for $r in dataset left + let $c := ( + for $t in word-tokens($r.authors) + order by $t + distinct by $t + return $t + ) + order by $r.id + return {"id": $r.id, "authors": $r.authors, "nondup": $c} +) + +for $s in $hash +for $t in $word +where $s.id = $t.id and count($s.nondup) != count($t.nondup) +order by $s.authors +return { + "authors": $s.authors, + "hdistinct": $s.nondup, + "hcount": count($s.nondup), + "hash": counthashed-word-tokens($s.authors), + "vhcount": count(counthashed-word-tokens($s.authors)), + "wdistinct": $t.nondup, + "wcount": count($t.nondup), + "word": word-tokens($t.authors), + "vwcount": count(word-tokens($t.authors)) + } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.1.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.1.ddl.aql new file mode 100644 index 0000000..65a52b1 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.1.ddl.aql @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +drop dataverse fuzzyjoin_basic if exists; + +create dataverse fuzzyjoin_basic; + +use dataverse fuzzyjoin_basic; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.10.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.10.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.10.query.aql new file mode 100644 index 0000000..e60be60 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.10.query.aql @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Support we have two records [1, 2, 3, 4, 5, 6, 7, 8, 9] and [2, 3, 4, 5, 6, 7, 8, 9, 10], we should return their + * similarity 0.8 by similarity-jaccard-check. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $left := [1, 2, 3, 4, 5, 6, 7, 8, 9] +let $right := [2, 3, 4, 5, 6, 7, 8, 9, 10] +return similarity-jaccard-check($left, $right, 0.8f) http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.2.update.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.2.update.aql new file mode 100644 index 0000000..042f3ce --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.2.update.aql @@ -0,0 +1,18 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.3.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.3.query.aql new file mode 100644 index 0000000..69dc59c --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.3.query.aql @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Support we have two records [1, 2, 3, 4, 5, 6, 7, 8, 9] and [2, 3, 4, 5, 6, 7, 8, 9, 10], we should return their + * similarity 0.8 by similarity-jaccard-prefix even the first token of the left side is removed so that it takes the + * form of [2, 3, 4, 5, 6, 7, 8, 9] with its actual length 9 as the first parameter of similarity-jaccard-prefix. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $left := [2, 3, 4, 5, 6, 7, 8, 9] +let $right := [2, 3, 4, 5, 6, 7, 8, 9, 10] +return similarity-jaccard-prefix(9, $left, 9, $right, $left[1], 0.8f) http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.4.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.4.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.4.query.aql new file mode 100644 index 0000000..ecfed74 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.4.query.aql @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Support we have two records [1, 2, 3, 4, 5, 6, 7, 8, 9] and [2, 3, 4, 5, 6, 7, 8, 9, 10], we should return their + * similarity 0.8 by similarity-jaccard-prefix even the first token of the right side is removed so that it takes the + * form of [2, 3, 4, 5, 6, 7, 8, 9] with its actual length 9 as the first parameter of similarity-jaccard-prefix. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $left := [1, 2, 3, 4, 5, 6, 7, 8, 9] +let $right := [2, 3, 4, 5, 6, 7, 8, 9] +return similarity-jaccard-prefix(9, $left, 9, $right, $left[1], 0.8f) http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.5.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.5.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.5.query.aql new file mode 100644 index 0000000..0b6482d --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.5.query.aql @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Support we have two records [2, 3, 4, 5, 6, 7, 8, 9, 10] and [1, 2, 3, 4, 5, 6, 7, 8, 9], we should return their + * similarity 0.8 by similarity-jaccard-prefix even the first token of the right side is removed so that it takes the + * form of [2, 3, 4, 5, 6, 7, 8, 9] with its actual length 9 as the first parameter of similarity-jaccard-prefix. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $left := [2, 3, 4, 5, 6, 7, 8, 9, 10] +let $right := [2, 3, 4, 5, 6, 7, 8, 9] +return similarity-jaccard-prefix(9, $left, 9, $right, $left[1], 0.8f) http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.6.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.6.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.6.query.aql new file mode 100644 index 0000000..21f1805 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.6.query.aql @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Support we have two records [2, 3, 4, 5, 6, 7, 8, 9, 10] and [1, 2, 3, 4, 5, 6, 7, 8, 9], we should return their + * similarity 0.8 by similarity-jaccard-prefix even the first token of the left side is removed so that it takes the + * form of [2, 3, 4, 5, 6, 7, 8, 9] with its actual length 9 as the first parameter of similarity-jaccard-prefix. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $left := [2, 3, 4, 5, 6, 7, 8, 9] +let $right := [1, 2, 3, 4, 5, 6, 7, 8, 9] +return similarity-jaccard-prefix(9, $left, 9, $right, $left[1], 0.8f) http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.7.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.7.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.7.query.aql new file mode 100644 index 0000000..5430f75 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.7.query.aql @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Support we have two records [-3, -2, -1, 2, 0, 3, 5, 7, 9] and [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], we should return their + * similarity 5/15 by similarity-jaccard-prefix even the tokens {-3, -2, -1} of the left side are removed so that it + * takes the form of [2, 0, 3, 5, 7, 9] with its actual length 9 as the first parameter of similarity-jaccard-prefix. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $left := [2, 0, 3, 5, 7, 9] +let $right := [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] +return similarity-jaccard-prefix(10, $left, 10, $right, $right[1], 0.33f) http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.8.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.8.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.8.query.aql new file mode 100644 index 0000000..eb20a08 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.8.query.aql @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Support we have two records [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] and [-3, -2, -1, 0, 2, 3, 5, 7, 9], we should return their + * similarity 5/14 by similarity-jaccard-prefix even the tokens {-3, -2, -1, 0} of the right side is removed so that it + * takes the form of [2, 3, 5, 7, 9] with its actual length 9 as the first parameter of similarity-jaccard-prefix. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $left := [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] +let $right := [2, 3, 5, 7, 9] +return similarity-jaccard-prefix(10, $left, 9, $right, $left[0], 0.35f) http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.9.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.9.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.9.query.aql new file mode 100644 index 0000000..586c162 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_2/basic-1_1_2.9.query.aql @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Support we have two records [1, 2, 3, 4, 5, 6, 7, 8, 9] and [2, 3, 4, 5, 6, 7, 8, 9, 10], we should return their + * similarity 0.8 by similarity-jaccard-sorted. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $left := [1, 2, 3, 4, 5, 6, 7, 8, 9] +let $right := [2, 3, 4, 5, 6, 7, 8, 9, 10] +return similarity-jaccard-sorted($left, $right) http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_3/basic-1_1_3.1.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_3/basic-1_1_3.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_3/basic-1_1_3.1.ddl.aql new file mode 100644 index 0000000..65a52b1 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_3/basic-1_1_3.1.ddl.aql @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +drop dataverse fuzzyjoin_basic if exists; + +create dataverse fuzzyjoin_basic; + +use dataverse fuzzyjoin_basic; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_3/basic-1_1_3.2.update.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_3/basic-1_1_3.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_3/basic-1_1_3.2.update.aql new file mode 100644 index 0000000..042f3ce --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_3/basic-1_1_3.2.update.aql @@ -0,0 +1,18 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_3/basic-1_1_3.3.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_3/basic-1_1_3.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_3/basic-1_1_3.3.query.aql new file mode 100644 index 0000000..b7e2025 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_3/basic-1_1_3.3.query.aql @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Support we have two records [3, 4, 5, 6, 8, 9] and [0, 1, 2, 3, 5, 6, 7], we should return their similarity 0.3 by + * similarity-jaccard-prefix even the first token of the left side is removed so that it takes the form of + * [2, 3, 4, 5, 6, 7, 8, 9] with its actual length 9 as the first parameter of similarity-jaccard-prefix. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $left := [3, 4, 5, 6, 8, 9] +let $right := [3, 5, 6, 7] +return similarity-jaccard-prefix(6, $left, 7, $right, $right[0], 0.3f) http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_3/basic-1_1_3.4.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_3/basic-1_1_3.4.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_3/basic-1_1_3.4.query.aql new file mode 100644 index 0000000..da8e765 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_1_3/basic-1_1_3.4.query.aql @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $sims := [ + similarity-jaccard-prefix(6, [3, 4, 5, 6, 8, 9], 7, [0, 1, 2, 3, 5, 6, 7], -1, 0.3f), + similarity-jaccard-prefix(6, [3, 4, 5, 6, 8, 9], 7, [3, 5, 6, 7], -1, 0.3f), + similarity-jaccard-prefix(7, [3, 5, 6, 7], 6, [3, 4, 5, 6, 8, 9], -1, 0.3f), + similarity-jaccard-prefix(6, [3, 4, 5, 6, 8, 9], 7, [3, 5, 6, 7], 3, 0.3f), + similarity-jaccard-prefix(7, [3, 5, 6, 7], 6, [3, 4, 5, 6, 8, 9], 3, 0.3f), + similarity-jaccard-prefix(6, [3, 4, 5, 6, 8, 9], 7, [3, 5, 6, 7], 6, 0.3f), + similarity-jaccard-prefix(7, [3, 5, 6, 7], 6, [3, 4, 5, 6, 8, 9], 6, 0.3f), + similarity-jaccard-prefix(7, [0, 1, 2, 3, 5, 6, 7], 6, [3, 5, 6, 8], -1, 0.3f), + similarity-jaccard-prefix(6, [3, 5, 6, 9], 7, [0, 1, 2, 3, 5, 6, 7], -1, 0.3f), + similarity-jaccard-prefix(7, [0, 1, 2, 3, 5, 6, 7], 6, [3, 5, 6, 8], 3, 0.3f), + similarity-jaccard-prefix(6, [3, 5, 6, 9], 7, [0, 1, 2, 3, 5, 6, 7], 3, 0.3f), + similarity-jaccard-prefix(7, [0, 1, 2, 3, 5, 6, 7], 6, [3, 5, 6, 8], 6, 0.3f), + similarity-jaccard-prefix(6, [3, 5, 6, 9], 7, [0, 1, 2, 3, 5, 6, 7], 6, 0.3f) +] +return $sims \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.1.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.1.ddl.aql new file mode 100644 index 0000000..45cc975 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.1.ddl.aql @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +drop dataverse fuzzyjoin_basic if exists; + +create dataverse fuzzyjoin_basic; + +use dataverse fuzzyjoin_basic; + +create type BasicType as closed { + id: uuid, + authors: string +} + +create dataset left(BasicType) primary key id autogenerated; +create dataset right(BasicType) primary key id autogenerated; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.2.update.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.2.update.aql new file mode 100644 index 0000000..c9aceb2 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.2.update.aql @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +load dataset left +using localfs +(("path"="asterix_nc1://data/pub-small/dblpauthors.adm"),("format"="adm")); + +load dataset right +using localfs +(("path"="asterix_nc1://data/pub-small/csxauthors.adm"),("format"="adm")); \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.3.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.3.query.aql new file mode 100644 index 0000000..5d9ab27 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.3.query.aql @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +let $left := [1, 5, 6, 7] +let $right := [4, 5, 6] +let $leftnull := [null, null, 5, 6, 7] +let $rightnull := [4, 5, 6] +let $nullstring := [null, null, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + +let $prefix1 := subset-collection($nullstring, 0, prefix-len-jaccard(len($nullstring), 0.1f)) +let $prefix2 := subset-collection($nullstring, 0, prefix-len-jaccard(len($nullstring), 0.2f)) +let $prefix3 := subset-collection($nullstring, 0, prefix-len-jaccard(len($nullstring), 0.3f)) +let $prefix4 := subset-collection($nullstring, 0, prefix-len-jaccard(len($nullstring), 0.4f)) +let $prefix5 := subset-collection($nullstring, 0, prefix-len-jaccard(len($nullstring), 0.5f)) +let $prefix6 := subset-collection($nullstring, 0, prefix-len-jaccard(len($nullstring), 0.6f)) +let $prefix7 := subset-collection($nullstring, 0, prefix-len-jaccard(len($nullstring), 0.7f)) +let $prefix8 := subset-collection($nullstring, 0, prefix-len-jaccard(len($nullstring), 0.8f)) +let $prefix9 := subset-collection($nullstring, 0, prefix-len-jaccard(len($nullstring), 0.9f)) + +let $bound := +for $l in [1] +return [ + [ + similarity-jaccard-prefix(3, [1, 2, 3], 3, [1, 2, 3], 1, 1f), + similarity-jaccard-prefix(3, [1, 2, 3], 3, [1, 2, 4], 1, .5f), + similarity-jaccard-prefix(3, [1, 2, 3], 3, [1, 2, 4], 1, .6f), + similarity-jaccard-prefix(3, [1, 2, 3], 9, [1, 2, 3], 1, .5f), + similarity-jaccard-prefix(4, [1, 2, 3, 4], 2, [1, 2], 1, .5f), + similarity-jaccard-prefix(4, [1, 2, 3, 4], 4, [1, 2], 1, .33f) + ],[ + similarity-jaccard-prefix(3, [1, 2, 3], 3, [1, 2, 3], 2, 1f), + similarity-jaccard-prefix(3, [1, 2, 3], 3, [1, 2, 4], 2, .5f), + similarity-jaccard-prefix(3, [1, 2, 3], 3, [1, 2, 4], 2, .6f), + similarity-jaccard-prefix(3, [1, 2, 3], 9, [1, 2, 3], 2, .5f), + similarity-jaccard-prefix(4, [1, 2, 3, 4], 2, [1, 2], 2, .5f), + similarity-jaccard-prefix(4, [1, 2, 3, 4], 4, [1, 2], 2, .33f) + ],[ + similarity-jaccard-prefix(3, [1, 2, 3], 3, [1, 2, 3], 3, 1f), + similarity-jaccard-prefix(3, [1, 2, 3], 3, [1, 2, 4], 3, .5f), + similarity-jaccard-prefix(3, [1, 2, 3], 3, [1, 2, 4], 3, .6f), + similarity-jaccard-prefix(3, [1, 2, 3], 9, [1, 2, 3], 3, .5f), + similarity-jaccard-prefix(4, [1, 2, 3, 4], 2, [1, 2], 3, .5f), + similarity-jaccard-prefix(4, [1, 2, 3, 4], 4, [1, 2], 3, .33f) + ] +] + +let $trybound := +for $l in [1] +return [ +similarity-jaccard-prefix(8, [3, 4, 5, 6, 7, 8], 10, [0, 0, 3, 4, 5, 6, 7, 8, 9, 10], -1, 0.5f), +similarity-jaccard-prefix(8, [1, 2, 3, 4, 5, 6, 7, 8], 10, [3, 4, 5, 6, 7, 8, 9, 10], 0, 0.5f), +similarity-jaccard-prefix(8, [3, 4, 5, 6, 7, 8], 10, [0, 0, 3, 4, 5, 6, 7, 8, 9, 10], 1, 0.5f), +similarity-jaccard-prefix(8, [1, 2, 3, 4, 5, 6, 7, 8], 10, [3, 4, 5, 6, 7, 8, 9, 10], 2, 0.5f), +similarity-jaccard-prefix(8, [3, 4, 5, 6, 7, 8], 10, [0, 0, 3, 4, 5, 6, 7, 8, 9, 10], 3, 0.5f), +similarity-jaccard-prefix(8, [1, 2, 3, 4, 5, 6, 7, 8], 10, [3, 4, 5, 6, 7, 8, 9, 10], 4, 0.5f), +similarity-jaccard-prefix(8, [3, 4, 5, 6, 7, 8], 10, [0, 0, 3, 4, 5, 6, 7, 8, 9, 10], 5, 0.5f), +similarity-jaccard-prefix(8, [1, 2, 3, 4, 5, 6, 7, 8], 10, [3, 4, 5, 6, 7, 8, 9, 10], 6, 0.5f), +similarity-jaccard-prefix(8, [3, 4, 5, 6, 7, 8], 10, [0, 0, 3, 4, 5, 6, 7, 8, 9, 10], 7, 0.5f), +similarity-jaccard-prefix(8, [1, 2, 3, 4, 5, 6, 7, 8], 10, [0, 0, 3, 4, 5, 6, 7, 8, 9, 10], 8, 0.5f), +similarity-jaccard-prefix(8, [1, 2, 3, 4, 5, 6, 7, 8], 10, [0, 0, 3, 4, 5, 6, 7, 8, 9, 10], 9, 0.5f), +similarity-jaccard-prefix(8, [1, 2, 3, 4, 5, 6, 7, 8], 10, [0, 0, 3, 4, 5, 6, 7, 8, 9, 10], 10, 0.5f), +similarity-jaccard-prefix(8, [1, 2, 5, 6, 7, 3, 4, 8], 10, [4, 5, 0, 0, 3, 6, 7, 8, 9, 10], 11, 0.5f) +] + +let $checkbound := +for $l in [1] +return [ +similarity-jaccard-check([1, 2, 3, 4, 5, 6, 7, 8], [null, 0, 3, 4, 5, 6, 7, 8, 9, 10], 0.1f), +similarity-jaccard-check([null, 2, 3, 4, 5, 6, 7, 8], [null, 0, 3, 4, 5, 6, 7, 8, 9, 10], 0.2f), +similarity-jaccard-check([null, 2, 3, 4, 5, 6, 7, 8], [null, null, 3, 4, 5, 6, 7, 8, 9, 10], 0.3f), +similarity-jaccard-check([null, null, 3, 4, 5, 6, 7, 8], [null, null, 3, 4, 5, 6, 7, 8, 9, 10], 0.4f), +similarity-jaccard-check([1, 2, 3, 4, 5, 6, 7, 8], [null, null, 3, 4, 5, 6, 7, 8, 9, 10], 0.5f), +similarity-jaccard-check([1, 2, 3, 4, 5, 6, 7, 8], [0, 2, 3, 4, 5, 6, 7, 8, 9, 10], 0.6f), +similarity-jaccard-check([1, 2, 3, 4, 5, 6, 7, 8], [null, null, 3, 4, 5, 6, 7, 8, 2], 0.7f), +similarity-jaccard-check([1, 2, 3, 4, 5, 6, 7, 8], [null, null, 3, 4, 5, 6, 7, 8, 1, 2], 0.8f), +similarity-jaccard-check([1, 2, 3, 4, 5, 6, 7, 8, 9], [null, 2, 3, 4, 5, 6, 7, 8, 9, 1], 0.9f) +] + +let $len := len($right) + +let $negativesub1 := subset-collection($right, 6, prefix-len-jaccard($len, .4f)) +let $negativesub2 := subset-collection($right, 0, -1) +let $prefix := subset-collection($right, 0, prefix-len-jaccard($len, .4f)) +let $sim1 := similarity-jaccard($leftnull, $right) +let $sim2 := similarity-jaccard-prefix(len($left), $left, len($right), $right, -1, .4f) +let $sim3 := similarity-jaccard($leftnull, $rightnull) +let $sim4 := similarity-jaccard-prefix(8, [1, 2, 3, 4, 5, 6, 7, 8], 10, [0, 0, 3, 4, 5, 6, 7, 8, 9, 10], 1, .4f) +let $sim5 := similarity-jaccard([1, 2, 3, 4, 5, 6, 7, 8], [null, null, 3, 4, 5, 6, 7, 8, 9, 10]) + +let $joinpair := +for $s in $prefix4 +for $r in $prefix1 +where $s = $r +return $s + +let $clearnull := +for $index at $i in $nullstring +where $nullstring[$i] <= null +return int16(string($nullstring[$i])) + 64 + +let $cleanup := [$clearnull, $nullstring] + +return [$prefix, $sim1, $sim2, $sim3, $sim4, $sim5, $bound, $trybound, $checkbound, $negativesub1, $negativesub2, +[$prefix1, $prefix2, $prefix3, $prefix4, $prefix5, $prefix6, $prefix7, $prefix8, $prefix9], $joinpair, $cleanup] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.4.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.4.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.4.query.aql new file mode 100644 index 0000000..6eff4f6 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.4.query.aql @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +let $records := [[5, 6, 7], [0, 3, 4]] +let $kv := [1, 2, 3, 4, 5, 6] + +let $lorecords := +for $r in $records +let $c := + for $t in $r + let $index := + for $k at $i in $kv + where $t = $k + return $i + return $index +return $c + +for $record in $lorecords +let $orecord := +for $d in $record +order by $d[0] +return $d[0] +return $orecord + +let $records := [[5, 6, 7], [0, 3, 4]] +let $kv := [1, 2, 3, 4, 5, 6] +for $r in $records +let $c := + for $t in $r + for $k at $i in $kv + where $t = $k + order by $i + return $i +return $c \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.5.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.5.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.5.query.aql new file mode 100644 index 0000000..9696695 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.5.query.aql @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin_basic; + +set import-private-functions 'true' + +// Stage 1 +let $rankedTokens := ( + for $right in dataset left + let $id := $right.id + for $token in word-tokens($right.authors) + /*+ hash */ group by $tokenGroupped := $token with $id + /*+ inmem 34 198608 */ order by count($id), $tokenGroupped + return $tokenGroupped +) + +// Stage 2_1 of left +let $tokenLeftVerify := ( + for $left in dataset left + let $tokenUnrankedLeft := word-tokens($left.authors) + let $lenLeft := len($tokenUnrankedLeft) + let $tokens := ( + for $token in $tokenUnrankedLeft + let $index := + for $tokenRanked at $i in $rankedTokens + where $token = /*+ bcast */ $tokenRanked + return $i + order by $index + return $index + ) + where count($tokenUnrankedLeft) != count($tokens) + order by $left.id + return {"tid": $left.id, "authors": $left.authors, "tokens": $tokenUnrankedLeft, "ranked": $tokens} +) +let $tokenLeft := ( + for $left in dataset left + let $tokenUnrankedLeft := word-tokens($left.authors) + let $lenLeft := len($tokenUnrankedLeft) + for $token in $tokenUnrankedLeft + for $tokenRanked at $i in $rankedTokens + where $token = /*+ bcast */ $tokenRanked + order by $i + return $i +) + +for $r in $tokenLeftVerify +return $r \ No newline at end of file
