http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/CountHashedGramTokensDescriptor.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/CountHashedGramTokensDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/CountHashedGramTokensDescriptor.java new file mode 100644 index 0000000..dd36671 --- /dev/null +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/CountHashedGramTokensDescriptor.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.runtime.evaluators.functions; + +import org.apache.asterix.om.functions.BuiltinFunctions; +import org.apache.asterix.om.functions.IFunctionDescriptor; +import org.apache.asterix.om.functions.IFunctionDescriptorFactory; +import org.apache.asterix.om.types.BuiltinType; +import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor; +import org.apache.asterix.runtime.evaluators.common.GramTokensEvaluator; +import org.apache.hyracks.algebricks.common.exceptions.AlgebricksException; +import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.api.exceptions.HyracksDataException; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8NGramTokenFactory; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer; + +public class CountHashedGramTokensDescriptor extends AbstractScalarFunctionDynamicDescriptor { + + private static final long serialVersionUID = 1L; + public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() { + @Override + public IFunctionDescriptor createFunctionDescriptor() { + return new CountHashedGramTokensDescriptor(); + } + }; + + @Override + public FunctionIdentifier getIdentifier() { + return BuiltinFunctions.COUNTHASHED_GRAM_TOKENS; + } + + @Override + public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) + throws AlgebricksException { + return new IScalarEvaluatorFactory() { + private static final long serialVersionUID = 1L; + + @Override + public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException { + ITokenFactory tokenFactory = new HashedUTF8NGramTokenFactory(); + NGramUTF8StringBinaryTokenizer tokenizer = + new NGramUTF8StringBinaryTokenizer(3, true, false, true, tokenFactory); + return new GramTokensEvaluator(args, ctx, tokenizer, BuiltinType.AINT32); + } + }; + } + +}
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/CountHashedWordTokensDescriptor.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/CountHashedWordTokensDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/CountHashedWordTokensDescriptor.java new file mode 100644 index 0000000..e12ba2e --- /dev/null +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/CountHashedWordTokensDescriptor.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.runtime.evaluators.functions; + +import org.apache.asterix.om.functions.BuiltinFunctions; +import org.apache.asterix.om.functions.IFunctionDescriptor; +import org.apache.asterix.om.functions.IFunctionDescriptorFactory; +import org.apache.asterix.om.types.BuiltinType; +import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor; +import org.apache.asterix.runtime.evaluators.common.WordTokensEvaluator; +import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.api.exceptions.HyracksDataException; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8WordTokenFactory; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory; + +public class CountHashedWordTokensDescriptor extends AbstractScalarFunctionDynamicDescriptor { + + private static final long serialVersionUID = 1L; + public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() { + @Override + public IFunctionDescriptor createFunctionDescriptor() { + return new CountHashedWordTokensDescriptor(); + } + }; + + @Override + public FunctionIdentifier getIdentifier() { + return BuiltinFunctions.COUNTHASHED_WORD_TOKENS; + } + + @Override + public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) { + return new IScalarEvaluatorFactory() { + private static final long serialVersionUID = 1L; + + @Override + public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException { + ITokenFactory tokenFactory = new HashedUTF8WordTokenFactory(); + IBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(false, true, tokenFactory); + return new WordTokensEvaluator(args, ctx, tokenizer, BuiltinType.AINT32); + } + }; + } + +} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceCheckDescriptor.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceCheckDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceCheckDescriptor.java new file mode 100644 index 0000000..e4b40b1 --- /dev/null +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceCheckDescriptor.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.runtime.evaluators.functions; + +import org.apache.asterix.om.functions.BuiltinFunctions; +import org.apache.asterix.om.functions.IFunctionDescriptor; +import org.apache.asterix.om.functions.IFunctionDescriptorFactory; +import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor; +import org.apache.asterix.runtime.evaluators.common.EditDistanceCheckEvaluator; +import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.api.exceptions.HyracksDataException; + +public class EditDistanceCheckDescriptor extends AbstractScalarFunctionDynamicDescriptor { + + private static final long serialVersionUID = 1L; + public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() { + @Override + public IFunctionDescriptor createFunctionDescriptor() { + return new EditDistanceCheckDescriptor(); + } + }; + + @Override + public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) { + return new IScalarEvaluatorFactory() { + private static final long serialVersionUID = 1L; + + @Override + public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException { + return new EditDistanceCheckEvaluator(args, ctx); + } + }; + } + + @Override + public FunctionIdentifier getIdentifier() { + return BuiltinFunctions.EDIT_DISTANCE_CHECK; + } +} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceContainsDescriptor.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceContainsDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceContainsDescriptor.java new file mode 100644 index 0000000..4c7c257 --- /dev/null +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceContainsDescriptor.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.runtime.evaluators.functions; + +import org.apache.asterix.om.functions.BuiltinFunctions; +import org.apache.asterix.om.functions.IFunctionDescriptor; +import org.apache.asterix.om.functions.IFunctionDescriptorFactory; +import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor; +import org.apache.asterix.runtime.evaluators.common.EditDistanceContainsEvaluator; +import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.api.exceptions.HyracksDataException; + +public class EditDistanceContainsDescriptor extends AbstractScalarFunctionDynamicDescriptor { + + private static final long serialVersionUID = 1L; + public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() { + @Override + public IFunctionDescriptor createFunctionDescriptor() { + return new EditDistanceContainsDescriptor(); + } + }; + + @Override + public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) { + return new IScalarEvaluatorFactory() { + private static final long serialVersionUID = 1L; + + @Override + public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException { + return new EditDistanceContainsEvaluator(args, ctx); + } + }; + } + + @Override + public FunctionIdentifier getIdentifier() { + return BuiltinFunctions.EDIT_DISTANCE_CONTAINS; + } +} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceDescriptor.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceDescriptor.java new file mode 100644 index 0000000..8c6c9ed --- /dev/null +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceDescriptor.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.runtime.evaluators.functions; + +import org.apache.asterix.om.functions.BuiltinFunctions; +import org.apache.asterix.om.functions.IFunctionDescriptor; +import org.apache.asterix.om.functions.IFunctionDescriptorFactory; +import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor; +import org.apache.asterix.runtime.evaluators.common.EditDistanceEvaluator; +import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.api.exceptions.HyracksDataException; + +public class EditDistanceDescriptor extends AbstractScalarFunctionDynamicDescriptor { + + private static final long serialVersionUID = 1L; + public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() { + @Override + public IFunctionDescriptor createFunctionDescriptor() { + return new EditDistanceDescriptor(); + } + }; + + @Override + public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) { + return new IScalarEvaluatorFactory() { + private static final long serialVersionUID = 1L; + + @Override + public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException { + return new EditDistanceEvaluator(args, ctx); + } + }; + } + + @Override + public FunctionIdentifier getIdentifier() { + return BuiltinFunctions.EDIT_DISTANCE; + } + +} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceListIsFilterableDescriptor.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceListIsFilterableDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceListIsFilterableDescriptor.java new file mode 100644 index 0000000..0f4ebee --- /dev/null +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceListIsFilterableDescriptor.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.runtime.evaluators.functions; + +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.asterix.dataflow.data.nontagged.serde.AOrderedListSerializerDeserializer; +import org.apache.asterix.dataflow.data.nontagged.serde.AUnorderedListSerializerDeserializer; +import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider; +import org.apache.asterix.om.base.ABoolean; +import org.apache.asterix.om.functions.BuiltinFunctions; +import org.apache.asterix.om.functions.IFunctionDescriptor; +import org.apache.asterix.om.functions.IFunctionDescriptorFactory; +import org.apache.asterix.om.types.ATypeTag; +import org.apache.asterix.om.types.BuiltinType; +import org.apache.asterix.om.types.EnumDeserializer; +import org.apache.asterix.om.types.hierachy.ATypeHierarchy; +import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor; +import org.apache.asterix.runtime.exceptions.TypeMismatchException; +import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer; +import org.apache.hyracks.api.exceptions.HyracksDataException; +import org.apache.hyracks.data.std.api.IPointable; +import org.apache.hyracks.data.std.primitive.VoidPointable; +import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; +import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference; + +/** + * Checks whether a list with an edit distance threshold can be filtered with a lower bounding on the number + * of common list elements. This function returns 'true' if the lower bound on the number of common elements + * is positive, 'false' otherwise. For example, this function is used during an indexed nested-loop join based + * on edit distance. We partition the tuples of the probing dataset into those that are filterable and those + * that are not. Those that are filterable are forwarded to the index. The others are are fed into a (non + * indexed) nested-loop join. + */ +public class EditDistanceListIsFilterableDescriptor extends AbstractScalarFunctionDynamicDescriptor { + + private static final long serialVersionUID = 1L; + + public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() { + @Override + public IFunctionDescriptor createFunctionDescriptor() { + return new EditDistanceListIsFilterableDescriptor(); + } + }; + + @Override + public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) { + return new IScalarEvaluatorFactory() { + private static final long serialVersionUID = 1L; + + @Override + public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException { + return new EditDistanceListIsFilterableEvaluator(args, ctx); + } + }; + } + + @Override + public FunctionIdentifier getIdentifier() { + return BuiltinFunctions.EDIT_DISTANCE_LIST_IS_FILTERABLE; + } + + private static class EditDistanceListIsFilterableEvaluator implements IScalarEvaluator { + + protected final IPointable listPtr = new VoidPointable(); + protected final IPointable edThreshPtr = new VoidPointable(); + protected final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage(); + protected final DataOutput output = resultStorage.getDataOutput(); + + protected final IScalarEvaluator listEval; + protected final IScalarEvaluator edThreshEval; + + @SuppressWarnings("unchecked") + private final ISerializerDeserializer<ABoolean> booleanSerde = + SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.ABOOLEAN); + + public EditDistanceListIsFilterableEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context) + throws HyracksDataException { + listEval = args[0].createScalarEvaluator(context); + edThreshEval = args[1].createScalarEvaluator(context); + } + + @Override + public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException { + resultStorage.reset(); + + listEval.evaluate(tuple, listPtr); + edThreshEval.evaluate(tuple, edThreshPtr); + + // Check type and compute string length. + byte[] bytes = listPtr.getByteArray(); + int offset = listPtr.getStartOffset(); + + ATypeTag typeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(bytes[offset]); + long listLen; + switch (typeTag) { + case MULTISET: + listLen = AUnorderedListSerializerDeserializer.getNumberOfItems(bytes, offset); + break; + case ARRAY: + listLen = AOrderedListSerializerDeserializer.getNumberOfItems(bytes, offset); + break; + default: + throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE_LIST_IS_FILTERABLE, 0, + ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG, ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG); + } + + // Check type and extract edit-distance threshold. + bytes = edThreshPtr.getByteArray(); + offset = edThreshPtr.getStartOffset(); + long edThresh = ATypeHierarchy.getIntegerValue(BuiltinFunctions.EDIT_DISTANCE_LIST_IS_FILTERABLE.getName(), + 1, bytes, offset); + + // Compute result. + long lowerBound = listLen - edThresh; + try { + if (lowerBound <= 0) { + booleanSerde.serialize(ABoolean.FALSE, output); + } else { + booleanSerde.serialize(ABoolean.TRUE, output); + } + } catch (IOException e) { + throw new HyracksDataException(e); + } + result.set(resultStorage); + } + } +} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceStringIsFilterableDescriptor.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceStringIsFilterableDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceStringIsFilterableDescriptor.java new file mode 100644 index 0000000..ddb18fc --- /dev/null +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceStringIsFilterableDescriptor.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.runtime.evaluators.functions; + +import org.apache.asterix.om.functions.BuiltinFunctions; +import org.apache.asterix.om.functions.IFunctionDescriptor; +import org.apache.asterix.om.functions.IFunctionDescriptorFactory; +import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor; +import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.api.exceptions.HyracksDataException; + +/** + * Checks whether a string with an edit distance threshold can be filtered with a lower bounding + * on number of common grams. This function returns 'true' if the lower bound on the number of + * common grams is positive, 'false' otherwise. For example, this function is used during an indexed + * nested-loop join based on edit distance. We partition the tuples of the probing dataset into those + * that are filterable and those that are not. Those that are filterable are forwarded to the index. + * The others are fed into a (non indexed) nested-loop join. + */ +public class EditDistanceStringIsFilterableDescriptor extends AbstractScalarFunctionDynamicDescriptor { + + private static final long serialVersionUID = 1L; + public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() { + @Override + public IFunctionDescriptor createFunctionDescriptor() { + return new EditDistanceStringIsFilterableDescriptor(); + } + }; + + @Override + public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) { + return new IScalarEvaluatorFactory() { + private static final long serialVersionUID = 1L; + + @Override + public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException { + return new EditDistanceStringIsFilterableEvaluator(args, ctx); + } + }; + } + + @Override + public FunctionIdentifier getIdentifier() { + return BuiltinFunctions.EDIT_DISTANCE_STRING_IS_FILTERABLE; + } +} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceStringIsFilterableEvaluator.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceStringIsFilterableEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceStringIsFilterableEvaluator.java new file mode 100644 index 0000000..0509f51 --- /dev/null +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceStringIsFilterableEvaluator.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.asterix.runtime.evaluators.functions; + +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.asterix.dataflow.data.nontagged.serde.ABooleanSerializerDeserializer; +import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider; +import org.apache.asterix.om.base.ABoolean; +import org.apache.asterix.om.functions.BuiltinFunctions; +import org.apache.asterix.om.types.ATypeTag; +import org.apache.asterix.om.types.BuiltinType; +import org.apache.asterix.om.types.hierachy.ATypeHierarchy; +import org.apache.asterix.runtime.exceptions.TypeMismatchException; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer; +import org.apache.hyracks.api.exceptions.HyracksDataException; +import org.apache.hyracks.data.std.api.IPointable; +import org.apache.hyracks.data.std.primitive.UTF8StringPointable; +import org.apache.hyracks.data.std.primitive.VoidPointable; +import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; +import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference; + +public class EditDistanceStringIsFilterableEvaluator implements IScalarEvaluator { + + protected final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage(); + protected final DataOutput output = resultStorage.getDataOutput(); + protected final IPointable stringPtr = new VoidPointable(); + protected final IPointable edThreshPtr = new VoidPointable(); + protected final IPointable gramLenPtr = new VoidPointable(); + protected final IPointable usePrePostPtr = new VoidPointable(); + + protected final IScalarEvaluator stringEval; + protected final IScalarEvaluator edThreshEval; + protected final IScalarEvaluator gramLenEval; + protected final IScalarEvaluator usePrePostEval; + + @SuppressWarnings("unchecked") + private final ISerializerDeserializer<ABoolean> booleanSerde = + SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.ABOOLEAN); + + private final UTF8StringPointable utf8Ptr = new UTF8StringPointable(); + + public EditDistanceStringIsFilterableEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context) + throws HyracksDataException { + stringEval = args[0].createScalarEvaluator(context); + edThreshEval = args[1].createScalarEvaluator(context); + gramLenEval = args[2].createScalarEvaluator(context); + usePrePostEval = args[3].createScalarEvaluator(context); + } + + @Override + public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException { + resultStorage.reset(); + + stringEval.evaluate(tuple, stringPtr); + edThreshEval.evaluate(tuple, edThreshPtr); + gramLenEval.evaluate(tuple, gramLenPtr); + usePrePostEval.evaluate(tuple, usePrePostPtr); + + // Check type and compute string length. + byte typeTag = stringPtr.getByteArray()[stringPtr.getStartOffset()]; + if (typeTag != ATypeTag.SERIALIZED_STRING_TYPE_TAG) { + throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE_STRING_IS_FILTERABLE, 0, typeTag, + ATypeTag.SERIALIZED_STRING_TYPE_TAG); + } + utf8Ptr.set(stringPtr.getByteArray(), stringPtr.getStartOffset() + 1, stringPtr.getLength()); + int strLen = utf8Ptr.getStringLength(); + + // Check type and extract edit-distance threshold. + long edThresh = ATypeHierarchy.getIntegerValue(BuiltinFunctions.EDIT_DISTANCE_LIST_IS_FILTERABLE.getName(), 1, + edThreshPtr.getByteArray(), edThreshPtr.getStartOffset()); + + // Check type and extract gram length. + long gramLen = ATypeHierarchy.getIntegerValue(BuiltinFunctions.EDIT_DISTANCE_LIST_IS_FILTERABLE.getName(), 2, + gramLenPtr.getByteArray(), gramLenPtr.getStartOffset()); + + // Check type and extract usePrePost flag. + typeTag = usePrePostPtr.getByteArray()[usePrePostPtr.getStartOffset()]; + if (typeTag != ATypeTag.SERIALIZED_BOOLEAN_TYPE_TAG) { + throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE_STRING_IS_FILTERABLE, 3, typeTag, + ATypeTag.SERIALIZED_BOOLEAN_TYPE_TAG); + } + boolean usePrePost = ABooleanSerializerDeserializer.getBoolean(usePrePostPtr.getByteArray(), + usePrePostPtr.getStartOffset() + 1); + + // Compute result. + long numGrams = usePrePost ? strLen + gramLen - 1 : strLen - gramLen + 1; + long lowerBound = numGrams - edThresh * gramLen; + try { + if (lowerBound <= 0 || strLen == 0) { + booleanSerde.serialize(ABoolean.FALSE, output); + } else { + booleanSerde.serialize(ABoolean.TRUE, output); + } + } catch (IOException e) { + throw new HyracksDataException(e); + } + result.set(resultStorage); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/GramTokensDescriptor.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/GramTokensDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/GramTokensDescriptor.java new file mode 100644 index 0000000..190013b --- /dev/null +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/GramTokensDescriptor.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.runtime.evaluators.functions; + +import org.apache.asterix.om.functions.BuiltinFunctions; +import org.apache.asterix.om.functions.IFunctionDescriptor; +import org.apache.asterix.om.functions.IFunctionDescriptorFactory; +import org.apache.asterix.om.types.BuiltinType; +import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor; +import org.apache.asterix.runtime.evaluators.common.GramTokensEvaluator; +import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.api.exceptions.HyracksDataException; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8NGramTokenFactory; + +public class GramTokensDescriptor extends AbstractScalarFunctionDynamicDescriptor { + + private static final long serialVersionUID = 1L; + public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() { + @Override + public IFunctionDescriptor createFunctionDescriptor() { + return new GramTokensDescriptor(); + } + }; + + @Override + public FunctionIdentifier getIdentifier() { + return BuiltinFunctions.GRAM_TOKENS; + } + + @Override + public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) { + return new IScalarEvaluatorFactory() { + private static final long serialVersionUID = 1L; + + @Override + public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException { + ITokenFactory tokenFactory = new UTF8NGramTokenFactory(); + NGramUTF8StringBinaryTokenizer tokenizer = + new NGramUTF8StringBinaryTokenizer(3, true, true, true, tokenFactory); + return new GramTokensEvaluator(args, ctx, tokenizer, BuiltinType.ASTRING); + } + }; + } + +} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/HashedGramTokensDescriptor.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/HashedGramTokensDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/HashedGramTokensDescriptor.java new file mode 100644 index 0000000..32dc292 --- /dev/null +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/HashedGramTokensDescriptor.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.runtime.evaluators.functions; + +import org.apache.asterix.om.functions.BuiltinFunctions; +import org.apache.asterix.om.functions.IFunctionDescriptor; +import org.apache.asterix.om.functions.IFunctionDescriptorFactory; +import org.apache.asterix.om.types.BuiltinType; +import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor; +import org.apache.asterix.runtime.evaluators.common.GramTokensEvaluator; +import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.api.exceptions.HyracksDataException; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8NGramTokenFactory; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer; + +public class HashedGramTokensDescriptor extends AbstractScalarFunctionDynamicDescriptor { + + private static final long serialVersionUID = 1L; + public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() { + @Override + public IFunctionDescriptor createFunctionDescriptor() { + return new HashedGramTokensDescriptor(); + } + }; + + @Override + public FunctionIdentifier getIdentifier() { + return BuiltinFunctions.HASHED_GRAM_TOKENS; + } + + @Override + public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) { + return new IScalarEvaluatorFactory() { + private static final long serialVersionUID = 1L; + + @Override + public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException { + ITokenFactory tokenFactory = new HashedUTF8NGramTokenFactory(); + NGramUTF8StringBinaryTokenizer tokenizer = + new NGramUTF8StringBinaryTokenizer(3, true, true, true, tokenFactory); + return new GramTokensEvaluator(args, ctx, tokenizer, BuiltinType.AINT32); + } + }; + } + +} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/HashedWordTokensDescriptor.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/HashedWordTokensDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/HashedWordTokensDescriptor.java new file mode 100644 index 0000000..e224fce --- /dev/null +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/HashedWordTokensDescriptor.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.runtime.evaluators.functions; + +import org.apache.asterix.om.functions.BuiltinFunctions; +import org.apache.asterix.om.functions.IFunctionDescriptor; +import org.apache.asterix.om.functions.IFunctionDescriptorFactory; +import org.apache.asterix.om.types.BuiltinType; +import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor; +import org.apache.asterix.runtime.evaluators.common.WordTokensEvaluator; +import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.api.exceptions.HyracksDataException; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8WordTokenFactory; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer; +import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory; + +public class HashedWordTokensDescriptor extends AbstractScalarFunctionDynamicDescriptor { + + private static final long serialVersionUID = 1L; + public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() { + @Override + public IFunctionDescriptor createFunctionDescriptor() { + return new HashedWordTokensDescriptor(); + } + }; + + @Override + public FunctionIdentifier getIdentifier() { + return BuiltinFunctions.HASHED_WORD_TOKENS; + } + + @Override + public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) { + return new IScalarEvaluatorFactory() { + private static final long serialVersionUID = 1L; + + @Override + public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException { + ITokenFactory tokenFactory = new HashedUTF8WordTokenFactory(); + IBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, true, tokenFactory); + return new WordTokensEvaluator(args, ctx, tokenizer, BuiltinType.AINT32); + } + }; + } + +} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/PrefixLenDescriptor.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/PrefixLenDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/PrefixLenDescriptor.java new file mode 100644 index 0000000..c9a865b --- /dev/null +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/PrefixLenDescriptor.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.runtime.evaluators.functions; + +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.asterix.common.functions.FunctionConstants; +import org.apache.asterix.dataflow.data.nontagged.serde.ADoubleSerializerDeserializer; +import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider; +import org.apache.asterix.fuzzyjoin.similarity.SimilarityFilters; +import org.apache.asterix.om.base.AInt32; +import org.apache.asterix.om.base.AMutableInt32; +import org.apache.asterix.runtime.exceptions.TypeMismatchException; +import org.apache.asterix.om.functions.IFunctionDescriptor; +import org.apache.asterix.om.functions.IFunctionDescriptorFactory; +import org.apache.asterix.om.types.ATypeTag; +import org.apache.asterix.om.types.BuiltinType; +import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor; +import org.apache.asterix.runtime.evaluators.common.SimilarityFiltersCache; +import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer; +import org.apache.hyracks.api.exceptions.HyracksDataException; +import org.apache.hyracks.data.std.api.IPointable; +import org.apache.hyracks.data.std.primitive.IntegerPointable; +import org.apache.hyracks.data.std.primitive.VoidPointable; +import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; +import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference; + +public class PrefixLenDescriptor extends AbstractScalarFunctionDynamicDescriptor { + + private static final long serialVersionUID = 1L; + private final static FunctionIdentifier FID = + new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "prefix-len@3", 3); + public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() { + @Override + public IFunctionDescriptor createFunctionDescriptor() { + return new PrefixLenDescriptor(); + } + }; + + @Override + public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) { + return new IScalarEvaluatorFactory() { + private static final long serialVersionUID = 1L; + + @Override + public IScalarEvaluator createScalarEvaluator(final IHyracksTaskContext ctx) throws HyracksDataException { + + return new IScalarEvaluator() { + + private final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage(); + private final DataOutput out = resultStorage.getDataOutput(); + private final IPointable inputVal = new VoidPointable(); + private final IScalarEvaluator evalLen = args[0].createScalarEvaluator(ctx); + private final IScalarEvaluator evalSimilarity = args[1].createScalarEvaluator(ctx); + private final IScalarEvaluator evalThreshold = args[2].createScalarEvaluator(ctx); + + private final SimilarityFiltersCache similarityFiltersCache = new SimilarityFiltersCache(); + + // result + private final AMutableInt32 res = new AMutableInt32(0); + @SuppressWarnings("unchecked") + private final ISerializerDeserializer<AInt32> int32Serde = + SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.AINT32); + + @Override + public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException { + resultStorage.reset(); + // length + evalLen.evaluate(tuple, inputVal); + byte[] data = inputVal.getByteArray(); + int offset = inputVal.getStartOffset(); + if (data[offset] != ATypeTag.SERIALIZED_INT32_TYPE_TAG) { + throw new TypeMismatchException(getIdentifier(), 0, data[offset], + ATypeTag.SERIALIZED_INT32_TYPE_TAG); + } + int length = IntegerPointable.getInteger(data, offset + 1); + + // similarity threshold + evalThreshold.evaluate(tuple, inputVal); + data = inputVal.getByteArray(); + offset = inputVal.getStartOffset(); + if (data[offset] != ATypeTag.SERIALIZED_DOUBLE_TYPE_TAG) { + throw new TypeMismatchException(getIdentifier(), 1, data[offset], + ATypeTag.SERIALIZED_DOUBLE_TYPE_TAG); + } + float similarityThreshold = (float) ADoubleSerializerDeserializer.getDouble(data, offset + 1); + + // similarity name + evalSimilarity.evaluate(tuple, inputVal); + data = inputVal.getByteArray(); + offset = inputVal.getStartOffset(); + int len = inputVal.getLength(); + if (data[offset] != ATypeTag.SERIALIZED_STRING_TYPE_TAG) { + throw new TypeMismatchException(getIdentifier(), 2, data[offset], + ATypeTag.SERIALIZED_STRING_TYPE_TAG); + } + SimilarityFilters similarityFilters = + similarityFiltersCache.get(similarityThreshold, data, offset, len); + + int prefixLength = similarityFilters.getPrefixLength(length); + res.setValue(prefixLength); + + try { + int32Serde.serialize(res, out); + } catch (IOException e) { + throw new HyracksDataException(e); + } + result.set(resultStorage); + } + }; + } + }; + } + + @Override + public FunctionIdentifier getIdentifier() { + return FID; + } + +} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/PrefixLenJaccardDescriptor.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/PrefixLenJaccardDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/PrefixLenJaccardDescriptor.java new file mode 100644 index 0000000..25d4be3 --- /dev/null +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/PrefixLenJaccardDescriptor.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.runtime.evaluators.functions; + +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.asterix.dataflow.data.nontagged.serde.AFloatSerializerDeserializer; +import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider; +import org.apache.asterix.fuzzyjoin.similarity.SimilarityFiltersJaccard; +import org.apache.asterix.om.base.AInt32; +import org.apache.asterix.om.base.AMutableInt32; +import org.apache.asterix.om.functions.BuiltinFunctions; +import org.apache.asterix.om.functions.IFunctionDescriptor; +import org.apache.asterix.om.functions.IFunctionDescriptorFactory; +import org.apache.asterix.om.types.ATypeTag; +import org.apache.asterix.om.types.BuiltinType; +import org.apache.asterix.om.types.hierachy.ATypeHierarchy; +import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor; +import org.apache.asterix.runtime.exceptions.TypeMismatchException; +import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer; +import org.apache.hyracks.api.exceptions.HyracksDataException; +import org.apache.hyracks.data.std.api.IPointable; +import org.apache.hyracks.data.std.primitive.VoidPointable; +import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; +import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference; + +public class PrefixLenJaccardDescriptor extends AbstractScalarFunctionDynamicDescriptor { + + private static final long serialVersionUID = 1L; + public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() { + @Override + public IFunctionDescriptor createFunctionDescriptor() { + return new PrefixLenJaccardDescriptor(); + } + }; + + @Override + public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) { + return new IScalarEvaluatorFactory() { + private static final long serialVersionUID = 1L; + + @Override + public IScalarEvaluator createScalarEvaluator(final IHyracksTaskContext ctx) throws HyracksDataException { + + return new IScalarEvaluator() { + + private final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage(); + private final DataOutput out = resultStorage.getDataOutput(); + private final IPointable lenPtr = new VoidPointable(); + private final IPointable thresholdPtr = new VoidPointable(); + private final IScalarEvaluator evalLen = args[0].createScalarEvaluator(ctx); + private final IScalarEvaluator evalThreshold = args[1].createScalarEvaluator(ctx); + + private float similarityThresholdCache; + private SimilarityFiltersJaccard similarityFilters; + + // result + private final AMutableInt32 res = new AMutableInt32(0); + @SuppressWarnings("unchecked") + private final ISerializerDeserializer<AInt32> int32Serde = + SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.AINT32); + + @Override + public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException { + resultStorage.reset(); + evalLen.evaluate(tuple, lenPtr); + evalThreshold.evaluate(tuple, thresholdPtr); + + // length + int length = ATypeHierarchy.getIntegerValue(getIdentifier().getName(), 0, lenPtr.getByteArray(), + lenPtr.getStartOffset()); + // similarity threshold + byte[] data = thresholdPtr.getByteArray(); + int offset = thresholdPtr.getStartOffset(); + if (data[offset] != ATypeTag.SERIALIZED_FLOAT_TYPE_TAG) { + throw new TypeMismatchException(getIdentifier(), 1, data[offset], + ATypeTag.SERIALIZED_FLOAT_TYPE_TAG); + } + float similarityThreshold = AFloatSerializerDeserializer.getFloat(data, offset + 1); + + if (similarityThreshold != similarityThresholdCache || similarityFilters == null) { + similarityFilters = new SimilarityFiltersJaccard(similarityThreshold); + } + + int prefixLength = similarityFilters.getPrefixLength(length); + res.setValue(prefixLength); + + try { + int32Serde.serialize(res, out); + } catch (IOException e) { + throw new HyracksDataException(e); + } + result.set(resultStorage); + } + }; + } + }; + } + + @Override + public FunctionIdentifier getIdentifier() { + return BuiltinFunctions.PREFIX_LEN_JACCARD; + } + +} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityDescriptor.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityDescriptor.java new file mode 100644 index 0000000..8584d06 --- /dev/null +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityDescriptor.java @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.runtime.evaluators.functions; + +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.asterix.common.functions.FunctionConstants; +import org.apache.asterix.dataflow.data.nontagged.serde.ADoubleSerializerDeserializer; +import org.apache.asterix.dataflow.data.nontagged.serde.AOrderedListSerializerDeserializer; +import org.apache.asterix.dataflow.data.nontagged.serde.AUnorderedListSerializerDeserializer; +import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider; +import org.apache.asterix.fuzzyjoin.IntArray; +import org.apache.asterix.fuzzyjoin.similarity.PartialIntersect; +import org.apache.asterix.fuzzyjoin.similarity.SimilarityFilters; +import org.apache.asterix.fuzzyjoin.similarity.SimilarityMetric; +import org.apache.asterix.om.base.ADouble; +import org.apache.asterix.om.base.AMutableDouble; +import org.apache.asterix.om.functions.IFunctionDescriptor; +import org.apache.asterix.om.functions.IFunctionDescriptorFactory; +import org.apache.asterix.om.types.ATypeTag; +import org.apache.asterix.om.types.BuiltinType; +import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor; +import org.apache.asterix.runtime.evaluators.common.SimilarityFiltersCache; +import org.apache.asterix.runtime.exceptions.TypeMismatchException; +import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer; +import org.apache.hyracks.api.exceptions.HyracksDataException; +import org.apache.hyracks.data.std.api.IPointable; +import org.apache.hyracks.data.std.primitive.IntegerPointable; +import org.apache.hyracks.data.std.primitive.VoidPointable; +import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; +import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference; + +public class SimilarityDescriptor extends AbstractScalarFunctionDynamicDescriptor { + + private static final long serialVersionUID = 1L; + private final static FunctionIdentifier FID = + new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "similarity@7", 7); + public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() { + @Override + public IFunctionDescriptor createFunctionDescriptor() { + return new SimilarityDescriptor(); + } + }; + + @Override + public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) { + return new IScalarEvaluatorFactory() { + private static final long serialVersionUID = 1L; + + @Override + public IScalarEvaluator createScalarEvaluator(final IHyracksTaskContext ctx) throws HyracksDataException { + + return new IScalarEvaluator() { + + private final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage(); + private final DataOutput out = resultStorage.getDataOutput(); + private final IPointable inputVal = new VoidPointable(); + private final IScalarEvaluator evalLen1 = args[0].createScalarEvaluator(ctx); + private final IScalarEvaluator evalTokens1 = args[1].createScalarEvaluator(ctx); + private final IScalarEvaluator evalLen2 = args[2].createScalarEvaluator(ctx); + private final IScalarEvaluator evalTokens2 = args[3].createScalarEvaluator(ctx); + private final IScalarEvaluator evalTokenPrefix = args[4].createScalarEvaluator(ctx); + private final IScalarEvaluator evalSimilarity = args[5].createScalarEvaluator(ctx); + private final IScalarEvaluator evalThreshold = args[6].createScalarEvaluator(ctx); + + private final SimilarityFiltersCache similarityFiltersCache = new SimilarityFiltersCache(); + + private final IntArray tokens1 = new IntArray(); + private final IntArray tokens2 = new IntArray(); + private final PartialIntersect parInter = new PartialIntersect(); + + // result + private final AMutableDouble res = new AMutableDouble(0); + @SuppressWarnings("unchecked") + private final ISerializerDeserializer<ADouble> doubleSerde = + SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.ADOUBLE); + + @Override + public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException { + resultStorage.reset(); + // similarity threshold + evalThreshold.evaluate(tuple, inputVal); + byte[] data = inputVal.getByteArray(); + int offset = inputVal.getStartOffset(); + + if (data[offset] != ATypeTag.SERIALIZED_DOUBLE_TYPE_TAG) { + throw new TypeMismatchException(getIdentifier(), 0, data[offset], + ATypeTag.SERIALIZED_DOUBLE_TYPE_TAG); + } + float similarityThreshold = (float) ADoubleSerializerDeserializer.getDouble(data, offset + 1); + + // similarity name + evalSimilarity.evaluate(tuple, inputVal); + data = inputVal.getByteArray(); + offset = inputVal.getStartOffset(); + int len = inputVal.getLength(); + if (data[offset] != ATypeTag.SERIALIZED_STRING_TYPE_TAG) { + throw new TypeMismatchException(getIdentifier(), 1, data[offset], + ATypeTag.SERIALIZED_DOUBLE_TYPE_TAG); + } + SimilarityFilters similarityFilters = + similarityFiltersCache.get(similarityThreshold, data, offset, len); + + evalLen1.evaluate(tuple, inputVal); + data = inputVal.getByteArray(); + offset = inputVal.getStartOffset(); + if (data[offset] != ATypeTag.SERIALIZED_INT32_TYPE_TAG) { + throw new TypeMismatchException(getIdentifier(), 2, data[offset], + ATypeTag.SERIALIZED_INT32_TYPE_TAG); + } + int length1 = IntegerPointable.getInteger(data, offset + 1); + + evalLen2.evaluate(tuple, inputVal); + data = inputVal.getByteArray(); + offset = inputVal.getStartOffset(); + if (data[offset] != ATypeTag.SERIALIZED_INT32_TYPE_TAG) { + throw new TypeMismatchException(getIdentifier(), 3, data[offset], + ATypeTag.SERIALIZED_INT32_TYPE_TAG); + } + int length2 = IntegerPointable.getInteger(data, offset + 1); + + float sim = 0; + + // + // -- - length filter - -- + // + if (similarityFilters.passLengthFilter(length1, length2)) { + + // -- - tokens1 - -- + int i; + tokens1.reset(); + evalTokens1.evaluate(tuple, inputVal); + byte[] serList = inputVal.getByteArray(); + offset = inputVal.getStartOffset(); + + if (serList[offset] != ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG + && serList[offset] != ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG) { + throw new TypeMismatchException(getIdentifier(), 4, data[offset], + ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG, + ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG); + } + + int lengthTokens1; + if (serList[offset] == ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG) { + lengthTokens1 = AOrderedListSerializerDeserializer.getNumberOfItems(serList, offset); + // read tokens + for (i = 0; i < lengthTokens1; i++) { + int itemOffset = + AOrderedListSerializerDeserializer.getItemOffset(serList, offset, i); + tokens1.add(IntegerPointable.getInteger(serList, itemOffset)); + } + } else { + lengthTokens1 = AUnorderedListSerializerDeserializer.getNumberOfItems(serList, offset); + // read tokens + for (i = 0; i < lengthTokens1; i++) { + int itemOffset = + AUnorderedListSerializerDeserializer.getItemOffset(serList, offset, i); + tokens1.add(IntegerPointable.getInteger(serList, itemOffset)); + } + } + // pad tokens + for (; i < length1; i++) { + tokens1.add(Integer.MAX_VALUE); + } + + // -- - tokens2 - -- + tokens2.reset(); + evalTokens2.evaluate(tuple, inputVal); + serList = inputVal.getByteArray(); + offset = inputVal.getStartOffset(); + + if (serList[offset] != ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG + && serList[offset] != ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG) { + throw new TypeMismatchException(getIdentifier(), 5, data[offset], + ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG, + ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG); + } + + int lengthTokens2; + if (serList[0] == ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG) { + lengthTokens2 = AOrderedListSerializerDeserializer.getNumberOfItems(serList, offset); + // read tokens + for (i = 0; i < lengthTokens2; i++) { + int itemOffset = + AOrderedListSerializerDeserializer.getItemOffset(serList, offset, i); + tokens2.add(IntegerPointable.getInteger(serList, itemOffset)); + } + } else { + lengthTokens2 = AUnorderedListSerializerDeserializer.getNumberOfItems(serList, offset); + // read tokens + for (i = 0; i < lengthTokens2; i++) { + int itemOffset = + AUnorderedListSerializerDeserializer.getItemOffset(serList, offset, i); + tokens2.add(IntegerPointable.getInteger(serList, itemOffset)); + } + } + // pad tokens + for (; i < length2; i++) { + tokens2.add(Integer.MAX_VALUE); + } + + // -- - token prefix - -- + evalTokenPrefix.evaluate(tuple, inputVal); + int tokenPrefix = + IntegerPointable.getInteger(inputVal.getByteArray(), inputVal.getStartOffset() + 1); + + // + // -- - position filter - -- + // + SimilarityMetric.getPartialIntersectSize(tokens1.get(), 0, tokens1.length(), tokens2.get(), + 0, tokens2.length(), tokenPrefix, parInter); + if (similarityFilters.passPositionFilter(parInter.intersectSize, parInter.posXStop, length1, + parInter.posYStop, length2)) { + + // + // -- - suffix filter - -- + // + if (similarityFilters.passSuffixFilter(tokens1.get(), 0, tokens1.length(), + parInter.posXStart, tokens2.get(), 0, tokens2.length(), parInter.posYStart)) { + + sim = similarityFilters.passSimilarityFilter(tokens1.get(), 0, tokens1.length(), + parInter.posXStop + 1, tokens2.get(), 0, tokens2.length(), + parInter.posYStop + 1, parInter.intersectSize); + } + } + } + + res.setValue(sim); + + try { + doubleSerde.serialize(res, out); + } catch (IOException e) { + throw HyracksDataException.create(e); + } + result.set(resultStorage); + } + }; + } + }; + } + + @Override + public FunctionIdentifier getIdentifier() { + return FID; + } + +} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityJaccardCheckDescriptor.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityJaccardCheckDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityJaccardCheckDescriptor.java new file mode 100644 index 0000000..9fc5b34 --- /dev/null +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityJaccardCheckDescriptor.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.runtime.evaluators.functions; + +import org.apache.asterix.om.functions.BuiltinFunctions; +import org.apache.asterix.om.functions.IFunctionDescriptor; +import org.apache.asterix.om.functions.IFunctionDescriptorFactory; +import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor; +import org.apache.asterix.runtime.evaluators.common.SimilarityJaccardCheckEvaluator; +import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.api.exceptions.HyracksDataException; + +public class SimilarityJaccardCheckDescriptor extends AbstractScalarFunctionDynamicDescriptor { + + private static final long serialVersionUID = 1L; + public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() { + @Override + public IFunctionDescriptor createFunctionDescriptor() { + return new SimilarityJaccardCheckDescriptor(); + } + }; + + @Override + public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) { + return new IScalarEvaluatorFactory() { + private static final long serialVersionUID = 1L; + + @Override + public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException { + return new SimilarityJaccardCheckEvaluator(args, ctx); + } + }; + } + + @Override + public FunctionIdentifier getIdentifier() { + return BuiltinFunctions.SIMILARITY_JACCARD_CHECK; + } +} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityJaccardDescriptor.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityJaccardDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityJaccardDescriptor.java new file mode 100644 index 0000000..742f2ff --- /dev/null +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityJaccardDescriptor.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.runtime.evaluators.functions; + +import org.apache.asterix.om.functions.BuiltinFunctions; +import org.apache.asterix.om.functions.IFunctionDescriptor; +import org.apache.asterix.om.functions.IFunctionDescriptorFactory; +import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor; +import org.apache.asterix.runtime.evaluators.common.SimilarityJaccardEvaluator; +import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator; +import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory; +import org.apache.hyracks.api.context.IHyracksTaskContext; +import org.apache.hyracks.api.exceptions.HyracksDataException; + +public class SimilarityJaccardDescriptor extends AbstractScalarFunctionDynamicDescriptor { + + private static final long serialVersionUID = 1L; + public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() { + @Override + public IFunctionDescriptor createFunctionDescriptor() { + return new SimilarityJaccardDescriptor(); + } + }; + + @Override + public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) { + return new IScalarEvaluatorFactory() { + private static final long serialVersionUID = 1L; + + @Override + public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException { + return new SimilarityJaccardEvaluator(args, ctx); + } + }; + } + + @Override + public FunctionIdentifier getIdentifier() { + return BuiltinFunctions.SIMILARITY_JACCARD; + } + +}