Author: tommaso
Date: Mon Jun 25 13:30:52 2018
New Revision: 1834326
URL: http://svn.apache.org/viewvc?rev=1834326&view=rev
Log:
OAK-7575 - Search over similar feature vectors
Added:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizer.java
(with props)
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilter.java
(with props)
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzer.java
(with props)
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/MinHashFilter.java
(with props)
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
(with props)
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/TruncateTokenFilter.java
(with props)
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizerTest.java
(with props)
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilterTest.java
(with props)
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzerTest.java
(with props)
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/TruncateTokenFilterTest.java
(with props)
jackrabbit/oak/trunk/oak-lucene/src/test/resources/org/apache/jackrabbit/oak/query/fvs.csv
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java
Mon Jun 25 13:30:52 2018
@@ -16,15 +16,22 @@
*/
package org.apache.jackrabbit.oak.plugins.index.lucene;
+import java.io.IOException;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collection;
import com.google.common.primitives.Ints;
+import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.commons.PathUtils;
+import org.apache.jackrabbit.oak.plugins.index.lucene.binary.BlobByteSource;
+import org.apache.jackrabbit.oak.plugins.index.lucene.util.fv.SimSearchUtils;
import org.apache.jackrabbit.util.ISO8601;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntField;
+import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
@@ -103,6 +110,34 @@ public final class FieldFactory {
return new StringField(name, value, NO);
}
+ static Collection<Field> newSimilarityFields(String name, Blob value)
throws IOException {
+ Collection<Field> fields = new ArrayList<>(1);
+ byte[] bytes = new BlobByteSource(value).read();
+// fields.add(newBinarySimilarityField(name, bytes));
+ fields.add(newSimilarityField(name, bytes));
+ return fields;
+ }
+
+ static Collection<Field> newSimilarityFields(String name, String value) {
+ Collection<Field> fields = new ArrayList<>(1);
+// byte[] bytes = SimSearchUtils.toByteArray(value);
+// fields.add(newBinarySimilarityField(name, bytes));
+ fields.add(newSimilarityField(name, value));
+ return fields;
+ }
+
+ private static Field newSimilarityField(String name, byte[] bytes) {
+ return newSimilarityField(name, SimSearchUtils.toDoubleString(bytes));
+ }
+
+ private static Field newSimilarityField(String name, String value) {
+ return new TextField(FieldNames.createSimilarityFieldName(name),
value, Field.Store.YES);
+ }
+
+ private static StoredField newBinarySimilarityField(String name, byte[]
bytes) {
+ return new StoredField(FieldNames.createBinSimilarityFieldName(name),
bytes);
+ }
+
public static Field newFulltextField(String value) {
return newFulltextField(value, false);
}
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java
Mon Jun 25 13:30:52 2018
@@ -70,6 +70,16 @@ public final class FieldNames {
public static final String ANALYZED_FIELD_PREFIX = "full:";
/**
+ * Name of the field that contains the similarity search indexed tokens.
+ */
+ private static final String SIMILARITY_PREFIX = "sim:";
+
+ /**
+ * Prefix for all field names that contains the similarity search binary
values.
+ */
+ private static final String SIMILARITY_BINARY_PREFIX = "simbin:";
+
+ /**
* Prefix used for storing fulltext of relative node
*/
public static final String FULLTEXT_RELATIVE_NODE = "fullnode:";
@@ -138,4 +148,12 @@ public final class FieldNames {
&& !field.startsWith(":")
&& !field.endsWith("_facet");
}
+
+ public static String createBinSimilarityFieldName(String name) {
+ return SIMILARITY_BINARY_PREFIX + name;
+ }
+
+ public static String createSimilarityFieldName(String name) {
+ return SIMILARITY_PREFIX + name;
+ }
}
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java
Mon Jun 25 13:30:52 2018
@@ -911,6 +911,7 @@ public final class IndexDefinition imple
private final List<PropertyDefinition> notNullCheckEnabledProperties;
private final List<PropertyDefinition> nodeScopeAnalyzedProps;
private final List<PropertyDefinition> syncProps;
+ private final List<PropertyDefinition> similarityProperties;
private final boolean indexesAllNodesOfMatchingType;
private final boolean nodeNameIndexed;
@@ -925,6 +926,7 @@ public final class IndexDefinition imple
final Aggregate propAggregate;
+
IndexingRule(String nodeTypeName, NodeState config) {
this.nodeTypeName = nodeTypeName;
this.baseNodeType = nodeTypeName;
@@ -938,9 +940,10 @@ public final class IndexDefinition imple
List<PropertyDefinition> existentProperties = newArrayList();
List<PropertyDefinition> nodeScopeAnalyzedProps = newArrayList();
List<PropertyDefinition> syncProps = newArrayList();
+ List<PropertyDefinition> similarityProperties = newArrayList();
List<Aggregate.Include> propIncludes = newArrayList();
this.propConfigs = collectPropConfigs(config, namePatterns,
propIncludes, nonExistentProperties,
- existentProperties, nodeScopeAnalyzedProps,
functionRestrictions, syncProps);
+ existentProperties, nodeScopeAnalyzedProps,
functionRestrictions, syncProps, similarityProperties);
this.propAggregate = new Aggregate(nodeTypeName, propIncludes);
this.aggregate = combine(propAggregate, nodeTypeName);
@@ -949,6 +952,7 @@ public final class IndexDefinition imple
this.nullCheckEnabledProperties =
ImmutableList.copyOf(nonExistentProperties);
this.functionRestrictions =
ImmutableList.copyOf(functionRestrictions);
this.notNullCheckEnabledProperties =
ImmutableList.copyOf(existentProperties);
+ this.similarityProperties =
ImmutableList.copyOf(similarityProperties);
this.fulltextEnabled = aggregate.hasNodeAggregates() ||
hasAnyFullTextEnabledProperty();
this.nodeFullTextIndexed = aggregate.hasNodeAggregates() ||
anyNodeScopeIndexedProperty();
this.propertyIndexEnabled = hasAnyPropertyIndexConfigured();
@@ -985,6 +989,7 @@ public final class IndexDefinition imple
this.indexesAllNodesOfMatchingType =
areAlMatchingNodeByTypeIndexed();
this.nodeNameIndexed = original.nodeNameIndexed;
this.syncProps = original.syncProps;
+ this.similarityProperties = original.similarityProperties;
}
/**
@@ -1032,6 +1037,10 @@ public final class IndexDefinition imple
return nodeScopeAnalyzedProps;
}
+ public List<PropertyDefinition> getSimilarityProperties() {
+ return similarityProperties;
+ }
+
@Override
public String toString() {
String str = "IndexRule: "+ nodeTypeName;
@@ -1153,7 +1162,8 @@ public final class IndexDefinition imple
List<PropertyDefinition> existentProperties,
List<PropertyDefinition> nodeScopeAnalyzedProps,
List<PropertyDefinition> functionRestrictions,
-
List<PropertyDefinition> syncProps) {
+
List<PropertyDefinition> syncProps,
+
List<PropertyDefinition> similarityProperties) {
Map<String, PropertyDefinition> propDefns = newHashMap();
NodeState propNode =
config.getChildNode(LuceneIndexConstants.PROP_NODE);
@@ -1232,6 +1242,9 @@ public final class IndexDefinition imple
if (pd.sync) {
syncProps.add(pd);
}
+ if (pd.useInSimilarity) {
+ similarityProperties.add(pd);
+ }
}
}
ensureNodeTypeIndexingIsConsistent(propDefns, syncProps);
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
Mon Jun 25 13:30:52 2018
@@ -62,6 +62,7 @@ import static org.apache.jackrabbit.oak.
import static
org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newFulltextField;
import static
org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newPathField;
import static
org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newPropertyField;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newSimilarityFields;
import static
org.apache.jackrabbit.oak.plugins.index.lucene.util.ConfigUtil.getPrimaryTypeName;
public class LuceneDocumentMaker {
@@ -253,7 +254,15 @@ public class LuceneDocumentMaker {
boolean includeTypeForFullText =
indexingRule.includePropertyType(property.getType().tag());
boolean dirty = false;
- if (Type.BINARY.tag() == property.getType().tag()
+ if (Type.BINARY.tag() == property.getType().tag() &&
pd.useInSimilarity) {
+ try {
+ log.trace("indexing similarity binaries for {}", pd.name);
+ fields.addAll(newSimilarityFields(pd.name,
property.getValue(Type.BINARY)));
+ dirty = true;
+ } catch (Exception e) {
+ log.error("could not index similarity field for property {}
and definition {}", property, pd);
+ }
+ } else if (Type.BINARY.tag() == property.getType().tag()
&& includeTypeForFullText) {
fields.addAll(newBinary(property, state, null, path + "@" +
pname));
dirty = true;
@@ -285,10 +294,17 @@ public class LuceneDocumentMaker {
if (pd.nodeScopeIndex) {
Field field = newFulltextField(value);
fields.add(field);
+ if (pd.useInSimilarity) {
+ log.trace("indexing similarity strings for {}",
pd.name);
+ fields.addAll(newSimilarityFields(pd.name,
value)); // fallback for when feature vectors are written in string typed
properties
+ }
}
+
+
dirty = true;
}
}
+
if (pd.facet && isFacetingEnabled()) {
dirty |= addFacetFields(fields, property, pname, pd);
}
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
Mon Jun 25 13:30:52 2018
@@ -305,6 +305,11 @@ public interface LuceneIndexConstants {
String PROP_USE_IN_SPELLCHECK = "useInSpellcheck";
/**
+ * whether use this property values for similarity
+ */
+ String PROP_USE_IN_SIMILARITY = "useInSimilarity";
+
+ /**
* Property definition config indicating that null check support should be
* enabled for this property
*/
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
Mon Jun 25 13:30:52 2018
@@ -18,10 +18,6 @@
*/
package org.apache.jackrabbit.oak.plugins.index.lucene;
-import javax.annotation.CheckForNull;
-import javax.annotation.Nonnull;
-import javax.annotation.Nullable;
-import javax.jcr.PropertyType;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
@@ -33,7 +29,11 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;
-import java.util.stream.Collectors;
+
+import javax.annotation.CheckForNull;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+import javax.jcr.PropertyType;
import com.google.common.base.Joiner;
import com.google.common.collect.AbstractIterator;
@@ -52,6 +52,8 @@ import org.apache.jackrabbit.oak.commons
import org.apache.jackrabbit.oak.commons.PerfLogger;
import org.apache.jackrabbit.oak.commons.json.JsopBuilder;
import org.apache.jackrabbit.oak.commons.json.JsopWriter;
+import org.apache.jackrabbit.oak.plugins.index.Cursors;
+import org.apache.jackrabbit.oak.plugins.index.Cursors.PathCursor;
import
org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition.IndexingRule;
import org.apache.jackrabbit.oak.plugins.index.lucene.IndexPlanner.PlanResult;
import
org.apache.jackrabbit.oak.plugins.index.lucene.IndexPlanner.PropertyIndexResult;
@@ -63,16 +65,9 @@ import org.apache.jackrabbit.oak.plugins
import
org.apache.jackrabbit.oak.plugins.index.lucene.util.PathStoredFieldVisitor;
import org.apache.jackrabbit.oak.plugins.index.lucene.util.SpellcheckHelper;
import org.apache.jackrabbit.oak.plugins.index.lucene.util.SuggestHelper;
+import org.apache.jackrabbit.oak.plugins.index.lucene.util.fv.SimSearchUtils;
import org.apache.jackrabbit.oak.plugins.memory.PropertyValues;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextAnd;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextContains;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextExpression;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextOr;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextTerm;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextVisitor;
import org.apache.jackrabbit.oak.spi.query.Cursor;
-import org.apache.jackrabbit.oak.plugins.index.Cursors;
-import org.apache.jackrabbit.oak.plugins.index.Cursors.PathCursor;
import org.apache.jackrabbit.oak.spi.query.Filter;
import org.apache.jackrabbit.oak.spi.query.Filter.PropertyRestriction;
import org.apache.jackrabbit.oak.spi.query.IndexRow;
@@ -80,6 +75,12 @@ import org.apache.jackrabbit.oak.spi.que
import org.apache.jackrabbit.oak.spi.query.QueryIndex;
import
org.apache.jackrabbit.oak.spi.query.QueryIndex.AdvanceFulltextQueryIndex;
import org.apache.jackrabbit.oak.spi.query.QueryLimits;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextAnd;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextContains;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextExpression;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextOr;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextTerm;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextVisitor;
import org.apache.jackrabbit.oak.spi.state.NodeState;
import org.apache.jackrabbit.oak.spi.state.NodeStateUtils;
import org.apache.lucene.analysis.Analyzer;
@@ -150,7 +151,9 @@ import static org.apache.jackrabbit.oak.
import static org.apache.jackrabbit.oak.spi.query.QueryConstants.REP_EXCERPT;
import static
org.apache.jackrabbit.oak.spi.query.QueryIndex.AdvancedQueryIndex;
import static org.apache.jackrabbit.oak.spi.query.QueryIndex.NativeQueryIndex;
-import static org.apache.lucene.search.BooleanClause.Occur.*;
+import static org.apache.lucene.search.BooleanClause.Occur.MUST;
+import static org.apache.lucene.search.BooleanClause.Occur.MUST_NOT;
+import static org.apache.lucene.search.BooleanClause.Occur.SHOULD;
/**
* Provides a QueryIndex that does lookups against a Lucene-based index
@@ -917,9 +920,20 @@ public class LucenePropertyIndex impleme
if (query.startsWith("mlt?")) {
String mltQueryString = query.replace("mlt?", "");
if (reader != null) {
- Query moreLikeThis =
MoreLikeThisHelper.getMoreLikeThis(reader, analyzer, mltQueryString);
- if (moreLikeThis != null) {
- qs.add(moreLikeThis);
+ List<PropertyDefinition> sp = new LinkedList<>();
+ for (IndexingRule r : defn.getDefinedRules()) {
+ sp.addAll(r.getSimilarityProperties());
+ }
+ if (sp.isEmpty()) {
+ Query moreLikeThis =
MoreLikeThisHelper.getMoreLikeThis(reader, analyzer, mltQueryString);
+ if (moreLikeThis != null) {
+ qs.add(moreLikeThis);
+ }
+ } else {
+ Query similarityQuery =
SimSearchUtils.getSimilarityQuery(sp, reader, mltQueryString);
+ if (similarityQuery != null) {
+ qs.add(similarityQuery);
+ }
}
}
} else if (query.startsWith("spellcheck?")) {
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java
Mon Jun 25 13:30:52 2018
@@ -58,7 +58,7 @@ public class PropertyDefinition {
* property etc then it should be defined via 'name' property in NodeState.
* In such case NodeState name can be set to anything
*/
- final String name;
+ public final String name;
private final int propertyType;
/**
@@ -123,7 +123,9 @@ public class PropertyDefinition {
public final boolean unique;
- public PropertyDefinition(IndexingRule idxDefn, String nodeName, NodeState
defn) {
+ public boolean useInSimilarity;
+
+ public PropertyDefinition(IndexingRule idxDefn, String nodeName, NodeState
defn) {
this.isRegexp = getOptionalValue(defn, PROP_IS_REGEX, false);
this.name = getName(defn, nodeName);
this.relative = isRelativeProperty(name);
@@ -151,6 +153,7 @@ public class PropertyDefinition {
this.propertyType = getPropertyType(idxDefn, nodeName, defn);
this.useInSuggest = getOptionalValueIfIndexed(defn,
LuceneIndexConstants.PROP_USE_IN_SUGGEST, false);
this.useInSpellcheck = getOptionalValueIfIndexed(defn,
LuceneIndexConstants.PROP_USE_IN_SPELLCHECK, false);
+ this.useInSimilarity = getOptionalValueIfIndexed(defn,
LuceneIndexConstants.PROP_USE_IN_SIMILARITY, false);
this.nullCheckEnabled = getOptionalValueIfIndexed(defn,
LuceneIndexConstants.PROP_NULL_CHECK_ENABLED, false);
this.notNullCheckEnabled = getOptionalValueIfIndexed(defn,
LuceneIndexConstants.PROP_NOT_NULL_CHECK_ENABLED, false);
this.excludeFromAggregate = getOptionalValueIfIndexed(defn,
LuceneIndexConstants.PROP_EXCLUDE_FROM_AGGREGATE, false);
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java
Mon Jun 25 13:30:52 2018
@@ -319,6 +319,11 @@ public final class IndexDefinitionBuilde
return this;
}
+ public PropertyRule useInSimilarity() {
+ propTree.setProperty(LuceneIndexConstants.PROP_USE_IN_SIMILARITY,
true);
+ return this;
+ }
+
public PropertyRule type(String type){
//This would throw an IAE if type is invalid
PropertyType.valueFromName(type);
Added:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizer.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizer.java?rev=1834326&view=auto
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizer.java
(added)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizer.java
Mon Jun 25 13:30:52 2018
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util.fv;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.util.CharTokenizer;
+import org.apache.lucene.util.Version;
+
+class FVTokenizer extends CharTokenizer {
+ FVTokenizer(Version matchVersion, Reader input) {
+ super(matchVersion, input);
+ }
+
+ @Override
+ protected boolean isTokenChar(int c) {
+ char c1 = Character.toChars(c)[0];
+ return c1 != ',' && !Character.isWhitespace(c);
+ }
+ }
\ No newline at end of file
Propchange:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilter.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilter.java?rev=1834326&view=auto
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilter.java
(added)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilter.java
Mon Jun 25 13:30:52 2018
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util.fv;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+final class FeaturePositionTokenFilter extends TokenFilter {
+
+ private final CharTermAttribute termAttribute =
addAttribute(CharTermAttribute.class);
+ private int tokenCount = 0;
+
+ FeaturePositionTokenFilter(TokenStream stream) {
+ super(stream);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ tokenCount++;
+ String token = new String(termAttribute.buffer(), 0,
termAttribute.length());
+ termAttribute.setEmpty();
+ termAttribute.append(String.valueOf(tokenCount));
+ termAttribute.append("_");
+ termAttribute.append(token);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ tokenCount = 0;
+ }
+
+ }
\ No newline at end of file
Propchange:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzer.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzer.java?rev=1834326&view=auto
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzer.java
(added)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzer.java
Mon Jun 25 13:30:52 2018
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util.fv;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.util.Version;
+
+/**
+ * {@link Analyzer} for LSH search
+ */
+public class LSHAnalyzer extends Analyzer {
+
+ private static final int DEFAULT_SHINGLE_SIZE = 4;
+
+ private final int min;
+ private final int max;
+ private final int hashCount;
+ private final int bucketCount;
+ private final int hashSetSize;
+
+ private LSHAnalyzer(int min, int max, int hashCount, int bucketCount, int
hashSetSize) {
+ super();
+ this.min = min;
+ this.max = max;
+ this.hashCount = hashCount;
+ this.bucketCount = bucketCount;
+ this.hashSetSize = hashSetSize;
+ }
+
+ public LSHAnalyzer() {
+ this(DEFAULT_SHINGLE_SIZE, DEFAULT_SHINGLE_SIZE,
MinHashFilter.DEFAULT_HASH_COUNT, MinHashFilter.DEFAULT_BUCKET_COUNT,
MinHashFilter.DEFAULT_HASH_SET_SIZE);
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader
reader) {
+ Tokenizer source = new FVTokenizer(Version.LUCENE_47, reader);
+ TokenFilter truncate = new TruncateTokenFilter(source, 5);
+ TokenFilter featurePos = new FeaturePositionTokenFilter(truncate);
+ ShingleFilter shingleFilter = new ShingleFilter(featurePos, min, max);
+ shingleFilter.setTokenSeparator(" ");
+ shingleFilter.setOutputUnigrams(false);
+ shingleFilter.setOutputUnigramsIfNoShingles(false);
+ TokenStream filter = new MinHashFilter(shingleFilter, hashCount,
bucketCount, hashSetSize, bucketCount > 1);
+ return new TokenStreamComponents(source, filter);
+ }
+
+}
Propchange:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/MinHashFilter.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/MinHashFilter.java?rev=1834326&view=auto
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/MinHashFilter.java
(added)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/MinHashFilter.java
Mon Jun 25 13:30:52 2018
@@ -0,0 +1,514 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util.fv;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.TreeSet;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+/**
+ * Generate min hash tokens from an incoming stream of tokens. The incoming
tokens would typically be 5 word shingles.
+ *
+ * The number of hashes used and the number of minimum values for each hash
can be set. You could have 1 hash and keep
+ * the 100 lowest values or 100 hashes and keep the lowest one for each.
Hashes can also be bucketed in ranges over the
+ * 128-bit hash space,
+ *
+ * A 128-bit hash is used internally. 5 word shingles from 10e5 words generate
10e25 combinations So a 64 bit hash would
+ * have collisions (1.8e19)
+ *
+ * When using different hashes 32 bits are used for the hash position leaving
scope for 8e28 unique hashes. A single
+ * hash will use all 128 bits.
+ *
+ */
+class MinHashFilter extends TokenFilter {
+ private static final int HASH_CACHE_SIZE = 512;
+
+ private static final LongPair[] cachedIntHashes = new
LongPair[HASH_CACHE_SIZE];
+
+ public static final int DEFAULT_HASH_COUNT = 1;
+
+ public static final int DEFAULT_HASH_SET_SIZE = 1;
+
+ public static final int DEFAULT_BUCKET_COUNT = 512;
+
+ private static final String MIN_HASH_TYPE = "MIN_HASH";
+
+ private final List<List<FixedSizeTreeSet<LongPair>>> minHashSets;
+
+ private int hashSetSize = DEFAULT_HASH_SET_SIZE;
+
+ private int bucketCount = DEFAULT_BUCKET_COUNT;
+
+ private int hashCount = DEFAULT_HASH_COUNT;
+
+ private boolean requiresInitialisation = true;
+
+ private State endState;
+
+ private int hashPosition = -1;
+
+ private int bucketPosition = -1;
+
+ private long bucketSize;
+
+ private final boolean withRotation;
+
+ private int endOffset;
+
+ private boolean exhausted = false;
+
+ private final CharTermAttribute termAttribute =
addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAttribute =
addAttribute(OffsetAttribute.class);
+ private final TypeAttribute typeAttribute =
addAttribute(TypeAttribute.class);
+ private final PositionIncrementAttribute posIncAttribute =
addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLenAttribute =
addAttribute(PositionLengthAttribute.class);
+
+ static {
+ for (int i = 0; i < HASH_CACHE_SIZE; i++) {
+ cachedIntHashes[i] = new LongPair();
+ murmurhash3_x64_128(getBytes(i), 0, 4, 0, cachedIntHashes[i]);
+ }
+ }
+
+ static byte[] getBytes(int i) {
+ byte[] answer = new byte[4];
+ answer[3] = (byte) (i);
+ answer[2] = (byte) (i >> 8);
+ answer[1] = (byte) (i >> 16);
+ answer[0] = (byte) (i >> 24);
+ return answer;
+ }
+
+ /**
+ * create a MinHash filter
+ *
+ * @param input the token stream
+ * @param hashCount the no. of hashes
+ * @param bucketCount the no. of buckets for hashing
+ * @param hashSetSize the no. of min hashes to keep
+ * @param withRotation whether rotate or not hashes while incrementing tokens
+ */
+ public MinHashFilter(TokenStream input, int hashCount, int bucketCount, int
hashSetSize, boolean withRotation) {
+ super(input);
+ if (hashCount <= 0) {
+ throw new IllegalArgumentException("hashCount must be greater than
zero");
+ }
+ if (bucketCount <= 0) {
+ throw new IllegalArgumentException("bucketCount must be greater than
zero");
+ }
+ if (hashSetSize <= 0) {
+ throw new IllegalArgumentException("hashSetSize must be greater than
zero");
+ }
+ this.hashCount = hashCount;
+ this.bucketCount = bucketCount;
+ this.hashSetSize = hashSetSize;
+ this.withRotation = withRotation;
+ this.bucketSize = (1L << 32) / bucketCount;
+ if((1L << 32) % bucketCount != 0)
+ {
+ bucketSize++;
+ }
+ minHashSets = new ArrayList<>(this.hashCount);
+ for (int i = 0; i < this.hashCount; i++) {
+ ArrayList<FixedSizeTreeSet<LongPair>> buckets = new
ArrayList<>(this.bucketCount);
+ minHashSets.add(buckets);
+ for (int j = 0; j < this.bucketCount; j++) {
+ FixedSizeTreeSet<LongPair> minSet = new
FixedSizeTreeSet<>(this.hashSetSize);
+ buckets.add(minSet);
+ }
+ }
+ doRest();
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ // Pull the underlying stream of tokens
+ // Hash each token found
+ // Generate the required number of variants of this hash
+ // Keep the minimum hash value found so far of each variant
+
+ int positionIncrement = 0;
+ if (requiresInitialisation) {
+ requiresInitialisation = false;
+ boolean found = false;
+ // First time through so we pull and hash everything
+ while (input.incrementToken()) {
+ found = true;
+ String current = new String(termAttribute.buffer(), 0,
termAttribute.length());
+
+ for (int i = 0; i < hashCount; i++) {
+ byte[] bytes = current.getBytes("UTF-16LE");
+ LongPair hash = new LongPair();
+ murmurhash3_x64_128(bytes, 0, bytes.length, 0, hash);
+ LongPair rehashed = combineOrdered(hash, getIntHash(i));
+ minHashSets.get(i).get((int) ((rehashed.val2 >>> 32) /
bucketSize)).add(rehashed);
+ }
+ endOffset = offsetAttribute.endOffset();
+ }
+ exhausted = true;
+ input.end();
+ // We need the end state so an underlying shingle filter can have its
state restored correctly.
+ endState = captureState();
+ if (!found) {
+ return false;
+ }
+
+ positionIncrement = 1;
+ // fix up any wrap around bucket values. ...
+ if (withRotation && (hashSetSize == 1)) {
+ for (int hashLoop = 0; hashLoop < hashCount; hashLoop++) {
+ for (int bucketLoop = 0; bucketLoop < bucketCount; bucketLoop++) {
+ if (minHashSets.get(hashLoop).get(bucketLoop).size() == 0) {
+ for (int bucketOffset = 1; bucketOffset < bucketCount;
bucketOffset++) {
+ if (minHashSets.get(hashLoop).get((bucketLoop + bucketOffset)
% bucketCount).size() > 0) {
+ LongPair replacementHash =
minHashSets.get(hashLoop).get((bucketLoop + bucketOffset) % bucketCount)
+ .first();
+
minHashSets.get(hashLoop).get(bucketLoop).add(replacementHash);
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ }
+
+ clearAttributes();
+
+ while (hashPosition < hashCount) {
+ if (hashPosition == -1) {
+ hashPosition++;
+ } else {
+ while (bucketPosition < bucketCount) {
+ if (bucketPosition == -1) {
+ bucketPosition++;
+ } else {
+ LongPair hash =
minHashSets.get(hashPosition).get(bucketPosition).pollFirst();
+ if (hash != null) {
+ termAttribute.setEmpty();
+ if (hashCount > 1) {
+ termAttribute.append(int0(hashPosition));
+ termAttribute.append(int1(hashPosition));
+ }
+ long high = hash.val2;
+ termAttribute.append(long0(high));
+ termAttribute.append(long1(high));
+ termAttribute.append(long2(high));
+ termAttribute.append(long3(high));
+ long low = hash.val1;
+ termAttribute.append(long0(low));
+ termAttribute.append(long1(low));
+ if (hashCount == 1) {
+ termAttribute.append(long2(low));
+ termAttribute.append(long3(low));
+ }
+ posIncAttribute.setPositionIncrement(positionIncrement);
+ offsetAttribute.setOffset(0, endOffset);
+ typeAttribute.setType(MIN_HASH_TYPE);
+ posLenAttribute.setPositionLength(1);
+ return true;
+ } else {
+ bucketPosition++;
+ }
+ }
+
+ }
+ bucketPosition = -1;
+ hashPosition++;
+ }
+ }
+ return false;
+ }
+
+ private static LongPair getIntHash(int i) {
+ if (i < HASH_CACHE_SIZE) {
+ return cachedIntHashes[i];
+ } else {
+ LongPair answer = new LongPair();
+ murmurhash3_x64_128(getBytes(i), 0, 4, 0, answer);
+ return answer;
+ }
+ }
+
+ @Override
+ public void end() throws IOException {
+ if(!exhausted) {
+ input.end();
+ }
+
+ restoreState(endState);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ doRest();
+ }
+
+ private void doRest() {
+ for (int i = 0; i < hashCount; i++) {
+ for (int j = 0; j < bucketCount; j++) {
+ minHashSets.get(i).get(j).clear();
+ }
+ }
+ endState = null;
+ hashPosition = -1;
+ bucketPosition = -1;
+ requiresInitialisation = true;
+ exhausted = false;
+ }
+
+ private static char long0(long x) {
+ return (char) (x >> 48);
+ }
+
+ private static char long1(long x) {
+ return (char) (x >> 32);
+ }
+
+ private static char long2(long x) {
+ return (char) (x >> 16);
+ }
+
+ private static char long3(long x) {
+ return (char) (x);
+ }
+
+ private static char int0(int x) {
+ return (char) (x >> 16);
+ }
+
+ private static char int1(int x) {
+ return (char) (x);
+ }
+
+ public static boolean isLessThanUnsigned(long n1, long n2) {
+ return (n1 < n2) ^ ((n1 < 0) != (n2 < 0));
+ }
+
+ static class FixedSizeTreeSet<E extends Comparable<E>> extends TreeSet<E> {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -8237117170340299630L;
+ private final int capacity;
+
+ FixedSizeTreeSet() {
+ this(20);
+ }
+
+ FixedSizeTreeSet(int capacity) {
+ super();
+ this.capacity = capacity;
+ }
+
+ @Override
+ public boolean add(final E toAdd) {
+ if (capacity <= size()) {
+ final E lastElm = last();
+ if (toAdd.compareTo(lastElm) > -1) {
+ return false;
+ } else {
+ pollLast();
+ }
+ }
+ return super.add(toAdd);
+ }
+ }
+
+ private static LongPair combineOrdered(LongPair... hashCodes) {
+ LongPair result = new LongPair();
+ for (LongPair hashCode : hashCodes) {
+ result.val1 = result.val1 * 37 + hashCode.val1;
+ result.val2 = result.val2 * 37 + hashCode.val2;
+
+ }
+ return result;
+ }
+
+ /** 128 bits of state */
+ static final class LongPair implements Comparable<LongPair> {
+ public long val1;
+ public long val2;
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Comparable#compareTo(java.lang.Object)
+ */
+ @Override
+ public int compareTo(LongPair other) {
+ if (isLessThanUnsigned(val2, other.val2)) {
+ return -1;
+ } else if (val2 == other.val2) {
+ if (isLessThanUnsigned(val1, other.val1)) {
+ return -1;
+ } else if (val1 == other.val1) {
+ return 0;
+ } else {
+ return 1;
+ }
+ } else {
+ return 1;
+ }
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ LongPair longPair = (LongPair) o;
+
+ return val1 == longPair.val1 && val2 == longPair.val2;
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = (int) (val1 ^ (val1 >>> 32));
+ result = 31 * result + (int) (val2 ^ (val2 >>> 32));
+ return result;
+ }
+ }
+
+ /** Gets a long from a byte buffer in little endian byte order. */
+ private static long getLongLittleEndian(byte[] buf, int offset) {
+ return ((long) buf[offset + 7] << 56) // no mask needed
+ | ((buf[offset + 6] & 0xffL) << 48)
+ | ((buf[offset + 5] & 0xffL) << 40)
+ | ((buf[offset + 4] & 0xffL) << 32)
+ | ((buf[offset + 3] & 0xffL) << 24)
+ | ((buf[offset + 2] & 0xffL) << 16)
+ | ((buf[offset + 1] & 0xffL) << 8)
+ | ((buf[offset] & 0xffL)); // no shift needed
+ }
+
+ /** Returns the MurmurHash3_x64_128 hash, placing the result in "out". */
+ @SuppressWarnings("fallthrough") // the huge switch is designed to use fall
through into cases!
+ static void murmurhash3_x64_128(byte[] key, int offset, int len, int seed,
LongPair out) {
+ // The original algorithm does have a 32 bit unsigned seed.
+ // We have to mask to match the behavior of the unsigned types and prevent
sign extension.
+ long h1 = seed & 0x00000000FFFFFFFFL;
+ long h2 = seed & 0x00000000FFFFFFFFL;
+
+ final long c1 = 0x87c37b91114253d5L;
+ final long c2 = 0x4cf5ad432745937fL;
+
+ int roundedEnd = offset + (len & 0xFFFFFFF0); // round down to 16 byte
block
+ for (int i = offset; i < roundedEnd; i += 16) {
+ long k1 = getLongLittleEndian(key, i);
+ long k2 = getLongLittleEndian(key, i + 8);
+ k1 *= c1;
+ k1 = Long.rotateLeft(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+ h1 = Long.rotateLeft(h1, 27);
+ h1 += h2;
+ h1 = h1 * 5 + 0x52dce729;
+ k2 *= c2;
+ k2 = Long.rotateLeft(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+ h2 = Long.rotateLeft(h2, 31);
+ h2 += h1;
+ h2 = h2 * 5 + 0x38495ab5;
+ }
+
+ long k1 = 0;
+ long k2 = 0;
+
+ switch (len & 15) {
+ case 15:
+ k2 = (key[roundedEnd + 14] & 0xffL) << 48;
+ case 14:
+ k2 |= (key[roundedEnd + 13] & 0xffL) << 40;
+ case 13:
+ k2 |= (key[roundedEnd + 12] & 0xffL) << 32;
+ case 12:
+ k2 |= (key[roundedEnd + 11] & 0xffL) << 24;
+ case 11:
+ k2 |= (key[roundedEnd + 10] & 0xffL) << 16;
+ case 10:
+ k2 |= (key[roundedEnd + 9] & 0xffL) << 8;
+ case 9:
+ k2 |= (key[roundedEnd + 8] & 0xffL);
+ k2 *= c2;
+ k2 = Long.rotateLeft(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+ case 8:
+ k1 = ((long) key[roundedEnd + 7]) << 56;
+ case 7:
+ k1 |= (key[roundedEnd + 6] & 0xffL) << 48;
+ case 6:
+ k1 |= (key[roundedEnd + 5] & 0xffL) << 40;
+ case 5:
+ k1 |= (key[roundedEnd + 4] & 0xffL) << 32;
+ case 4:
+ k1 |= (key[roundedEnd + 3] & 0xffL) << 24;
+ case 3:
+ k1 |= (key[roundedEnd + 2] & 0xffL) << 16;
+ case 2:
+ k1 |= (key[roundedEnd + 1] & 0xffL) << 8;
+ case 1:
+ k1 |= (key[roundedEnd] & 0xffL);
+ k1 *= c1;
+ k1 = Long.rotateLeft(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+ }
+
+ // ----------
+ // finalization
+
+ h1 ^= len;
+ h2 ^= len;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 = fmix64(h1);
+ h2 = fmix64(h2);
+
+ h1 += h2;
+ h2 += h1;
+
+ out.val1 = h1;
+ out.val2 = h2;
+ }
+
+ private static long fmix64(long k) {
+ k ^= k >>> 33;
+ k *= 0xff51afd7ed558ccdL;
+ k ^= k >>> 33;
+ k *= 0xc4ceb9fe1a85ec53L;
+ k ^= k >>> 33;
+ return k;
+ }
+}
Propchange:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/MinHashFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java?rev=1834326&view=auto
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
(added)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
Mon Jun 25 13:30:52 2018
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util.fv;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames;
+import org.apache.jackrabbit.oak.plugins.index.lucene.PropertyDefinition;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static org.apache.lucene.search.BooleanClause.Occur.SHOULD;
+
+/**
+ * Utility methods for indexing and searching for similar feature vectors
+ */
+public class SimSearchUtils {
+
+ private static final Logger log =
LoggerFactory.getLogger(SimSearchUtils.class);
+
+ public static String toDoubleString(byte[] bytes) {
+ Double[] a = toDoubleArray(bytes);
+ StringBuilder builder = new StringBuilder();
+ for (Double d : a) {
+ if (builder.length() > 0) {
+ builder.append(' ');
+ }
+ builder.append(d);
+ }
+ return builder.toString();
+ }
+
+ private static Double[] toDoubleArray(byte[] array) {
+ List<Double> doubles = toDoubles(array);
+ return doubles.toArray(new Double[doubles.size()]);
+ }
+
+ public static List<Double> toDoubles(byte[] array) {
+ int blockSize = Double.SIZE / Byte.SIZE;
+ ByteBuffer wrap = ByteBuffer.wrap(array);
+ int capacity = array.length / blockSize;
+ List<Double> doubles = new ArrayList<>(capacity);
+ for (int i = 0; i < capacity; i++) {
+ double e = wrap.getDouble(i * blockSize);
+ doubles.add(e);
+ }
+ return doubles;
+ }
+
+ private static Collection<String> getTokens(Analyzer analyzer, String
field, String sampleTextString) throws IOException {
+ Collection<String> tokens = new LinkedList<>();
+ TokenStream ts = analyzer.tokenStream(field, sampleTextString);
+ ts.reset();
+ ts.addAttribute(CharTermAttribute.class);
+ while (ts.incrementToken()) {
+ CharTermAttribute charTermAttribute =
ts.getAttribute(CharTermAttribute.class);
+ String token = new String(charTermAttribute.buffer(), 0,
charTermAttribute.length());
+ tokens.add(token);
+ }
+ ts.end();
+ ts.close();
+ return tokens;
+ }
+
+ static BooleanQuery getSimQuery(Analyzer analyzer, String fieldName,
String text) throws IOException {
+ Collection<String> tokens = getTokens(analyzer, fieldName, text);
+ BooleanQuery booleanQuery = new BooleanQuery(true);
+ booleanQuery.setMinimumNumberShouldMatch(3);
+ for (String token : tokens) {
+ booleanQuery.add(new ConstantScoreQuery(new TermQuery(new
Term(fieldName, token))), BooleanClause.Occur.SHOULD);
+ }
+ return booleanQuery;
+ }
+
+
+ public static byte[] toByteArray(List<Double> values) {
+ int blockSize = Double.SIZE / Byte.SIZE;
+ byte[] bytes = new byte[values.size() * blockSize];
+ for (int i = 0, j = 0; i < values.size(); i++, j += blockSize) {
+ ByteBuffer.wrap(bytes, j, blockSize).putDouble(values.get(i));
+ }
+ return bytes;
+ }
+
+ public static byte[] toByteArray(String value) {
+ List<Double> doubles = new LinkedList<>();
+ for (String dv : value.split(",")) {
+ doubles.add(Double.parseDouble(dv));
+ }
+ return toByteArray(doubles);
+ }
+
+ public static Query getSimilarityQuery(List<PropertyDefinition> sp,
IndexReader reader, String queryString) {
+ try {
+ log.debug("parsing similarity query on {}", queryString);
+ Query similarityQuery = null;
+ String text = null;
+ for (String param : queryString.split("&")) {
+ String[] keyValuePair = param.split("=");
+ if (keyValuePair.length != 2 || keyValuePair[0] == null ||
keyValuePair[1] == null) {
+ throw new RuntimeException("Unparsable native Lucene query
for fv similarity: " + queryString);
+ } else {
+ if ("stream.body".equals(keyValuePair[0])) {
+ text = keyValuePair[1];
+ break;
+ }
+ }
+ }
+
+ if (text != null && !sp.isEmpty()) {
+ log.debug("generating similarity query for {}", text);
+ BooleanQuery booleanQuery = new BooleanQuery(true);
+ LSHAnalyzer analyzer = new LSHAnalyzer();
+ IndexSearcher searcher = new IndexSearcher(reader);
+ TermQuery q = new TermQuery(new Term(FieldNames.PATH, text));
+ TopDocs top = searcher.search(q, 1);
+ if (top.totalHits > 0) {
+ ScoreDoc d = top.scoreDocs[0];
+ Document doc = reader.document(d.doc);
+ for (PropertyDefinition pd : sp) {
+ log.debug("adding similarity clause for property {}",
pd.name);
+ String similarityFieldName =
FieldNames.createSimilarityFieldName(pd.name);
+ String fvString = doc.get(similarityFieldName);
+ if (fvString != null && fvString.trim().length() > 0) {
+ String fieldName =
FieldNames.createSimilarityFieldName(pd.name);
+ log.trace("generating sim query on field {} and
text {}", fieldName, fvString);
+ BooleanQuery simQuery =
SimSearchUtils.getSimQuery(analyzer, fieldName, fvString);
+ booleanQuery.add(new BooleanClause(simQuery,
SHOULD));
+ log.trace("similarity query generated for {}",
pd.name);
+ }
+ }
+ }
+ if (booleanQuery.clauses().size() > 0) {
+ similarityQuery = booleanQuery;
+ log.trace("final similarity query is {}", similarityQuery);
+ }
+ }
+
+ return similarityQuery;
+ } catch (Exception e) {
+ throw new RuntimeException("could not handle similarity query " +
queryString);
+ }
+ }
+
+}
Propchange:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/TruncateTokenFilter.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/TruncateTokenFilter.java?rev=1834326&view=auto
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/TruncateTokenFilter.java
(added)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/TruncateTokenFilter.java
Mon Jun 25 13:30:52 2018
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util.fv;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+class TruncateTokenFilter extends TokenFilter {
+
+ private final CharTermAttribute termAttribute =
addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttr =
addAttribute(KeywordAttribute.class);
+
+ private final int length;
+
+ TruncateTokenFilter(TokenStream input, int length) {
+ super(input);
+ if (length < 1) {
+ throw new IllegalArgumentException("length parameter must be a
positive number: " + length);
+ }
+ this.length = length;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword() && termAttribute.length() > length) {
+ termAttribute.setLength(length);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
\ No newline at end of file
Propchange:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/TruncateTokenFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java
Mon Jun 25 13:30:52 2018
@@ -20,12 +20,15 @@
package org.apache.jackrabbit.oak.plugins.index.lucene.writer;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition;
import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants;
+import org.apache.jackrabbit.oak.plugins.index.lucene.PropertyDefinition;
+import org.apache.jackrabbit.oak.plugins.index.lucene.util.fv.LSHAnalyzer;
import org.apache.jackrabbit.oak.plugins.index.lucene.util.SuggestHelper;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
@@ -51,6 +54,15 @@ public class IndexWriterUtils {
Analyzer definitionAnalyzer = definition.getAnalyzer();
Map<String, Analyzer> analyzers = new HashMap<String, Analyzer>();
analyzers.put(FieldNames.SPELLCHECK, new
ShingleAnalyzerWrapper(LuceneIndexConstants.ANALYZER, 3));
+ for (IndexDefinition.IndexingRule r :
definition.getDefinedRules()) {
+ List<PropertyDefinition> similarityProperties =
r.getSimilarityProperties();
+ for (PropertyDefinition pd : similarityProperties) {
+ if (pd.useInSimilarity) {
+
analyzers.put(FieldNames.createSimilarityFieldName(pd.name), new LSHAnalyzer());
+ }
+ }
+ }
+
if (!definition.isSuggestAnalyzed()) {
analyzers.put(FieldNames.SUGGEST, SuggestHelper.getAnalyzer());
}
Modified:
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
Mon Jun 25 13:30:52 2018
@@ -19,66 +19,17 @@
package org.apache.jackrabbit.oak.plugins.index.lucene;
-import javax.annotation.Nonnull;
-import javax.jcr.PropertyType;
-
-import static com.google.common.collect.ImmutableSet.of;
-import static com.google.common.collect.Lists.newArrayList;
-import static java.util.Arrays.asList;
-import static org.apache.jackrabbit.JcrConstants.JCR_CONTENT;
-import static org.apache.jackrabbit.JcrConstants.JCR_DATA;
-import static org.apache.jackrabbit.JcrConstants.NT_FILE;
-import static org.apache.jackrabbit.oak.api.QueryEngine.NO_BINDINGS;
-import static org.apache.jackrabbit.oak.api.QueryEngine.NO_MAPPINGS;
-import static org.apache.jackrabbit.oak.api.Type.NAMES;
-import static org.apache.jackrabbit.oak.api.Type.STRING;
-import static org.apache.jackrabbit.oak.api.Type.STRINGS;
-import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.ASYNC_PROPERTY_NAME;
-import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.DECLARING_NODE_TYPES;
-import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NAME;
-import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NODE_TYPE;
-import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.QUERY_PATHS;
-import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.REINDEX_PROPERTY_NAME;
-import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.TYPE_PROPERTY_NAME;
-import static
org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition.INDEX_DEFINITION_NODE;
-import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.ANALYZERS;
-import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.INCLUDE_PROPERTY_NAMES;
-import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.ORDERED_PROP_NAMES;
-import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.INDEX_ORIGINAL_TERM;
-import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROPDEF_PROP_NODE_NAME;
-import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_ANALYZED;
-import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NAME;
-import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NODE;
-import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NODE_SCOPE_INDEX;
-import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_PROPERTY_INDEX;
-import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_TYPE;
-import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.TIKA;
-import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexEditorTest.createCal;
-import static org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.child;
-import static
org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.newNodeAggregator;
-import static org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.useV2;
-import static
org.apache.jackrabbit.oak.plugins.index.property.OrderedIndex.OrderDirection;
-import static
org.apache.jackrabbit.oak.plugins.memory.PropertyStates.createProperty;
-import static org.apache.jackrabbit.oak.InitialContentHelper.INITIAL_CONTENT;
-import static
org.apache.jackrabbit.oak.spi.filter.PathFilter.PROP_EXCLUDED_PATHS;
-import static
org.apache.jackrabbit.oak.spi.filter.PathFilter.PROP_INCLUDED_PATHS;
-import static org.hamcrest.CoreMatchers.containsString;
-import static org.hamcrest.CoreMatchers.not;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertThat;
-import static org.junit.Assert.assertTrue;
-
+import java.io.ByteArrayInputStream;
import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.net.URI;
+import java.nio.charset.Charset;
import java.text.ParseException;
import java.util.Calendar;
+import java.util.Collection;
import java.util.Collections;
-import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
@@ -88,18 +39,21 @@ import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
+import javax.annotation.Nonnull;
+import javax.jcr.PropertyType;
+
import com.google.common.base.Charsets;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
-import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.CountingInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.jackrabbit.JcrConstants;
+import org.apache.jackrabbit.oak.InitialContent;
import org.apache.jackrabbit.oak.Oak;
import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.api.CommitFailedException;
@@ -120,13 +74,13 @@ import org.apache.jackrabbit.oak.plugins
import
org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
import
org.apache.jackrabbit.oak.plugins.index.lucene.directory.CopyOnReadDirectory;
import
org.apache.jackrabbit.oak.plugins.index.lucene.util.IndexDefinitionBuilder;
+import org.apache.jackrabbit.oak.plugins.index.lucene.util.fv.SimSearchUtils;
import org.apache.jackrabbit.oak.plugins.index.nodetype.NodeTypeIndexProvider;
import
org.apache.jackrabbit.oak.plugins.index.property.PropertyIndexEditorProvider;
import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob;
import org.apache.jackrabbit.oak.plugins.memory.MemoryNodeStore;
import org.apache.jackrabbit.oak.plugins.memory.PropertyStates;
import org.apache.jackrabbit.oak.plugins.nodetype.TypeEditorProvider;
-import org.apache.jackrabbit.oak.InitialContent;
import org.apache.jackrabbit.oak.plugins.nodetype.write.NodeTypeRegistry;
import org.apache.jackrabbit.oak.query.AbstractQueryTest;
import org.apache.jackrabbit.oak.spi.commit.CommitInfo;
@@ -147,6 +101,56 @@ import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
+import static com.google.common.collect.ImmutableSet.of;
+import static com.google.common.collect.Lists.newArrayList;
+import static java.util.Arrays.asList;
+import static org.apache.jackrabbit.JcrConstants.JCR_CONTENT;
+import static org.apache.jackrabbit.JcrConstants.JCR_DATA;
+import static org.apache.jackrabbit.JcrConstants.NT_FILE;
+import static org.apache.jackrabbit.oak.InitialContentHelper.INITIAL_CONTENT;
+import static org.apache.jackrabbit.oak.api.QueryEngine.NO_BINDINGS;
+import static org.apache.jackrabbit.oak.api.QueryEngine.NO_MAPPINGS;
+import static org.apache.jackrabbit.oak.api.Type.NAMES;
+import static org.apache.jackrabbit.oak.api.Type.STRING;
+import static org.apache.jackrabbit.oak.api.Type.STRINGS;
+import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.ASYNC_PROPERTY_NAME;
+import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.DECLARING_NODE_TYPES;
+import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NAME;
+import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NODE_TYPE;
+import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.QUERY_PATHS;
+import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.REINDEX_PROPERTY_NAME;
+import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.TYPE_PROPERTY_NAME;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition.INDEX_DEFINITION_NODE;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.ANALYZERS;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.INCLUDE_PROPERTY_NAMES;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.INDEX_ORIGINAL_TERM;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.ORDERED_PROP_NAMES;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROPDEF_PROP_NODE_NAME;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_ANALYZED;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NAME;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NODE;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NODE_SCOPE_INDEX;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_PROPERTY_INDEX;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_TYPE;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.TIKA;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexEditorTest.createCal;
+import static org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.child;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.newNodeAggregator;
+import static org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.useV2;
+import static
org.apache.jackrabbit.oak.plugins.index.property.OrderedIndex.OrderDirection;
+import static
org.apache.jackrabbit.oak.plugins.memory.PropertyStates.createProperty;
+import static
org.apache.jackrabbit.oak.spi.filter.PathFilter.PROP_EXCLUDED_PATHS;
+import static
org.apache.jackrabbit.oak.spi.filter.PathFilter.PROP_INCLUDED_PATHS;
+import static org.hamcrest.CoreMatchers.containsString;
+import static org.hamcrest.CoreMatchers.not;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
+
@SuppressWarnings("ArraysAsListWithZeroOrOneArgument")
public class LucenePropertyIndexTest extends AbstractQueryTest {
/**
@@ -2946,6 +2950,107 @@ public class LucenePropertyIndexTest ext
"lucene:test1(/oak:index/test1)", asList("/d"));
}
+ @Test
+ public void testRepSimilarWithBinaryFeatureVectors() throws Exception {
+
+ IndexDefinitionBuilder idxb = new IndexDefinitionBuilder().noAsync();
+
idxb.indexRule("nt:base").property("fv").useInSimilarity().nodeScopeIndex().propertyIndex();
+
+ Tree idx = root.getTree("/").getChild("oak:index").addChild("test1");
+ idxb.build(idx);
+ root.commit();
+
+ Tree test = root.getTree("/").addChild("test");
+
+ URI uri =
getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
+ File file = new File(uri);
+
+ Collection<String> children = new LinkedList<>();
+ for (String line : IOUtils.readLines(new FileInputStream(file),
Charset.defaultCharset())) {
+ String[] split = line.split(",");
+ List<Double> values = new LinkedList<>();
+ int i = 0;
+ for (String s : split) {
+ if (i > 0) {
+ values.add(Double.parseDouble(s));
+ }
+ i++;
+ }
+
+ byte[] bytes = SimSearchUtils.toByteArray(values);
+ List<Double> actual = SimSearchUtils.toDoubles(bytes);
+ assertEquals(values, actual);
+
+ Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
+ String name = split[0];
+ Tree child = test.addChild(name);
+ child.setProperty("fv", blob, Type.BINARY);
+ }
+ root.commit();
+
+ // check that similarity changes across different feature vectors
+ List<String> baseline = new LinkedList<>();
+ for (String similarPath : children) {
+ String query = "select [jcr:path] from [nt:base] where similar(.,
'" + similarPath + "')";
+
+ Iterator<String> result = executeQuery(query,
"JCR-SQL2").iterator();
+ List<String> current = new LinkedList<>();
+ while (result.hasNext()) {
+ String next = result.next();
+ current.add(next);
+ }
+ assertNotEquals(baseline, current);
+ baseline.clear();
+ baseline.addAll(current);
+ }
+
+ }
+
+ @Test
+ public void testRepSimilarWithStringFeatureVectors() throws Exception {
+
+ IndexDefinitionBuilder idxb = new IndexDefinitionBuilder().noAsync();
+
idxb.indexRule("nt:base").property("fv").useInSimilarity().nodeScopeIndex().propertyIndex();
+
+ Tree idx = root.getTree("/").getChild("oak:index").addChild("test1");
+ idxb.build(idx);
+ root.commit();
+
+
+ Tree test = root.getTree("/").addChild("test");
+
+ URI uri =
getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
+ File file = new File(uri);
+
+ Collection<String> children = new LinkedList<>();
+
+ for (String line : IOUtils.readLines(new FileInputStream(file),
Charset.defaultCharset())) {
+ int i1 = line.indexOf(',');
+ String name = line.substring(0, i1);
+ String value = line.substring(i1 + 1);
+ Tree child = test.addChild(name);
+ child.setProperty("fv", value, Type.STRING);
+ children.add(child.getPath());
+ }
+ root.commit();
+
+ // check that similarity changes across different feature vectors
+ List<String> baseline = new LinkedList<>();
+ for (String similarPath : children) {
+ String query = "select [jcr:path] from [nt:base] where similar(.,
'" + similarPath + "')";
+
+ Iterator<String> result = executeQuery(query,
"JCR-SQL2").iterator();
+ List<String> current = new LinkedList<>();
+ while (result.hasNext()) {
+ String next = result.next();
+ current.add(next);
+ }
+ assertNotEquals(baseline, current);
+ baseline.clear();
+ baseline.addAll(current);
+ }
+ }
+
private void assertPlanAndQuery(String query, String planExpectation,
List<String> paths) {
assertPlanAndQuery(query, planExpectation, paths, false);
}
Added:
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizerTest.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizerTest.java?rev=1834326&view=auto
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizerTest.java
(added)
+++
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizerTest.java
Mon Jun 25 13:30:52 2018
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util.fv;
+
+import java.io.StringReader;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Tests for {@link FVTokenizer}
+ */
+public class FVTokenizerTest {
+
+ @Test
+ public void testTokenizeWithSpaces() throws Exception {
+ TokenStream stream = new FVTokenizer(Version.LUCENE_47, new
StringReader("0.10 0.20 0.30 0.40"));
+ stream.reset();
+ List<String> expectedTokens = new LinkedList<>();
+ expectedTokens.add("0.10");
+ expectedTokens.add("0.20");
+ expectedTokens.add("0.30");
+ expectedTokens.add("0.40");
+ int i = 0;
+ while (stream.incrementToken()) {
+ CharTermAttribute charTermAttribute =
stream.getAttribute(CharTermAttribute.class);
+ String token = new String(charTermAttribute.buffer(), 0,
charTermAttribute.length());
+ assertEquals(expectedTokens.get(i), token);
+ i++;
+ }
+ stream.close();
+ }
+
+ @Test
+ public void testTokenizeWithCommas() throws Exception {
+ TokenStream stream = new FVTokenizer(Version.LUCENE_47, new
StringReader("0.10,0.20,0.30,0.40"));
+ stream.reset();
+ List<String> expectedTokens = new LinkedList<>();
+ expectedTokens.add("0.10");
+ expectedTokens.add("0.20");
+ expectedTokens.add("0.30");
+ expectedTokens.add("0.40");
+ int i = 0;
+ while (stream.incrementToken()) {
+ CharTermAttribute charTermAttribute =
stream.getAttribute(CharTermAttribute.class);
+ String token = new String(charTermAttribute.buffer(), 0,
charTermAttribute.length());
+ assertEquals(expectedTokens.get(i), token);
+ i++;
+ }
+ stream.close();
+ }
+
+ @Test
+ public void testTokenizeWithCommasAndSpaces() throws Exception {
+ TokenStream stream = new FVTokenizer(Version.LUCENE_47, new
StringReader("0.10, 0.20, 0.30, 0.40"));
+ stream.reset();
+ List<String> expectedTokens = new LinkedList<>();
+ expectedTokens.add("0.10");
+ expectedTokens.add("0.20");
+ expectedTokens.add("0.30");
+ expectedTokens.add("0.40");
+ int i = 0;
+ while (stream.incrementToken()) {
+ CharTermAttribute charTermAttribute =
stream.getAttribute(CharTermAttribute.class);
+ String token = new String(charTermAttribute.buffer(), 0,
charTermAttribute.length());
+ assertEquals(expectedTokens.get(i), token);
+ i++;
+ }
+ stream.close();
+ }
+
+}
\ No newline at end of file
Propchange:
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizerTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilterTest.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilterTest.java?rev=1834326&view=auto
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilterTest.java
(added)
+++
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilterTest.java
Mon Jun 25 13:30:52 2018
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util.fv;
+
+import java.io.StringReader;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Tests for {@link FeaturePositionTokenFilter}
+ */
+public class FeaturePositionTokenFilterTest {
+
+ @Test
+ public void testFiltering() throws Exception {
+ TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_47, new
StringReader("0.10 0.20 0.30 0.40"));
+ FeaturePositionTokenFilter filter = new
FeaturePositionTokenFilter(stream);
+ filter.reset();
+ List<String> expectedTokens = new LinkedList<>();
+ expectedTokens.add("1_0.10");
+ expectedTokens.add("2_0.20");
+ expectedTokens.add("3_0.30");
+ expectedTokens.add("4_0.40");
+ int i = 0;
+ while (filter.incrementToken()) {
+ CharTermAttribute charTermAttribute =
filter.getAttribute(CharTermAttribute.class);
+ String token = new String(charTermAttribute.buffer(), 0,
charTermAttribute.length());
+ assertEquals(expectedTokens.get(i), token);
+ i++;
+ }
+ filter.close();
+ }
+
+}
\ No newline at end of file
Propchange:
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilterTest.java
------------------------------------------------------------------------------
svn:eol-style = native