util...

tommaso Mon, 25 Jun 2018 06:31:02 -0700

Author: tommaso
Date: Mon Jun 25 13:30:52 2018
New Revision: 1834326

URL: http://svn.apache.org/viewvc?rev=1834326&view=rev
Log:
OAK-7575 - Search over similar feature vectors


Added:
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizer.java
   (with props)
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilter.java
   (with props)
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzer.java
   (with props)
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/MinHashFilter.java
   (with props)
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
   (with props)
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/TruncateTokenFilter.java
   (with props)
    
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/
    
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizerTest.java
   (with props)
    
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilterTest.java
   (with props)
    
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzerTest.java
   (with props)
    
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/TruncateTokenFilterTest.java
   (with props)
    
jackrabbit/oak/trunk/oak-lucene/src/test/resources/org/apache/jackrabbit/oak/query/fvs.csv
Modified:
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java
    
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java
 Mon Jun 25 13:30:52 2018
@@ -16,15 +16,22 @@
  */
 package org.apache.jackrabbit.oak.plugins.index.lucene;
 
+import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
 
 import com.google.common.primitives.Ints;
+import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.Type;
 import org.apache.jackrabbit.oak.commons.PathUtils;
+import org.apache.jackrabbit.oak.plugins.index.lucene.binary.BlobByteSource;
+import org.apache.jackrabbit.oak.plugins.index.lucene.util.fv.SimSearchUtils;
 import org.apache.jackrabbit.util.ISO8601;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.IntField;
+import org.apache.lucene.document.StoredField;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 
@@ -103,6 +110,34 @@ public final class FieldFactory {
         return new StringField(name, value, NO);
     }
 
+    static Collection<Field> newSimilarityFields(String name, Blob value) 
throws IOException {
+        Collection<Field> fields = new ArrayList<>(1);
+        byte[] bytes = new BlobByteSource(value).read();
+//        fields.add(newBinarySimilarityField(name, bytes));
+        fields.add(newSimilarityField(name, bytes));
+        return fields;
+    }
+
+    static Collection<Field> newSimilarityFields(String name, String value) {
+        Collection<Field> fields = new ArrayList<>(1);
+//        byte[] bytes = SimSearchUtils.toByteArray(value);
+//        fields.add(newBinarySimilarityField(name, bytes));
+        fields.add(newSimilarityField(name, value));
+        return fields;
+    }
+
+    private static Field newSimilarityField(String name, byte[] bytes) {
+        return newSimilarityField(name, SimSearchUtils.toDoubleString(bytes));
+    }
+
+    private static Field newSimilarityField(String name, String value) {
+        return new TextField(FieldNames.createSimilarityFieldName(name), 
value, Field.Store.YES);
+    }
+
+    private static StoredField newBinarySimilarityField(String name, byte[] 
bytes) {
+        return new StoredField(FieldNames.createBinSimilarityFieldName(name), 
bytes);
+    }
+
     public static Field newFulltextField(String value) {
         return newFulltextField(value, false);
     }

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java
 Mon Jun 25 13:30:52 2018
@@ -70,6 +70,16 @@ public final class FieldNames {
     public static final String ANALYZED_FIELD_PREFIX = "full:";
 
     /**
+     * Name of the field that contains the similarity search indexed tokens.
+     */
+    private static final String SIMILARITY_PREFIX = "sim:";
+
+    /**
+     * Prefix for all field names that contains the similarity search binary 
values.
+     */
+    private static final String SIMILARITY_BINARY_PREFIX = "simbin:";
+
+    /**
      * Prefix used for storing fulltext of relative node
      */
     public static final String FULLTEXT_RELATIVE_NODE = "fullnode:";
@@ -138,4 +148,12 @@ public final class FieldNames {
                 && !field.startsWith(":")
                 && !field.endsWith("_facet");
     }
+
+    public static String createBinSimilarityFieldName(String name) {
+        return SIMILARITY_BINARY_PREFIX + name;
+    }
+
+    public static String createSimilarityFieldName(String name) {
+        return SIMILARITY_PREFIX + name;
+    }
 }

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java
 Mon Jun 25 13:30:52 2018
@@ -911,6 +911,7 @@ public final class IndexDefinition imple
         private final List<PropertyDefinition> notNullCheckEnabledProperties;
         private final List<PropertyDefinition> nodeScopeAnalyzedProps;
         private final List<PropertyDefinition> syncProps;
+        private final List<PropertyDefinition> similarityProperties;
         private final boolean indexesAllNodesOfMatchingType;
         private final boolean nodeNameIndexed;
 
@@ -925,6 +926,7 @@ public final class IndexDefinition imple
         final Aggregate propAggregate;
 
 
+
         IndexingRule(String nodeTypeName, NodeState config) {
             this.nodeTypeName = nodeTypeName;
             this.baseNodeType = nodeTypeName;
@@ -938,9 +940,10 @@ public final class IndexDefinition imple
             List<PropertyDefinition> existentProperties = newArrayList();
             List<PropertyDefinition> nodeScopeAnalyzedProps = newArrayList();
             List<PropertyDefinition> syncProps = newArrayList();
+            List<PropertyDefinition> similarityProperties = newArrayList();
             List<Aggregate.Include> propIncludes = newArrayList();
             this.propConfigs = collectPropConfigs(config, namePatterns, 
propIncludes, nonExistentProperties,
-                    existentProperties, nodeScopeAnalyzedProps, 
functionRestrictions, syncProps);
+                    existentProperties, nodeScopeAnalyzedProps, 
functionRestrictions, syncProps, similarityProperties);
             this.propAggregate = new Aggregate(nodeTypeName, propIncludes);
             this.aggregate = combine(propAggregate, nodeTypeName);
 
@@ -949,6 +952,7 @@ public final class IndexDefinition imple
             this.nullCheckEnabledProperties = 
ImmutableList.copyOf(nonExistentProperties);
             this.functionRestrictions = 
ImmutableList.copyOf(functionRestrictions);
             this.notNullCheckEnabledProperties = 
ImmutableList.copyOf(existentProperties);
+            this.similarityProperties = 
ImmutableList.copyOf(similarityProperties);
             this.fulltextEnabled = aggregate.hasNodeAggregates() || 
hasAnyFullTextEnabledProperty();
             this.nodeFullTextIndexed = aggregate.hasNodeAggregates() || 
anyNodeScopeIndexedProperty();
             this.propertyIndexEnabled = hasAnyPropertyIndexConfigured();
@@ -985,6 +989,7 @@ public final class IndexDefinition imple
             this.indexesAllNodesOfMatchingType = 
areAlMatchingNodeByTypeIndexed();
             this.nodeNameIndexed = original.nodeNameIndexed;
             this.syncProps = original.syncProps;
+            this.similarityProperties = original.similarityProperties;
         }
 
         /**
@@ -1032,6 +1037,10 @@ public final class IndexDefinition imple
             return nodeScopeAnalyzedProps;
         }
 
+        public List<PropertyDefinition> getSimilarityProperties() {
+            return similarityProperties;
+        }
+
         @Override
         public String toString() {
             String str = "IndexRule: "+ nodeTypeName;
@@ -1153,7 +1162,8 @@ public final class IndexDefinition imple
                                                                    
List<PropertyDefinition> existentProperties,
                                                                    
List<PropertyDefinition> nodeScopeAnalyzedProps,
                                                                    
List<PropertyDefinition> functionRestrictions,
-                                                                   
List<PropertyDefinition> syncProps) {
+                                                                   
List<PropertyDefinition> syncProps,
+                                                                   
List<PropertyDefinition> similarityProperties) {
             Map<String, PropertyDefinition> propDefns = newHashMap();
             NodeState propNode = 
config.getChildNode(LuceneIndexConstants.PROP_NODE);
 
@@ -1232,6 +1242,9 @@ public final class IndexDefinition imple
                     if (pd.sync) {
                         syncProps.add(pd);
                     }
+                    if (pd.useInSimilarity) {
+                        similarityProperties.add(pd);
+                    }
                 }
             }
             ensureNodeTypeIndexingIsConsistent(propDefns, syncProps);

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
 Mon Jun 25 13:30:52 2018
@@ -62,6 +62,7 @@ import static org.apache.jackrabbit.oak.
 import static 
org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newFulltextField;
 import static 
org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newPathField;
 import static 
org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newPropertyField;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newSimilarityFields;
 import static 
org.apache.jackrabbit.oak.plugins.index.lucene.util.ConfigUtil.getPrimaryTypeName;
 
 public class LuceneDocumentMaker {
@@ -253,7 +254,15 @@ public class LuceneDocumentMaker {
         boolean includeTypeForFullText = 
indexingRule.includePropertyType(property.getType().tag());
 
         boolean dirty = false;
-        if (Type.BINARY.tag() == property.getType().tag()
+        if (Type.BINARY.tag() == property.getType().tag() && 
pd.useInSimilarity) {
+            try {
+                log.trace("indexing similarity binaries for {}", pd.name);
+                fields.addAll(newSimilarityFields(pd.name, 
property.getValue(Type.BINARY)));
+                dirty = true;
+            } catch (Exception e) {
+                log.error("could not index similarity field for property {} 
and definition {}", property, pd);
+            }
+        } else if (Type.BINARY.tag() == property.getType().tag()
                 && includeTypeForFullText) {
             fields.addAll(newBinary(property, state, null, path + "@" + 
pname));
             dirty = true;
@@ -285,10 +294,17 @@ public class LuceneDocumentMaker {
                     if (pd.nodeScopeIndex) {
                         Field field = newFulltextField(value);
                         fields.add(field);
+                        if (pd.useInSimilarity) {
+                            log.trace("indexing similarity strings for {}", 
pd.name);
+                            fields.addAll(newSimilarityFields(pd.name, 
value)); // fallback for when feature vectors are written in string typed 
properties
+                       }
                     }
+
+
                     dirty = true;
                 }
             }
+
             if (pd.facet && isFacetingEnabled()) {
                 dirty |= addFacetFields(fields, property, pname, pd);
             }

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
 Mon Jun 25 13:30:52 2018
@@ -305,6 +305,11 @@ public interface LuceneIndexConstants {
     String PROP_USE_IN_SPELLCHECK = "useInSpellcheck";
 
     /**
+     * whether use this property values for similarity
+     */
+    String PROP_USE_IN_SIMILARITY = "useInSimilarity";
+
+    /**
      * Property definition config indicating that null check support should be
      * enabled for this property
      */

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
 Mon Jun 25 13:30:52 2018
@@ -18,10 +18,6 @@
  */
 package org.apache.jackrabbit.oak.plugins.index.lucene;
 
-import javax.annotation.CheckForNull;
-import javax.annotation.Nonnull;
-import javax.annotation.Nullable;
-import javax.jcr.PropertyType;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -33,7 +29,11 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicReference;
-import java.util.stream.Collectors;
+
+import javax.annotation.CheckForNull;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+import javax.jcr.PropertyType;
 
 import com.google.common.base.Joiner;
 import com.google.common.collect.AbstractIterator;
@@ -52,6 +52,8 @@ import org.apache.jackrabbit.oak.commons
 import org.apache.jackrabbit.oak.commons.PerfLogger;
 import org.apache.jackrabbit.oak.commons.json.JsopBuilder;
 import org.apache.jackrabbit.oak.commons.json.JsopWriter;
+import org.apache.jackrabbit.oak.plugins.index.Cursors;
+import org.apache.jackrabbit.oak.plugins.index.Cursors.PathCursor;
 import 
org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition.IndexingRule;
 import org.apache.jackrabbit.oak.plugins.index.lucene.IndexPlanner.PlanResult;
 import 
org.apache.jackrabbit.oak.plugins.index.lucene.IndexPlanner.PropertyIndexResult;
@@ -63,16 +65,9 @@ import org.apache.jackrabbit.oak.plugins
 import 
org.apache.jackrabbit.oak.plugins.index.lucene.util.PathStoredFieldVisitor;
 import org.apache.jackrabbit.oak.plugins.index.lucene.util.SpellcheckHelper;
 import org.apache.jackrabbit.oak.plugins.index.lucene.util.SuggestHelper;
+import org.apache.jackrabbit.oak.plugins.index.lucene.util.fv.SimSearchUtils;
 import org.apache.jackrabbit.oak.plugins.memory.PropertyValues;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextAnd;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextContains;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextExpression;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextOr;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextTerm;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextVisitor;
 import org.apache.jackrabbit.oak.spi.query.Cursor;
-import org.apache.jackrabbit.oak.plugins.index.Cursors;
-import org.apache.jackrabbit.oak.plugins.index.Cursors.PathCursor;
 import org.apache.jackrabbit.oak.spi.query.Filter;
 import org.apache.jackrabbit.oak.spi.query.Filter.PropertyRestriction;
 import org.apache.jackrabbit.oak.spi.query.IndexRow;
@@ -80,6 +75,12 @@ import org.apache.jackrabbit.oak.spi.que
 import org.apache.jackrabbit.oak.spi.query.QueryIndex;
 import 
org.apache.jackrabbit.oak.spi.query.QueryIndex.AdvanceFulltextQueryIndex;
 import org.apache.jackrabbit.oak.spi.query.QueryLimits;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextAnd;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextContains;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextExpression;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextOr;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextTerm;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextVisitor;
 import org.apache.jackrabbit.oak.spi.state.NodeState;
 import org.apache.jackrabbit.oak.spi.state.NodeStateUtils;
 import org.apache.lucene.analysis.Analyzer;
@@ -150,7 +151,9 @@ import static org.apache.jackrabbit.oak.
 import static org.apache.jackrabbit.oak.spi.query.QueryConstants.REP_EXCERPT;
 import static 
org.apache.jackrabbit.oak.spi.query.QueryIndex.AdvancedQueryIndex;
 import static org.apache.jackrabbit.oak.spi.query.QueryIndex.NativeQueryIndex;
-import static org.apache.lucene.search.BooleanClause.Occur.*;
+import static org.apache.lucene.search.BooleanClause.Occur.MUST;
+import static org.apache.lucene.search.BooleanClause.Occur.MUST_NOT;
+import static org.apache.lucene.search.BooleanClause.Occur.SHOULD;
 
 /**
  * Provides a QueryIndex that does lookups against a Lucene-based index
@@ -917,9 +920,20 @@ public class LucenePropertyIndex impleme
             if (query.startsWith("mlt?")) {
                 String mltQueryString = query.replace("mlt?", "");
                 if (reader != null) {
-                    Query moreLikeThis = 
MoreLikeThisHelper.getMoreLikeThis(reader, analyzer, mltQueryString);
-                    if (moreLikeThis != null) {
-                        qs.add(moreLikeThis);
+                    List<PropertyDefinition> sp = new LinkedList<>();
+                    for (IndexingRule r : defn.getDefinedRules()) {
+                        sp.addAll(r.getSimilarityProperties());
+                    }
+                    if (sp.isEmpty()) {
+                        Query moreLikeThis = 
MoreLikeThisHelper.getMoreLikeThis(reader, analyzer, mltQueryString);
+                        if (moreLikeThis != null) {
+                            qs.add(moreLikeThis);
+                        }
+                    } else {
+                        Query similarityQuery = 
SimSearchUtils.getSimilarityQuery(sp, reader, mltQueryString);
+                        if (similarityQuery != null) {
+                            qs.add(similarityQuery);
+                        }
                     }
                 }
             } else if (query.startsWith("spellcheck?")) {

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java
 Mon Jun 25 13:30:52 2018
@@ -58,7 +58,7 @@ public class PropertyDefinition {
      * property etc then it should be defined via 'name' property in NodeState.
      * In such case NodeState name can be set to anything
      */
-    final String name;
+    public final String name;
 
     private final int propertyType;
     /**
@@ -123,7 +123,9 @@ public class PropertyDefinition {
 
     public final boolean unique;
 
-    public PropertyDefinition(IndexingRule idxDefn, String nodeName, NodeState 
defn) {
+    public boolean useInSimilarity;
+
+  public PropertyDefinition(IndexingRule idxDefn, String nodeName, NodeState 
defn) {
         this.isRegexp = getOptionalValue(defn, PROP_IS_REGEX, false);
         this.name = getName(defn, nodeName);
         this.relative = isRelativeProperty(name);
@@ -151,6 +153,7 @@ public class PropertyDefinition {
         this.propertyType = getPropertyType(idxDefn, nodeName, defn);
         this.useInSuggest = getOptionalValueIfIndexed(defn, 
LuceneIndexConstants.PROP_USE_IN_SUGGEST, false);
         this.useInSpellcheck = getOptionalValueIfIndexed(defn, 
LuceneIndexConstants.PROP_USE_IN_SPELLCHECK, false);
+        this.useInSimilarity = getOptionalValueIfIndexed(defn, 
LuceneIndexConstants.PROP_USE_IN_SIMILARITY, false);
         this.nullCheckEnabled = getOptionalValueIfIndexed(defn, 
LuceneIndexConstants.PROP_NULL_CHECK_ENABLED, false);
         this.notNullCheckEnabled = getOptionalValueIfIndexed(defn, 
LuceneIndexConstants.PROP_NOT_NULL_CHECK_ENABLED, false);
         this.excludeFromAggregate = getOptionalValueIfIndexed(defn, 
LuceneIndexConstants.PROP_EXCLUDE_FROM_AGGREGATE, false);

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java
 Mon Jun 25 13:30:52 2018
@@ -319,6 +319,11 @@ public final class IndexDefinitionBuilde
             return this;
         }
 
+        public PropertyRule useInSimilarity() {
+            propTree.setProperty(LuceneIndexConstants.PROP_USE_IN_SIMILARITY, 
true);
+            return this;
+        }
+
         public PropertyRule type(String type){
             //This would throw an IAE if type is invalid
             PropertyType.valueFromName(type);

Added: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizer.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizer.java?rev=1834326&view=auto
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizer.java
 (added)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizer.java
 Mon Jun 25 13:30:52 2018
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util.fv;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.util.CharTokenizer;
+import org.apache.lucene.util.Version;
+
+class FVTokenizer extends CharTokenizer {
+    FVTokenizer(Version matchVersion, Reader input) {
+      super(matchVersion, input);
+    }
+
+    @Override
+    protected boolean isTokenChar(int c) {
+      char c1 = Character.toChars(c)[0];
+      return c1 != ',' && !Character.isWhitespace(c);
+    }
+  }
\ No newline at end of file

Propchange: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilter.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilter.java?rev=1834326&view=auto
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilter.java
 (added)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilter.java
 Mon Jun 25 13:30:52 2018
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util.fv;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+final class FeaturePositionTokenFilter extends TokenFilter {
+
+    private final CharTermAttribute termAttribute = 
addAttribute(CharTermAttribute.class);
+    private int tokenCount = 0;
+
+    FeaturePositionTokenFilter(TokenStream stream) {
+      super(stream);
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      if (input.incrementToken()) {
+        tokenCount++;
+        String token = new String(termAttribute.buffer(), 0, 
termAttribute.length());
+        termAttribute.setEmpty();
+        termAttribute.append(String.valueOf(tokenCount));
+        termAttribute.append("_");
+        termAttribute.append(token);
+        return true;
+      } else {
+        return false;
+      }
+    }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    tokenCount = 0;
+  }
+
+  }
\ No newline at end of file

Propchange: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzer.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzer.java?rev=1834326&view=auto
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzer.java
 (added)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzer.java
 Mon Jun 25 13:30:52 2018
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util.fv;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.util.Version;
+
+/**
+ * {@link Analyzer} for LSH search
+ */
+public class LSHAnalyzer extends Analyzer {
+
+  private static final int DEFAULT_SHINGLE_SIZE = 4;
+
+  private final int min;
+  private final int max;
+  private final int hashCount;
+  private final int bucketCount;
+  private final int hashSetSize;
+
+  private LSHAnalyzer(int min, int max, int hashCount, int bucketCount, int 
hashSetSize) {
+    super();
+    this.min = min;
+    this.max = max;
+    this.hashCount = hashCount;
+    this.bucketCount = bucketCount;
+    this.hashSetSize = hashSetSize;
+  }
+
+  public LSHAnalyzer() {
+    this(DEFAULT_SHINGLE_SIZE, DEFAULT_SHINGLE_SIZE, 
MinHashFilter.DEFAULT_HASH_COUNT, MinHashFilter.DEFAULT_BUCKET_COUNT, 
MinHashFilter.DEFAULT_HASH_SET_SIZE);
+  }
+
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName, Reader 
reader) {
+    Tokenizer source = new FVTokenizer(Version.LUCENE_47, reader);
+    TokenFilter truncate = new TruncateTokenFilter(source, 5);
+    TokenFilter featurePos = new FeaturePositionTokenFilter(truncate);
+    ShingleFilter shingleFilter = new ShingleFilter(featurePos, min, max);
+    shingleFilter.setTokenSeparator(" ");
+    shingleFilter.setOutputUnigrams(false);
+    shingleFilter.setOutputUnigramsIfNoShingles(false);
+    TokenStream filter = new MinHashFilter(shingleFilter, hashCount, 
bucketCount, hashSetSize, bucketCount > 1);
+    return new TokenStreamComponents(source, filter);
+  }
+
+}

Propchange: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/MinHashFilter.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/MinHashFilter.java?rev=1834326&view=auto
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/MinHashFilter.java
 (added)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/MinHashFilter.java
 Mon Jun 25 13:30:52 2018
@@ -0,0 +1,514 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util.fv;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.TreeSet;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+/**
+ * Generate min hash tokens from an incoming stream of tokens. The incoming 
tokens would typically be 5 word shingles.
+ * 
+ * The number of hashes used and the number of minimum values for each hash 
can be set. You could have 1 hash and keep
+ * the 100 lowest values or 100 hashes and keep the lowest one for each. 
Hashes can also be bucketed in ranges over the
+ * 128-bit hash space,
+ * 
+ * A 128-bit hash is used internally. 5 word shingles from 10e5 words generate 
10e25 combinations So a 64 bit hash would
+ * have collisions (1.8e19)
+ * 
+ * When using different hashes 32 bits are used for the hash position leaving 
scope for 8e28 unique hashes. A single
+ * hash will use all 128 bits.
+ *
+ */
+class MinHashFilter extends TokenFilter {
+  private static final int HASH_CACHE_SIZE = 512;
+
+  private static final LongPair[] cachedIntHashes = new 
LongPair[HASH_CACHE_SIZE];
+
+  public static final int DEFAULT_HASH_COUNT = 1;
+
+  public static final int DEFAULT_HASH_SET_SIZE = 1;
+
+  public static final int DEFAULT_BUCKET_COUNT = 512;
+
+  private static final String MIN_HASH_TYPE = "MIN_HASH";
+
+  private final List<List<FixedSizeTreeSet<LongPair>>> minHashSets;
+
+  private int hashSetSize = DEFAULT_HASH_SET_SIZE;
+
+  private int bucketCount = DEFAULT_BUCKET_COUNT;
+
+  private int hashCount = DEFAULT_HASH_COUNT;
+
+  private boolean requiresInitialisation = true;
+
+  private State endState;
+
+  private int hashPosition = -1;
+
+  private int bucketPosition = -1;
+
+  private long bucketSize;
+  
+  private final boolean withRotation;
+  
+  private int endOffset;
+  
+  private boolean exhausted = false;
+
+  private final CharTermAttribute termAttribute = 
addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAttribute = 
addAttribute(OffsetAttribute.class);
+  private final TypeAttribute typeAttribute = 
addAttribute(TypeAttribute.class);
+  private final PositionIncrementAttribute posIncAttribute = 
addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAttribute = 
addAttribute(PositionLengthAttribute.class);
+
+  static {
+    for (int i = 0; i < HASH_CACHE_SIZE; i++) {
+      cachedIntHashes[i] = new LongPair();
+      murmurhash3_x64_128(getBytes(i), 0, 4, 0, cachedIntHashes[i]);
+    }
+  }
+
+  static byte[] getBytes(int i) {
+    byte[] answer = new byte[4];
+    answer[3] = (byte) (i);
+    answer[2] = (byte) (i >> 8);
+    answer[1] = (byte) (i >> 16);
+    answer[0] = (byte) (i >> 24);
+    return answer;
+  }
+
+  /**
+   * create a MinHash filter
+   *
+   * @param input the token stream
+   * @param hashCount the no. of hashes
+   * @param bucketCount the no. of buckets for hashing
+   * @param hashSetSize the no. of min hashes to keep
+   * @param withRotation whether rotate or not hashes while incrementing tokens
+   */
+  public MinHashFilter(TokenStream input, int hashCount, int bucketCount, int 
hashSetSize, boolean withRotation) {
+    super(input);
+    if (hashCount <= 0) {
+      throw new IllegalArgumentException("hashCount must be greater than 
zero");
+    }
+    if (bucketCount <= 0) {
+      throw new IllegalArgumentException("bucketCount must be greater than 
zero");
+    }
+    if (hashSetSize <= 0) {
+      throw new IllegalArgumentException("hashSetSize must be greater than 
zero");
+    }
+    this.hashCount = hashCount;
+    this.bucketCount = bucketCount;
+    this.hashSetSize = hashSetSize;
+    this.withRotation = withRotation;
+    this.bucketSize = (1L << 32) / bucketCount;
+    if((1L << 32) % bucketCount != 0)
+    {
+      bucketSize++;
+    }
+    minHashSets = new ArrayList<>(this.hashCount);
+    for (int i = 0; i < this.hashCount; i++) {
+      ArrayList<FixedSizeTreeSet<LongPair>> buckets = new 
ArrayList<>(this.bucketCount);
+      minHashSets.add(buckets);
+      for (int j = 0; j < this.bucketCount; j++) {
+        FixedSizeTreeSet<LongPair> minSet = new 
FixedSizeTreeSet<>(this.hashSetSize);
+        buckets.add(minSet);
+      }
+    }
+    doRest();
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    // Pull the underlying stream of tokens
+    // Hash each token found
+    // Generate the required number of variants of this hash
+    // Keep the minimum hash value found so far of each variant
+
+    int positionIncrement = 0;
+    if (requiresInitialisation) {
+      requiresInitialisation = false;
+      boolean found = false;
+      // First time through so we pull and hash everything
+      while (input.incrementToken()) {
+        found = true;
+        String current = new String(termAttribute.buffer(), 0, 
termAttribute.length());
+
+        for (int i = 0; i < hashCount; i++) {
+          byte[] bytes = current.getBytes("UTF-16LE");
+          LongPair hash = new LongPair();
+          murmurhash3_x64_128(bytes, 0, bytes.length, 0, hash);
+          LongPair rehashed = combineOrdered(hash, getIntHash(i));
+          minHashSets.get(i).get((int) ((rehashed.val2 >>> 32) / 
bucketSize)).add(rehashed);
+        }
+        endOffset = offsetAttribute.endOffset();
+      }
+      exhausted = true;
+      input.end();
+      // We need the end state so an underlying shingle filter can have its 
state restored correctly.
+      endState = captureState();
+      if (!found) {
+        return false;
+      }
+      
+      positionIncrement = 1;
+      // fix up any wrap around bucket values. ...
+      if (withRotation && (hashSetSize == 1)) {
+        for (int hashLoop = 0; hashLoop < hashCount; hashLoop++) {
+          for (int bucketLoop = 0; bucketLoop < bucketCount; bucketLoop++) {
+            if (minHashSets.get(hashLoop).get(bucketLoop).size() == 0) {
+              for (int bucketOffset = 1; bucketOffset < bucketCount; 
bucketOffset++) {
+                if (minHashSets.get(hashLoop).get((bucketLoop + bucketOffset) 
% bucketCount).size() > 0) {
+                  LongPair replacementHash = 
minHashSets.get(hashLoop).get((bucketLoop + bucketOffset) % bucketCount)
+                      .first();
+                  
minHashSets.get(hashLoop).get(bucketLoop).add(replacementHash);
+                  break;
+                }
+              }
+            }
+          }
+        }
+      }
+
+    }
+   
+    clearAttributes();
+
+    while (hashPosition < hashCount) {
+      if (hashPosition == -1) {
+        hashPosition++;
+      } else {
+        while (bucketPosition < bucketCount) {
+          if (bucketPosition == -1) {
+            bucketPosition++;
+          } else {
+            LongPair hash = 
minHashSets.get(hashPosition).get(bucketPosition).pollFirst();
+            if (hash != null) {
+              termAttribute.setEmpty();
+              if (hashCount > 1) {
+                termAttribute.append(int0(hashPosition));
+                termAttribute.append(int1(hashPosition));
+              }
+              long high = hash.val2;
+              termAttribute.append(long0(high));
+              termAttribute.append(long1(high));
+              termAttribute.append(long2(high));
+              termAttribute.append(long3(high));
+              long low = hash.val1;
+              termAttribute.append(long0(low));
+              termAttribute.append(long1(low));
+              if (hashCount == 1) {
+                termAttribute.append(long2(low));
+                termAttribute.append(long3(low));
+              }
+              posIncAttribute.setPositionIncrement(positionIncrement);
+              offsetAttribute.setOffset(0, endOffset);
+              typeAttribute.setType(MIN_HASH_TYPE);
+              posLenAttribute.setPositionLength(1);
+              return true;
+            } else {
+              bucketPosition++;
+            }
+          }
+
+        }
+        bucketPosition = -1;
+        hashPosition++;
+      }
+    }
+    return false;
+  }
+
+  private static LongPair getIntHash(int i) {
+    if (i < HASH_CACHE_SIZE) {
+      return cachedIntHashes[i];
+    } else {
+      LongPair answer = new LongPair();
+      murmurhash3_x64_128(getBytes(i), 0, 4, 0, answer);
+      return answer;
+    }
+  }
+
+  @Override
+  public void end() throws IOException {
+    if(!exhausted) {
+      input.end();
+    }
+      
+    restoreState(endState);
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    doRest();
+  }
+
+  private void doRest() {
+    for (int i = 0; i < hashCount; i++) {
+      for (int j = 0; j < bucketCount; j++) {
+        minHashSets.get(i).get(j).clear();
+      }
+    }
+    endState = null;
+    hashPosition = -1;
+    bucketPosition = -1;
+    requiresInitialisation = true;
+    exhausted = false;
+  }
+
+  private static char long0(long x) {
+    return (char) (x >> 48);
+  }
+
+  private static char long1(long x) {
+    return (char) (x >> 32);
+  }
+
+  private static char long2(long x) {
+    return (char) (x >> 16);
+  }
+
+  private static char long3(long x) {
+    return (char) (x);
+  }
+
+  private static char int0(int x) {
+    return (char) (x >> 16);
+  }
+
+  private static char int1(int x) {
+    return (char) (x);
+  }
+
+  public static boolean isLessThanUnsigned(long n1, long n2) {
+    return (n1 < n2) ^ ((n1 < 0) != (n2 < 0));
+  }
+
+  static class FixedSizeTreeSet<E extends Comparable<E>> extends TreeSet<E> {
+
+    /**
+     * 
+     */
+    private static final long serialVersionUID = -8237117170340299630L;
+    private final int capacity;
+
+    FixedSizeTreeSet() {
+      this(20);
+    }
+
+    FixedSizeTreeSet(int capacity) {
+      super();
+      this.capacity = capacity;
+    }
+
+    @Override
+    public boolean add(final E toAdd) {
+      if (capacity <= size()) {
+        final E lastElm = last();
+        if (toAdd.compareTo(lastElm) > -1) {
+          return false;
+        } else {
+          pollLast();
+        }
+      }
+      return super.add(toAdd);
+    }
+  }
+
+  private static LongPair combineOrdered(LongPair... hashCodes) {
+    LongPair result = new LongPair();
+    for (LongPair hashCode : hashCodes) {
+      result.val1 = result.val1 * 37 + hashCode.val1;
+      result.val2 = result.val2 * 37 + hashCode.val2;
+
+    }
+    return result;
+  }
+
+  /** 128 bits of state */
+  static final class LongPair implements Comparable<LongPair> {
+    public long val1;
+    public long val2;
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Comparable#compareTo(java.lang.Object)
+     */
+    @Override
+    public int compareTo(LongPair other) {
+      if (isLessThanUnsigned(val2, other.val2)) {
+        return -1;
+      } else if (val2 == other.val2) {
+        if (isLessThanUnsigned(val1, other.val1)) {
+          return -1;
+        } else if (val1 == other.val1) {
+          return 0;
+        } else {
+          return 1;
+        }
+      } else {
+        return 1;
+      }
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      if (this == o) return true;
+      if (o == null || getClass() != o.getClass()) return false;
+
+      LongPair longPair = (LongPair) o;
+
+      return val1 == longPair.val1 && val2 == longPair.val2;
+
+    }
+
+    @Override
+    public int hashCode() {
+      int result = (int) (val1 ^ (val1 >>> 32));
+      result = 31 * result + (int) (val2 ^ (val2 >>> 32));
+      return result;
+    }
+  }
+
+  /** Gets a long from a byte buffer in little endian byte order. */
+  private static long getLongLittleEndian(byte[] buf, int offset) {
+    return ((long) buf[offset + 7] << 56) // no mask needed
+        | ((buf[offset + 6] & 0xffL) << 48)
+        | ((buf[offset + 5] & 0xffL) << 40)
+        | ((buf[offset + 4] & 0xffL) << 32)
+        | ((buf[offset + 3] & 0xffL) << 24)
+        | ((buf[offset + 2] & 0xffL) << 16)
+        | ((buf[offset + 1] & 0xffL) << 8)
+        | ((buf[offset] & 0xffL)); // no shift needed
+  }
+
+  /** Returns the MurmurHash3_x64_128 hash, placing the result in "out". */
+  @SuppressWarnings("fallthrough") // the huge switch is designed to use fall 
through into cases!
+  static void murmurhash3_x64_128(byte[] key, int offset, int len, int seed, 
LongPair out) {
+    // The original algorithm does have a 32 bit unsigned seed.
+    // We have to mask to match the behavior of the unsigned types and prevent 
sign extension.
+    long h1 = seed & 0x00000000FFFFFFFFL;
+    long h2 = seed & 0x00000000FFFFFFFFL;
+
+    final long c1 = 0x87c37b91114253d5L;
+    final long c2 = 0x4cf5ad432745937fL;
+
+    int roundedEnd = offset + (len & 0xFFFFFFF0); // round down to 16 byte 
block
+    for (int i = offset; i < roundedEnd; i += 16) {
+      long k1 = getLongLittleEndian(key, i);
+      long k2 = getLongLittleEndian(key, i + 8);
+      k1 *= c1;
+      k1 = Long.rotateLeft(k1, 31);
+      k1 *= c2;
+      h1 ^= k1;
+      h1 = Long.rotateLeft(h1, 27);
+      h1 += h2;
+      h1 = h1 * 5 + 0x52dce729;
+      k2 *= c2;
+      k2 = Long.rotateLeft(k2, 33);
+      k2 *= c1;
+      h2 ^= k2;
+      h2 = Long.rotateLeft(h2, 31);
+      h2 += h1;
+      h2 = h2 * 5 + 0x38495ab5;
+    }
+
+    long k1 = 0;
+    long k2 = 0;
+
+    switch (len & 15) {
+      case 15:
+        k2 = (key[roundedEnd + 14] & 0xffL) << 48;
+      case 14:
+        k2 |= (key[roundedEnd + 13] & 0xffL) << 40;
+      case 13:
+        k2 |= (key[roundedEnd + 12] & 0xffL) << 32;
+      case 12:
+        k2 |= (key[roundedEnd + 11] & 0xffL) << 24;
+      case 11:
+        k2 |= (key[roundedEnd + 10] & 0xffL) << 16;
+      case 10:
+        k2 |= (key[roundedEnd + 9] & 0xffL) << 8;
+      case 9:
+        k2 |= (key[roundedEnd + 8] & 0xffL);
+        k2 *= c2;
+        k2 = Long.rotateLeft(k2, 33);
+        k2 *= c1;
+        h2 ^= k2;
+      case 8:
+        k1 = ((long) key[roundedEnd + 7]) << 56;
+      case 7:
+        k1 |= (key[roundedEnd + 6] & 0xffL) << 48;
+      case 6:
+        k1 |= (key[roundedEnd + 5] & 0xffL) << 40;
+      case 5:
+        k1 |= (key[roundedEnd + 4] & 0xffL) << 32;
+      case 4:
+        k1 |= (key[roundedEnd + 3] & 0xffL) << 24;
+      case 3:
+        k1 |= (key[roundedEnd + 2] & 0xffL) << 16;
+      case 2:
+        k1 |= (key[roundedEnd + 1] & 0xffL) << 8;
+      case 1:
+        k1 |= (key[roundedEnd] & 0xffL);
+        k1 *= c1;
+        k1 = Long.rotateLeft(k1, 31);
+        k1 *= c2;
+        h1 ^= k1;
+    }
+
+    // ----------
+    // finalization
+
+    h1 ^= len;
+    h2 ^= len;
+
+    h1 += h2;
+    h2 += h1;
+
+    h1 = fmix64(h1);
+    h2 = fmix64(h2);
+
+    h1 += h2;
+    h2 += h1;
+
+    out.val1 = h1;
+    out.val2 = h2;
+  }
+
+  private static long fmix64(long k) {
+    k ^= k >>> 33;
+    k *= 0xff51afd7ed558ccdL;
+    k ^= k >>> 33;
+    k *= 0xc4ceb9fe1a85ec53L;
+    k ^= k >>> 33;
+    return k;
+  }
+}

Propchange: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/MinHashFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java?rev=1834326&view=auto
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
 (added)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
 Mon Jun 25 13:30:52 2018
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util.fv;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames;
+import org.apache.jackrabbit.oak.plugins.index.lucene.PropertyDefinition;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static org.apache.lucene.search.BooleanClause.Occur.SHOULD;
+
+/**
+ * Utility methods for indexing and searching for similar feature vectors
+ */
+public class SimSearchUtils {
+
+    private static final Logger log = 
LoggerFactory.getLogger(SimSearchUtils.class);
+
+    public static String toDoubleString(byte[] bytes) {
+        Double[] a = toDoubleArray(bytes);
+        StringBuilder builder = new StringBuilder();
+        for (Double d : a) {
+            if (builder.length() > 0) {
+                builder.append(' ');
+            }
+            builder.append(d);
+        }
+        return builder.toString();
+    }
+
+    private static Double[] toDoubleArray(byte[] array) {
+        List<Double> doubles = toDoubles(array);
+        return doubles.toArray(new Double[doubles.size()]);
+    }
+
+    public static List<Double> toDoubles(byte[] array) {
+        int blockSize = Double.SIZE / Byte.SIZE;
+        ByteBuffer wrap = ByteBuffer.wrap(array);
+        int capacity = array.length / blockSize;
+        List<Double> doubles = new ArrayList<>(capacity);
+        for (int i = 0; i < capacity; i++) {
+            double e = wrap.getDouble(i * blockSize);
+            doubles.add(e);
+        }
+        return doubles;
+    }
+
+    private static Collection<String> getTokens(Analyzer analyzer, String 
field, String sampleTextString) throws IOException {
+        Collection<String> tokens = new LinkedList<>();
+        TokenStream ts = analyzer.tokenStream(field, sampleTextString);
+        ts.reset();
+        ts.addAttribute(CharTermAttribute.class);
+        while (ts.incrementToken()) {
+            CharTermAttribute charTermAttribute = 
ts.getAttribute(CharTermAttribute.class);
+            String token = new String(charTermAttribute.buffer(), 0, 
charTermAttribute.length());
+            tokens.add(token);
+        }
+        ts.end();
+        ts.close();
+        return tokens;
+    }
+
+    static BooleanQuery getSimQuery(Analyzer analyzer, String fieldName, 
String text) throws IOException {
+        Collection<String> tokens = getTokens(analyzer, fieldName, text);
+        BooleanQuery booleanQuery = new BooleanQuery(true);
+        booleanQuery.setMinimumNumberShouldMatch(3);
+        for (String token : tokens) {
+            booleanQuery.add(new ConstantScoreQuery(new TermQuery(new 
Term(fieldName, token))), BooleanClause.Occur.SHOULD);
+        }
+        return booleanQuery;
+    }
+
+
+    public static byte[] toByteArray(List<Double> values) {
+        int blockSize = Double.SIZE / Byte.SIZE;
+        byte[] bytes = new byte[values.size() * blockSize];
+        for (int i = 0, j = 0; i < values.size(); i++, j += blockSize) {
+            ByteBuffer.wrap(bytes, j, blockSize).putDouble(values.get(i));
+        }
+        return bytes;
+    }
+
+    public static byte[] toByteArray(String value) {
+        List<Double> doubles = new LinkedList<>();
+        for (String dv : value.split(",")) {
+            doubles.add(Double.parseDouble(dv));
+        }
+        return toByteArray(doubles);
+    }
+
+    public static Query getSimilarityQuery(List<PropertyDefinition> sp, 
IndexReader reader, String queryString) {
+        try {
+            log.debug("parsing similarity query on {}", queryString);
+            Query similarityQuery = null;
+            String text = null;
+            for (String param : queryString.split("&")) {
+                String[] keyValuePair = param.split("=");
+                if (keyValuePair.length != 2 || keyValuePair[0] == null || 
keyValuePair[1] == null) {
+                    throw new RuntimeException("Unparsable native Lucene query 
for fv similarity: " + queryString);
+                } else {
+                    if ("stream.body".equals(keyValuePair[0])) {
+                        text = keyValuePair[1];
+                        break;
+                    }
+                }
+            }
+
+            if (text != null && !sp.isEmpty()) {
+                log.debug("generating similarity query for {}", text);
+                BooleanQuery booleanQuery = new BooleanQuery(true);
+                LSHAnalyzer analyzer = new LSHAnalyzer();
+                IndexSearcher searcher = new IndexSearcher(reader);
+                TermQuery q = new TermQuery(new Term(FieldNames.PATH, text));
+                TopDocs top = searcher.search(q, 1);
+                if (top.totalHits > 0) {
+                    ScoreDoc d = top.scoreDocs[0];
+                    Document doc = reader.document(d.doc);
+                    for (PropertyDefinition pd : sp) {
+                        log.debug("adding similarity clause for property {}", 
pd.name);
+                        String similarityFieldName = 
FieldNames.createSimilarityFieldName(pd.name);
+                        String fvString = doc.get(similarityFieldName);
+                        if (fvString != null && fvString.trim().length() > 0) {
+                            String fieldName = 
FieldNames.createSimilarityFieldName(pd.name);
+                            log.trace("generating sim query on field {} and 
text {}", fieldName, fvString);
+                            BooleanQuery simQuery = 
SimSearchUtils.getSimQuery(analyzer, fieldName, fvString);
+                            booleanQuery.add(new BooleanClause(simQuery, 
SHOULD));
+                            log.trace("similarity query generated for {}", 
pd.name);
+                        }
+                    }
+                }
+                if (booleanQuery.clauses().size() > 0) {
+                    similarityQuery = booleanQuery;
+                    log.trace("final similarity query is {}", similarityQuery);
+                }
+            }
+
+            return similarityQuery;
+        } catch (Exception e) {
+            throw new RuntimeException("could not handle similarity query " + 
queryString);
+        }
+    }
+
+}

Propchange: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/TruncateTokenFilter.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/TruncateTokenFilter.java?rev=1834326&view=auto
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/TruncateTokenFilter.java
 (added)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/TruncateTokenFilter.java
 Mon Jun 25 13:30:52 2018
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util.fv;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+class TruncateTokenFilter extends TokenFilter {
+
+    private final CharTermAttribute termAttribute = 
addAttribute(CharTermAttribute.class);
+    private final KeywordAttribute keywordAttr = 
addAttribute(KeywordAttribute.class);
+
+    private final int length;
+
+    TruncateTokenFilter(TokenStream input, int length) {
+      super(input);
+      if (length < 1) {
+        throw new IllegalArgumentException("length parameter must be a 
positive number: " + length);
+      }
+      this.length = length;
+    }
+
+    @Override
+    public final boolean incrementToken() throws IOException {
+      if (input.incrementToken()) {
+        if (!keywordAttr.isKeyword() && termAttribute.length() > length) {
+          termAttribute.setLength(length);
+        }
+        return true;
+      } else {
+        return false;
+      }
+    }
+  }
\ No newline at end of file

Propchange: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/TruncateTokenFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java
 Mon Jun 25 13:30:52 2018
@@ -20,12 +20,15 @@
 package org.apache.jackrabbit.oak.plugins.index.lucene.writer;
 
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 
 
 import org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames;
 import org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition;
 import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants;
+import org.apache.jackrabbit.oak.plugins.index.lucene.PropertyDefinition;
+import org.apache.jackrabbit.oak.plugins.index.lucene.util.fv.LSHAnalyzer;
 import org.apache.jackrabbit.oak.plugins.index.lucene.util.SuggestHelper;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
@@ -51,6 +54,15 @@ public class IndexWriterUtils {
             Analyzer definitionAnalyzer = definition.getAnalyzer();
             Map<String, Analyzer> analyzers = new HashMap<String, Analyzer>();
             analyzers.put(FieldNames.SPELLCHECK, new 
ShingleAnalyzerWrapper(LuceneIndexConstants.ANALYZER, 3));
+            for (IndexDefinition.IndexingRule r : 
definition.getDefinedRules()) {
+                List<PropertyDefinition> similarityProperties = 
r.getSimilarityProperties();
+                for (PropertyDefinition pd : similarityProperties) {
+                    if (pd.useInSimilarity) {
+                        
analyzers.put(FieldNames.createSimilarityFieldName(pd.name), new LSHAnalyzer());
+                    }
+                }
+            }
+
             if (!definition.isSuggestAnalyzed()) {
                 analyzers.put(FieldNames.SUGGEST, SuggestHelper.getAnalyzer());
             }

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java?rev=1834326&r1=1834325&r2=1834326&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
 Mon Jun 25 13:30:52 2018
@@ -19,66 +19,17 @@
 
 package org.apache.jackrabbit.oak.plugins.index.lucene;
 
-import javax.annotation.Nonnull;
-import javax.jcr.PropertyType;
-
-import static com.google.common.collect.ImmutableSet.of;
-import static com.google.common.collect.Lists.newArrayList;
-import static java.util.Arrays.asList;
-import static org.apache.jackrabbit.JcrConstants.JCR_CONTENT;
-import static org.apache.jackrabbit.JcrConstants.JCR_DATA;
-import static org.apache.jackrabbit.JcrConstants.NT_FILE;
-import static org.apache.jackrabbit.oak.api.QueryEngine.NO_BINDINGS;
-import static org.apache.jackrabbit.oak.api.QueryEngine.NO_MAPPINGS;
-import static org.apache.jackrabbit.oak.api.Type.NAMES;
-import static org.apache.jackrabbit.oak.api.Type.STRING;
-import static org.apache.jackrabbit.oak.api.Type.STRINGS;
-import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.ASYNC_PROPERTY_NAME;
-import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.DECLARING_NODE_TYPES;
-import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NAME;
-import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NODE_TYPE;
-import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.QUERY_PATHS;
-import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.REINDEX_PROPERTY_NAME;
-import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.TYPE_PROPERTY_NAME;
-import static 
org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition.INDEX_DEFINITION_NODE;
-import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.ANALYZERS;
-import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.INCLUDE_PROPERTY_NAMES;
-import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.ORDERED_PROP_NAMES;
-import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.INDEX_ORIGINAL_TERM;
-import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROPDEF_PROP_NODE_NAME;
-import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_ANALYZED;
-import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NAME;
-import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NODE;
-import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NODE_SCOPE_INDEX;
-import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_PROPERTY_INDEX;
-import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_TYPE;
-import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.TIKA;
-import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexEditorTest.createCal;
-import static org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.child;
-import static 
org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.newNodeAggregator;
-import static org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.useV2;
-import static 
org.apache.jackrabbit.oak.plugins.index.property.OrderedIndex.OrderDirection;
-import static 
org.apache.jackrabbit.oak.plugins.memory.PropertyStates.createProperty;
-import static org.apache.jackrabbit.oak.InitialContentHelper.INITIAL_CONTENT;
-import static 
org.apache.jackrabbit.oak.spi.filter.PathFilter.PROP_EXCLUDED_PATHS;
-import static 
org.apache.jackrabbit.oak.spi.filter.PathFilter.PROP_INCLUDED_PATHS;
-import static org.hamcrest.CoreMatchers.containsString;
-import static org.hamcrest.CoreMatchers.not;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertThat;
-import static org.junit.Assert.assertTrue;
-
+import java.io.ByteArrayInputStream;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.URI;
+import java.nio.charset.Charset;
 import java.text.ParseException;
 import java.util.Calendar;
+import java.util.Collection;
 import java.util.Collections;
-import java.util.Comparator;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
@@ -88,18 +39,21 @@ import java.util.Set;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 
+import javax.annotation.Nonnull;
+import javax.jcr.PropertyType;
+
 import com.google.common.base.Charsets;
 import com.google.common.collect.ComparisonChain;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Iterables;
-import com.google.common.collect.Iterators;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import com.google.common.io.CountingInputStream;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.jackrabbit.JcrConstants;
+import org.apache.jackrabbit.oak.InitialContent;
 import org.apache.jackrabbit.oak.Oak;
 import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.CommitFailedException;
@@ -120,13 +74,13 @@ import org.apache.jackrabbit.oak.plugins
 import 
org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
 import 
org.apache.jackrabbit.oak.plugins.index.lucene.directory.CopyOnReadDirectory;
 import 
org.apache.jackrabbit.oak.plugins.index.lucene.util.IndexDefinitionBuilder;
+import org.apache.jackrabbit.oak.plugins.index.lucene.util.fv.SimSearchUtils;
 import org.apache.jackrabbit.oak.plugins.index.nodetype.NodeTypeIndexProvider;
 import 
org.apache.jackrabbit.oak.plugins.index.property.PropertyIndexEditorProvider;
 import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob;
 import org.apache.jackrabbit.oak.plugins.memory.MemoryNodeStore;
 import org.apache.jackrabbit.oak.plugins.memory.PropertyStates;
 import org.apache.jackrabbit.oak.plugins.nodetype.TypeEditorProvider;
-import org.apache.jackrabbit.oak.InitialContent;
 import org.apache.jackrabbit.oak.plugins.nodetype.write.NodeTypeRegistry;
 import org.apache.jackrabbit.oak.query.AbstractQueryTest;
 import org.apache.jackrabbit.oak.spi.commit.CommitInfo;
@@ -147,6 +101,56 @@ import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
 
+import static com.google.common.collect.ImmutableSet.of;
+import static com.google.common.collect.Lists.newArrayList;
+import static java.util.Arrays.asList;
+import static org.apache.jackrabbit.JcrConstants.JCR_CONTENT;
+import static org.apache.jackrabbit.JcrConstants.JCR_DATA;
+import static org.apache.jackrabbit.JcrConstants.NT_FILE;
+import static org.apache.jackrabbit.oak.InitialContentHelper.INITIAL_CONTENT;
+import static org.apache.jackrabbit.oak.api.QueryEngine.NO_BINDINGS;
+import static org.apache.jackrabbit.oak.api.QueryEngine.NO_MAPPINGS;
+import static org.apache.jackrabbit.oak.api.Type.NAMES;
+import static org.apache.jackrabbit.oak.api.Type.STRING;
+import static org.apache.jackrabbit.oak.api.Type.STRINGS;
+import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.ASYNC_PROPERTY_NAME;
+import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.DECLARING_NODE_TYPES;
+import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NAME;
+import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NODE_TYPE;
+import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.QUERY_PATHS;
+import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.REINDEX_PROPERTY_NAME;
+import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.TYPE_PROPERTY_NAME;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition.INDEX_DEFINITION_NODE;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.ANALYZERS;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.INCLUDE_PROPERTY_NAMES;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.INDEX_ORIGINAL_TERM;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.ORDERED_PROP_NAMES;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROPDEF_PROP_NODE_NAME;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_ANALYZED;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NAME;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NODE;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NODE_SCOPE_INDEX;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_PROPERTY_INDEX;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_TYPE;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.TIKA;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexEditorTest.createCal;
+import static org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.child;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.newNodeAggregator;
+import static org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.useV2;
+import static 
org.apache.jackrabbit.oak.plugins.index.property.OrderedIndex.OrderDirection;
+import static 
org.apache.jackrabbit.oak.plugins.memory.PropertyStates.createProperty;
+import static 
org.apache.jackrabbit.oak.spi.filter.PathFilter.PROP_EXCLUDED_PATHS;
+import static 
org.apache.jackrabbit.oak.spi.filter.PathFilter.PROP_INCLUDED_PATHS;
+import static org.hamcrest.CoreMatchers.containsString;
+import static org.hamcrest.CoreMatchers.not;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
+
 @SuppressWarnings("ArraysAsListWithZeroOrOneArgument")
 public class LucenePropertyIndexTest extends AbstractQueryTest {
     /**
@@ -2946,6 +2950,107 @@ public class LucenePropertyIndexTest ext
                 "lucene:test1(/oak:index/test1)", asList("/d"));
     }
 
+    @Test
+    public void testRepSimilarWithBinaryFeatureVectors() throws Exception {
+
+        IndexDefinitionBuilder idxb = new IndexDefinitionBuilder().noAsync();
+        
idxb.indexRule("nt:base").property("fv").useInSimilarity().nodeScopeIndex().propertyIndex();
+
+        Tree idx = root.getTree("/").getChild("oak:index").addChild("test1");
+        idxb.build(idx);
+        root.commit();
+
+        Tree test = root.getTree("/").addChild("test");
+
+        URI uri = 
getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
+        File file = new File(uri);
+
+        Collection<String> children = new LinkedList<>();
+        for (String line : IOUtils.readLines(new FileInputStream(file), 
Charset.defaultCharset())) {
+            String[] split = line.split(",");
+            List<Double> values = new LinkedList<>();
+            int i = 0;
+            for (String s : split) {
+                if (i > 0) {
+                    values.add(Double.parseDouble(s));
+                }
+                i++;
+            }
+
+            byte[] bytes = SimSearchUtils.toByteArray(values);
+            List<Double> actual = SimSearchUtils.toDoubles(bytes);
+            assertEquals(values, actual);
+
+            Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
+            String name = split[0];
+            Tree child = test.addChild(name);
+            child.setProperty("fv", blob, Type.BINARY);
+        }
+        root.commit();
+
+        // check that similarity changes across different feature vectors
+        List<String> baseline = new LinkedList<>();
+        for (String similarPath : children) {
+            String query = "select [jcr:path] from [nt:base] where similar(., 
'" + similarPath + "')";
+
+            Iterator<String> result = executeQuery(query, 
"JCR-SQL2").iterator();
+            List<String> current = new LinkedList<>();
+            while (result.hasNext()) {
+                String next = result.next();
+                current.add(next);
+            }
+            assertNotEquals(baseline, current);
+            baseline.clear();
+            baseline.addAll(current);
+        }
+
+    }
+
+    @Test
+    public void testRepSimilarWithStringFeatureVectors() throws Exception {
+
+        IndexDefinitionBuilder idxb = new IndexDefinitionBuilder().noAsync();
+        
idxb.indexRule("nt:base").property("fv").useInSimilarity().nodeScopeIndex().propertyIndex();
+
+        Tree idx = root.getTree("/").getChild("oak:index").addChild("test1");
+        idxb.build(idx);
+        root.commit();
+
+
+        Tree test = root.getTree("/").addChild("test");
+
+        URI uri = 
getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
+        File file = new File(uri);
+
+        Collection<String> children = new LinkedList<>();
+
+        for (String line : IOUtils.readLines(new FileInputStream(file), 
Charset.defaultCharset())) {
+            int i1 = line.indexOf(',');
+            String name = line.substring(0, i1);
+            String value = line.substring(i1 + 1);
+            Tree child = test.addChild(name);
+            child.setProperty("fv", value, Type.STRING);
+            children.add(child.getPath());
+        }
+        root.commit();
+
+        // check that similarity changes across different feature vectors
+        List<String> baseline = new LinkedList<>();
+        for (String similarPath : children) {
+            String query = "select [jcr:path] from [nt:base] where similar(., 
'" + similarPath + "')";
+
+            Iterator<String> result = executeQuery(query, 
"JCR-SQL2").iterator();
+            List<String> current = new LinkedList<>();
+            while (result.hasNext()) {
+                String next = result.next();
+                current.add(next);
+            }
+            assertNotEquals(baseline, current);
+            baseline.clear();
+            baseline.addAll(current);
+        }
+    }
+
     private void assertPlanAndQuery(String query, String planExpectation, 
List<String> paths) {
         assertPlanAndQuery(query, planExpectation, paths, false);
     }

Added: 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizerTest.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizerTest.java?rev=1834326&view=auto
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizerTest.java
 (added)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizerTest.java
 Mon Jun 25 13:30:52 2018
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util.fv;
+
+import java.io.StringReader;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Tests for {@link FVTokenizer}
+ */
+public class FVTokenizerTest {
+
+  @Test
+  public void testTokenizeWithSpaces() throws Exception {
+      TokenStream stream = new FVTokenizer(Version.LUCENE_47, new 
StringReader("0.10 0.20 0.30 0.40"));
+      stream.reset();
+      List<String> expectedTokens = new LinkedList<>();
+      expectedTokens.add("0.10");
+      expectedTokens.add("0.20");
+      expectedTokens.add("0.30");
+      expectedTokens.add("0.40");
+      int i = 0;
+      while (stream.incrementToken()) {
+          CharTermAttribute charTermAttribute = 
stream.getAttribute(CharTermAttribute.class);
+          String token = new String(charTermAttribute.buffer(), 0, 
charTermAttribute.length());
+          assertEquals(expectedTokens.get(i), token);
+          i++;
+      }
+      stream.close();
+  }
+
+  @Test
+  public void testTokenizeWithCommas() throws Exception {
+      TokenStream stream = new FVTokenizer(Version.LUCENE_47, new 
StringReader("0.10,0.20,0.30,0.40"));
+      stream.reset();
+      List<String> expectedTokens = new LinkedList<>();
+      expectedTokens.add("0.10");
+      expectedTokens.add("0.20");
+      expectedTokens.add("0.30");
+      expectedTokens.add("0.40");
+      int i = 0;
+      while (stream.incrementToken()) {
+          CharTermAttribute charTermAttribute = 
stream.getAttribute(CharTermAttribute.class);
+          String token = new String(charTermAttribute.buffer(), 0, 
charTermAttribute.length());
+          assertEquals(expectedTokens.get(i), token);
+          i++;
+      }
+      stream.close();
+  }
+
+  @Test
+  public void testTokenizeWithCommasAndSpaces() throws Exception {
+      TokenStream stream = new FVTokenizer(Version.LUCENE_47, new 
StringReader("0.10, 0.20, 0.30, 0.40"));
+      stream.reset();
+      List<String> expectedTokens = new LinkedList<>();
+      expectedTokens.add("0.10");
+      expectedTokens.add("0.20");
+      expectedTokens.add("0.30");
+      expectedTokens.add("0.40");
+      int i = 0;
+      while (stream.incrementToken()) {
+          CharTermAttribute charTermAttribute = 
stream.getAttribute(CharTermAttribute.class);
+          String token = new String(charTermAttribute.buffer(), 0, 
charTermAttribute.length());
+          assertEquals(expectedTokens.get(i), token);
+          i++;
+      }
+      stream.close();
+  }
+
+}
\ No newline at end of file

Propchange: 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FVTokenizerTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilterTest.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilterTest.java?rev=1834326&view=auto
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilterTest.java
 (added)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilterTest.java
 Mon Jun 25 13:30:52 2018
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util.fv;
+
+import java.io.StringReader;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Tests for {@link FeaturePositionTokenFilter}
+ */
+public class FeaturePositionTokenFilterTest {
+
+    @Test
+    public void testFiltering() throws Exception {
+        TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_47, new 
StringReader("0.10 0.20 0.30 0.40"));
+        FeaturePositionTokenFilter filter = new 
FeaturePositionTokenFilter(stream);
+        filter.reset();
+        List<String> expectedTokens = new LinkedList<>();
+        expectedTokens.add("1_0.10");
+        expectedTokens.add("2_0.20");
+        expectedTokens.add("3_0.30");
+        expectedTokens.add("4_0.40");
+        int i = 0;
+        while (filter.incrementToken()) {
+          CharTermAttribute charTermAttribute = 
filter.getAttribute(CharTermAttribute.class);
+          String token = new String(charTermAttribute.buffer(), 0, 
charTermAttribute.length());
+          assertEquals(expectedTokens.get(i), token);
+          i++;
+        }
+        filter.close();
+    }
+
+}
\ No newline at end of file

Propchange: 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/FeaturePositionTokenFilterTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

svn commit: r1834326 [1/3] - in /jackrabbit/oak/trunk/oak-lucene/src: main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/ main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util...

Reply via email to