Author: ngupta
Date: Tue Jun 30 10:01:56 2020
New Revision: 1879359

URL: http://svn.apache.org/viewvc?rev=1879359&view=rev
Log:
OAK-9127 | Introduce Similarity Search (Text) support in oak-search-elastic

Added:
    
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
   (with props)
    
jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
   (with props)
    
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/MoreLikeThisHelperUtil.java
   (with props)
Modified:
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/MoreLikeThisHelper.java
    
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexWriter.java
    
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/MoreLikeThisHelper.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/MoreLikeThisHelper.java?rev=1879359&r1=1879358&r2=1879359&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/MoreLikeThisHelper.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/MoreLikeThisHelper.java
 Tue Jun 30 10:01:56 2020
@@ -19,8 +19,10 @@ package org.apache.jackrabbit.oak.plugin
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map;
 
 import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
+import org.apache.jackrabbit.oak.plugins.index.search.MoreLikeThisHelperUtil;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.IndexReader;
@@ -43,40 +45,38 @@ public class MoreLikeThisHelper {
         MoreLikeThis mlt = new MoreLikeThis(reader);
         mlt.setAnalyzer(analyzer);
         try {
+            Map<String, String> paramMap = 
MoreLikeThisHelperUtil.getParamMapFromMltQuery(mltQueryString);
             String text = null;
             String[] fields = {};
-            for (String param : mltQueryString.split("&")) {
-                String[] keyValuePair = param.split("=");
-                if (keyValuePair.length != 2 || keyValuePair[0] == null || 
keyValuePair[1] == null) {
-                    throw new RuntimeException("Unparsable native Lucene MLT 
query: " + mltQueryString);
-                } else {
-                    if ("stream.body".equals(keyValuePair[0])) {
-                        text = keyValuePair[1];
-                    } else if ("mlt.fl".equals(keyValuePair[0])) {
-                        fields = keyValuePair[1].split(",");
-                    } else if ("mlt.mindf".equals(keyValuePair[0])) {
-                        mlt.setMinDocFreq(Integer.parseInt(keyValuePair[1]));
-                    } else if ("mlt.mintf".equals(keyValuePair[0])) {
-                        mlt.setMinTermFreq(Integer.parseInt(keyValuePair[1]));
-                    } else if ("mlt.boost".equals(keyValuePair[0])) {
-                        mlt.setBoost(Boolean.parseBoolean(keyValuePair[1]));
-                    } else if ("mlt.qf".equals(keyValuePair[0])) {
-                        mlt.setBoostFactor(Float.parseFloat(keyValuePair[1]));
-                    } else if ("mlt.maxdf".equals(keyValuePair[0])) {
-                        mlt.setMaxDocFreq(Integer.parseInt(keyValuePair[1]));
-                    } else if ("mlt.maxdfp".equals(keyValuePair[0])) {
-                        
mlt.setMaxDocFreqPct(Integer.parseInt(keyValuePair[1]));
-                    } else if ("mlt.maxntp".equals(keyValuePair[0])) {
-                        
mlt.setMaxNumTokensParsed(Integer.parseInt(keyValuePair[1]));
-                    } else if ("mlt.maxqt".equals(keyValuePair[0])) {
-                        
mlt.setMaxQueryTerms(Integer.parseInt(keyValuePair[1]));
-                    } else if ("mlt.maxwl".equals(keyValuePair[0])) {
-                        mlt.setMaxWordLen(Integer.parseInt(keyValuePair[1]));
-                    } else if ("mlt.minwl".equals(keyValuePair[0])) {
-                        mlt.setMinWordLen(Integer.parseInt(keyValuePair[1]));
-                    }
+            for (String key : paramMap.keySet()) {
+                String value = paramMap.get(key);
+                if (MoreLikeThisHelperUtil.MLT_STREAM_BODY.equals(key)) {
+                    text = value;
+                } else if (MoreLikeThisHelperUtil.MLT_FILED.equals(key)) {
+                    fields = value.split(",");
+                } else if 
(MoreLikeThisHelperUtil.MLT_MIN_DOC_FREQ.equals(key)) {
+                    mlt.setMinDocFreq(Integer.parseInt(value));
+                } else if 
(MoreLikeThisHelperUtil.MLT_MIN_TERM_FREQ.equals(key)) {
+                    mlt.setMinTermFreq(Integer.parseInt(value));
+                } else if (MoreLikeThisHelperUtil.MLT_BOOST.equals(key)) {
+                    mlt.setBoost(Boolean.parseBoolean(value));
+                } else if 
(MoreLikeThisHelperUtil.MLT_BOOST_FACTOR.equals(key)) {
+                    mlt.setBoostFactor(Float.parseFloat(value));
+                } else if 
(MoreLikeThisHelperUtil.MLT_MAX_DOC_FREQ.equals(key)) {
+                    mlt.setMaxDocFreq(Integer.parseInt(value));
+                } else if 
(MoreLikeThisHelperUtil.MLT_MAX_DOC_FREQ_PCT.equals(key)) {
+                    mlt.setMaxDocFreqPct(Integer.parseInt(value));
+                } else if 
(MoreLikeThisHelperUtil.MLT_MAX_NUM_TOKENS_PARSED.equals(key)) {
+                    mlt.setMaxNumTokensParsed(Integer.parseInt(value));
+                } else if 
(MoreLikeThisHelperUtil.MLT_MAX_QUERY_TERMS.equals(key)) {
+                    mlt.setMaxQueryTerms(Integer.parseInt(value));
+                } else if 
(MoreLikeThisHelperUtil.MLT_MAX_WORD_LENGTH.equals(key)) {
+                    mlt.setMaxWordLen(Integer.parseInt(value));
+                } else if 
(MoreLikeThisHelperUtil.MLT_MIN_WORD_LENGTH.equals(key)) {
+                    mlt.setMinWordLen(Integer.parseInt(value));
                 }
             }
+
             if (text != null) {
                 if (FieldNames.PATH.equals(fields[0])) {
                     IndexSearcher searcher = new IndexSearcher(reader);

Modified: 
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexWriter.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexWriter.java?rev=1879359&r1=1879358&r2=1879359&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexWriter.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexWriter.java
 Tue Jun 30 10:01:56 2020
@@ -18,6 +18,7 @@ package org.apache.jackrabbit.oak.plugin
 
 import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticConnection;
 import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
+import org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils;
 import 
org.apache.jackrabbit.oak.plugins.index.search.spi.editor.FulltextIndexWriter;
 import org.elasticsearch.ElasticsearchStatusException;
 import org.elasticsearch.action.DocWriteRequest;
@@ -115,7 +116,7 @@ class ElasticIndexWriter implements Full
     @Override
     public void updateDocument(String path, ElasticDocument doc) {
         IndexRequest request = new 
IndexRequest(indexDefinition.getRemoteIndexAlias())
-                .id(idFromPath(path))
+                .id(ElasticIndexUtils.idFromPath(path))
                 .source(doc.build(), XContentType.JSON);
         bulkProcessor.add(request);
     }
@@ -123,7 +124,7 @@ class ElasticIndexWriter implements Full
     @Override
     public void deleteDocuments(String path) {
         DeleteRequest request = new 
DeleteRequest(indexDefinition.getRemoteIndexAlias())
-                .id(idFromPath(path));
+                .id(ElasticIndexUtils.idFromPath(path));
         bulkProcessor.add(request);
     }
 
@@ -275,25 +276,4 @@ class ElasticIndexWriter implements Full
         }
     }
 
-    /**
-     * Transforms a path into an _id compatible with Elasticsearch 
specification. The path cannot be larger than 512
-     * bytes. For performance reasons paths that are already compatible are 
returned untouched. Otherwise, SHA-256
-     * algorithm is used to return a transformed path (32 bytes max).
-     *
-     * @param path the document path
-     * @return the Elasticsearch compatible path
-     * @see <a 
href="https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-id-field.html";>
-     *     Mapping _id field</a>
-     */
-    private static String idFromPath(@NotNull String path) {
-        byte[] pathBytes = path.getBytes(StandardCharsets.UTF_8);
-        if (pathBytes.length > 512) {
-            try {
-                return new 
String(MessageDigest.getInstance("SHA-256").digest(pathBytes));
-            } catch (NoSuchAlgorithmException e) {
-                throw new IllegalStateException(e);
-            }
-        }
-        return path;
-    }
 }

Modified: 
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java?rev=1879359&r1=1879358&r2=1879359&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
 Tue Jun 30 10:01:56 2020
@@ -20,8 +20,10 @@ import org.apache.jackrabbit.oak.api.Typ
 import org.apache.jackrabbit.oak.commons.PathUtils;
 import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
 import 
org.apache.jackrabbit.oak.plugins.index.elastic.query.async.facets.ElasticFacetProvider;
+import org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils;
 import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
 import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
+import org.apache.jackrabbit.oak.plugins.index.search.MoreLikeThisHelperUtil;
 import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
 import org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndex;
 import 
org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndexPlanner;
@@ -38,6 +40,7 @@ import org.apache.jackrabbit.oak.spi.que
 import org.apache.lucene.search.WildcardQuery;
 import org.elasticsearch.index.query.BoolQueryBuilder;
 import org.elasticsearch.index.query.MatchPhraseQueryBuilder;
+import org.elasticsearch.index.query.MoreLikeThisQueryBuilder;
 import org.elasticsearch.index.query.MultiMatchQueryBuilder;
 import org.elasticsearch.index.query.Operator;
 import org.elasticsearch.index.query.QueryBuilder;
@@ -50,15 +53,16 @@ import org.elasticsearch.search.suggest.
 import org.elasticsearch.search.suggest.phrase.PhraseSuggestionBuilder;
 import org.jetbrains.annotations.NotNull;
 import org.jetbrains.annotations.Nullable;
-
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import javax.jcr.PropertyType;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.BiPredicate;
 import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
-
 import static org.apache.jackrabbit.JcrConstants.JCR_MIXINTYPES;
 import static org.apache.jackrabbit.JcrConstants.JCR_PRIMARYTYPE;
 import static org.apache.jackrabbit.oak.commons.PathUtils.denotesRoot;
@@ -82,12 +86,14 @@ import static org.elasticsearch.index.qu
 import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
 import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery;
 import static org.elasticsearch.index.query.QueryBuilders.termQuery;
+import static org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item;
 
 /**
  * Class to map query plans into Elastic request objects.
  */
 public class ElasticRequestHandler {
 
+    private static final Logger LOG = 
LoggerFactory.getLogger(ElasticRequestHandler.class);
     private final static String SPELLCHECK_PREFIX = "spellcheck?term=";
     private static final String ES_TRIGRAM_SUFFIX = ".trigram";
 
@@ -122,9 +128,19 @@ public class ElasticRequestHandler {
         }
 
         if (propertyRestrictionQuery != null) {
-            boolQuery.must(queryStringQuery(propertyRestrictionQuery));
+            if (propertyRestrictionQuery.startsWith("mlt?")) {
+                // SimilarityImpl in oak-core sets property restriction for 
sim search and the query is somehting like
+                // mlt?mlt.fl=:path&mlt.mindf=0&stream.body=<path> . We need 
parse this query string and turn into a query
+                // elastic can understand.
+                String mltQueryString = 
propertyRestrictionQuery.replace("mlt?", "");
+                boolQuery.must(moreLikeThisQuery(mltQueryString));
+
+            } else {
+                boolQuery.must(queryStringQuery(propertyRestrictionQuery));
+            }
+
         } else if (planResult.evaluateNonFullTextConstraints()) {
-            for (QueryBuilder constraint: nonFullTextConstraints(indexPlan, 
planResult)) {
+            for (QueryBuilder constraint : nonFullTextConstraints(indexPlan, 
planResult)) {
                 boolQuery.filter(constraint);
             }
         }
@@ -189,6 +205,81 @@ public class ElasticRequestHandler {
                 .map(pd -> pd.name);
     }
 
+    /*
+    Generates mlt query builder from the given mltQueryString
+    There could be 2 cases here -
+    1) select [jcr:path] from [nt:base] where similar(., '/test/a') [Return 
nodes with similar content to /test/a]
+    Xpath variant - //element(*, nt:base)[rep:similar(., '/test/a')]
+    In this case org.apache.jackrabbit.oak.query.ast.SimilarImpl creates the 
mltQueryString as
+    mlt?mlt.fl=:path&mlt.mindf=0&stream.body=/test/a
+    2) select [jcr:path] from [nt:base] where " +
+       "native('elastic-sim', 
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0')
+       In this case the the exact mlt query passed above is passed to this 
method. This can be useful if we want to
+       fine tune the various default parameters.
+       The function name passed to native func ('elastic-sim') needs to be 
defined on index def
+       Refer 
https://jackrabbit.apache.org/oak/docs/query/lucene.html#native-query
+       TODO : Docs for writing a native mlt query with the various parameters 
that can be tuned
+       (The above is important since this is not a one-size-fits-all situation 
and the default values might not
+       be useful in every situation based on the type of content)
+     */
+    private QueryBuilder moreLikeThisQuery(String mltQueryString) {
+        MoreLikeThisQueryBuilder mlt;
+        Map<String, String> paramMap = 
MoreLikeThisHelperUtil.getParamMapFromMltQuery(mltQueryString);
+        String text = paramMap.get(MoreLikeThisHelperUtil.MLT_STREAM_BODY);
+        String fields = paramMap.get(MoreLikeThisHelperUtil.MLT_FILED);
+
+        if (text != null) {
+            // It's expected the text here to be the path of the doc
+            // In case the path of a node is greater than 512 bytes,
+            // we hash it before storing it as the _id for the elastic doc
+            text = ElasticIndexUtils.idFromPath(text);
+            if (FieldNames.PATH.equals(fields) || fields == null) {
+                // Handle the case 1) where default query sent by SimilarImpl 
(No Custom fields)
+                // We just need to specify the doc (Item) whose similar 
content we need to find
+                // We store path as the _id so no need to do anything extra 
here
+                // We expect Similar impl to send a query where text would 
have evaluated to node path.
+                mlt = new MoreLikeThisQueryBuilder(null, new Item[]{new 
Item(null, text)});
+            } else {
+                // This is for native queries if someone send additional 
fields via mlt.fl=field1,field2
+                String[] fieldsArray = fields.split(",");
+                mlt = new MoreLikeThisQueryBuilder(fieldsArray, null, new 
Item[]{new Item(null, text)});
+            }
+            // TODO : See if we might want to support like Text here (passed 
as null in above constructors)
+            // IT is not supported in our lucene implementation.
+        } else {
+            throw new RuntimeException("Missing required field stream.body in  
MLT query: " + mltQueryString);
+        }
+
+        for (String key : paramMap.keySet()) {
+            String val = paramMap.get(key);
+            if (MoreLikeThisHelperUtil.MLT_MIN_DOC_FREQ.equals(key)) {
+                mlt.minDocFreq(Integer.parseInt(val));
+            } else if (MoreLikeThisHelperUtil.MLT_MIN_TERM_FREQ.equals(key)) {
+                mlt.minTermFreq(Integer.parseInt(val));
+            } else if (MoreLikeThisHelperUtil.MLT_BOOST_FACTOR.equals(key)) {
+                mlt.boost(Float.parseFloat(val));
+            } else if (MoreLikeThisHelperUtil.MLT_MAX_DOC_FREQ.equals(key)) {
+                mlt.maxDocFreq(Integer.parseInt(val));
+            } else if (MoreLikeThisHelperUtil.MLT_MAX_QUERY_TERMS.equals(key)) 
{
+                mlt.maxQueryTerms(Integer.parseInt(val));
+            } else if (MoreLikeThisHelperUtil.MLT_MAX_WORD_LENGTH.equals(key)) 
{
+                mlt.maxWordLength(Integer.parseInt(val));
+            } else if (MoreLikeThisHelperUtil.MLT_MIN_WORD_LENGTH.equals(key)) 
{
+                mlt.minWordLength(Integer.parseInt(val));
+            } else if 
(MoreLikeThisHelperUtil.MLT_MIN_SHOULD_MATCH.equals(key)) {
+                mlt.minimumShouldMatch(val);
+            } else if (MoreLikeThisHelperUtil.MLT_STOP_WORDS.equals(key)) {
+                // TODO : Read this from a stopwords text file, configured via 
index defn maybe ?
+                String[] stopWords = val.split(",");
+                mlt.stopWords(stopWords);
+            } else {
+                LOG.warn("Unrecognized param {} in the mlt query {}", key, 
mltQueryString);
+            }
+        }
+
+        return mlt;
+    }
+
     public PhraseSuggestionBuilder suggestQuery(String field, String 
spellCheckQuery) {
         BoolQueryBuilder query = boolQuery()
                 .must(new MatchPhraseQueryBuilder(field, "{{suggestion}}"));

Added: 
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java?rev=1879359&view=auto
==============================================================================
--- 
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
 (added)
+++ 
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
 Tue Jun 30 10:01:56 2020
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.elastic.util;
+
+
+import org.jetbrains.annotations.NotNull;
+
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+
+public class ElasticIndexUtils {
+
+    /**
+     * Transforms a path into an _id compatible with Elasticsearch 
specification. The path cannot be larger than 512
+     * bytes. For performance reasons paths that are already compatible are 
returned untouched. Otherwise, SHA-256
+     * algorithm is used to return a transformed path (32 bytes max).
+     *
+     * @param path the document path
+     * @return the Elasticsearch compatible path
+     * @see <a 
href="https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-id-field.html";>
+     * Mapping _id field</a>
+     */
+    public static String idFromPath(@NotNull String path) {
+        byte[] pathBytes = path.getBytes(StandardCharsets.UTF_8);
+        if (pathBytes.length > 512) {
+            try {
+                return new 
String(MessageDigest.getInstance("SHA-256").digest(pathBytes));
+            } catch (NoSuchAlgorithmException e) {
+                throw new IllegalStateException(e);
+            }
+        }
+        return path;
+    }
+}

Propchange: 
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java?rev=1879359&view=auto
==============================================================================
--- 
jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
 (added)
+++ 
jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
 Tue Jun 30 10:01:56 2020
@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.elastic;
+
+
+import org.apache.jackrabbit.oak.api.Tree;
+import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
+import 
org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder;
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.UUID;
+
+
+public class ElasticSimilarQueryTest extends ElasticAbstractQueryTest {
+
+    /*
+    This test mirror the test 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexQueryTest#testRepSimilarAsNativeQuery
+    Exact same test data, to test out for feature parity
+    The only difference is the same query in lucene returns the doc itself 
(the one that we need similar docs of) as part of search results
+    whereas in elastic, it doesn't.
+     */
+    @Test
+    public void testRepSimilarAsNativeQuery() throws Exception {
+
+        createIndex(true);
+
+        String nativeQueryString = "select [jcr:path] from [nt:base] where " +
+                "native('elastic-sim', 
'mlt?stream.body=/test/c&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0')";
+        Tree test = root.getTree("/").addChild("test");
+        test.addChild("a").setProperty("text", "Hello World");
+        test.addChild("b").setProperty("text", "He said Hello and then the 
world said Hello as well.");
+        test.addChild("c").setProperty("text", "He said Hi.");
+        root.commit();
+
+        assertEventually(() -> assertQuery(nativeQueryString,
+                Arrays.asList("/test/b")));
+    }
+
+
+    /*
+    This test mirror the test 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexQueryTest#testRepSimilarQuery
+    Exact same test data, to test out for feature parity
+    The only difference is the same query in lucene returns the doc itself 
(the one that we need similar docs of) as part of search results
+    whereas in elastic, it doesn't.
+     */
+    @Test
+    public void testRepSimilarQuery() throws Exception {
+        createIndex(false);
+
+        String query = "select [jcr:path] from [nt:base] where similar(., 
'/test/a')";
+        Tree test = root.getTree("/").addChild("test");
+        test.addChild("a").setProperty("text", "Hello World Hello World");
+        test.addChild("b").setProperty("text", "Hello World");
+        test.addChild("c").setProperty("text", "World");
+        test.addChild("d").setProperty("text", "Hello");
+        test.addChild("e").setProperty("text", "Bye Bye");
+        test.addChild("f").setProperty("text", "Hello");
+        test.addChild("g").setProperty("text", "World");
+        test.addChild("h").setProperty("text", "Hello");
+        root.commit();
+
+        assertEventually(() -> assertQuery(query,
+                Arrays.asList("/test/b", "/test/c", "/test/d", "/test/f", 
"/test/g", "/test/h")));
+    }
+
+    /*
+    This test mirror the test 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexQueryTest#testRepSimilarXPathQuery
+    Exact same test data, to test out for feature parity
+    The only difference is the same query in lucene returns the doc itself 
(the one that we need similar docs of) as part of search results
+    whereas in elastic, it doesn't.
+     */
+    @Test
+    public void testRepSimilarXPathQuery() throws Exception {
+        createIndex(false);
+
+        String query = "//element(*, nt:base)[rep:similar(., '/test/a')]";
+        Tree test = root.getTree("/").addChild("test");
+        test.addChild("a").setProperty("text", "Hello World Hello World");
+        test.addChild("b").setProperty("text", "Hello World");
+        test.addChild("c").setProperty("text", "World");
+        test.addChild("d").setProperty("text", "Hello");
+        test.addChild("e").setProperty("text", "Bye Bye");
+        test.addChild("f").setProperty("text", "Hello");
+        test.addChild("g").setProperty("text", "World");
+        test.addChild("h").setProperty("text", "Hello");
+        root.commit();
+        assertEventually(() -> assertQuery(query, XPATH,
+                Arrays.asList("/test/b", "/test/c", "/test/d", "/test/f", 
"/test/g", "/test/h")));
+    }
+
+
+    @Test
+    public void testRepSimilarWithStopWords() throws Exception {
+        createIndex(true);
+
+        String nativeQueryStringWithStopWords = "select [jcr:path] from 
[nt:base] where " +
+                "native('elastic-sim', 
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0&mlt.stopwords=Hello,bye')";
+
+        String nativeQueryStringWithouStopWords =  "select [jcr:path] from 
[nt:base] where " +
+                "native('elastic-sim', 
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0&mlt.minshouldmatch=20%')";
+
+        Tree test = root.getTree("/").addChild("test");
+        test.addChild("a").setProperty("text", "Hello World. Ok Bye Bye now. 
See you tomorrow.");
+        test.addChild("b").setProperty("text", "He said Hello and then the she 
said Hello as well.");
+        test.addChild("c").setProperty("text", "He said Bye.");
+        test.addChild("d").setProperty("text", "Bye Bye World.");
+        test.addChild("e").setProperty("text", "See you Tomorrow");
+        test.addChild("f").setProperty("text", "Hello Mr X. Let's catch up 
tomorrow. Bye Bye");
+        test.addChild("g").setProperty("text", "Random text");
+        root.commit();
+
+        // Matches due to terms Hello or bye should be ignored
+        assertEventually(() -> assertQuery(nativeQueryStringWithStopWords,
+                Arrays.asList("/test/e", "/test/f")));
+
+        assertEventually(() -> assertQuery(nativeQueryStringWithouStopWords,
+                Arrays.asList("/test/b", "/test/c", "/test/d", "/test/e", 
"/test/f")));
+    }
+
+    @Test
+    public void testRepSimilarWithMinWordLength() throws Exception {
+        createIndex(true);
+        String nativeQueryStringWithMinWordLength = "select [jcr:path] from 
[nt:base] where " +
+                "native('elastic-sim', 
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0&mlt.minwl=6')";
+
+        String nativeQueryStringWithoutMinWordLength = "select [jcr:path] from 
[nt:base] where " +
+                "native('elastic-sim', 
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0')";
+
+        Tree test = root.getTree("/").addChild("test");
+        test.addChild("a").setProperty("text", "Hello Worlds.");
+        test.addChild("b").setProperty("text", "He said Hello and then the 
world said Hello as well.");
+        test.addChild("c").setProperty("text", "War of the worlds is a good 
movie");
+        test.addChild("d").setProperty("text", "Hello. How are you? Worlds");
+        root.commit();
+
+        // Matches because of term Hello should be ignored since wl <6 (so 
/test/ should NOT be in the match list)
+        // /test/d should be in match list (becuase of Worlds term)
+        assertEventually(() -> assertQuery(nativeQueryStringWithMinWordLength,
+                Arrays.asList("/test/c", "/test/d")));
+
+        assertEventually(() -> 
assertQuery(nativeQueryStringWithoutMinWordLength,
+                Arrays.asList("/test/b", "/test/c", "/test/d")));
+
+    }
+
+
+    @Test
+    public void testRepSimilarQueryWithLongPath() throws Exception {
+        createIndex(false);
+        Tree test = root.getTree("/").addChild("test");
+        Tree longPath = test.addChild("a");
+        for (int i = 0; i < 258; i ++) {
+            longPath = longPath.addChild("a"+i);
+        }
+        longPath.setProperty("text", "Hello World Hello World");
+        test.addChild("b").setProperty("text", "Hello World");
+        test.addChild("c").setProperty("text", "World");
+        test.addChild("d").setProperty("text", "Hello");
+        test.addChild("e").setProperty("text", "Bye Bye");
+        test.addChild("f").setProperty("text", "Hello");
+        test.addChild("g").setProperty("text", "World");
+        test.addChild("h").setProperty("text", "Hello");
+        root.commit();
+
+        String query = "select [jcr:path] from [nt:base] where similar(., 
'"+longPath.getPath()+"')";
+
+        assertEventually(() -> assertQuery(query,
+                Arrays.asList("/test/b", "/test/c", "/test/d", "/test/f", 
"/test/g", "/test/h")));
+    }
+
+
+    private void createIndex(boolean nativeQuery) throws Exception {
+        IndexDefinitionBuilder builder = createIndex("text");
+        if (nativeQuery) {
+            
builder.getBuilderTree().setProperty(FulltextIndexConstants.FUNC_NAME, 
"elastic-sim");
+        }
+        builder.indexRule("nt:base").property("text").analyzed();
+        String indexId = UUID.randomUUID().toString();
+        setIndex(indexId, builder);
+        root.commit();
+    }
+
+}

Propchange: 
jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/MoreLikeThisHelperUtil.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/MoreLikeThisHelperUtil.java?rev=1879359&view=auto
==============================================================================
--- 
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/MoreLikeThisHelperUtil.java
 (added)
+++ 
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/MoreLikeThisHelperUtil.java
 Tue Jun 30 10:01:56 2020
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.search;
+
+
+import java.util.HashMap;
+import java.util.Map;
+
+/*
+Helper class to assist with mlt query formation for elastic and lucene
+ */
+public class MoreLikeThisHelperUtil {
+
+    /*
+        A list of fields to fetch and analyze the text from.
+        Default analyzes all the indexed fields.
+     */
+    public static final String MLT_FILED = "mlt.fl";
+
+    /*
+        The minimum document frequency for a term below which the terms will 
be ignored from the input document.
+        Defaults to 5
+     */
+    public static final String MLT_MIN_DOC_FREQ = "mlt.mindf";
+
+    /*
+        The maximum document frequency above which the terms will be ignored 
from the input document.
+        This could be useful in order to ignore highly frequent words such as 
stop words. Defaults to INTEGER.MAX
+     */
+    public static final String MLT_MAX_DOC_FREQ = "mlt.maxdf";
+
+    /*
+        The minimum term frequency (Number of times the term occurs in the 
input doc)
+        below which the terms will be ignored from the input document.
+        Defaults to 2
+     */
+    public static final String MLT_MIN_TERM_FREQ = "mlt.mintf";
+
+    /*
+        Bool value if boost should be supported or not. Only valid for lucene
+        Not available in elastic.
+     */
+    public static final String MLT_BOOST = "mlt.boost";
+
+    /*
+        Sets the boost value of the whole query. Defaults to 1.0.
+     */
+    public static final String MLT_BOOST_FACTOR = "mlt.qf";
+
+    /*
+        Only For Lucene
+     */
+    public static final String MLT_MAX_DOC_FREQ_PCT = "mlt.maxdfp";
+
+    /*
+        Only For Lucene
+     */
+    public static final String MLT_MAX_NUM_TOKENS_PARSED = "mlt.maxntp";
+
+    /*
+        The maximum number of query terms that will be selected.
+        Increasing this value gives greater accuracy at the expense of query 
execution speed. Defaults to 25.
+     */
+    public static final String MLT_MAX_QUERY_TERMS = "mlt.maxqt";
+
+    /*
+        The maximum word length above which the terms will be ignored. The old 
name max_word_len is deprecated.
+        Defaults to unbounded
+     */
+    public static final String MLT_MAX_WORD_LENGTH = "mlt.maxwl";
+
+    /*
+        The minimum word length below which the terms will be ignored.
+        Defaults to 0
+     */
+    public static final String MLT_MIN_WORD_LENGTH = "mlt.minwl";
+
+    /*
+        An array of stop words.
+        Any word in this set is considered "uninteresting" and ignored.
+        Only applicable for ELASTIC
+     */
+    public static final String MLT_STOP_WORDS = "mlt.stopwords";
+
+    /*
+        This should have either the id to the doc whose similar docs need to 
be searched or the complete body of the doc.
+        Defautls to ID (via the rep:similar query).
+     */
+    public static final String MLT_STREAM_BODY = "stream.body";
+
+    /*
+        After the disjunctive query has been formed,
+        this parameter controls the number of terms that must match.
+        (Defaults to "30%").
+        Only applicable for ELASTIC
+     */
+    public static final String MLT_MIN_SHOULD_MATCH = "mlt.minshouldmatch";
+
+
+
+    /*
+    Returns param map for a query string of type 
mlt.fl=:path&mlt.mindf=0&stream.body=/test/a
+     */
+    public static Map<String, String> getParamMapFromMltQuery(String 
mltQueryString) {
+        Map<String, String> paramMap = new HashMap();
+        try {
+            for (String param : mltQueryString.split("&")) {
+                String[] keyValuePair = param.split("=");
+                if (keyValuePair.length != 2 || keyValuePair[0] == null || 
keyValuePair[1] == null) {
+                    throw new RuntimeException("Unparsable native MLT query: " 
+ mltQueryString);
+                } else {
+                    paramMap.put(keyValuePair[0], keyValuePair[1]);
+                }
+            }
+        } catch (Exception e) {
+            throw new RuntimeException("Error while parsing native MLT query: 
" + mltQueryString);
+        }
+
+        if (paramMap.size() == 0) {
+            throw new RuntimeException("No params found while parsing the MLT 
query : " + mltQueryString);
+        }
+
+        return paramMap;
+    }
+
+
+}

Propchange: 
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/MoreLikeThisHelperUtil.java
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to