Author: ngupta
Date: Tue Jun 30 10:01:56 2020
New Revision: 1879359
URL: http://svn.apache.org/viewvc?rev=1879359&view=rev
Log:
OAK-9127 | Introduce Similarity Search (Text) support in oak-search-elastic
Added:
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
(with props)
jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
(with props)
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/MoreLikeThisHelperUtil.java
(with props)
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/MoreLikeThisHelper.java
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexWriter.java
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/MoreLikeThisHelper.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/MoreLikeThisHelper.java?rev=1879359&r1=1879358&r2=1879359&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/MoreLikeThisHelper.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/MoreLikeThisHelper.java
Tue Jun 30 10:01:56 2020
@@ -19,8 +19,10 @@ package org.apache.jackrabbit.oak.plugin
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
+import org.apache.jackrabbit.oak.plugins.index.search.MoreLikeThisHelperUtil;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
@@ -43,40 +45,38 @@ public class MoreLikeThisHelper {
MoreLikeThis mlt = new MoreLikeThis(reader);
mlt.setAnalyzer(analyzer);
try {
+ Map<String, String> paramMap =
MoreLikeThisHelperUtil.getParamMapFromMltQuery(mltQueryString);
String text = null;
String[] fields = {};
- for (String param : mltQueryString.split("&")) {
- String[] keyValuePair = param.split("=");
- if (keyValuePair.length != 2 || keyValuePair[0] == null ||
keyValuePair[1] == null) {
- throw new RuntimeException("Unparsable native Lucene MLT
query: " + mltQueryString);
- } else {
- if ("stream.body".equals(keyValuePair[0])) {
- text = keyValuePair[1];
- } else if ("mlt.fl".equals(keyValuePair[0])) {
- fields = keyValuePair[1].split(",");
- } else if ("mlt.mindf".equals(keyValuePair[0])) {
- mlt.setMinDocFreq(Integer.parseInt(keyValuePair[1]));
- } else if ("mlt.mintf".equals(keyValuePair[0])) {
- mlt.setMinTermFreq(Integer.parseInt(keyValuePair[1]));
- } else if ("mlt.boost".equals(keyValuePair[0])) {
- mlt.setBoost(Boolean.parseBoolean(keyValuePair[1]));
- } else if ("mlt.qf".equals(keyValuePair[0])) {
- mlt.setBoostFactor(Float.parseFloat(keyValuePair[1]));
- } else if ("mlt.maxdf".equals(keyValuePair[0])) {
- mlt.setMaxDocFreq(Integer.parseInt(keyValuePair[1]));
- } else if ("mlt.maxdfp".equals(keyValuePair[0])) {
-
mlt.setMaxDocFreqPct(Integer.parseInt(keyValuePair[1]));
- } else if ("mlt.maxntp".equals(keyValuePair[0])) {
-
mlt.setMaxNumTokensParsed(Integer.parseInt(keyValuePair[1]));
- } else if ("mlt.maxqt".equals(keyValuePair[0])) {
-
mlt.setMaxQueryTerms(Integer.parseInt(keyValuePair[1]));
- } else if ("mlt.maxwl".equals(keyValuePair[0])) {
- mlt.setMaxWordLen(Integer.parseInt(keyValuePair[1]));
- } else if ("mlt.minwl".equals(keyValuePair[0])) {
- mlt.setMinWordLen(Integer.parseInt(keyValuePair[1]));
- }
+ for (String key : paramMap.keySet()) {
+ String value = paramMap.get(key);
+ if (MoreLikeThisHelperUtil.MLT_STREAM_BODY.equals(key)) {
+ text = value;
+ } else if (MoreLikeThisHelperUtil.MLT_FILED.equals(key)) {
+ fields = value.split(",");
+ } else if
(MoreLikeThisHelperUtil.MLT_MIN_DOC_FREQ.equals(key)) {
+ mlt.setMinDocFreq(Integer.parseInt(value));
+ } else if
(MoreLikeThisHelperUtil.MLT_MIN_TERM_FREQ.equals(key)) {
+ mlt.setMinTermFreq(Integer.parseInt(value));
+ } else if (MoreLikeThisHelperUtil.MLT_BOOST.equals(key)) {
+ mlt.setBoost(Boolean.parseBoolean(value));
+ } else if
(MoreLikeThisHelperUtil.MLT_BOOST_FACTOR.equals(key)) {
+ mlt.setBoostFactor(Float.parseFloat(value));
+ } else if
(MoreLikeThisHelperUtil.MLT_MAX_DOC_FREQ.equals(key)) {
+ mlt.setMaxDocFreq(Integer.parseInt(value));
+ } else if
(MoreLikeThisHelperUtil.MLT_MAX_DOC_FREQ_PCT.equals(key)) {
+ mlt.setMaxDocFreqPct(Integer.parseInt(value));
+ } else if
(MoreLikeThisHelperUtil.MLT_MAX_NUM_TOKENS_PARSED.equals(key)) {
+ mlt.setMaxNumTokensParsed(Integer.parseInt(value));
+ } else if
(MoreLikeThisHelperUtil.MLT_MAX_QUERY_TERMS.equals(key)) {
+ mlt.setMaxQueryTerms(Integer.parseInt(value));
+ } else if
(MoreLikeThisHelperUtil.MLT_MAX_WORD_LENGTH.equals(key)) {
+ mlt.setMaxWordLen(Integer.parseInt(value));
+ } else if
(MoreLikeThisHelperUtil.MLT_MIN_WORD_LENGTH.equals(key)) {
+ mlt.setMinWordLen(Integer.parseInt(value));
}
}
+
if (text != null) {
if (FieldNames.PATH.equals(fields[0])) {
IndexSearcher searcher = new IndexSearcher(reader);
Modified:
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexWriter.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexWriter.java?rev=1879359&r1=1879358&r2=1879359&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexWriter.java
(original)
+++
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexWriter.java
Tue Jun 30 10:01:56 2020
@@ -18,6 +18,7 @@ package org.apache.jackrabbit.oak.plugin
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticConnection;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
+import org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils;
import
org.apache.jackrabbit.oak.plugins.index.search.spi.editor.FulltextIndexWriter;
import org.elasticsearch.ElasticsearchStatusException;
import org.elasticsearch.action.DocWriteRequest;
@@ -115,7 +116,7 @@ class ElasticIndexWriter implements Full
@Override
public void updateDocument(String path, ElasticDocument doc) {
IndexRequest request = new
IndexRequest(indexDefinition.getRemoteIndexAlias())
- .id(idFromPath(path))
+ .id(ElasticIndexUtils.idFromPath(path))
.source(doc.build(), XContentType.JSON);
bulkProcessor.add(request);
}
@@ -123,7 +124,7 @@ class ElasticIndexWriter implements Full
@Override
public void deleteDocuments(String path) {
DeleteRequest request = new
DeleteRequest(indexDefinition.getRemoteIndexAlias())
- .id(idFromPath(path));
+ .id(ElasticIndexUtils.idFromPath(path));
bulkProcessor.add(request);
}
@@ -275,25 +276,4 @@ class ElasticIndexWriter implements Full
}
}
- /**
- * Transforms a path into an _id compatible with Elasticsearch
specification. The path cannot be larger than 512
- * bytes. For performance reasons paths that are already compatible are
returned untouched. Otherwise, SHA-256
- * algorithm is used to return a transformed path (32 bytes max).
- *
- * @param path the document path
- * @return the Elasticsearch compatible path
- * @see <a
href="https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-id-field.html">
- * Mapping _id field</a>
- */
- private static String idFromPath(@NotNull String path) {
- byte[] pathBytes = path.getBytes(StandardCharsets.UTF_8);
- if (pathBytes.length > 512) {
- try {
- return new
String(MessageDigest.getInstance("SHA-256").digest(pathBytes));
- } catch (NoSuchAlgorithmException e) {
- throw new IllegalStateException(e);
- }
- }
- return path;
- }
}
Modified:
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java?rev=1879359&r1=1879358&r2=1879359&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
(original)
+++
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
Tue Jun 30 10:01:56 2020
@@ -20,8 +20,10 @@ import org.apache.jackrabbit.oak.api.Typ
import org.apache.jackrabbit.oak.commons.PathUtils;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
import
org.apache.jackrabbit.oak.plugins.index.elastic.query.async.facets.ElasticFacetProvider;
+import org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
+import org.apache.jackrabbit.oak.plugins.index.search.MoreLikeThisHelperUtil;
import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
import org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndex;
import
org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndexPlanner;
@@ -38,6 +40,7 @@ import org.apache.jackrabbit.oak.spi.que
import org.apache.lucene.search.WildcardQuery;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MatchPhraseQueryBuilder;
+import org.elasticsearch.index.query.MoreLikeThisQueryBuilder;
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
import org.elasticsearch.index.query.Operator;
import org.elasticsearch.index.query.QueryBuilder;
@@ -50,15 +53,16 @@ import org.elasticsearch.search.suggest.
import org.elasticsearch.search.suggest.phrase.PhraseSuggestionBuilder;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
-
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import javax.jcr.PropertyType;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.BiPredicate;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
-
import static org.apache.jackrabbit.JcrConstants.JCR_MIXINTYPES;
import static org.apache.jackrabbit.JcrConstants.JCR_PRIMARYTYPE;
import static org.apache.jackrabbit.oak.commons.PathUtils.denotesRoot;
@@ -82,12 +86,14 @@ import static org.elasticsearch.index.qu
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery;
import static org.elasticsearch.index.query.QueryBuilders.termQuery;
+import static org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item;
/**
* Class to map query plans into Elastic request objects.
*/
public class ElasticRequestHandler {
+ private static final Logger LOG =
LoggerFactory.getLogger(ElasticRequestHandler.class);
private final static String SPELLCHECK_PREFIX = "spellcheck?term=";
private static final String ES_TRIGRAM_SUFFIX = ".trigram";
@@ -122,9 +128,19 @@ public class ElasticRequestHandler {
}
if (propertyRestrictionQuery != null) {
- boolQuery.must(queryStringQuery(propertyRestrictionQuery));
+ if (propertyRestrictionQuery.startsWith("mlt?")) {
+ // SimilarityImpl in oak-core sets property restriction for
sim search and the query is somehting like
+ // mlt?mlt.fl=:path&mlt.mindf=0&stream.body=<path> . We need
parse this query string and turn into a query
+ // elastic can understand.
+ String mltQueryString =
propertyRestrictionQuery.replace("mlt?", "");
+ boolQuery.must(moreLikeThisQuery(mltQueryString));
+
+ } else {
+ boolQuery.must(queryStringQuery(propertyRestrictionQuery));
+ }
+
} else if (planResult.evaluateNonFullTextConstraints()) {
- for (QueryBuilder constraint: nonFullTextConstraints(indexPlan,
planResult)) {
+ for (QueryBuilder constraint : nonFullTextConstraints(indexPlan,
planResult)) {
boolQuery.filter(constraint);
}
}
@@ -189,6 +205,81 @@ public class ElasticRequestHandler {
.map(pd -> pd.name);
}
+ /*
+ Generates mlt query builder from the given mltQueryString
+ There could be 2 cases here -
+ 1) select [jcr:path] from [nt:base] where similar(., '/test/a') [Return
nodes with similar content to /test/a]
+ Xpath variant - //element(*, nt:base)[rep:similar(., '/test/a')]
+ In this case org.apache.jackrabbit.oak.query.ast.SimilarImpl creates the
mltQueryString as
+ mlt?mlt.fl=:path&mlt.mindf=0&stream.body=/test/a
+ 2) select [jcr:path] from [nt:base] where " +
+ "native('elastic-sim',
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0')
+ In this case the the exact mlt query passed above is passed to this
method. This can be useful if we want to
+ fine tune the various default parameters.
+ The function name passed to native func ('elastic-sim') needs to be
defined on index def
+ Refer
https://jackrabbit.apache.org/oak/docs/query/lucene.html#native-query
+ TODO : Docs for writing a native mlt query with the various parameters
that can be tuned
+ (The above is important since this is not a one-size-fits-all situation
and the default values might not
+ be useful in every situation based on the type of content)
+ */
+ private QueryBuilder moreLikeThisQuery(String mltQueryString) {
+ MoreLikeThisQueryBuilder mlt;
+ Map<String, String> paramMap =
MoreLikeThisHelperUtil.getParamMapFromMltQuery(mltQueryString);
+ String text = paramMap.get(MoreLikeThisHelperUtil.MLT_STREAM_BODY);
+ String fields = paramMap.get(MoreLikeThisHelperUtil.MLT_FILED);
+
+ if (text != null) {
+ // It's expected the text here to be the path of the doc
+ // In case the path of a node is greater than 512 bytes,
+ // we hash it before storing it as the _id for the elastic doc
+ text = ElasticIndexUtils.idFromPath(text);
+ if (FieldNames.PATH.equals(fields) || fields == null) {
+ // Handle the case 1) where default query sent by SimilarImpl
(No Custom fields)
+ // We just need to specify the doc (Item) whose similar
content we need to find
+ // We store path as the _id so no need to do anything extra
here
+ // We expect Similar impl to send a query where text would
have evaluated to node path.
+ mlt = new MoreLikeThisQueryBuilder(null, new Item[]{new
Item(null, text)});
+ } else {
+ // This is for native queries if someone send additional
fields via mlt.fl=field1,field2
+ String[] fieldsArray = fields.split(",");
+ mlt = new MoreLikeThisQueryBuilder(fieldsArray, null, new
Item[]{new Item(null, text)});
+ }
+ // TODO : See if we might want to support like Text here (passed
as null in above constructors)
+ // IT is not supported in our lucene implementation.
+ } else {
+ throw new RuntimeException("Missing required field stream.body in
MLT query: " + mltQueryString);
+ }
+
+ for (String key : paramMap.keySet()) {
+ String val = paramMap.get(key);
+ if (MoreLikeThisHelperUtil.MLT_MIN_DOC_FREQ.equals(key)) {
+ mlt.minDocFreq(Integer.parseInt(val));
+ } else if (MoreLikeThisHelperUtil.MLT_MIN_TERM_FREQ.equals(key)) {
+ mlt.minTermFreq(Integer.parseInt(val));
+ } else if (MoreLikeThisHelperUtil.MLT_BOOST_FACTOR.equals(key)) {
+ mlt.boost(Float.parseFloat(val));
+ } else if (MoreLikeThisHelperUtil.MLT_MAX_DOC_FREQ.equals(key)) {
+ mlt.maxDocFreq(Integer.parseInt(val));
+ } else if (MoreLikeThisHelperUtil.MLT_MAX_QUERY_TERMS.equals(key))
{
+ mlt.maxQueryTerms(Integer.parseInt(val));
+ } else if (MoreLikeThisHelperUtil.MLT_MAX_WORD_LENGTH.equals(key))
{
+ mlt.maxWordLength(Integer.parseInt(val));
+ } else if (MoreLikeThisHelperUtil.MLT_MIN_WORD_LENGTH.equals(key))
{
+ mlt.minWordLength(Integer.parseInt(val));
+ } else if
(MoreLikeThisHelperUtil.MLT_MIN_SHOULD_MATCH.equals(key)) {
+ mlt.minimumShouldMatch(val);
+ } else if (MoreLikeThisHelperUtil.MLT_STOP_WORDS.equals(key)) {
+ // TODO : Read this from a stopwords text file, configured via
index defn maybe ?
+ String[] stopWords = val.split(",");
+ mlt.stopWords(stopWords);
+ } else {
+ LOG.warn("Unrecognized param {} in the mlt query {}", key,
mltQueryString);
+ }
+ }
+
+ return mlt;
+ }
+
public PhraseSuggestionBuilder suggestQuery(String field, String
spellCheckQuery) {
BoolQueryBuilder query = boolQuery()
.must(new MatchPhraseQueryBuilder(field, "{{suggestion}}"));
Added:
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java?rev=1879359&view=auto
==============================================================================
---
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
(added)
+++
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
Tue Jun 30 10:01:56 2020
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.elastic.util;
+
+
+import org.jetbrains.annotations.NotNull;
+
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+
+public class ElasticIndexUtils {
+
+ /**
+ * Transforms a path into an _id compatible with Elasticsearch
specification. The path cannot be larger than 512
+ * bytes. For performance reasons paths that are already compatible are
returned untouched. Otherwise, SHA-256
+ * algorithm is used to return a transformed path (32 bytes max).
+ *
+ * @param path the document path
+ * @return the Elasticsearch compatible path
+ * @see <a
href="https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-id-field.html">
+ * Mapping _id field</a>
+ */
+ public static String idFromPath(@NotNull String path) {
+ byte[] pathBytes = path.getBytes(StandardCharsets.UTF_8);
+ if (pathBytes.length > 512) {
+ try {
+ return new
String(MessageDigest.getInstance("SHA-256").digest(pathBytes));
+ } catch (NoSuchAlgorithmException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+ return path;
+ }
+}
Propchange:
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java?rev=1879359&view=auto
==============================================================================
---
jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
(added)
+++
jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
Tue Jun 30 10:01:56 2020
@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.elastic;
+
+
+import org.apache.jackrabbit.oak.api.Tree;
+import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
+import
org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder;
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.UUID;
+
+
+public class ElasticSimilarQueryTest extends ElasticAbstractQueryTest {
+
+ /*
+ This test mirror the test
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexQueryTest#testRepSimilarAsNativeQuery
+ Exact same test data, to test out for feature parity
+ The only difference is the same query in lucene returns the doc itself
(the one that we need similar docs of) as part of search results
+ whereas in elastic, it doesn't.
+ */
+ @Test
+ public void testRepSimilarAsNativeQuery() throws Exception {
+
+ createIndex(true);
+
+ String nativeQueryString = "select [jcr:path] from [nt:base] where " +
+ "native('elastic-sim',
'mlt?stream.body=/test/c&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0')";
+ Tree test = root.getTree("/").addChild("test");
+ test.addChild("a").setProperty("text", "Hello World");
+ test.addChild("b").setProperty("text", "He said Hello and then the
world said Hello as well.");
+ test.addChild("c").setProperty("text", "He said Hi.");
+ root.commit();
+
+ assertEventually(() -> assertQuery(nativeQueryString,
+ Arrays.asList("/test/b")));
+ }
+
+
+ /*
+ This test mirror the test
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexQueryTest#testRepSimilarQuery
+ Exact same test data, to test out for feature parity
+ The only difference is the same query in lucene returns the doc itself
(the one that we need similar docs of) as part of search results
+ whereas in elastic, it doesn't.
+ */
+ @Test
+ public void testRepSimilarQuery() throws Exception {
+ createIndex(false);
+
+ String query = "select [jcr:path] from [nt:base] where similar(.,
'/test/a')";
+ Tree test = root.getTree("/").addChild("test");
+ test.addChild("a").setProperty("text", "Hello World Hello World");
+ test.addChild("b").setProperty("text", "Hello World");
+ test.addChild("c").setProperty("text", "World");
+ test.addChild("d").setProperty("text", "Hello");
+ test.addChild("e").setProperty("text", "Bye Bye");
+ test.addChild("f").setProperty("text", "Hello");
+ test.addChild("g").setProperty("text", "World");
+ test.addChild("h").setProperty("text", "Hello");
+ root.commit();
+
+ assertEventually(() -> assertQuery(query,
+ Arrays.asList("/test/b", "/test/c", "/test/d", "/test/f",
"/test/g", "/test/h")));
+ }
+
+ /*
+ This test mirror the test
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexQueryTest#testRepSimilarXPathQuery
+ Exact same test data, to test out for feature parity
+ The only difference is the same query in lucene returns the doc itself
(the one that we need similar docs of) as part of search results
+ whereas in elastic, it doesn't.
+ */
+ @Test
+ public void testRepSimilarXPathQuery() throws Exception {
+ createIndex(false);
+
+ String query = "//element(*, nt:base)[rep:similar(., '/test/a')]";
+ Tree test = root.getTree("/").addChild("test");
+ test.addChild("a").setProperty("text", "Hello World Hello World");
+ test.addChild("b").setProperty("text", "Hello World");
+ test.addChild("c").setProperty("text", "World");
+ test.addChild("d").setProperty("text", "Hello");
+ test.addChild("e").setProperty("text", "Bye Bye");
+ test.addChild("f").setProperty("text", "Hello");
+ test.addChild("g").setProperty("text", "World");
+ test.addChild("h").setProperty("text", "Hello");
+ root.commit();
+ assertEventually(() -> assertQuery(query, XPATH,
+ Arrays.asList("/test/b", "/test/c", "/test/d", "/test/f",
"/test/g", "/test/h")));
+ }
+
+
+ @Test
+ public void testRepSimilarWithStopWords() throws Exception {
+ createIndex(true);
+
+ String nativeQueryStringWithStopWords = "select [jcr:path] from
[nt:base] where " +
+ "native('elastic-sim',
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0&mlt.stopwords=Hello,bye')";
+
+ String nativeQueryStringWithouStopWords = "select [jcr:path] from
[nt:base] where " +
+ "native('elastic-sim',
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0&mlt.minshouldmatch=20%')";
+
+ Tree test = root.getTree("/").addChild("test");
+ test.addChild("a").setProperty("text", "Hello World. Ok Bye Bye now.
See you tomorrow.");
+ test.addChild("b").setProperty("text", "He said Hello and then the she
said Hello as well.");
+ test.addChild("c").setProperty("text", "He said Bye.");
+ test.addChild("d").setProperty("text", "Bye Bye World.");
+ test.addChild("e").setProperty("text", "See you Tomorrow");
+ test.addChild("f").setProperty("text", "Hello Mr X. Let's catch up
tomorrow. Bye Bye");
+ test.addChild("g").setProperty("text", "Random text");
+ root.commit();
+
+ // Matches due to terms Hello or bye should be ignored
+ assertEventually(() -> assertQuery(nativeQueryStringWithStopWords,
+ Arrays.asList("/test/e", "/test/f")));
+
+ assertEventually(() -> assertQuery(nativeQueryStringWithouStopWords,
+ Arrays.asList("/test/b", "/test/c", "/test/d", "/test/e",
"/test/f")));
+ }
+
+ @Test
+ public void testRepSimilarWithMinWordLength() throws Exception {
+ createIndex(true);
+ String nativeQueryStringWithMinWordLength = "select [jcr:path] from
[nt:base] where " +
+ "native('elastic-sim',
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0&mlt.minwl=6')";
+
+ String nativeQueryStringWithoutMinWordLength = "select [jcr:path] from
[nt:base] where " +
+ "native('elastic-sim',
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0')";
+
+ Tree test = root.getTree("/").addChild("test");
+ test.addChild("a").setProperty("text", "Hello Worlds.");
+ test.addChild("b").setProperty("text", "He said Hello and then the
world said Hello as well.");
+ test.addChild("c").setProperty("text", "War of the worlds is a good
movie");
+ test.addChild("d").setProperty("text", "Hello. How are you? Worlds");
+ root.commit();
+
+ // Matches because of term Hello should be ignored since wl <6 (so
/test/ should NOT be in the match list)
+ // /test/d should be in match list (becuase of Worlds term)
+ assertEventually(() -> assertQuery(nativeQueryStringWithMinWordLength,
+ Arrays.asList("/test/c", "/test/d")));
+
+ assertEventually(() ->
assertQuery(nativeQueryStringWithoutMinWordLength,
+ Arrays.asList("/test/b", "/test/c", "/test/d")));
+
+ }
+
+
+ @Test
+ public void testRepSimilarQueryWithLongPath() throws Exception {
+ createIndex(false);
+ Tree test = root.getTree("/").addChild("test");
+ Tree longPath = test.addChild("a");
+ for (int i = 0; i < 258; i ++) {
+ longPath = longPath.addChild("a"+i);
+ }
+ longPath.setProperty("text", "Hello World Hello World");
+ test.addChild("b").setProperty("text", "Hello World");
+ test.addChild("c").setProperty("text", "World");
+ test.addChild("d").setProperty("text", "Hello");
+ test.addChild("e").setProperty("text", "Bye Bye");
+ test.addChild("f").setProperty("text", "Hello");
+ test.addChild("g").setProperty("text", "World");
+ test.addChild("h").setProperty("text", "Hello");
+ root.commit();
+
+ String query = "select [jcr:path] from [nt:base] where similar(.,
'"+longPath.getPath()+"')";
+
+ assertEventually(() -> assertQuery(query,
+ Arrays.asList("/test/b", "/test/c", "/test/d", "/test/f",
"/test/g", "/test/h")));
+ }
+
+
+ private void createIndex(boolean nativeQuery) throws Exception {
+ IndexDefinitionBuilder builder = createIndex("text");
+ if (nativeQuery) {
+
builder.getBuilderTree().setProperty(FulltextIndexConstants.FUNC_NAME,
"elastic-sim");
+ }
+ builder.indexRule("nt:base").property("text").analyzed();
+ String indexId = UUID.randomUUID().toString();
+ setIndex(indexId, builder);
+ root.commit();
+ }
+
+}
Propchange:
jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/MoreLikeThisHelperUtil.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/MoreLikeThisHelperUtil.java?rev=1879359&view=auto
==============================================================================
---
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/MoreLikeThisHelperUtil.java
(added)
+++
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/MoreLikeThisHelperUtil.java
Tue Jun 30 10:01:56 2020
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.search;
+
+
+import java.util.HashMap;
+import java.util.Map;
+
+/*
+Helper class to assist with mlt query formation for elastic and lucene
+ */
+public class MoreLikeThisHelperUtil {
+
+ /*
+ A list of fields to fetch and analyze the text from.
+ Default analyzes all the indexed fields.
+ */
+ public static final String MLT_FILED = "mlt.fl";
+
+ /*
+ The minimum document frequency for a term below which the terms will
be ignored from the input document.
+ Defaults to 5
+ */
+ public static final String MLT_MIN_DOC_FREQ = "mlt.mindf";
+
+ /*
+ The maximum document frequency above which the terms will be ignored
from the input document.
+ This could be useful in order to ignore highly frequent words such as
stop words. Defaults to INTEGER.MAX
+ */
+ public static final String MLT_MAX_DOC_FREQ = "mlt.maxdf";
+
+ /*
+ The minimum term frequency (Number of times the term occurs in the
input doc)
+ below which the terms will be ignored from the input document.
+ Defaults to 2
+ */
+ public static final String MLT_MIN_TERM_FREQ = "mlt.mintf";
+
+ /*
+ Bool value if boost should be supported or not. Only valid for lucene
+ Not available in elastic.
+ */
+ public static final String MLT_BOOST = "mlt.boost";
+
+ /*
+ Sets the boost value of the whole query. Defaults to 1.0.
+ */
+ public static final String MLT_BOOST_FACTOR = "mlt.qf";
+
+ /*
+ Only For Lucene
+ */
+ public static final String MLT_MAX_DOC_FREQ_PCT = "mlt.maxdfp";
+
+ /*
+ Only For Lucene
+ */
+ public static final String MLT_MAX_NUM_TOKENS_PARSED = "mlt.maxntp";
+
+ /*
+ The maximum number of query terms that will be selected.
+ Increasing this value gives greater accuracy at the expense of query
execution speed. Defaults to 25.
+ */
+ public static final String MLT_MAX_QUERY_TERMS = "mlt.maxqt";
+
+ /*
+ The maximum word length above which the terms will be ignored. The old
name max_word_len is deprecated.
+ Defaults to unbounded
+ */
+ public static final String MLT_MAX_WORD_LENGTH = "mlt.maxwl";
+
+ /*
+ The minimum word length below which the terms will be ignored.
+ Defaults to 0
+ */
+ public static final String MLT_MIN_WORD_LENGTH = "mlt.minwl";
+
+ /*
+ An array of stop words.
+ Any word in this set is considered "uninteresting" and ignored.
+ Only applicable for ELASTIC
+ */
+ public static final String MLT_STOP_WORDS = "mlt.stopwords";
+
+ /*
+ This should have either the id to the doc whose similar docs need to
be searched or the complete body of the doc.
+ Defautls to ID (via the rep:similar query).
+ */
+ public static final String MLT_STREAM_BODY = "stream.body";
+
+ /*
+ After the disjunctive query has been formed,
+ this parameter controls the number of terms that must match.
+ (Defaults to "30%").
+ Only applicable for ELASTIC
+ */
+ public static final String MLT_MIN_SHOULD_MATCH = "mlt.minshouldmatch";
+
+
+
+ /*
+ Returns param map for a query string of type
mlt.fl=:path&mlt.mindf=0&stream.body=/test/a
+ */
+ public static Map<String, String> getParamMapFromMltQuery(String
mltQueryString) {
+ Map<String, String> paramMap = new HashMap();
+ try {
+ for (String param : mltQueryString.split("&")) {
+ String[] keyValuePair = param.split("=");
+ if (keyValuePair.length != 2 || keyValuePair[0] == null ||
keyValuePair[1] == null) {
+ throw new RuntimeException("Unparsable native MLT query: "
+ mltQueryString);
+ } else {
+ paramMap.put(keyValuePair[0], keyValuePair[1]);
+ }
+ }
+ } catch (Exception e) {
+ throw new RuntimeException("Error while parsing native MLT query:
" + mltQueryString);
+ }
+
+ if (paramMap.size() == 0) {
+ throw new RuntimeException("No params found while parsing the MLT
query : " + mltQueryString);
+ }
+
+ return paramMap;
+ }
+
+
+}
Propchange:
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/MoreLikeThisHelperUtil.java
------------------------------------------------------------------------------
svn:eol-style = native