[incubator-pinot] branch hotfix-text-operator updated: Make default operator for multi-term and phrase text search queries configurable (#6251)

siddteotia Tue, 17 Nov 2020 22:37:05 -0800

This is an automated email from the ASF dual-hosted git repository.

siddteotia pushed a commit to branch hotfix-text-operator
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git



The following commit(s) were added to refs/heads/hotfix-text-operator by this 
push:
     new 8db3d94  Make default operator for multi-term and phrase text search 
queries configurable (#6251)
8db3d94 is described below

commit 8db3d94094b881db5c0391261e18b31b10e32de3
Author: Sidd <[email protected]>
AuthorDate: Mon Nov 9 19:25:11 2020 -0800

    Make default operator for multi-term and phrase text search queries 
configurable (#6251)
    
    * Make default operator for multi-term and phrase text
    index queries configurable
    
    * cleanup
    
    Co-authored-by: Siddharth Teotia <[email protected]>
---
 .../segment/index/loader/IndexLoadingConfig.java   |  4 ++
 .../index/readers/text/LuceneTextIndexReader.java  |  8 ++++
 .../pinot/queries/TextSearchQueriesTest.java       | 49 ++++++++++++++++++++--
 .../apache/pinot/spi/config/table/FieldConfig.java |  1 +
 4 files changed, 58 insertions(+), 4 deletions(-)

diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/IndexLoadingConfig.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/IndexLoadingConfig.java
index a6817a0..04d91d1 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/IndexLoadingConfig.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/IndexLoadingConfig.java
@@ -230,6 +230,10 @@ public class IndexLoadingConfig {
     return _columnProperties;
   }
 
+  public void setColumnProperties(Map<String, Map<String, String>> 
columnProperties) {
+    _columnProperties = columnProperties;
+  }
+
   /**
    * Used in two places:
    * (1) In {@link 
org.apache.pinot.core.segment.index.column.PhysicalColumnIndexContainer}
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/text/LuceneTextIndexReader.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/text/LuceneTextIndexReader.java
index 3a3a2fa..38a6025 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/text/LuceneTextIndexReader.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/text/LuceneTextIndexReader.java
@@ -57,6 +57,7 @@ public class LuceneTextIndexReader implements TextIndexReader 
{
   private final String _column;
   private final DocIdTranslator _docIdTranslator;
   private final StandardAnalyzer _standardAnalyzer;
+  private boolean _useANDForMultiTermQueries = false;
 
   public static final String LUCENE_TEXT_INDEX_DOCID_MAPPING_FILE_EXTENSION = 
".lucene.mapping";
 
@@ -83,6 +84,10 @@ public class LuceneTextIndexReader implements 
TextIndexReader {
         // repeated queries, on the downside it cause heap issues.
         _indexSearcher.setQueryCache(null);
       }
+      if (textIndexProperties != null && Boolean
+          
.parseBoolean(textIndexProperties.get(FieldConfig.TEXT_INDEX_USE_AND_FOR_MULTI_TERM_QUERIES)))
 {
+        _useANDForMultiTermQueries = true;
+      }
       // TODO: consider using a threshold of num docs per segment to decide 
between building
       // mapping file upfront on segment load v/s on-the-fly during query 
processing
       _docIdTranslator = new DocIdTranslator(indexDir, _column, numDocs, 
_indexSearcher);
@@ -125,6 +130,9 @@ public class LuceneTextIndexReader implements 
TextIndexReader {
       // be instantiated per query. Analyzer on the other hand is stateless
       // and can be created upfront.
       QueryParser parser = new QueryParser(_column, _standardAnalyzer);
+      if (_useANDForMultiTermQueries) {
+        parser.setDefaultOperator(QueryParser.Operator.AND);
+      }
       Query query = parser.parse(searchQuery);
       _indexSearcher.search(query, docIDCollector);
       return docIds;
diff --git 
a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java 
b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
index 1f76cc0..64610f9 100644
--- 
a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
+++ 
b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
@@ -27,8 +27,10 @@ import java.io.Serializable;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Random;
 import java.util.Set;
 import org.apache.commons.io.FileUtils;
@@ -91,8 +93,10 @@ public class TextSearchQueriesTest extends BaseQueriesTest { 
 private static fin
   private static final String QUERY_LOG_TEXT_COL_NAME = "QUERY_LOG_TEXT_COL";
   private static final String SKILLS_TEXT_COL_NAME = "SKILLS_TEXT_COL";
   private static final String SKILLS_TEXT_COL_DICT_NAME = 
"SKILLS_TEXT_COL_DICT";
+  private static final String SKILLS_COPY_TEXT_COL_NAME = "SKILLS_TEXT_COL_1";
   private static final String INT_COL_NAME = "INT_COL";
-  private static final List<String> RAW_TEXT_INDEX_COLUMNS = 
Arrays.asList(QUERY_LOG_TEXT_COL_NAME, SKILLS_TEXT_COL_NAME);
+  private static final List<String> RAW_TEXT_INDEX_COLUMNS =
+      Arrays.asList(QUERY_LOG_TEXT_COL_NAME, SKILLS_TEXT_COL_NAME, 
SKILLS_COPY_TEXT_COL_NAME);
   private static final List<String> DICT_TEXT_INDEX_COLUMNS = 
Arrays.asList(SKILLS_TEXT_COL_DICT_NAME);
   private static final int INT_BASE_VALUE = 1000;
 
@@ -128,6 +132,11 @@ public class TextSearchQueriesTest extends BaseQueriesTest 
{  private static fin
     textIndexColumns.addAll(DICT_TEXT_INDEX_COLUMNS);
     indexLoadingConfig.setTextIndexColumns(textIndexColumns);
     indexLoadingConfig.setInvertedIndexColumns(new 
HashSet<>(DICT_TEXT_INDEX_COLUMNS));
+    Map<String, Map<String, String>> columnProperties = new HashMap<>();
+    Map<String, String> props = new HashMap<>();
+    props.put(FieldConfig.TEXT_INDEX_USE_AND_FOR_MULTI_TERM_QUERIES, "true");
+    columnProperties.put(SKILLS_COPY_TEXT_COL_NAME, props);
+    indexLoadingConfig.setColumnProperties(columnProperties);
     ImmutableSegment immutableSegment =
         ImmutableSegmentLoader.load(new File(INDEX_DIR, SEGMENT_NAME), 
indexLoadingConfig);
     _indexSegment = immutableSegment;
@@ -160,6 +169,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest 
{  private static fin
         .addSingleValueDimension(QUERY_LOG_TEXT_COL_NAME, 
FieldSpec.DataType.STRING)
         .addSingleValueDimension(SKILLS_TEXT_COL_NAME, 
FieldSpec.DataType.STRING)
         .addSingleValueDimension(SKILLS_TEXT_COL_DICT_NAME, 
FieldSpec.DataType.STRING)
+        .addSingleValueDimension(SKILLS_COPY_TEXT_COL_NAME, 
FieldSpec.DataType.STRING)
         .addMetric(INT_COL_NAME, FieldSpec.DataType.INT).build();
     SegmentGeneratorConfig config = new SegmentGeneratorConfig(tableConfig, 
schema);
     config.setOutDir(INDEX_DIR.getPath());
@@ -204,9 +214,11 @@ public class TextSearchQueriesTest extends BaseQueriesTest 
{  private static fin
         if (counter >= skillCount) {
           row.putField(SKILLS_TEXT_COL_NAME, "software engineering");
           row.putField(SKILLS_TEXT_COL_DICT_NAME, "software engineering");
+          row.putField(SKILLS_COPY_TEXT_COL_NAME, "software engineering");
         } else {
           row.putField(SKILLS_TEXT_COL_NAME, skills[counter]);
           row.putField(SKILLS_TEXT_COL_DICT_NAME, skills[counter]);
+          row.putField(SKILLS_COPY_TEXT_COL_NAME, skills[counter]);
         }
         rows.add(row);
         counter++;
@@ -542,10 +554,20 @@ public class TextSearchQueriesTest extends 
BaseQueriesTest {  private static fin
 
     query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE 
TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\" AND Java AND C++') LIMIT 
50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
     query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 
'\"distributed systems\" AND Java AND C++') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());
 
+    // test for the index configured to use AND as the default
+    // conjunction operator
+    query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE 
TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" Java C++') LIMIT 50000";
+    testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
+    query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, 
'\"distributed systems\" Java C++') LIMIT 50000";
+    testTextSearchAggregationQueryHelper(query, expected.size());
+    query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE 
TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" AND Java AND C++') LIMIT 
50000";
+    testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
+    query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, 
'\"distributed systems\" AND Java AND C++') LIMIT 50000";
+    testTextSearchAggregationQueryHelper(query, expected.size());
+
     // TEST 22: composite phrase and term query using boolean operator OR
     // Search in SKILLS_TEXT_COL column to look for documents where each 
document MUST contain ANY of the following skills:
     // phrase "distributed systems" as is, term 'Java', term 'C++'. Note: OR 
operator is implicit when we don't specify
@@ -569,10 +591,16 @@ public class TextSearchQueriesTest extends 
BaseQueriesTest {  private static fin
 
     query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE 
TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\" Java C++') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
     query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 
'\"distributed systems\" Java C++') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());
 
+    // test for the index configured to use AND as the default
+    // conjunction operator
+    query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE 
TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" OR Java OR C++') LIMIT 
50000";
+    testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
+    query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, 
'\"distributed systems\" OR Java OR C++') LIMIT 50000";
+    testTextSearchAggregationQueryHelper(query, expected.size());
+
     // TEST 23: composite phrase and term query using both AND and OR
     // Search in SKILLS_TEXT_COL column to look for documents where each 
document MUST contain phrase "distributed systems"
     // as is and any of the following terms 'Java' or 'C++'. The expected 
result table was built by doing
@@ -585,10 +613,23 @@ public class TextSearchQueriesTest extends 
BaseQueriesTest {  private static fin
 
     query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE 
TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\" AND (Java C++)') LIMIT 
50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
     query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 
'\"distributed systems\" AND (Java C++)') LIMIT 50000";
     testTextSearchAggregationQueryHelper(query, expected.size());
 
+    // test for the index configured to use AND as the default
+    // conjunction operator
+    query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE 
TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" AND (Java OR C++)') 
LIMIT 50000";
+    testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
+    query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, 
'\"distributed systems\" AND (Java OR C++)') LIMIT 50000";
+    testTextSearchAggregationQueryHelper(query, expected.size());
+    expected = new ArrayList<>();
+    expected.add(new Serializable[]{1005, "Distributed systems, Java, C++, Go, 
distributed query engines for analytics and data warehouses, Machine learning, 
spark, Kubernetes, transaction processing"});
+    expected.add(new Serializable[]{1017, "Distributed systems, Apache Kafka, 
publish-subscribe, building and deploying large scale production systems, 
concurrency, multi-threading, C++, CPU processing, Java"});
+    query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE 
TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" AND (Java C++)') LIMIT 
50000";
+    testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
+    query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1, 
'\"distributed systems\" AND (Java C++)') LIMIT 50000";
+    testTextSearchAggregationQueryHelper(query, expected.size());
+
     // TEST 24: prefix query
     // Search in SKILLS_TEXT_COL column to look for documents that have 
stream* -- stream, streaming, streams etc.
     // The expected result table was built by doing grep -n -i -E 'stream' 
skills.txt
diff --git 
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java 
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
index 9c9bc1b..aecc25a 100644
--- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
+++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
@@ -43,6 +43,7 @@ public class FieldConfig extends BaseJsonConfig {
   // Lucene creates a query result cache if this option is enabled
   // the cache improves performance of repeatable queries
   public static String TEXT_INDEX_ENABLE_QUERY_CACHE = 
"enableQueryCacheForTextIndex";
+  public static String TEXT_INDEX_USE_AND_FOR_MULTI_TERM_QUERIES = 
"useANDForMultiTermTextIndexQueries";
 
   @JsonCreator
   public FieldConfig(@JsonProperty(value = "name", required = true) String 
name,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[incubator-pinot] branch hotfix-text-operator updated: Make default operator for multi-term and phrase text search queries configurable (#6251)

Reply via email to