This is an automated email from the ASF dual-hosted git repository.
siddteotia pushed a commit to branch hotfix-text-operator
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git
The following commit(s) were added to refs/heads/hotfix-text-operator by this
push:
new 8db3d94 Make default operator for multi-term and phrase text search
queries configurable (#6251)
8db3d94 is described below
commit 8db3d94094b881db5c0391261e18b31b10e32de3
Author: Sidd <[email protected]>
AuthorDate: Mon Nov 9 19:25:11 2020 -0800
Make default operator for multi-term and phrase text search queries
configurable (#6251)
* Make default operator for multi-term and phrase text
index queries configurable
* cleanup
Co-authored-by: Siddharth Teotia <[email protected]>
---
.../segment/index/loader/IndexLoadingConfig.java | 4 ++
.../index/readers/text/LuceneTextIndexReader.java | 8 ++++
.../pinot/queries/TextSearchQueriesTest.java | 49 ++++++++++++++++++++--
.../apache/pinot/spi/config/table/FieldConfig.java | 1 +
4 files changed, 58 insertions(+), 4 deletions(-)
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/IndexLoadingConfig.java
b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/IndexLoadingConfig.java
index a6817a0..04d91d1 100644
---
a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/IndexLoadingConfig.java
+++
b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/IndexLoadingConfig.java
@@ -230,6 +230,10 @@ public class IndexLoadingConfig {
return _columnProperties;
}
+ public void setColumnProperties(Map<String, Map<String, String>>
columnProperties) {
+ _columnProperties = columnProperties;
+ }
+
/**
* Used in two places:
* (1) In {@link
org.apache.pinot.core.segment.index.column.PhysicalColumnIndexContainer}
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/text/LuceneTextIndexReader.java
b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/text/LuceneTextIndexReader.java
index 3a3a2fa..38a6025 100644
---
a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/text/LuceneTextIndexReader.java
+++
b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/text/LuceneTextIndexReader.java
@@ -57,6 +57,7 @@ public class LuceneTextIndexReader implements TextIndexReader
{
private final String _column;
private final DocIdTranslator _docIdTranslator;
private final StandardAnalyzer _standardAnalyzer;
+ private boolean _useANDForMultiTermQueries = false;
public static final String LUCENE_TEXT_INDEX_DOCID_MAPPING_FILE_EXTENSION =
".lucene.mapping";
@@ -83,6 +84,10 @@ public class LuceneTextIndexReader implements
TextIndexReader {
// repeated queries, on the downside it cause heap issues.
_indexSearcher.setQueryCache(null);
}
+ if (textIndexProperties != null && Boolean
+
.parseBoolean(textIndexProperties.get(FieldConfig.TEXT_INDEX_USE_AND_FOR_MULTI_TERM_QUERIES)))
{
+ _useANDForMultiTermQueries = true;
+ }
// TODO: consider using a threshold of num docs per segment to decide
between building
// mapping file upfront on segment load v/s on-the-fly during query
processing
_docIdTranslator = new DocIdTranslator(indexDir, _column, numDocs,
_indexSearcher);
@@ -125,6 +130,9 @@ public class LuceneTextIndexReader implements
TextIndexReader {
// be instantiated per query. Analyzer on the other hand is stateless
// and can be created upfront.
QueryParser parser = new QueryParser(_column, _standardAnalyzer);
+ if (_useANDForMultiTermQueries) {
+ parser.setDefaultOperator(QueryParser.Operator.AND);
+ }
Query query = parser.parse(searchQuery);
_indexSearcher.search(query, docIDCollector);
return docIds;
diff --git
a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
index 1f76cc0..64610f9 100644
---
a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
+++
b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
@@ -27,8 +27,10 @@ import java.io.Serializable;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
+import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.apache.commons.io.FileUtils;
@@ -91,8 +93,10 @@ public class TextSearchQueriesTest extends BaseQueriesTest {
private static fin
private static final String QUERY_LOG_TEXT_COL_NAME = "QUERY_LOG_TEXT_COL";
private static final String SKILLS_TEXT_COL_NAME = "SKILLS_TEXT_COL";
private static final String SKILLS_TEXT_COL_DICT_NAME =
"SKILLS_TEXT_COL_DICT";
+ private static final String SKILLS_COPY_TEXT_COL_NAME = "SKILLS_TEXT_COL_1";
private static final String INT_COL_NAME = "INT_COL";
- private static final List<String> RAW_TEXT_INDEX_COLUMNS =
Arrays.asList(QUERY_LOG_TEXT_COL_NAME, SKILLS_TEXT_COL_NAME);
+ private static final List<String> RAW_TEXT_INDEX_COLUMNS =
+ Arrays.asList(QUERY_LOG_TEXT_COL_NAME, SKILLS_TEXT_COL_NAME,
SKILLS_COPY_TEXT_COL_NAME);
private static final List<String> DICT_TEXT_INDEX_COLUMNS =
Arrays.asList(SKILLS_TEXT_COL_DICT_NAME);
private static final int INT_BASE_VALUE = 1000;
@@ -128,6 +132,11 @@ public class TextSearchQueriesTest extends BaseQueriesTest
{ private static fin
textIndexColumns.addAll(DICT_TEXT_INDEX_COLUMNS);
indexLoadingConfig.setTextIndexColumns(textIndexColumns);
indexLoadingConfig.setInvertedIndexColumns(new
HashSet<>(DICT_TEXT_INDEX_COLUMNS));
+ Map<String, Map<String, String>> columnProperties = new HashMap<>();
+ Map<String, String> props = new HashMap<>();
+ props.put(FieldConfig.TEXT_INDEX_USE_AND_FOR_MULTI_TERM_QUERIES, "true");
+ columnProperties.put(SKILLS_COPY_TEXT_COL_NAME, props);
+ indexLoadingConfig.setColumnProperties(columnProperties);
ImmutableSegment immutableSegment =
ImmutableSegmentLoader.load(new File(INDEX_DIR, SEGMENT_NAME),
indexLoadingConfig);
_indexSegment = immutableSegment;
@@ -160,6 +169,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest
{ private static fin
.addSingleValueDimension(QUERY_LOG_TEXT_COL_NAME,
FieldSpec.DataType.STRING)
.addSingleValueDimension(SKILLS_TEXT_COL_NAME,
FieldSpec.DataType.STRING)
.addSingleValueDimension(SKILLS_TEXT_COL_DICT_NAME,
FieldSpec.DataType.STRING)
+ .addSingleValueDimension(SKILLS_COPY_TEXT_COL_NAME,
FieldSpec.DataType.STRING)
.addMetric(INT_COL_NAME, FieldSpec.DataType.INT).build();
SegmentGeneratorConfig config = new SegmentGeneratorConfig(tableConfig,
schema);
config.setOutDir(INDEX_DIR.getPath());
@@ -204,9 +214,11 @@ public class TextSearchQueriesTest extends BaseQueriesTest
{ private static fin
if (counter >= skillCount) {
row.putField(SKILLS_TEXT_COL_NAME, "software engineering");
row.putField(SKILLS_TEXT_COL_DICT_NAME, "software engineering");
+ row.putField(SKILLS_COPY_TEXT_COL_NAME, "software engineering");
} else {
row.putField(SKILLS_TEXT_COL_NAME, skills[counter]);
row.putField(SKILLS_TEXT_COL_DICT_NAME, skills[counter]);
+ row.putField(SKILLS_COPY_TEXT_COL_NAME, skills[counter]);
}
rows.add(row);
counter++;
@@ -542,10 +554,20 @@ public class TextSearchQueriesTest extends
BaseQueriesTest { private static fin
query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE
TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\" AND Java AND C++') LIMIT
50000";
testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL,
'\"distributed systems\" AND Java AND C++') LIMIT 50000";
testTextSearchAggregationQueryHelper(query, expected.size());
+ // test for the index configured to use AND as the default
+ // conjunction operator
+ query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE
TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" Java C++') LIMIT 50000";
+ testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
+ query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1,
'\"distributed systems\" Java C++') LIMIT 50000";
+ testTextSearchAggregationQueryHelper(query, expected.size());
+ query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE
TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" AND Java AND C++') LIMIT
50000";
+ testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
+ query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1,
'\"distributed systems\" AND Java AND C++') LIMIT 50000";
+ testTextSearchAggregationQueryHelper(query, expected.size());
+
// TEST 22: composite phrase and term query using boolean operator OR
// Search in SKILLS_TEXT_COL column to look for documents where each
document MUST contain ANY of the following skills:
// phrase "distributed systems" as is, term 'Java', term 'C++'. Note: OR
operator is implicit when we don't specify
@@ -569,10 +591,16 @@ public class TextSearchQueriesTest extends
BaseQueriesTest { private static fin
query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE
TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\" Java C++') LIMIT 50000";
testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL,
'\"distributed systems\" Java C++') LIMIT 50000";
testTextSearchAggregationQueryHelper(query, expected.size());
+ // test for the index configured to use AND as the default
+ // conjunction operator
+ query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE
TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" OR Java OR C++') LIMIT
50000";
+ testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
+ query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1,
'\"distributed systems\" OR Java OR C++') LIMIT 50000";
+ testTextSearchAggregationQueryHelper(query, expected.size());
+
// TEST 23: composite phrase and term query using both AND and OR
// Search in SKILLS_TEXT_COL column to look for documents where each
document MUST contain phrase "distributed systems"
// as is and any of the following terms 'Java' or 'C++'. The expected
result table was built by doing
@@ -585,10 +613,23 @@ public class TextSearchQueriesTest extends
BaseQueriesTest { private static fin
query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE
TEXT_MATCH(SKILLS_TEXT_COL, '\"distributed systems\" AND (Java C++)') LIMIT
50000";
testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
-
query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL,
'\"distributed systems\" AND (Java C++)') LIMIT 50000";
testTextSearchAggregationQueryHelper(query, expected.size());
+ // test for the index configured to use AND as the default
+ // conjunction operator
+ query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE
TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" AND (Java OR C++)')
LIMIT 50000";
+ testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
+ query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1,
'\"distributed systems\" AND (Java OR C++)') LIMIT 50000";
+ testTextSearchAggregationQueryHelper(query, expected.size());
+ expected = new ArrayList<>();
+ expected.add(new Serializable[]{1005, "Distributed systems, Java, C++, Go,
distributed query engines for analytics and data warehouses, Machine learning,
spark, Kubernetes, transaction processing"});
+ expected.add(new Serializable[]{1017, "Distributed systems, Apache Kafka,
publish-subscribe, building and deploying large scale production systems,
concurrency, multi-threading, C++, CPU processing, Java"});
+ query = "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE
TEXT_MATCH(SKILLS_TEXT_COL_1, '\"distributed systems\" AND (Java C++)') LIMIT
50000";
+ testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
+ query = "SELECT COUNT(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL_1,
'\"distributed systems\" AND (Java C++)') LIMIT 50000";
+ testTextSearchAggregationQueryHelper(query, expected.size());
+
// TEST 24: prefix query
// Search in SKILLS_TEXT_COL column to look for documents that have
stream* -- stream, streaming, streams etc.
// The expected result table was built by doing grep -n -i -E 'stream'
skills.txt
diff --git
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
index 9c9bc1b..aecc25a 100644
--- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
+++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
@@ -43,6 +43,7 @@ public class FieldConfig extends BaseJsonConfig {
// Lucene creates a query result cache if this option is enabled
// the cache improves performance of repeatable queries
public static String TEXT_INDEX_ENABLE_QUERY_CACHE =
"enableQueryCacheForTextIndex";
+ public static String TEXT_INDEX_USE_AND_FOR_MULTI_TERM_QUERIES =
"useANDForMultiTermTextIndexQueries";
@JsonCreator
public FieldConfig(@JsonProperty(value = "name", required = true) String
name,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]