This is an automated email from the ASF dual-hosted git repository.
tingchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new b657699b7c [Feature] Support configurable Lucene analyzer with args
and configurable query parser (#13003)
b657699b7c is described below
commit b657699b7c4a3320154131cd89d24c6b73b4ce06
Author: Jack Luo <[email protected]>
AuthorDate: Fri Aug 2 08:19:29 2024 +0800
[Feature] Support configurable Lucene analyzer with args and configurable
query parser (#13003)
* Support custom Lucene analyzer with arguments and custom query parser
* Add Apache license header to CsvParserTest.java
* Adjusted dependency import order for javax.annotation.Nullable in
CsvParser.java
* Applied mvn spotless:apply on CsvParserTest.java
* Correct a spelling mistake in the comments in RealtimeLuceneTextIndex.java
* Adjusted parsing methods naming to improve clarity in TextIndexUtils.java
* Removed redundant reflection logic in TextIndexUtils.java
* Removed redundant string replacement Streaming pipeline in CsvParser.java
* Break up long line over 120 char into two lines in TextIndexUtils.java
* Adjusted the scenarios of when default Analyzers with custom arguments
should be used.
* Emit Class.getName() instead of Class.getCanonicalName() in logs within
RealtimeLuceneTextIndex.java
* Moved `||` operator in if condition to a new line to resolve style issue.
* Addressed code review concerns from @Bill-hbrhbr
* Fix style error.
* Fix unit test.
* Run maven spotless:apply
* Patched argument parsing bug.
* Updated PR with latest code base.
* Addressed comments from @chenboat
* Fix build issues resulting from merge conflicts
* Removed trailing white-space
* Fix style-check errors
* Fix default Stopwords default initialization values
* Refactored builder code in unit test inside
LuceneMutableTextIndexTest.java
Signed-off-by: Jack Luo <[email protected]>
* Fix style bug
Signed-off-by: Jack Luo <[email protected]>
---------
Signed-off-by: Jack Luo <[email protected]>
---
.../invertedindex/RealtimeLuceneTextIndex.java | 41 ++++-
.../creator/impl/text/LuceneTextIndexCreator.java | 10 +-
.../index/readers/text/LuceneTextIndexReader.java | 27 ++--
.../segment/index/text/TextIndexConfigBuilder.java | 22 +++
.../local/segment/store/TextIndexUtils.java | 174 ++++++++++++++++++++-
.../converter/RealtimeSegmentConverterTest.java | 9 +-
.../invertedindex/LuceneMutableTextIndexTest.java | 143 ++++++++++++++++-
.../NativeAndLuceneMutableTextIndexTest.java | 7 +-
.../segment/store/FilePerIndexDirectoryTest.java | 8 +-
.../store/SingleFileIndexDirectoryTest.java | 8 +-
.../pinot/segment/spi/index/TextIndexConfig.java | 93 ++++++++++-
.../apache/pinot/segment/spi/utils/CsvParser.java | 83 ++++++++++
.../pinot/segment/spi/utils/CsvParserTest.java | 49 ++++++
.../apache/pinot/spi/config/table/FieldConfig.java | 5 +
14 files changed, 627 insertions(+), 52 deletions(-)
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java
index 961f028629..0c854e9aca 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java
@@ -19,18 +19,20 @@
package org.apache.pinot.segment.local.realtime.impl.invertedindex;
import java.io.File;
+import java.lang.reflect.Constructor;
import java.util.concurrent.Callable;
import java.util.concurrent.Future;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.queryparser.classic.QueryParserBase;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SearcherManager;
import org.apache.pinot.common.utils.LLCSegmentName;
import org.apache.pinot.segment.local.indexsegment.mutable.MutableSegmentImpl;
import
org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
+import org.apache.pinot.segment.local.segment.store.TextIndexUtils;
import org.apache.pinot.segment.local.utils.LuceneTextIndexUtils;
import org.apache.pinot.segment.spi.index.TextIndexConfig;
import org.apache.pinot.segment.spi.index.mutable.MutableTextIndex;
@@ -53,6 +55,7 @@ public class RealtimeLuceneTextIndex implements
MutableTextIndex {
private final LuceneTextIndexCreator _indexCreator;
private SearcherManager _searcherManager;
private Analyzer _analyzer;
+ private Constructor<QueryParserBase> _queryParserClassConstructor;
private final String _column;
private final String _segmentName;
private final boolean _reuseMutableIndex;
@@ -91,6 +94,8 @@ public class RealtimeLuceneTextIndex implements
MutableTextIndex {
llcSegmentName.getPartitionGroupId(), _indexCreator::getNumDocs);
_searcherManager.addListener(_refreshListener);
_analyzer = _indexCreator.getIndexWriter().getConfig().getAnalyzer();
+ _queryParserClassConstructor =
+
TextIndexUtils.getQueryParserWithStringAndAnalyzerTypeConstructor(config.getLuceneQueryParserClass());
_enablePrefixSuffixMatchingInPhraseQueries =
config.isEnablePrefixSuffixMatchingInPhraseQueries();
_reuseMutableIndex = config.isReuseMutableIndex();
@@ -137,12 +142,23 @@ public class RealtimeLuceneTextIndex implements
MutableTextIndex {
Callable<MutableRoaringBitmap> searchCallable = () -> {
IndexSearcher indexSearcher = null;
try {
- QueryParser parser = new QueryParser(_column, _analyzer);
+ // Lucene query parsers are generally stateful and a new instance must
be created per query.
+ QueryParserBase parser =
_queryParserClassConstructor.newInstance(_column, _analyzer);
if (_enablePrefixSuffixMatchingInPhraseQueries) {
+ // Note: Lucene's built-in QueryParser has limited wildcard
functionality in phrase queries. It does not use
+ // the provided analyzer when wildcards are present, defaulting to
the default analyzer for tokenization.
+ // Additionally, it does not support wildcards that span across
terms.
+ // For more details, see:
https://github.com/elastic/elasticsearch/issues/22540
+ // Workaround: Use a custom query parser that correctly implements
wildcard searches.
parser.setAllowLeadingWildcard(true);
}
Query query = parser.parse(searchQuery);
if (_enablePrefixSuffixMatchingInPhraseQueries) {
+ // Note: Lucene's built-in QueryParser has limited wildcard
functionality in phrase queries. It does not use
+ // the provided analyzer when wildcards are present, defaulting to
the default analyzer for tokenization.
+ // Additionally, it does not support wildcards that span across
terms.
+ // For more details, see:
https://github.com/elastic/elasticsearch/issues/22540
+ // Workaround: Use a custom query parser that correctly implements
wildcard searches.
query = LuceneTextIndexUtils.convertToMultiTermSpanQuery(query);
}
indexSearcher = _searcherManager.acquire();
@@ -195,6 +211,27 @@ public class RealtimeLuceneTextIndex implements
MutableTextIndex {
return actualDocIDs;
}
+ private Constructor<QueryParserBase>
getQueryParserWithStringAndAnalyzerTypeConstructor(String queryParserClassName)
+ throws ReflectiveOperationException {
+ // Fail-fast if the query parser is specified class is not QueryParseBase
class
+ final Class<?> queryParserClass = Class.forName(queryParserClassName);
+ if (!QueryParserBase.class.isAssignableFrom(queryParserClass)) {
+ throw new ReflectiveOperationException("The specified lucene query
parser class " + queryParserClassName
+ + " is not assignable from " + QueryParserBase.class.getName());
+ }
+ // Fail-fast if the query parser does not have the required constructor
used by this class
+ try {
+ queryParserClass.getConstructor(String.class, Analyzer.class);
+ } catch (NoSuchMethodException ex) {
+ throw new NoSuchMethodException("The specified lucene query parser class
" + queryParserClassName
+ + " is not assignable because the class does not have the
required constructor method with parameter "
+ + "type [String.class, Analyzer.class]"
+ );
+ }
+
+ return (Constructor<QueryParserBase>)
queryParserClass.getConstructor(String.class, Analyzer.class);
+ }
+
@Override
public void commit() {
if (!_reuseMutableIndex) {
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
index a5262b489f..d38e2c04fe 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
@@ -27,7 +27,6 @@ import javax.annotation.Nullable;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StoredField;
@@ -119,14 +118,7 @@ public class LuceneTextIndexCreator extends
AbstractTextIndexCreator {
// to V3 if segmentVersion is set to V3 in SegmentGeneratorConfig.
_indexFile = getV1TextIndexFile(segmentIndexDir);
- Analyzer luceneAnalyzer;
- if (luceneAnalyzerClass.isEmpty() ||
luceneAnalyzerClass.equals(StandardAnalyzer.class.getName())) {
- luceneAnalyzer =
TextIndexUtils.getStandardAnalyzerWithCustomizedStopWords(config.getStopWordsInclude(),
- config.getStopWordsExclude());
- } else {
- luceneAnalyzer =
TextIndexUtils.getAnalyzerFromClassName(luceneAnalyzerClass);
- }
-
+ Analyzer luceneAnalyzer = TextIndexUtils.getAnalyzer(config);
IndexWriterConfig indexWriterConfig = new
IndexWriterConfig(luceneAnalyzer);
indexWriterConfig.setRAMBufferSizeMB(config.getLuceneMaxBufferSizeMB());
indexWriterConfig.setCommitOnClose(commit);
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java
index ed027903b9..ff5690d043 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java
@@ -21,15 +21,16 @@ package
org.apache.pinot.segment.local.segment.index.readers.text;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
+import java.lang.reflect.Constructor;
import java.nio.ByteOrder;
import java.util.Map;
import javax.annotation.Nullable;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.queryparser.classic.QueryParserBase;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
@@ -67,6 +68,8 @@ public class LuceneTextIndexReader implements TextIndexReader
{
private final DocIdTranslator _docIdTranslator;
private final Analyzer _analyzer;
private boolean _useANDForMultiTermQueries = false;
+ private final String _queryParserClass;
+ private Constructor<QueryParserBase> _queryParserClassConstructor;
private boolean _enablePrefixSuffixMatchingInPhraseQueries = false;
public LuceneTextIndexReader(String column, File indexDir, int numDocs,
TextIndexConfig config) {
@@ -90,10 +93,10 @@ public class LuceneTextIndexReader implements
TextIndexReader {
// TODO: consider using a threshold of num docs per segment to decide
between building
// mapping file upfront on segment load v/s on-the-fly during query
processing
_docIdTranslator = new DocIdTranslator(indexDir, _column, numDocs,
_indexSearcher);
- String luceneAnalyzerClass = config.getLuceneAnalyzerClass();
- _analyzer = luceneAnalyzerClass.equals(StandardAnalyzer.class.getName())
- ?
TextIndexUtils.getStandardAnalyzerWithCustomizedStopWords(config.getStopWordsInclude(),
- config.getStopWordsExclude()) :
TextIndexUtils.getAnalyzerFromClassName(luceneAnalyzerClass);
+ _analyzer = TextIndexUtils.getAnalyzer(config);
+ _queryParserClass = config.getLuceneQueryParserClass();
+ _queryParserClassConstructor =
+
TextIndexUtils.getQueryParserWithStringAndAnalyzerTypeConstructor(_queryParserClass);
LOGGER.info("Successfully read lucene index for {} from {}", _column,
indexDir);
} catch (Exception e) {
LOGGER.error("Failed to instantiate Lucene text index reader for column
{}, exception {}", column,
@@ -151,20 +154,20 @@ public class LuceneTextIndexReader implements
TextIndexReader {
MutableRoaringBitmap docIds = new MutableRoaringBitmap();
Collector docIDCollector = new LuceneDocIdCollector(docIds,
_docIdTranslator);
try {
- // Lucene Query Parser is JavaCC based. It is stateful and should
- // be instantiated per query. Analyzer on the other hand is stateless
- // and can be created upfront.
- QueryParser parser = new QueryParser(_column, _analyzer);
+ // Lucene query parsers are generally stateful and a new instance must
be created per query.
+ QueryParserBase parser =
_queryParserClassConstructor.newInstance(_column, _analyzer);
// Phrase search with prefix/suffix matching may have leading *. E.g.,
`*pache pinot` which can be stripped by
// the query parser. To support the feature, we need to explicitly set
the config to be true.
- if (_enablePrefixSuffixMatchingInPhraseQueries) {
+ if
(_queryParserClass.equals("org.apache.lucene.queryparser.classic.QueryParser")
+ && _enablePrefixSuffixMatchingInPhraseQueries) {
parser.setAllowLeadingWildcard(true);
}
- if (_useANDForMultiTermQueries) {
+ if
(_queryParserClass.equals("org.apache.lucene.queryparser.classic.QueryParser")
&& _useANDForMultiTermQueries) {
parser.setDefaultOperator(QueryParser.Operator.AND);
}
Query query = parser.parse(searchQuery);
- if (_enablePrefixSuffixMatchingInPhraseQueries) {
+ if
(_queryParserClass.equals("org.apache.lucene.queryparser.classic.QueryParser")
+ && _enablePrefixSuffixMatchingInPhraseQueries) {
query = LuceneTextIndexUtils.convertToMultiTermSpanQuery(query);
}
_indexSearcher.search(query, docIDCollector);
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/TextIndexConfigBuilder.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/TextIndexConfigBuilder.java
index 5d07fb788d..99516fe511 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/TextIndexConfigBuilder.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/TextIndexConfigBuilder.java
@@ -23,11 +23,16 @@ import java.util.Map;
import javax.annotation.Nullable;
import org.apache.pinot.segment.local.segment.store.TextIndexUtils;
import org.apache.pinot.segment.spi.index.TextIndexConfig;
+import org.apache.pinot.segment.spi.utils.CsvParser;
import org.apache.pinot.spi.config.table.FSTType;
import org.apache.pinot.spi.config.table.FieldConfig;
public class TextIndexConfigBuilder extends TextIndexConfig.AbstractBuilder {
+ public TextIndexConfigBuilder() {
+ super((FSTType) null);
+ }
+
public TextIndexConfigBuilder(@Nullable FSTType fstType) {
super(fstType);
}
@@ -67,6 +72,23 @@ public class TextIndexConfigBuilder extends
TextIndexConfig.AbstractBuilder {
_luceneAnalyzerClass =
textIndexProperties.get(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS);
}
+ // Note that we cannot depend on jackson's default behavior to
automatically coerce the comma delimited args to
+ // List<String>. This is because the args may contain comma and other
special characters such as space. Therefore,
+ // we use our own csv parser to parse the values directly.
+ if
(textIndexProperties.get(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARGS) !=
null) {
+ _luceneAnalyzerClassArgs = CsvParser.parse(
+
textIndexProperties.get(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARGS),
true, false);
+ }
+
+ if
(textIndexProperties.get(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARG_TYPES)
!= null) {
+ _luceneAnalyzerClassArgTypes = CsvParser.parse(
+
textIndexProperties.get(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARG_TYPES),
false, true);
+ }
+
+ if
(textIndexProperties.get(FieldConfig.TEXT_INDEX_LUCENE_QUERY_PARSER_CLASS) !=
null) {
+ _luceneQueryParserClass =
textIndexProperties.get(FieldConfig.TEXT_INDEX_LUCENE_QUERY_PARSER_CLASS);
+ }
+
for (Map.Entry<String, String> entry : textIndexProperties.entrySet()) {
if (entry.getKey().equalsIgnoreCase(FieldConfig.TEXT_FST_TYPE)) {
_fstType = FSTType.NATIVE;
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
index 0c2369bdd8..2d8abe40f4 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
@@ -19,6 +19,8 @@
package org.apache.pinot.segment.local.segment.store;
import java.io.File;
+import java.lang.reflect.Constructor;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
@@ -30,14 +32,19 @@ import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.queryparser.classic.QueryParserBase;
import
org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
import org.apache.pinot.segment.spi.V1Constants.Indexes;
+import org.apache.pinot.segment.spi.index.TextIndexConfig;
import org.apache.pinot.segment.spi.store.SegmentDirectoryPaths;
import org.apache.pinot.spi.config.table.FSTType;
import org.apache.pinot.spi.config.table.FieldConfig;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class TextIndexUtils {
+ private static final Logger LOGGER =
LoggerFactory.getLogger(TextIndexUtils.class);
private TextIndexUtils() {
}
@@ -111,10 +118,148 @@ public class TextIndexUtils {
.collect(Collectors.toList());
}
- public static Analyzer getAnalyzerFromClassName(String luceneAnalyzerClass)
- throws ReflectiveOperationException {
- // Support instantiation with default constructor for now unless customized
- return (Analyzer)
Class.forName(luceneAnalyzerClass).getConstructor().newInstance();
+ /**
+ * Retrieves the Lucene Analyzer class instance via reflection from the
fully qualified class name of the text config.
+ * If the class name is not specified in the config, the default
StandardAnalyzer is instantiated.
+ *
+ * @param config Pinot TextIndexConfig to fetch the configuration from
+ * @return Lucene Analyzer class instance
+ * @throws ReflectiveOperationException if instantiation via reflection fails
+ */
+ public static Analyzer getAnalyzer(TextIndexConfig config) throws
ReflectiveOperationException {
+ String luceneAnalyzerClassName = config.getLuceneAnalyzerClass();
+ List<String> luceneAnalyzerClassArgs = config.getLuceneAnalyzerClassArgs();
+ List<String> luceneAnalyzerClassArgTypes =
config.getLuceneAnalyzerClassArgTypes();
+
+ if (null == luceneAnalyzerClassName || luceneAnalyzerClassName.isEmpty()
+ ||
(luceneAnalyzerClassName.equals(StandardAnalyzer.class.getName())
+ && luceneAnalyzerClassArgs.isEmpty() &&
luceneAnalyzerClassArgTypes.isEmpty())) {
+ // When there is no analyzer defined, or when StandardAnalyzer (default)
is used without arguments,
+ // use existing logic to obtain an instance of StandardAnalyzer with
customized stop words
+ return TextIndexUtils.getStandardAnalyzerWithCustomizedStopWords(
+ config.getStopWordsInclude(), config.getStopWordsExclude());
+ }
+
+ // Custom analyzer + custom configs via reflection
+ if (luceneAnalyzerClassArgs.size() != luceneAnalyzerClassArgTypes.size()) {
+ throw new ReflectiveOperationException("Mismatch of the number of
analyzer arguments and arguments types.");
+ }
+
+ // Generate args type list
+ List<Class<?>> argClasses = new ArrayList<>();
+ for (String argType : luceneAnalyzerClassArgTypes) {
+ argClasses.add(parseSupportedTypes(argType));
+ }
+
+ // Best effort coercion to the analyzer argument type
+ // Note only a subset of class types is supported, unsupported ones can be
added in the future
+ List<Object> argValues = new ArrayList<>();
+ for (int i = 0; i < luceneAnalyzerClassArgs.size(); i++) {
+ argValues.add(parseSupportedTypeValues(luceneAnalyzerClassArgs.get(i),
argClasses.get(i)));
+ }
+
+ // Initialize the custom analyzer class with custom analyzer args
+ Class<?> luceneAnalyzerClass = Class.forName(luceneAnalyzerClassName);
+ if (!Analyzer.class.isAssignableFrom(luceneAnalyzerClass)) {
+ String exceptionMessage = "Custom analyzer must be a child of " +
Analyzer.class.getCanonicalName();
+ LOGGER.error(exceptionMessage);
+ throw new ReflectiveOperationException(exceptionMessage);
+ }
+
+ // Return a new instance of custom lucene analyzer class
+ return (Analyzer)
luceneAnalyzerClass.getConstructor(argClasses.toArray(new Class<?>[0]))
+ .newInstance(argValues.toArray(new Object[0]));
+ }
+
+ /**
+ * Parse the Java value type specified in the type string
+ * @param valueTypeString FQCN of the value type class or the name of the
primitive value type
+ * @return Class object of the value type
+ * @throws ClassNotFoundException when the value type is not supported
+ */
+ public static Class<?> parseSupportedTypes(String valueTypeString) throws
ClassNotFoundException {
+ try {
+ // Support both primitive types + class
+ switch (valueTypeString) {
+ case "java.lang.Byte.TYPE":
+ return Byte.TYPE;
+ case "java.lang.Short.TYPE":
+ return Short.TYPE;
+ case "java.lang.Integer.TYPE":
+ return Integer.TYPE;
+ case "java.lang.Long.TYPE":
+ return Long.TYPE;
+ case "java.lang.Float.TYPE":
+ return Float.TYPE;
+ case "java.lang.Double.TYPE":
+ return Double.TYPE;
+ case "java.lang.Boolean.TYPE":
+ return Boolean.TYPE;
+ case "java.lang.Character.TYPE":
+ return Character.TYPE;
+ default:
+ return Class.forName(valueTypeString);
+ }
+ } catch (ClassNotFoundException ex) {
+ LOGGER.error("Analyzer argument class type not found: " +
valueTypeString);
+ throw ex;
+ }
+ }
+
+ /**
+ * Attempt to coerce string into supported value type
+ * @param stringValue string representation of the value
+ * @param clazz of the value
+ * @return class object of the value, auto-boxed if it is a primitive type
+ * @throws ReflectiveOperationException if value cannot be coerced without
ambiguity or encountered unsupported type
+ */
+ public static Object parseSupportedTypeValues(String stringValue, Class<?>
clazz)
+ throws ReflectiveOperationException {
+ try {
+ if (clazz.equals(String.class)) {
+ return stringValue;
+ } else if (clazz.equals(Byte.class) || clazz.equals(Byte.TYPE)) {
+ return Byte.parseByte(stringValue);
+ } else if (clazz.equals(Short.class) || clazz.equals(Short.TYPE)) {
+ return Short.parseShort(stringValue);
+ } else if (clazz.equals(Integer.class) || clazz.equals(Integer.TYPE)) {
+ return Integer.parseInt(stringValue);
+ } else if (clazz.equals(Long.class) || clazz.equals(Long.TYPE)) {
+ return Long.parseLong(stringValue);
+ } else if (clazz.equals(Float.class) || clazz.equals(Float.TYPE)) {
+ return Float.parseFloat(stringValue);
+ } else if (clazz.equals(Double.class) || clazz.equals(Double.TYPE)) {
+ return Double.parseDouble(stringValue);
+ } else if (clazz.equals(Boolean.class) || clazz.equals(Boolean.TYPE)) {
+ // Note we cannot use Boolean.parseBoolean here because it treats
"abc" as false which
+ // introduces unexpected parsing results. We should validate the input
by accepting only
+ // true|false in a case-insensitive manner, for all other values,
return an exception.
+ String lowerCaseStringValue = stringValue.toLowerCase();
+ if (lowerCaseStringValue.equals("true")) {
+ return true;
+ } else if (lowerCaseStringValue.equals("false")) {
+ return false;
+ }
+ throw new ReflectiveOperationException();
+ } else if (clazz.equals(Character.class) ||
clazz.equals(Character.TYPE)) {
+ if (stringValue.length() == 1) {
+ return stringValue.charAt(0);
+ }
+ throw new ReflectiveOperationException();
+ } else {
+ throw new UnsupportedOperationException();
+ }
+ } catch (NumberFormatException | ReflectiveOperationException ex) {
+ String exceptionMessage = "Custom analyzer argument cannot be coerced
from "
+ + stringValue + " to " + clazz.getName() + " type";
+ LOGGER.error(exceptionMessage);
+ throw new ReflectiveOperationException(exceptionMessage);
+ } catch (UnsupportedOperationException ex) {
+ // In the future, consider adding more common serdes for common complex
types used within Lucene
+ String exceptionMessage = "Custom analyzer argument does not support " +
clazz.getName() + " type";
+ LOGGER.error(exceptionMessage);
+ throw new ReflectiveOperationException(exceptionMessage);
+ }
}
public static StandardAnalyzer
getStandardAnalyzerWithCustomizedStopWords(@Nullable List<String>
stopWordsInclude,
@@ -128,4 +273,25 @@ public class TextIndexUtils {
}
return new StandardAnalyzer(new CharArraySet(stopWordSet, true));
}
+
+ public static Constructor<QueryParserBase>
getQueryParserWithStringAndAnalyzerTypeConstructor(
+ String queryParserClassName) throws ReflectiveOperationException {
+ // Fail-fast if the query parser is specified class is not QueryParseBase
class
+ final Class<?> queryParserClass = Class.forName(queryParserClassName);
+ if (!QueryParserBase.class.isAssignableFrom(queryParserClass)) {
+ throw new ReflectiveOperationException("The specified lucene query
parser class " + queryParserClassName
+ + " is not assignable from " + QueryParserBase.class.getName());
+ }
+ // Fail-fast if the query parser does not have the required constructor
used by this class
+ try {
+ queryParserClass.getConstructor(String.class, Analyzer.class);
+ } catch (NoSuchMethodException ex) {
+ throw new NoSuchMethodException("The specified lucene query parser class
" + queryParserClassName
+ + " is not assignable from does not have the required
constructor method with parameter type "
+ + "[String.class, Analyzer.class]"
+ );
+ }
+
+ return (Constructor<QueryParserBase>)
queryParserClass.getConstructor(String.class, Analyzer.class);
+ }
}
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
index 84254060b3..2618930bf6 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
@@ -42,6 +42,7 @@ import
org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLucene
import
org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLuceneTextIndexSearcherPool;
import
org.apache.pinot.segment.local.segment.index.column.PhysicalColumnIndexContainer;
import org.apache.pinot.segment.local.segment.index.loader.IndexLoadingConfig;
+import
org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
import org.apache.pinot.segment.local.segment.store.SegmentLocalFSDirectory;
import
org.apache.pinot.segment.local.segment.virtualcolumn.VirtualColumnProviderFactory;
import org.apache.pinot.segment.spi.ColumnMetadata;
@@ -480,9 +481,11 @@ public class RealtimeSegmentConverterTest {
String tableNameWithType = tableConfig.getTableName();
String segmentName = "testTable__0__0__123456";
IndexingConfig indexingConfig = tableConfig.getIndexingConfig();
- TextIndexConfig textIndexConfig =
- new TextIndexConfig(false, null, null, false, false,
Collections.emptyList(), Collections.emptyList(), false,
- 500, null, false, reuseMutableIndex,
luceneNRTCachingDirectoryMaxBufferSizeMB);
+ TextIndexConfig textIndexConfig = new TextIndexConfigBuilder()
+ .withUseANDForMultiTermQueries(false)
+ .withReuseMutableIndex(reuseMutableIndex)
+
.withLuceneNRTCachingDirectoryMaxBufferSizeMB(luceneNRTCachingDirectoryMaxBufferSizeMB)
+ .build();
RealtimeSegmentConfig.Builder realtimeSegmentConfigBuilder =
new
RealtimeSegmentConfig.Builder().setTableNameWithType(tableNameWithType).setSegmentName(segmentName)
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/LuceneMutableTextIndexTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/LuceneMutableTextIndexTest.java
index cc46486606..d88c134b62 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/LuceneMutableTextIndexTest.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/LuceneMutableTextIndexTest.java
@@ -24,8 +24,13 @@ import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
+import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.FileUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.pinot.common.metrics.ServerMetrics;
+import
org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
import org.apache.pinot.segment.spi.index.TextIndexConfig;
import org.apache.pinot.util.TestUtils;
import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
@@ -39,11 +44,13 @@ import static org.testng.Assert.assertEquals;
public class LuceneMutableTextIndexTest {
+ private static final AtomicInteger SEGMENT_NAME_SUFFIX_COUNTER = new
AtomicInteger(0);
private static final File INDEX_DIR = new File(FileUtils.getTempDirectory(),
"LuceneMutableIndexTest");
private static final String TEXT_COLUMN_NAME = "testColumnName";
+ private static final String CUSTOM_ANALYZER_FQCN =
CustomAnalyzer.class.getName();
+ private static final String CUSTOM_QUERY_PARSER_FQCN =
CustomQueryParser.class.getName();
private static final RealtimeLuceneTextIndexSearcherPool SEARCHER_POOL =
RealtimeLuceneTextIndexSearcherPool.init(1);
-
private RealtimeLuceneTextIndex _realtimeLuceneTextIndex;
public LuceneMutableTextIndexTest() {
@@ -51,6 +58,96 @@ public class LuceneMutableTextIndexTest {
ServerMetrics.register(mock(ServerMetrics.class));
}
+ @Test
+ public void testDefaultAnalyzerAndDefaultQueryParser() {
+ // Test queries with standard analyzer with default configurations used by
Pinot
+ configureIndex(null, null, null, null);
+ assertEquals(_realtimeLuceneTextIndex.getDocIds("stream"),
ImmutableRoaringBitmap.bitmapOf(0));
+ assertEquals(_realtimeLuceneTextIndex.getDocIds("/.*house.*/"),
ImmutableRoaringBitmap.bitmapOf(1));
+ assertEquals(_realtimeLuceneTextIndex.getDocIds("invalid"),
ImmutableRoaringBitmap.bitmapOf());
+ }
+
+ @Test
+ public void testCustomAnalyzerWithNoArgsAndDefaultQueryParser() {
+ // Test query with CustomKeywordAnalyzer without any args
+ configureIndex(CUSTOM_ANALYZER_FQCN, null, null, null);
+ assertEquals(_realtimeLuceneTextIndex.getDocIds(
+ "/.*processing for data ware.*/"),
ImmutableRoaringBitmap.bitmapOf(1));
+ assertEquals(_realtimeLuceneTextIndex.getDocIds(
+ "columnar processing for data warehouses"),
ImmutableRoaringBitmap.bitmapOf(1));
+ }
+
+ @Test
+ public void testCustomAnalyzerWithNoArgsAndCustomQueryParser() {
+ // Test queries with CustomKeywordAnalyzer without any args and
CustomQueryParser
+ configureIndex(CUSTOM_ANALYZER_FQCN, null, null, CUSTOM_QUERY_PARSER_FQCN);
+ assertEquals(_realtimeLuceneTextIndex.getDocIds(
+ "/.*processing for data ware.*/"),
ImmutableRoaringBitmap.bitmapOf(1));
+ assertEquals(_realtimeLuceneTextIndex.getDocIds(
+ "columnar processing for data warehouses"),
ImmutableRoaringBitmap.bitmapOf(1));
+ }
+
+ @Test
+ public void testCustomAnalyzerWithTwoStringArgsAndCustomQueryParser() {
+ // Test queries with CustomKeywordAnalyzer with two java.lang.String args
and CustomQueryParser
+ configureIndex(CUSTOM_ANALYZER_FQCN,
+ "a,b", "java.lang.String, java.lang.String",
CUSTOM_QUERY_PARSER_FQCN);
+ assertEquals(_realtimeLuceneTextIndex.getDocIds(
+ "/.*processing for data ware.*/"),
ImmutableRoaringBitmap.bitmapOf(1));
+ assertEquals(_realtimeLuceneTextIndex.getDocIds(
+ "columnar processing for data warehouses"),
ImmutableRoaringBitmap.bitmapOf(1));
+ }
+
+ @Test
+ public void
testCustomAnalyzerWithOneStringOneIntegerParametersAndCustomQueryParser() {
+ // Test queries with CustomKeywordAnalyzer w/ two String.class args and
ExtendedQueryParser
+ configureIndex(CUSTOM_ANALYZER_FQCN,
+ "a,123", "java.lang.String,java.lang.Integer",
CUSTOM_QUERY_PARSER_FQCN);
+ assertEquals(_realtimeLuceneTextIndex.getDocIds(
+ "/.*processing for data ware.*/"),
ImmutableRoaringBitmap.bitmapOf(1));
+ assertEquals(_realtimeLuceneTextIndex.getDocIds(
+ "columnar processing for data warehouses"),
ImmutableRoaringBitmap.bitmapOf(1));
+ }
+
+ @Test
+ public void
testCustomAnalyzerWithOnePrimitiveIntParametersAndCustomQueryParser() {
+ // Test queries with CustomKeywordAnalyzer w/ two String.class args and
ExtendedQueryParser
+ configureIndex(CUSTOM_ANALYZER_FQCN,
+ "123", "java.lang.Integer.TYPE", CUSTOM_QUERY_PARSER_FQCN);
+ assertEquals(_realtimeLuceneTextIndex.getDocIds(
+ "/.*processing for data ware.*/"),
ImmutableRoaringBitmap.bitmapOf(1));
+ assertEquals(_realtimeLuceneTextIndex.getDocIds(
+ "columnar processing for data warehouses"),
ImmutableRoaringBitmap.bitmapOf(1));
+ }
+
+ private static class CustomQueryParser extends QueryParser {
+ public CustomQueryParser(String field, Analyzer analyzer) {
+ super(field, analyzer);
+ }
+ }
+
+ public static class CustomAnalyzer extends Analyzer {
+ public CustomAnalyzer() {
+ super();
+ }
+
+ public CustomAnalyzer(String stringArg1, String stringArg2) {
+ super();
+ }
+
+ public CustomAnalyzer(String stringArg1, Integer integerArg2) {
+ super();
+ }
+
+ public CustomAnalyzer(int intArg2) {
+ super();
+ }
+
+ protected Analyzer.TokenStreamComponents createComponents(String
fieldName) {
+ return new Analyzer.TokenStreamComponents(new KeywordTokenizer());
+ }
+ }
+
private String[][] getTextData() {
return new String[][]{
{"realtime stream processing"}, {"publish subscribe", "columnar
processing for data warehouses", "concurrency"}
@@ -63,13 +160,27 @@ public class LuceneMutableTextIndexTest {
};
}
- @BeforeClass
- public void setUp()
- throws Exception {
- TextIndexConfig config =
- new TextIndexConfig(false, null, null, false, false, null, null, true,
500, null, false, false, 0);
- _realtimeLuceneTextIndex =
- new RealtimeLuceneTextIndex(TEXT_COLUMN_NAME, INDEX_DIR,
"table__0__1__20240602T0014Z", config);
+ private void configureIndex(String analyzerClass, String analyzerClassArgs,
String analyzerClassArgTypes,
+ String queryParserClass) {
+ TextIndexConfigBuilder builder = new TextIndexConfigBuilder();
+ if (null != analyzerClass) {
+ builder.withLuceneAnalyzerClass(analyzerClass);
+ }
+ if (null != analyzerClassArgs) {
+ builder.withLuceneAnalyzerClassArgs(analyzerClassArgs);
+ }
+ if (null != analyzerClassArgTypes) {
+ builder.withLuceneAnalyzerClassArgTypes(analyzerClassArgTypes);
+ }
+ if (null != queryParserClass) {
+ builder.withLuceneQueryParserClass(queryParserClass);
+ }
+ TextIndexConfig config =
builder.withUseANDForMultiTermQueries(false).build();
+
+ // Note that segment name must be unique on each query setup, otherwise
`testQueryCancellationIsSuccessful` method
+ // will cause unit test to fail due to inability to release a lock.
+ _realtimeLuceneTextIndex = new RealtimeLuceneTextIndex(TEXT_COLUMN_NAME,
INDEX_DIR,
+ "table__0__1__20240601T1818Z" +
SEGMENT_NAME_SUFFIX_COUNTER.getAndIncrement(), config);
String[][] documents = getTextData();
String[][] repeatedDocuments = getRepeatedData();
@@ -82,6 +193,22 @@ public class LuceneMutableTextIndexTest {
_realtimeLuceneTextIndex.add(row);
}
}
+
+ // ensure searches work after .commit() is called
+ _realtimeLuceneTextIndex.commit();
+
+ // sleep for index refresh
+ try {
+ Thread.sleep(100);
+ } catch (Exception e) {
+ // no-op
+ }
+ }
+
+ @BeforeClass
+ public void setUp()
+ throws Exception {
+ RealtimeLuceneIndexRefreshManager.getInstance().reset();
}
@AfterClass
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java
index 84765353b0..4a463ebc30 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java
@@ -24,6 +24,7 @@ import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.search.SearcherManager;
import org.apache.pinot.common.metrics.ServerMetrics;
+import
org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
import org.apache.pinot.segment.spi.index.TextIndexConfig;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
@@ -75,8 +76,10 @@ public class NativeAndLuceneMutableTextIndexTest {
throws Exception {
RealtimeLuceneIndexRefreshManager.init(1, 10);
ServerMetrics.register(mock(ServerMetrics.class));
- TextIndexConfig config =
- new TextIndexConfig(false, null, null, false, false, null, null, true,
500, null, false, false, 0);
+ TextIndexConfig config = new TextIndexConfigBuilder()
+ .withUseANDForMultiTermQueries(false)
+ .build();
+
_realtimeLuceneTextIndex =
new RealtimeLuceneTextIndex(TEXT_COLUMN_NAME, INDEX_DIR,
"table__0__1__20240602T0014Z", config);
_nativeMutableTextIndex = new NativeMutableTextIndex(TEXT_COLUMN_NAME);
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/FilePerIndexDirectoryTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/FilePerIndexDirectoryTest.java
index 65a1621dfc..68f4cc6cbe 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/FilePerIndexDirectoryTest.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/FilePerIndexDirectoryTest.java
@@ -201,8 +201,8 @@ public class FilePerIndexDirectoryTest {
@Test
public void testRemoveTextIndices()
throws IOException {
- TextIndexConfig config =
- new TextIndexConfig(false, null, null, false, false, null, null, true,
500, null, false, false, 0);
+ TextIndexConfig config = new TextIndexConfig(false, null, null, false,
false, null, null, true, 500, null, null,
+ null, null, false, false, 0);
try (FilePerIndexDirectory fpi = new FilePerIndexDirectory(TEMP_DIR,
_segmentMetadata, ReadMode.mmap);
LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo",
TEMP_DIR, true, false, null, null,
config);
@@ -265,8 +265,8 @@ public class FilePerIndexDirectoryTest {
@Test
public void testGetColumnIndices()
throws IOException {
- TextIndexConfig config =
- new TextIndexConfig(false, null, null, false, false, null, null, true,
500, null, false, false, 0);
+ TextIndexConfig config = new TextIndexConfig(false, null, null, false,
false, null, null, true, 500, null, null,
+ null, null, false, false, 0);
// Write sth to buffers and flush them to index files on disk
try (FilePerIndexDirectory fpi = new FilePerIndexDirectory(TEMP_DIR,
_segmentMetadata, ReadMode.mmap);
LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo",
TEMP_DIR, true, false, null, null,
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/SingleFileIndexDirectoryTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/SingleFileIndexDirectoryTest.java
index 7e9933d777..88c58e2672 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/SingleFileIndexDirectoryTest.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/SingleFileIndexDirectoryTest.java
@@ -234,8 +234,8 @@ public class SingleFileIndexDirectoryTest {
@Test
public void testRemoveTextIndices()
throws IOException, ConfigurationException {
- TextIndexConfig config =
- new TextIndexConfig(false, null, null, false, false, null, null, true,
500, null, false, false, 0);
+ TextIndexConfig config = new TextIndexConfig(false, null, null, false,
false, null, null, true, 500, null, null,
+ null, null, false, false, 0);
try (SingleFileIndexDirectory sfd = new SingleFileIndexDirectory(TEMP_DIR,
_segmentMetadata, ReadMode.mmap);
LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo",
TEMP_DIR, true, false, null, null,
config);
@@ -342,8 +342,8 @@ public class SingleFileIndexDirectoryTest {
@Test
public void testGetColumnIndices()
throws Exception {
- TextIndexConfig config =
- new TextIndexConfig(false, null, null, false, false, null, null, true,
500, null, false, false, 0);
+ TextIndexConfig config = new TextIndexConfig(false, null, null, false,
false, null, null, true, 500, null, null,
+ null, null, false, false, 0);
try (SingleFileIndexDirectory sfd = new SingleFileIndexDirectory(TEMP_DIR,
_segmentMetadata, ReadMode.mmap);
LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo",
TEMP_DIR, true, false, null, null,
config);
diff --git
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
index 522fb7bbf2..6f3a768837 100644
---
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
+++
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
@@ -27,6 +27,7 @@ import java.util.List;
import java.util.Map;
import java.util.Objects;
import javax.annotation.Nullable;
+import org.apache.pinot.segment.spi.utils.CsvParser;
import org.apache.pinot.spi.config.table.FSTType;
import org.apache.pinot.spi.config.table.FieldConfig;
import org.apache.pinot.spi.config.table.IndexConfig;
@@ -38,9 +39,11 @@ public class TextIndexConfig extends IndexConfig {
private static final boolean
LUCENE_INDEX_ENABLE_PREFIX_SUFFIX_MATCH_IN_PHRASE_SEARCH = false;
private static final boolean LUCENE_INDEX_REUSE_MUTABLE_INDEX = false;
private static final int
LUCENE_INDEX_NRT_CACHING_DIRECTORY_MAX_BUFFER_SIZE_MB = 0;
+
public static final TextIndexConfig DISABLED =
new TextIndexConfig(true, null, null, false, false,
Collections.emptyList(), Collections.emptyList(), false,
- LUCENE_INDEX_DEFAULT_MAX_BUFFER_SIZE_MB, null, false, false, 0);
+ LUCENE_INDEX_DEFAULT_MAX_BUFFER_SIZE_MB, null, null, null, null,
false, false, 0);
+
private final FSTType _fstType;
@Nullable
private final Object _rawValueForTextIndex;
@@ -51,6 +54,9 @@ public class TextIndexConfig extends IndexConfig {
private final boolean _luceneUseCompoundFile;
private final int _luceneMaxBufferSizeMB;
private final String _luceneAnalyzerClass;
+ private final List<String> _luceneAnalyzerClassArgs;
+ private final List<String> _luceneAnalyzerClassArgTypes;
+ private final String _luceneQueryParserClass;
private final boolean _enablePrefixSuffixMatchingInPhraseQueries;
private final boolean _reuseMutableIndex;
private final int _luceneNRTCachingDirectoryMaxBufferSizeMB;
@@ -65,6 +71,9 @@ public class TextIndexConfig extends IndexConfig {
@JsonProperty("luceneUseCompoundFile") Boolean luceneUseCompoundFile,
@JsonProperty("luceneMaxBufferSizeMB") Integer luceneMaxBufferSizeMB,
@JsonProperty("luceneAnalyzerClass") String luceneAnalyzerClass,
+ @JsonProperty("luceneAnalyzerClassArgs") String luceneAnalyzerClassArgs,
+ @JsonProperty("luceneAnalyzerClassArgTypes") String
luceneAnalyzerClassArgTypes,
+ @JsonProperty("luceneQueryParserClass") String luceneQueryParserClass,
@JsonProperty("enablePrefixSuffixMatchingInPhraseQueries") Boolean
enablePrefixSuffixMatchingInPhraseQueries,
@JsonProperty("reuseMutableIndex") Boolean reuseMutableIndex,
@JsonProperty("luceneNRTCachingDirectoryMaxBufferSizeMB") Integer
luceneNRTCachingDirectoryMaxBufferSizeMB) {
@@ -81,6 +90,14 @@ public class TextIndexConfig extends IndexConfig {
luceneMaxBufferSizeMB == null ?
LUCENE_INDEX_DEFAULT_MAX_BUFFER_SIZE_MB : luceneMaxBufferSizeMB;
_luceneAnalyzerClass = (luceneAnalyzerClass == null ||
luceneAnalyzerClass.isEmpty())
? FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_ANALYZER_CLASS :
luceneAnalyzerClass;
+
+ // Note that we cannot depend on jackson's default behavior to
automatically coerce the comma delimited args to
+ // List<String>. This is because the args may contain comma and other
special characters such as space. Therefore,
+ // we use our own csv parser to parse the values directly.
+ _luceneAnalyzerClassArgs = CsvParser.parse(luceneAnalyzerClassArgs, true,
false);
+ _luceneAnalyzerClassArgTypes =
CsvParser.parse(luceneAnalyzerClassArgTypes, false, true);
+ _luceneQueryParserClass = luceneQueryParserClass == null
+ ? FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_QUERY_PARSER_CLASS :
luceneQueryParserClass;
_enablePrefixSuffixMatchingInPhraseQueries =
enablePrefixSuffixMatchingInPhraseQueries == null ?
LUCENE_INDEX_ENABLE_PREFIX_SUFFIX_MATCH_IN_PHRASE_SEARCH
: enablePrefixSuffixMatchingInPhraseQueries;
@@ -141,6 +158,29 @@ public class TextIndexConfig extends IndexConfig {
return _luceneAnalyzerClass;
}
+ /**
+ * Lucene analyzer arguments in String type. At runtime, the string
representation are best-effort coerced into the
+ * proper type with the fully-qualified value type specified in
luceneAnalyzerClassArgTypes
+ */
+ public List<String> getLuceneAnalyzerClassArgs() {
+ return _luceneAnalyzerClassArgs;
+ }
+
+ /**
+ * Lucene analyzer fully qualified argument value types for each argument.
At runtime, the values specified in the
+ * luceneAnalyserClassArgs (string representation) are best-effort coerced
into the specified value type.
+ */
+ public List<String> getLuceneAnalyzerClassArgTypes() {
+ return _luceneAnalyzerClassArgTypes;
+ }
+
+ /**
+ * Lucene query parser fully qualified class name specifying which lucene
query parser class to use for query parsing
+ */
+ public String getLuceneQueryParserClass() {
+ return _luceneQueryParserClass;
+ }
+
/**
* Whether to enable prefix and suffix wildcard term matching (i.e.,
.*value for prefix and value.* for suffix
* term matching) in a phrase query. By default, Pinot today treats .* in a
phrase query like ".*value str1 value.*"
@@ -171,6 +211,9 @@ public class TextIndexConfig extends IndexConfig {
protected boolean _luceneUseCompoundFile =
LUCENE_INDEX_DEFAULT_USE_COMPOUND_FILE;
protected int _luceneMaxBufferSizeMB =
LUCENE_INDEX_DEFAULT_MAX_BUFFER_SIZE_MB;
protected String _luceneAnalyzerClass =
FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_ANALYZER_CLASS;
+ protected List<String> _luceneAnalyzerClassArgs = new ArrayList<>();
+ protected List<String> _luceneAnalyzerClassArgTypes = new ArrayList<>();
+ protected String _luceneQueryParserClass =
FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_QUERY_PARSER_CLASS;
protected boolean _enablePrefixSuffixMatchingInPhraseQueries =
LUCENE_INDEX_ENABLE_PREFIX_SUFFIX_MATCH_IN_PHRASE_SEARCH;
protected boolean _reuseMutableIndex = LUCENE_INDEX_REUSE_MUTABLE_INDEX;
@@ -189,6 +232,9 @@ public class TextIndexConfig extends IndexConfig {
_luceneUseCompoundFile = other._luceneUseCompoundFile;
_luceneMaxBufferSizeMB = other._luceneMaxBufferSizeMB;
_luceneAnalyzerClass = other._luceneAnalyzerClass;
+ _luceneAnalyzerClassArgs = other._luceneAnalyzerClassArgs;
+ _luceneAnalyzerClassArgTypes = other._luceneAnalyzerClassArgTypes;
+ _luceneQueryParserClass = other._luceneQueryParserClass;
_enablePrefixSuffixMatchingInPhraseQueries =
other._enablePrefixSuffixMatchingInPhraseQueries;
_reuseMutableIndex = other._reuseMutableIndex;
_luceneNRTCachingDirectoryMaxBufferSizeMB =
other._luceneNRTCachingDirectoryMaxBufferSizeMB;
@@ -197,7 +243,10 @@ public class TextIndexConfig extends IndexConfig {
public TextIndexConfig build() {
return new TextIndexConfig(false, _fstType, _rawValueForTextIndex,
_enableQueryCache, _useANDForMultiTermQueries,
_stopWordsInclude, _stopWordsExclude, _luceneUseCompoundFile,
_luceneMaxBufferSizeMB, _luceneAnalyzerClass,
- _enablePrefixSuffixMatchingInPhraseQueries, _reuseMutableIndex,
_luceneNRTCachingDirectoryMaxBufferSizeMB);
+ CsvParser.serialize(_luceneAnalyzerClassArgs, true, false),
+ CsvParser.serialize(_luceneAnalyzerClassArgTypes, true, false),
+ _luceneQueryParserClass, _enablePrefixSuffixMatchingInPhraseQueries,
_reuseMutableIndex,
+ _luceneNRTCachingDirectoryMaxBufferSizeMB);
}
public abstract AbstractBuilder withProperties(@Nullable Map<String,
String> textIndexProperties);
@@ -207,6 +256,11 @@ public class TextIndexConfig extends IndexConfig {
return this;
}
+ public AbstractBuilder withUseANDForMultiTermQueries(boolean
useANDForMultiTermQueries) {
+ _useANDForMultiTermQueries = useANDForMultiTermQueries;
+ return this;
+ }
+
public AbstractBuilder withStopWordsInclude(List<String> stopWordsInclude)
{
_stopWordsInclude = stopWordsInclude;
return this;
@@ -232,6 +286,31 @@ public class TextIndexConfig extends IndexConfig {
return this;
}
+ public AbstractBuilder withLuceneAnalyzerClassArgs(String
luceneAnalyzerClassArgs) {
+ _luceneAnalyzerClassArgs = CsvParser.parse(luceneAnalyzerClassArgs,
true, false);
+ return this;
+ }
+
+ public AbstractBuilder withLuceneAnalyzerClassArgs(List<String>
luceneAnalyzerClassArgs) {
+ _luceneAnalyzerClassArgs = luceneAnalyzerClassArgs;
+ return this;
+ }
+
+ public AbstractBuilder withLuceneAnalyzerClassArgTypes(String
luceneAnalyzerClassArgTypes) {
+ _luceneAnalyzerClassArgTypes =
CsvParser.parse(luceneAnalyzerClassArgTypes, false, true);
+ return this;
+ }
+
+ public AbstractBuilder withLuceneAnalyzerClassArgTypes(List<String>
luceneAnalyzerClassArgTypes) {
+ _luceneAnalyzerClassArgTypes = luceneAnalyzerClassArgTypes;
+ return this;
+ }
+
+ public AbstractBuilder withLuceneQueryParserClass(String
luceneQueryParserClass) {
+ _luceneQueryParserClass = luceneQueryParserClass;
+ return this;
+ }
+
public AbstractBuilder withEnablePrefixSuffixMatchingInPhraseQueries(
boolean enablePrefixSuffixMatchingInPhraseQueries) {
_enablePrefixSuffixMatchingInPhraseQueries =
enablePrefixSuffixMatchingInPhraseQueries;
@@ -269,14 +348,20 @@ public class TextIndexConfig extends IndexConfig {
&& _luceneNRTCachingDirectoryMaxBufferSizeMB ==
that._luceneNRTCachingDirectoryMaxBufferSizeMB
&& _fstType == that._fstType && Objects.equals(_rawValueForTextIndex,
that._rawValueForTextIndex)
&& Objects.equals(_stopWordsInclude, that._stopWordsInclude) &&
Objects.equals(_stopWordsExclude,
- that._stopWordsExclude) && Objects.equals(_luceneAnalyzerClass,
that._luceneAnalyzerClass);
+ that._stopWordsExclude) && _luceneUseCompoundFile ==
that._luceneUseCompoundFile
+ && _luceneMaxBufferSizeMB == that._luceneMaxBufferSizeMB
+ && Objects.equals(_luceneAnalyzerClass, that._luceneAnalyzerClass)
+ && Objects.equals(_luceneAnalyzerClassArgs,
that._luceneAnalyzerClassArgs)
+ && Objects.equals(_luceneAnalyzerClassArgTypes,
that._luceneAnalyzerClassArgTypes)
+ && Objects.equals(_luceneQueryParserClass,
that._luceneQueryParserClass);
}
@Override
public int hashCode() {
return Objects.hash(super.hashCode(), _fstType, _rawValueForTextIndex,
_enableQueryCache,
_useANDForMultiTermQueries, _stopWordsInclude, _stopWordsExclude,
_luceneUseCompoundFile,
- _luceneMaxBufferSizeMB, _luceneAnalyzerClass,
_enablePrefixSuffixMatchingInPhraseQueries, _reuseMutableIndex,
+ _luceneMaxBufferSizeMB, _luceneAnalyzerClass,
_luceneAnalyzerClassArgs, _luceneAnalyzerClassArgTypes,
+ _luceneQueryParserClass, _enablePrefixSuffixMatchingInPhraseQueries,
_reuseMutableIndex,
_luceneNRTCachingDirectoryMaxBufferSizeMB);
}
}
diff --git
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/utils/CsvParser.java
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/utils/CsvParser.java
new file mode 100644
index 0000000000..b4c2aeacfb
--- /dev/null
+++
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/utils/CsvParser.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.spi.utils;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import javax.annotation.Nullable;
+
+public class CsvParser {
+ private CsvParser() {
+ // Hide utility class default constructor
+ }
+
+ /**
+ * Parse the input csv string with customizable parsing behavior. Sometimes
the individual values may contain comma
+ * and other white space characters. These characters are sometimes expected
to be part of the actual argument.
+ *
+ * @param input string to split on comma
+ * @param escapeComma if true, we don't split on escaped commas, and we
replace "\," with "," after the split
+ * @param trim whether we should trim each tokenized terms
+ * @return a list of values, empty list if input is empty or null
+ */
+ public static List<String> parse(@Nullable String input, boolean
escapeComma, boolean trim) {
+ if (null == input || input.isEmpty()) {
+ return Collections.emptyList();
+ }
+
+ Stream<String> tokenStream;
+ if (escapeComma) {
+ // Use regular expression to split on "," unless it is "\,"
+ // Use a non-positive limit to apply the replacement as many times as
possible and to ensure trailing empty
+ // strings shall not be discarded
+ tokenStream = Arrays.stream(input.split("(?<!\\\\),", -1))
+ .map(s -> s.replace("\\,", ","));
+ } else {
+ tokenStream = Arrays.stream(input.split(","));
+ }
+
+ if (trim) {
+ tokenStream = tokenStream.map(String::trim);
+ }
+
+ return tokenStream.collect(Collectors.toList());
+ }
+
+ /**
+ * Parse the input list of string with customized serialization behavior.
+ * @param input containing a list of string to be serialized
+ * @param escapeComma if true, escape commas by replacing "," with "\,"
before the join
+ * @param trim whether we should trim each tokenized terms before
serialization
+ * @return serialized string representing the input list of string
+ */
+ public static String serialize(List<String> input, boolean escapeComma,
boolean trim) {
+ Stream<String> tokenStream = input.stream();
+ if (escapeComma) {
+ tokenStream = tokenStream.map(s -> s.replaceAll(",",
Matcher.quoteReplacement("\\,")));
+ }
+ if (trim) {
+ tokenStream = tokenStream.map(String::trim);
+ }
+ return tokenStream.collect(Collectors.joining(","));
+ }
+}
diff --git
a/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/utils/CsvParserTest.java
b/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/utils/CsvParserTest.java
new file mode 100644
index 0000000000..b3fd5427f4
--- /dev/null
+++
b/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/utils/CsvParserTest.java
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.spi.utils;
+
+import java.util.Arrays;
+import java.util.List;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+public class CsvParserTest {
+ @Test
+ public void testEscapeTrueTrimFalse() {
+ String input = " \\,.\n\t()[]{}\"':=-_$\\?@&|#+/,:=[]$@&|#";
+ List<String> actualParsedOutput = CsvParser.parse(input, true, false);
+ List<String> expectedParsedOutput = Arrays.asList("
,.\n\t()[]{}\"':=-_$\\?@&|#+/", ":=[]$@&|#");
+ Assert.assertEquals(actualParsedOutput, expectedParsedOutput);
+ Assert.assertEquals(CsvParser.serialize(actualParsedOutput, true,
false), input);
+ }
+
+ @Test
+ public void testEscapeTrueTrimTrue() {
+ String input = " \\,.\n\t()[]{}\"':=-_$\\?@&|#+/,:=[]$@&|#";
+ List<String> expectedOutput =
Arrays.asList(",.\n\t()[]{}\"':=-_$\\?@&|#+/", ":=[]$@&|#");
+ Assert.assertEquals(CsvParser.parse(input, true, true),
expectedOutput);
+ }
+
+ @Test
+ public void testEscapeFalseTrimTrue() {
+ String input = "abc\\,def.ghi, abc.def.ghi\n";
+ List<String> expectedOutput = Arrays.asList("abc\\", "def.ghi",
"abc.def.ghi");
+ Assert.assertEquals(CsvParser.parse(input, false, true),
expectedOutput);
+ }
+}
diff --git
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
index 9f6f6b88f8..f6e6f3f99d 100644
--- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
+++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
@@ -52,8 +52,13 @@ public class FieldConfig extends BaseJsonConfig {
public static final String TEXT_INDEX_LUCENE_USE_COMPOUND_FILE =
"luceneUseCompoundFile";
public static final String TEXT_INDEX_LUCENE_MAX_BUFFER_SIZE_MB =
"luceneMaxBufferSizeMB";
public static final String TEXT_INDEX_LUCENE_ANALYZER_CLASS =
"luceneAnalyzerClass";
+ public static final String TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARGS =
"luceneAnalyzerClassArgs";
+ public static final String TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARG_TYPES =
"luceneAnalyzerClassArgTypes";
+ public static final String TEXT_INDEX_LUCENE_QUERY_PARSER_CLASS =
"luceneQueryParserClass";
public static final String TEXT_INDEX_DEFAULT_LUCENE_ANALYZER_CLASS =
"org.apache.lucene.analysis.standard.StandardAnalyzer";
+ public static final String TEXT_INDEX_DEFAULT_LUCENE_QUERY_PARSER_CLASS =
+ "org.apache.lucene.queryparser.classic.QueryParser";
public static final String TEXT_INDEX_STOP_WORD_SEPERATOR = ",";
// "native" for native, default is Lucene
public static final String TEXT_FST_TYPE = "fstType";
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]