(pinot) branch master updated: [Feature] Support configurable Lucene analyzer with args and configurable query parser (#13003)

tingchen Thu, 01 Aug 2024 17:19:42 -0700

This is an automated email from the ASF dual-hosted git repository.

tingchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git



The following commit(s) were added to refs/heads/master by this push:
     new b657699b7c [Feature] Support configurable Lucene analyzer with args 
and configurable query parser (#13003)
b657699b7c is described below

commit b657699b7c4a3320154131cd89d24c6b73b4ce06
Author: Jack Luo <[email protected]>
AuthorDate: Fri Aug 2 08:19:29 2024 +0800

    [Feature] Support configurable Lucene analyzer with args and configurable 
query parser (#13003)
    
    * Support custom Lucene analyzer with arguments and custom query parser
    
    * Add Apache license header to CsvParserTest.java
    
    * Adjusted dependency import order for javax.annotation.Nullable in 
CsvParser.java
    
    * Applied mvn spotless:apply on CsvParserTest.java
    
    * Correct a spelling mistake in the comments in RealtimeLuceneTextIndex.java
    
    * Adjusted parsing methods naming to improve clarity in TextIndexUtils.java
    
    * Removed redundant reflection logic in TextIndexUtils.java
    
    * Removed redundant string replacement Streaming pipeline in CsvParser.java
    
    * Break up long line over 120 char into two lines in TextIndexUtils.java
    
    * Adjusted the scenarios of when default Analyzers with custom arguments 
should be used.
    
    * Emit Class.getName() instead of Class.getCanonicalName() in logs within 
RealtimeLuceneTextIndex.java
    
    * Moved `||` operator in if condition to a new line to resolve style issue.
    
    * Addressed code review concerns from @Bill-hbrhbr
    
    * Fix style error.
    
    * Fix unit test.
    
    * Run maven spotless:apply
    
    * Patched argument parsing bug.
    
    * Updated PR with latest code base.
    
    * Addressed comments from @chenboat
    
    * Fix build issues resulting from merge conflicts
    
    * Removed trailing white-space
    
    * Fix style-check errors
    
    * Fix default Stopwords default initialization values
    
    * Refactored builder code in unit test inside 
LuceneMutableTextIndexTest.java
    
    Signed-off-by: Jack Luo <[email protected]>
    
    * Fix style bug
    
    Signed-off-by: Jack Luo <[email protected]>
    
    ---------
    
    Signed-off-by: Jack Luo <[email protected]>
---
 .../invertedindex/RealtimeLuceneTextIndex.java     |  41 ++++-
 .../creator/impl/text/LuceneTextIndexCreator.java  |  10 +-
 .../index/readers/text/LuceneTextIndexReader.java  |  27 ++--
 .../segment/index/text/TextIndexConfigBuilder.java |  22 +++
 .../local/segment/store/TextIndexUtils.java        | 174 ++++++++++++++++++++-
 .../converter/RealtimeSegmentConverterTest.java    |   9 +-
 .../invertedindex/LuceneMutableTextIndexTest.java  | 143 ++++++++++++++++-
 .../NativeAndLuceneMutableTextIndexTest.java       |   7 +-
 .../segment/store/FilePerIndexDirectoryTest.java   |   8 +-
 .../store/SingleFileIndexDirectoryTest.java        |   8 +-
 .../pinot/segment/spi/index/TextIndexConfig.java   |  93 ++++++++++-
 .../apache/pinot/segment/spi/utils/CsvParser.java  |  83 ++++++++++
 .../pinot/segment/spi/utils/CsvParserTest.java     |  49 ++++++
 .../apache/pinot/spi/config/table/FieldConfig.java |   5 +
 14 files changed, 627 insertions(+), 52 deletions(-)

diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java
index 961f028629..0c854e9aca 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java
@@ -19,18 +19,20 @@
 package org.apache.pinot.segment.local.realtime.impl.invertedindex;
 
 import java.io.File;
+import java.lang.reflect.Constructor;
 import java.util.concurrent.Callable;
 import java.util.concurrent.Future;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.queryparser.classic.QueryParserBase;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.SearcherManager;
 import org.apache.pinot.common.utils.LLCSegmentName;
 import org.apache.pinot.segment.local.indexsegment.mutable.MutableSegmentImpl;
 import 
org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
+import org.apache.pinot.segment.local.segment.store.TextIndexUtils;
 import org.apache.pinot.segment.local.utils.LuceneTextIndexUtils;
 import org.apache.pinot.segment.spi.index.TextIndexConfig;
 import org.apache.pinot.segment.spi.index.mutable.MutableTextIndex;
@@ -53,6 +55,7 @@ public class RealtimeLuceneTextIndex implements 
MutableTextIndex {
   private final LuceneTextIndexCreator _indexCreator;
   private SearcherManager _searcherManager;
   private Analyzer _analyzer;
+  private Constructor<QueryParserBase> _queryParserClassConstructor;
   private final String _column;
   private final String _segmentName;
   private final boolean _reuseMutableIndex;
@@ -91,6 +94,8 @@ public class RealtimeLuceneTextIndex implements 
MutableTextIndex {
           llcSegmentName.getPartitionGroupId(), _indexCreator::getNumDocs);
       _searcherManager.addListener(_refreshListener);
       _analyzer = _indexCreator.getIndexWriter().getConfig().getAnalyzer();
+      _queryParserClassConstructor =
+          
TextIndexUtils.getQueryParserWithStringAndAnalyzerTypeConstructor(config.getLuceneQueryParserClass());
       _enablePrefixSuffixMatchingInPhraseQueries = 
config.isEnablePrefixSuffixMatchingInPhraseQueries();
       _reuseMutableIndex = config.isReuseMutableIndex();
 
@@ -137,12 +142,23 @@ public class RealtimeLuceneTextIndex implements 
MutableTextIndex {
     Callable<MutableRoaringBitmap> searchCallable = () -> {
       IndexSearcher indexSearcher = null;
       try {
-        QueryParser parser = new QueryParser(_column, _analyzer);
+        // Lucene query parsers are generally stateful and a new instance must 
be created per query.
+        QueryParserBase parser = 
_queryParserClassConstructor.newInstance(_column, _analyzer);
         if (_enablePrefixSuffixMatchingInPhraseQueries) {
+          // Note: Lucene's built-in QueryParser has limited wildcard 
functionality in phrase queries. It does not use
+          // the provided analyzer when wildcards are present, defaulting to 
the default analyzer for tokenization.
+          // Additionally, it does not support wildcards that span across 
terms.
+          // For more details, see: 
https://github.com/elastic/elasticsearch/issues/22540
+          // Workaround: Use a custom query parser that correctly implements 
wildcard searches.
           parser.setAllowLeadingWildcard(true);
         }
         Query query = parser.parse(searchQuery);
         if (_enablePrefixSuffixMatchingInPhraseQueries) {
+          // Note: Lucene's built-in QueryParser has limited wildcard 
functionality in phrase queries. It does not use
+          // the provided analyzer when wildcards are present, defaulting to 
the default analyzer for tokenization.
+          // Additionally, it does not support wildcards that span across 
terms.
+          // For more details, see: 
https://github.com/elastic/elasticsearch/issues/22540
+          // Workaround: Use a custom query parser that correctly implements 
wildcard searches.
           query = LuceneTextIndexUtils.convertToMultiTermSpanQuery(query);
         }
         indexSearcher = _searcherManager.acquire();
@@ -195,6 +211,27 @@ public class RealtimeLuceneTextIndex implements 
MutableTextIndex {
     return actualDocIDs;
   }
 
+  private Constructor<QueryParserBase> 
getQueryParserWithStringAndAnalyzerTypeConstructor(String queryParserClassName)
+          throws ReflectiveOperationException {
+    // Fail-fast if the query parser is specified class is not QueryParseBase 
class
+    final Class<?> queryParserClass = Class.forName(queryParserClassName);
+    if (!QueryParserBase.class.isAssignableFrom(queryParserClass)) {
+      throw new ReflectiveOperationException("The specified lucene query 
parser class " + queryParserClassName
+              + " is not assignable from " + QueryParserBase.class.getName());
+    }
+    // Fail-fast if the query parser does not have the required constructor 
used by this class
+    try {
+      queryParserClass.getConstructor(String.class, Analyzer.class);
+    } catch (NoSuchMethodException ex) {
+      throw new NoSuchMethodException("The specified lucene query parser class 
" + queryParserClassName
+              + " is not assignable because the class does not have the 
required constructor method with parameter "
+              + "type [String.class, Analyzer.class]"
+      );
+    }
+
+    return (Constructor<QueryParserBase>) 
queryParserClass.getConstructor(String.class, Analyzer.class);
+  }
+
   @Override
   public void commit() {
     if (!_reuseMutableIndex) {
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
index a5262b489f..d38e2c04fe 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
@@ -27,7 +27,6 @@ import javax.annotation.Nullable;
 import org.apache.commons.io.FileUtils;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.StoredField;
@@ -119,14 +118,7 @@ public class LuceneTextIndexCreator extends 
AbstractTextIndexCreator {
       // to V3 if segmentVersion is set to V3 in SegmentGeneratorConfig.
       _indexFile = getV1TextIndexFile(segmentIndexDir);
 
-      Analyzer luceneAnalyzer;
-      if (luceneAnalyzerClass.isEmpty() || 
luceneAnalyzerClass.equals(StandardAnalyzer.class.getName())) {
-        luceneAnalyzer = 
TextIndexUtils.getStandardAnalyzerWithCustomizedStopWords(config.getStopWordsInclude(),
-            config.getStopWordsExclude());
-      } else {
-        luceneAnalyzer = 
TextIndexUtils.getAnalyzerFromClassName(luceneAnalyzerClass);
-      }
-
+      Analyzer luceneAnalyzer = TextIndexUtils.getAnalyzer(config);
       IndexWriterConfig indexWriterConfig = new 
IndexWriterConfig(luceneAnalyzer);
       indexWriterConfig.setRAMBufferSizeMB(config.getLuceneMaxBufferSizeMB());
       indexWriterConfig.setCommitOnClose(commit);
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java
index ed027903b9..ff5690d043 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java
@@ -21,15 +21,16 @@ package 
org.apache.pinot.segment.local.segment.index.readers.text;
 import java.io.Closeable;
 import java.io.File;
 import java.io.IOException;
+import java.lang.reflect.Constructor;
 import java.nio.ByteOrder;
 import java.util.Map;
 import javax.annotation.Nullable;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.queryparser.classic.QueryParserBase;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
@@ -67,6 +68,8 @@ public class LuceneTextIndexReader implements TextIndexReader 
{
   private final DocIdTranslator _docIdTranslator;
   private final Analyzer _analyzer;
   private boolean _useANDForMultiTermQueries = false;
+  private final String _queryParserClass;
+  private Constructor<QueryParserBase> _queryParserClassConstructor;
   private boolean _enablePrefixSuffixMatchingInPhraseQueries = false;
 
   public LuceneTextIndexReader(String column, File indexDir, int numDocs, 
TextIndexConfig config) {
@@ -90,10 +93,10 @@ public class LuceneTextIndexReader implements 
TextIndexReader {
       // TODO: consider using a threshold of num docs per segment to decide 
between building
       // mapping file upfront on segment load v/s on-the-fly during query 
processing
       _docIdTranslator = new DocIdTranslator(indexDir, _column, numDocs, 
_indexSearcher);
-      String luceneAnalyzerClass = config.getLuceneAnalyzerClass();
-      _analyzer = luceneAnalyzerClass.equals(StandardAnalyzer.class.getName())
-          ? 
TextIndexUtils.getStandardAnalyzerWithCustomizedStopWords(config.getStopWordsInclude(),
-          config.getStopWordsExclude()) : 
TextIndexUtils.getAnalyzerFromClassName(luceneAnalyzerClass);
+      _analyzer = TextIndexUtils.getAnalyzer(config);
+      _queryParserClass = config.getLuceneQueryParserClass();
+      _queryParserClassConstructor =
+              
TextIndexUtils.getQueryParserWithStringAndAnalyzerTypeConstructor(_queryParserClass);
       LOGGER.info("Successfully read lucene index for {} from {}", _column, 
indexDir);
     } catch (Exception e) {
       LOGGER.error("Failed to instantiate Lucene text index reader for column 
{}, exception {}", column,
@@ -151,20 +154,20 @@ public class LuceneTextIndexReader implements 
TextIndexReader {
     MutableRoaringBitmap docIds = new MutableRoaringBitmap();
     Collector docIDCollector = new LuceneDocIdCollector(docIds, 
_docIdTranslator);
     try {
-      // Lucene Query Parser is JavaCC based. It is stateful and should
-      // be instantiated per query. Analyzer on the other hand is stateless
-      // and can be created upfront.
-      QueryParser parser = new QueryParser(_column, _analyzer);
+      // Lucene query parsers are generally stateful and a new instance must 
be created per query.
+      QueryParserBase parser = 
_queryParserClassConstructor.newInstance(_column, _analyzer);
       // Phrase search with prefix/suffix matching may have leading *. E.g., 
`*pache pinot` which can be stripped by
       // the query parser. To support the feature, we need to explicitly set 
the config to be true.
-      if (_enablePrefixSuffixMatchingInPhraseQueries) {
+      if 
(_queryParserClass.equals("org.apache.lucene.queryparser.classic.QueryParser")
+              && _enablePrefixSuffixMatchingInPhraseQueries) {
         parser.setAllowLeadingWildcard(true);
       }
-      if (_useANDForMultiTermQueries) {
+      if 
(_queryParserClass.equals("org.apache.lucene.queryparser.classic.QueryParser") 
&& _useANDForMultiTermQueries) {
         parser.setDefaultOperator(QueryParser.Operator.AND);
       }
       Query query = parser.parse(searchQuery);
-      if (_enablePrefixSuffixMatchingInPhraseQueries) {
+      if 
(_queryParserClass.equals("org.apache.lucene.queryparser.classic.QueryParser")
+              && _enablePrefixSuffixMatchingInPhraseQueries) {
         query = LuceneTextIndexUtils.convertToMultiTermSpanQuery(query);
       }
       _indexSearcher.search(query, docIDCollector);
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/TextIndexConfigBuilder.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/TextIndexConfigBuilder.java
index 5d07fb788d..99516fe511 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/TextIndexConfigBuilder.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/TextIndexConfigBuilder.java
@@ -23,11 +23,16 @@ import java.util.Map;
 import javax.annotation.Nullable;
 import org.apache.pinot.segment.local.segment.store.TextIndexUtils;
 import org.apache.pinot.segment.spi.index.TextIndexConfig;
+import org.apache.pinot.segment.spi.utils.CsvParser;
 import org.apache.pinot.spi.config.table.FSTType;
 import org.apache.pinot.spi.config.table.FieldConfig;
 
 
 public class TextIndexConfigBuilder extends TextIndexConfig.AbstractBuilder {
+  public TextIndexConfigBuilder() {
+    super((FSTType) null);
+  }
+
   public TextIndexConfigBuilder(@Nullable FSTType fstType) {
     super(fstType);
   }
@@ -67,6 +72,23 @@ public class TextIndexConfigBuilder extends 
TextIndexConfig.AbstractBuilder {
         _luceneAnalyzerClass = 
textIndexProperties.get(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS);
       }
 
+      // Note that we cannot depend on jackson's default behavior to 
automatically coerce the comma delimited args to
+      // List<String>. This is because the args may contain comma and other 
special characters such as space. Therefore,
+      // we use our own csv parser to parse the values directly.
+      if 
(textIndexProperties.get(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARGS) != 
null) {
+        _luceneAnalyzerClassArgs = CsvParser.parse(
+                
textIndexProperties.get(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARGS), 
true, false);
+      }
+
+      if 
(textIndexProperties.get(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARG_TYPES)
 != null) {
+        _luceneAnalyzerClassArgTypes = CsvParser.parse(
+                
textIndexProperties.get(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARG_TYPES),
 false, true);
+      }
+
+      if 
(textIndexProperties.get(FieldConfig.TEXT_INDEX_LUCENE_QUERY_PARSER_CLASS) != 
null) {
+        _luceneQueryParserClass = 
textIndexProperties.get(FieldConfig.TEXT_INDEX_LUCENE_QUERY_PARSER_CLASS);
+      }
+
       for (Map.Entry<String, String> entry : textIndexProperties.entrySet()) {
         if (entry.getKey().equalsIgnoreCase(FieldConfig.TEXT_FST_TYPE)) {
           _fstType = FSTType.NATIVE;
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
index 0c2369bdd8..2d8abe40f4 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
@@ -19,6 +19,8 @@
 package org.apache.pinot.segment.local.segment.store;
 
 import java.io.File;
+import java.lang.reflect.Constructor;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
@@ -30,14 +32,19 @@ import org.apache.commons.io.FileUtils;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.queryparser.classic.QueryParserBase;
 import 
org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
 import org.apache.pinot.segment.spi.V1Constants.Indexes;
+import org.apache.pinot.segment.spi.index.TextIndexConfig;
 import org.apache.pinot.segment.spi.store.SegmentDirectoryPaths;
 import org.apache.pinot.spi.config.table.FSTType;
 import org.apache.pinot.spi.config.table.FieldConfig;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 
 public class TextIndexUtils {
+  private static final Logger LOGGER = 
LoggerFactory.getLogger(TextIndexUtils.class);
   private TextIndexUtils() {
   }
 
@@ -111,10 +118,148 @@ public class TextIndexUtils {
         .collect(Collectors.toList());
   }
 
-  public static Analyzer getAnalyzerFromClassName(String luceneAnalyzerClass)
-      throws ReflectiveOperationException {
-    // Support instantiation with default constructor for now unless customized
-    return (Analyzer) 
Class.forName(luceneAnalyzerClass).getConstructor().newInstance();
+  /**
+   * Retrieves the Lucene Analyzer class instance via reflection from the 
fully qualified class name of the text config.
+   * If the class name is not specified in the config, the default 
StandardAnalyzer is instantiated.
+   *
+   * @param config Pinot TextIndexConfig to fetch the configuration from
+   * @return Lucene Analyzer class instance
+   * @throws ReflectiveOperationException if instantiation via reflection fails
+   */
+  public static Analyzer getAnalyzer(TextIndexConfig config) throws 
ReflectiveOperationException {
+    String luceneAnalyzerClassName = config.getLuceneAnalyzerClass();
+    List<String> luceneAnalyzerClassArgs = config.getLuceneAnalyzerClassArgs();
+    List<String> luceneAnalyzerClassArgTypes = 
config.getLuceneAnalyzerClassArgTypes();
+
+    if (null == luceneAnalyzerClassName || luceneAnalyzerClassName.isEmpty()
+            || 
(luceneAnalyzerClassName.equals(StandardAnalyzer.class.getName())
+                    && luceneAnalyzerClassArgs.isEmpty() && 
luceneAnalyzerClassArgTypes.isEmpty())) {
+      // When there is no analyzer defined, or when StandardAnalyzer (default) 
is used without arguments,
+      // use existing logic to obtain an instance of StandardAnalyzer with 
customized stop words
+      return TextIndexUtils.getStandardAnalyzerWithCustomizedStopWords(
+              config.getStopWordsInclude(), config.getStopWordsExclude());
+    }
+
+    // Custom analyzer + custom configs via reflection
+    if (luceneAnalyzerClassArgs.size() != luceneAnalyzerClassArgTypes.size()) {
+      throw new ReflectiveOperationException("Mismatch of the number of 
analyzer arguments and arguments types.");
+    }
+
+    // Generate args type list
+    List<Class<?>> argClasses = new ArrayList<>();
+    for (String argType : luceneAnalyzerClassArgTypes) {
+      argClasses.add(parseSupportedTypes(argType));
+    }
+
+    // Best effort coercion to the analyzer argument type
+    // Note only a subset of class types is supported, unsupported ones can be 
added in the future
+    List<Object> argValues = new ArrayList<>();
+    for (int i = 0; i < luceneAnalyzerClassArgs.size(); i++) {
+      argValues.add(parseSupportedTypeValues(luceneAnalyzerClassArgs.get(i), 
argClasses.get(i)));
+    }
+
+    // Initialize the custom analyzer class with custom analyzer args
+    Class<?> luceneAnalyzerClass = Class.forName(luceneAnalyzerClassName);
+    if (!Analyzer.class.isAssignableFrom(luceneAnalyzerClass)) {
+      String exceptionMessage = "Custom analyzer must be a child of " + 
Analyzer.class.getCanonicalName();
+      LOGGER.error(exceptionMessage);
+      throw new ReflectiveOperationException(exceptionMessage);
+    }
+
+    // Return a new instance of custom lucene analyzer class
+    return (Analyzer) 
luceneAnalyzerClass.getConstructor(argClasses.toArray(new Class<?>[0]))
+            .newInstance(argValues.toArray(new Object[0]));
+  }
+
+  /**
+   * Parse the Java value type specified in the type string
+   * @param valueTypeString FQCN of the value type class or the name of the 
primitive value type
+   * @return Class object of the value type
+   * @throws ClassNotFoundException when the value type is not supported
+   */
+  public static Class<?> parseSupportedTypes(String valueTypeString) throws 
ClassNotFoundException {
+    try {
+      // Support both primitive types + class
+      switch (valueTypeString) {
+        case "java.lang.Byte.TYPE":
+          return Byte.TYPE;
+        case "java.lang.Short.TYPE":
+          return Short.TYPE;
+        case "java.lang.Integer.TYPE":
+          return Integer.TYPE;
+        case "java.lang.Long.TYPE":
+          return Long.TYPE;
+        case "java.lang.Float.TYPE":
+          return Float.TYPE;
+        case "java.lang.Double.TYPE":
+          return Double.TYPE;
+        case "java.lang.Boolean.TYPE":
+          return Boolean.TYPE;
+        case "java.lang.Character.TYPE":
+          return Character.TYPE;
+        default:
+          return Class.forName(valueTypeString);
+      }
+    } catch (ClassNotFoundException ex) {
+      LOGGER.error("Analyzer argument class type not found: " + 
valueTypeString);
+      throw ex;
+    }
+  }
+
+  /**
+   * Attempt to coerce string into supported value type
+   * @param stringValue string representation of the value
+   * @param clazz of the value
+   * @return class object of the value, auto-boxed if it is a primitive type
+   * @throws ReflectiveOperationException if value cannot be coerced without 
ambiguity or encountered unsupported type
+   */
+  public static Object parseSupportedTypeValues(String stringValue, Class<?> 
clazz)
+          throws ReflectiveOperationException {
+    try {
+      if (clazz.equals(String.class)) {
+        return stringValue;
+      } else if (clazz.equals(Byte.class) || clazz.equals(Byte.TYPE)) {
+        return Byte.parseByte(stringValue);
+      } else if (clazz.equals(Short.class) || clazz.equals(Short.TYPE)) {
+        return Short.parseShort(stringValue);
+      } else if (clazz.equals(Integer.class) || clazz.equals(Integer.TYPE)) {
+        return Integer.parseInt(stringValue);
+      } else if (clazz.equals(Long.class) || clazz.equals(Long.TYPE)) {
+        return Long.parseLong(stringValue);
+      } else if (clazz.equals(Float.class) || clazz.equals(Float.TYPE)) {
+        return Float.parseFloat(stringValue);
+      } else if (clazz.equals(Double.class) || clazz.equals(Double.TYPE)) {
+        return Double.parseDouble(stringValue);
+      } else if (clazz.equals(Boolean.class) || clazz.equals(Boolean.TYPE)) {
+        // Note we cannot use Boolean.parseBoolean here because it treats 
"abc" as false which
+        // introduces unexpected parsing results. We should validate the input 
by accepting only
+        // true|false in a case-insensitive manner, for all other values, 
return an exception.
+        String lowerCaseStringValue = stringValue.toLowerCase();
+        if (lowerCaseStringValue.equals("true")) {
+          return true;
+        } else if (lowerCaseStringValue.equals("false")) {
+          return false;
+        }
+        throw new ReflectiveOperationException();
+      } else if (clazz.equals(Character.class) || 
clazz.equals(Character.TYPE)) {
+        if (stringValue.length() == 1) {
+          return stringValue.charAt(0);
+        }
+        throw new ReflectiveOperationException();
+      } else {
+        throw new UnsupportedOperationException();
+      }
+    } catch (NumberFormatException | ReflectiveOperationException ex) {
+      String exceptionMessage = "Custom analyzer argument cannot be coerced 
from "
+              + stringValue + " to " + clazz.getName() + " type";
+      LOGGER.error(exceptionMessage);
+      throw new ReflectiveOperationException(exceptionMessage);
+    } catch (UnsupportedOperationException ex) {
+      // In the future, consider adding more common serdes for common complex 
types used within Lucene
+      String exceptionMessage = "Custom analyzer argument does not support " + 
clazz.getName() + " type";
+      LOGGER.error(exceptionMessage);
+      throw new ReflectiveOperationException(exceptionMessage);
+    }
   }
 
   public static StandardAnalyzer 
getStandardAnalyzerWithCustomizedStopWords(@Nullable List<String> 
stopWordsInclude,
@@ -128,4 +273,25 @@ public class TextIndexUtils {
     }
     return new StandardAnalyzer(new CharArraySet(stopWordSet, true));
   }
+
+  public static Constructor<QueryParserBase> 
getQueryParserWithStringAndAnalyzerTypeConstructor(
+          String queryParserClassName) throws ReflectiveOperationException {
+    // Fail-fast if the query parser is specified class is not QueryParseBase 
class
+    final Class<?> queryParserClass = Class.forName(queryParserClassName);
+    if (!QueryParserBase.class.isAssignableFrom(queryParserClass)) {
+      throw new ReflectiveOperationException("The specified lucene query 
parser class " + queryParserClassName
+              + " is not assignable from " + QueryParserBase.class.getName());
+    }
+    // Fail-fast if the query parser does not have the required constructor 
used by this class
+    try {
+      queryParserClass.getConstructor(String.class, Analyzer.class);
+    } catch (NoSuchMethodException ex) {
+      throw new NoSuchMethodException("The specified lucene query parser class 
" + queryParserClassName
+              + " is not assignable from does not have the required 
constructor method with parameter type "
+              + "[String.class, Analyzer.class]"
+      );
+    }
+
+    return (Constructor<QueryParserBase>) 
queryParserClass.getConstructor(String.class, Analyzer.class);
+  }
 }
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
index 84254060b3..2618930bf6 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
@@ -42,6 +42,7 @@ import 
org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLucene
 import 
org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLuceneTextIndexSearcherPool;
 import 
org.apache.pinot.segment.local.segment.index.column.PhysicalColumnIndexContainer;
 import org.apache.pinot.segment.local.segment.index.loader.IndexLoadingConfig;
+import 
org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
 import org.apache.pinot.segment.local.segment.store.SegmentLocalFSDirectory;
 import 
org.apache.pinot.segment.local.segment.virtualcolumn.VirtualColumnProviderFactory;
 import org.apache.pinot.segment.spi.ColumnMetadata;
@@ -480,9 +481,11 @@ public class RealtimeSegmentConverterTest {
     String tableNameWithType = tableConfig.getTableName();
     String segmentName = "testTable__0__0__123456";
     IndexingConfig indexingConfig = tableConfig.getIndexingConfig();
-    TextIndexConfig textIndexConfig =
-        new TextIndexConfig(false, null, null, false, false, 
Collections.emptyList(), Collections.emptyList(), false,
-            500, null, false, reuseMutableIndex, 
luceneNRTCachingDirectoryMaxBufferSizeMB);
+    TextIndexConfig textIndexConfig = new TextIndexConfigBuilder()
+            .withUseANDForMultiTermQueries(false)
+            .withReuseMutableIndex(reuseMutableIndex)
+            
.withLuceneNRTCachingDirectoryMaxBufferSizeMB(luceneNRTCachingDirectoryMaxBufferSizeMB)
+            .build();
 
     RealtimeSegmentConfig.Builder realtimeSegmentConfigBuilder =
         new 
RealtimeSegmentConfig.Builder().setTableNameWithType(tableNameWithType).setSegmentName(segmentName)
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/LuceneMutableTextIndexTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/LuceneMutableTextIndexTest.java
index cc46486606..d88c134b62 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/LuceneMutableTextIndexTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/LuceneMutableTextIndexTest.java
@@ -24,8 +24,13 @@ import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
+import java.util.concurrent.atomic.AtomicInteger;
 import org.apache.commons.io.FileUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.queryparser.classic.QueryParser;
 import org.apache.pinot.common.metrics.ServerMetrics;
+import 
org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
 import org.apache.pinot.segment.spi.index.TextIndexConfig;
 import org.apache.pinot.util.TestUtils;
 import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
@@ -39,11 +44,13 @@ import static org.testng.Assert.assertEquals;
 
 
 public class LuceneMutableTextIndexTest {
+  private static final AtomicInteger SEGMENT_NAME_SUFFIX_COUNTER = new 
AtomicInteger(0);
   private static final File INDEX_DIR = new File(FileUtils.getTempDirectory(), 
"LuceneMutableIndexTest");
   private static final String TEXT_COLUMN_NAME = "testColumnName";
+  private static final String CUSTOM_ANALYZER_FQCN = 
CustomAnalyzer.class.getName();
+  private static final String CUSTOM_QUERY_PARSER_FQCN = 
CustomQueryParser.class.getName();
   private static final RealtimeLuceneTextIndexSearcherPool SEARCHER_POOL =
       RealtimeLuceneTextIndexSearcherPool.init(1);
-
   private RealtimeLuceneTextIndex _realtimeLuceneTextIndex;
 
   public LuceneMutableTextIndexTest() {
@@ -51,6 +58,96 @@ public class LuceneMutableTextIndexTest {
     ServerMetrics.register(mock(ServerMetrics.class));
   }
 
+  @Test
+  public void testDefaultAnalyzerAndDefaultQueryParser() {
+    // Test queries with standard analyzer with default configurations used by 
Pinot
+    configureIndex(null, null, null, null);
+    assertEquals(_realtimeLuceneTextIndex.getDocIds("stream"), 
ImmutableRoaringBitmap.bitmapOf(0));
+    assertEquals(_realtimeLuceneTextIndex.getDocIds("/.*house.*/"), 
ImmutableRoaringBitmap.bitmapOf(1));
+    assertEquals(_realtimeLuceneTextIndex.getDocIds("invalid"), 
ImmutableRoaringBitmap.bitmapOf());
+  }
+
+  @Test
+  public void testCustomAnalyzerWithNoArgsAndDefaultQueryParser() {
+    // Test query with CustomKeywordAnalyzer without any args
+    configureIndex(CUSTOM_ANALYZER_FQCN, null, null, null);
+    assertEquals(_realtimeLuceneTextIndex.getDocIds(
+            "/.*processing for data ware.*/"), 
ImmutableRoaringBitmap.bitmapOf(1));
+    assertEquals(_realtimeLuceneTextIndex.getDocIds(
+            "columnar processing for data warehouses"), 
ImmutableRoaringBitmap.bitmapOf(1));
+  }
+
+  @Test
+  public void testCustomAnalyzerWithNoArgsAndCustomQueryParser() {
+    // Test queries with CustomKeywordAnalyzer without any args and 
CustomQueryParser
+    configureIndex(CUSTOM_ANALYZER_FQCN, null, null, CUSTOM_QUERY_PARSER_FQCN);
+    assertEquals(_realtimeLuceneTextIndex.getDocIds(
+            "/.*processing for data ware.*/"), 
ImmutableRoaringBitmap.bitmapOf(1));
+    assertEquals(_realtimeLuceneTextIndex.getDocIds(
+            "columnar processing for data warehouses"), 
ImmutableRoaringBitmap.bitmapOf(1));
+  }
+
+  @Test
+  public void testCustomAnalyzerWithTwoStringArgsAndCustomQueryParser() {
+    // Test queries with CustomKeywordAnalyzer with two java.lang.String args 
and CustomQueryParser
+    configureIndex(CUSTOM_ANALYZER_FQCN,
+            "a,b", "java.lang.String, java.lang.String", 
CUSTOM_QUERY_PARSER_FQCN);
+    assertEquals(_realtimeLuceneTextIndex.getDocIds(
+            "/.*processing for data ware.*/"), 
ImmutableRoaringBitmap.bitmapOf(1));
+    assertEquals(_realtimeLuceneTextIndex.getDocIds(
+            "columnar processing for data warehouses"), 
ImmutableRoaringBitmap.bitmapOf(1));
+  }
+
+  @Test
+  public void 
testCustomAnalyzerWithOneStringOneIntegerParametersAndCustomQueryParser() {
+    // Test queries with CustomKeywordAnalyzer w/ two String.class args and 
ExtendedQueryParser
+    configureIndex(CUSTOM_ANALYZER_FQCN,
+            "a,123", "java.lang.String,java.lang.Integer", 
CUSTOM_QUERY_PARSER_FQCN);
+    assertEquals(_realtimeLuceneTextIndex.getDocIds(
+            "/.*processing for data ware.*/"), 
ImmutableRoaringBitmap.bitmapOf(1));
+    assertEquals(_realtimeLuceneTextIndex.getDocIds(
+            "columnar processing for data warehouses"), 
ImmutableRoaringBitmap.bitmapOf(1));
+  }
+
+  @Test
+  public void 
testCustomAnalyzerWithOnePrimitiveIntParametersAndCustomQueryParser() {
+    // Test queries with CustomKeywordAnalyzer w/ two String.class args and 
ExtendedQueryParser
+    configureIndex(CUSTOM_ANALYZER_FQCN,
+            "123", "java.lang.Integer.TYPE", CUSTOM_QUERY_PARSER_FQCN);
+    assertEquals(_realtimeLuceneTextIndex.getDocIds(
+            "/.*processing for data ware.*/"), 
ImmutableRoaringBitmap.bitmapOf(1));
+    assertEquals(_realtimeLuceneTextIndex.getDocIds(
+            "columnar processing for data warehouses"), 
ImmutableRoaringBitmap.bitmapOf(1));
+  }
+
+  private static class CustomQueryParser extends QueryParser {
+    public CustomQueryParser(String field, Analyzer analyzer) {
+      super(field, analyzer);
+    }
+  }
+
+  public static class CustomAnalyzer extends Analyzer {
+    public CustomAnalyzer() {
+      super();
+    }
+
+    public CustomAnalyzer(String stringArg1, String stringArg2) {
+      super();
+    }
+
+    public CustomAnalyzer(String stringArg1, Integer integerArg2) {
+      super();
+    }
+
+    public CustomAnalyzer(int intArg2) {
+      super();
+    }
+
+    protected Analyzer.TokenStreamComponents createComponents(String 
fieldName) {
+      return new Analyzer.TokenStreamComponents(new KeywordTokenizer());
+    }
+  }
+
   private String[][] getTextData() {
     return new String[][]{
         {"realtime stream processing"}, {"publish subscribe", "columnar 
processing for data warehouses", "concurrency"}
@@ -63,13 +160,27 @@ public class LuceneMutableTextIndexTest {
     };
   }
 
-  @BeforeClass
-  public void setUp()
-      throws Exception {
-    TextIndexConfig config =
-        new TextIndexConfig(false, null, null, false, false, null, null, true, 
500, null, false, false, 0);
-    _realtimeLuceneTextIndex =
-        new RealtimeLuceneTextIndex(TEXT_COLUMN_NAME, INDEX_DIR, 
"table__0__1__20240602T0014Z", config);
+  private void configureIndex(String analyzerClass, String analyzerClassArgs, 
String analyzerClassArgTypes,
+                              String queryParserClass) {
+    TextIndexConfigBuilder builder = new TextIndexConfigBuilder();
+    if (null != analyzerClass) {
+      builder.withLuceneAnalyzerClass(analyzerClass);
+    }
+    if (null != analyzerClassArgs) {
+      builder.withLuceneAnalyzerClassArgs(analyzerClassArgs);
+    }
+    if (null != analyzerClassArgTypes) {
+      builder.withLuceneAnalyzerClassArgTypes(analyzerClassArgTypes);
+    }
+    if (null != queryParserClass) {
+      builder.withLuceneQueryParserClass(queryParserClass);
+    }
+    TextIndexConfig config = 
builder.withUseANDForMultiTermQueries(false).build();
+
+    // Note that segment name must be unique on each query setup, otherwise 
`testQueryCancellationIsSuccessful` method
+    // will cause unit test to fail due to inability to release a lock.
+    _realtimeLuceneTextIndex = new RealtimeLuceneTextIndex(TEXT_COLUMN_NAME, 
INDEX_DIR,
+        "table__0__1__20240601T1818Z" + 
SEGMENT_NAME_SUFFIX_COUNTER.getAndIncrement(), config);
     String[][] documents = getTextData();
     String[][] repeatedDocuments = getRepeatedData();
 
@@ -82,6 +193,22 @@ public class LuceneMutableTextIndexTest {
         _realtimeLuceneTextIndex.add(row);
       }
     }
+
+    // ensure searches work after .commit() is called
+    _realtimeLuceneTextIndex.commit();
+
+    // sleep for index refresh
+    try {
+      Thread.sleep(100);
+    } catch (Exception e) {
+      // no-op
+    }
+  }
+
+  @BeforeClass
+  public void setUp()
+      throws Exception {
+    RealtimeLuceneIndexRefreshManager.getInstance().reset();
   }
 
   @AfterClass
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java
index 84765353b0..4a463ebc30 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java
@@ -24,6 +24,7 @@ import java.util.List;
 import org.apache.commons.io.FileUtils;
 import org.apache.lucene.search.SearcherManager;
 import org.apache.pinot.common.metrics.ServerMetrics;
+import 
org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
 import org.apache.pinot.segment.spi.index.TextIndexConfig;
 import org.testng.annotations.AfterClass;
 import org.testng.annotations.BeforeClass;
@@ -75,8 +76,10 @@ public class NativeAndLuceneMutableTextIndexTest {
       throws Exception {
     RealtimeLuceneIndexRefreshManager.init(1, 10);
     ServerMetrics.register(mock(ServerMetrics.class));
-    TextIndexConfig config =
-        new TextIndexConfig(false, null, null, false, false, null, null, true, 
500, null, false, false, 0);
+    TextIndexConfig config = new TextIndexConfigBuilder()
+            .withUseANDForMultiTermQueries(false)
+            .build();
+
     _realtimeLuceneTextIndex =
         new RealtimeLuceneTextIndex(TEXT_COLUMN_NAME, INDEX_DIR, 
"table__0__1__20240602T0014Z", config);
     _nativeMutableTextIndex = new NativeMutableTextIndex(TEXT_COLUMN_NAME);
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/FilePerIndexDirectoryTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/FilePerIndexDirectoryTest.java
index 65a1621dfc..68f4cc6cbe 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/FilePerIndexDirectoryTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/FilePerIndexDirectoryTest.java
@@ -201,8 +201,8 @@ public class FilePerIndexDirectoryTest {
   @Test
   public void testRemoveTextIndices()
       throws IOException {
-    TextIndexConfig config =
-        new TextIndexConfig(false, null, null, false, false, null, null, true, 
500, null, false, false, 0);
+    TextIndexConfig config = new TextIndexConfig(false, null, null, false, 
false, null, null, true, 500, null, null,
+            null, null, false, false, 0);
     try (FilePerIndexDirectory fpi = new FilePerIndexDirectory(TEMP_DIR, 
_segmentMetadata, ReadMode.mmap);
         LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo", 
TEMP_DIR, true, false, null, null,
             config);
@@ -265,8 +265,8 @@ public class FilePerIndexDirectoryTest {
   @Test
   public void testGetColumnIndices()
       throws IOException {
-    TextIndexConfig config =
-        new TextIndexConfig(false, null, null, false, false, null, null, true, 
500, null, false, false, 0);
+    TextIndexConfig config = new TextIndexConfig(false, null, null, false, 
false, null, null, true, 500, null, null,
+            null, null, false, false, 0);
     // Write sth to buffers and flush them to index files on disk
     try (FilePerIndexDirectory fpi = new FilePerIndexDirectory(TEMP_DIR, 
_segmentMetadata, ReadMode.mmap);
         LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo", 
TEMP_DIR, true, false, null, null,
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/SingleFileIndexDirectoryTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/SingleFileIndexDirectoryTest.java
index 7e9933d777..88c58e2672 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/SingleFileIndexDirectoryTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/SingleFileIndexDirectoryTest.java
@@ -234,8 +234,8 @@ public class SingleFileIndexDirectoryTest {
   @Test
   public void testRemoveTextIndices()
       throws IOException, ConfigurationException {
-    TextIndexConfig config =
-        new TextIndexConfig(false, null, null, false, false, null, null, true, 
500, null, false, false, 0);
+    TextIndexConfig config = new TextIndexConfig(false, null, null, false, 
false, null, null, true, 500, null, null,
+            null, null, false, false, 0);
     try (SingleFileIndexDirectory sfd = new SingleFileIndexDirectory(TEMP_DIR, 
_segmentMetadata, ReadMode.mmap);
         LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo", 
TEMP_DIR, true, false, null, null,
             config);
@@ -342,8 +342,8 @@ public class SingleFileIndexDirectoryTest {
   @Test
   public void testGetColumnIndices()
       throws Exception {
-    TextIndexConfig config =
-        new TextIndexConfig(false, null, null, false, false, null, null, true, 
500, null, false, false, 0);
+    TextIndexConfig config = new TextIndexConfig(false, null, null, false, 
false, null, null, true, 500, null, null,
+            null, null, false, false, 0);
     try (SingleFileIndexDirectory sfd = new SingleFileIndexDirectory(TEMP_DIR, 
_segmentMetadata, ReadMode.mmap);
         LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo", 
TEMP_DIR, true, false, null, null,
             config);
diff --git 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
index 522fb7bbf2..6f3a768837 100644
--- 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
+++ 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
@@ -27,6 +27,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 import javax.annotation.Nullable;
+import org.apache.pinot.segment.spi.utils.CsvParser;
 import org.apache.pinot.spi.config.table.FSTType;
 import org.apache.pinot.spi.config.table.FieldConfig;
 import org.apache.pinot.spi.config.table.IndexConfig;
@@ -38,9 +39,11 @@ public class TextIndexConfig extends IndexConfig {
   private static final boolean 
LUCENE_INDEX_ENABLE_PREFIX_SUFFIX_MATCH_IN_PHRASE_SEARCH = false;
   private static final boolean LUCENE_INDEX_REUSE_MUTABLE_INDEX = false;
   private static final int 
LUCENE_INDEX_NRT_CACHING_DIRECTORY_MAX_BUFFER_SIZE_MB = 0;
+
   public static final TextIndexConfig DISABLED =
       new TextIndexConfig(true, null, null, false, false, 
Collections.emptyList(), Collections.emptyList(), false,
-          LUCENE_INDEX_DEFAULT_MAX_BUFFER_SIZE_MB, null, false, false, 0);
+         LUCENE_INDEX_DEFAULT_MAX_BUFFER_SIZE_MB, null, null, null, null, 
false, false, 0);
+
   private final FSTType _fstType;
   @Nullable
   private final Object _rawValueForTextIndex;
@@ -51,6 +54,9 @@ public class TextIndexConfig extends IndexConfig {
   private final boolean _luceneUseCompoundFile;
   private final int _luceneMaxBufferSizeMB;
   private final String _luceneAnalyzerClass;
+  private final List<String> _luceneAnalyzerClassArgs;
+  private final List<String> _luceneAnalyzerClassArgTypes;
+  private final String _luceneQueryParserClass;
   private final boolean _enablePrefixSuffixMatchingInPhraseQueries;
   private final boolean _reuseMutableIndex;
   private final int _luceneNRTCachingDirectoryMaxBufferSizeMB;
@@ -65,6 +71,9 @@ public class TextIndexConfig extends IndexConfig {
       @JsonProperty("luceneUseCompoundFile") Boolean luceneUseCompoundFile,
       @JsonProperty("luceneMaxBufferSizeMB") Integer luceneMaxBufferSizeMB,
       @JsonProperty("luceneAnalyzerClass") String luceneAnalyzerClass,
+      @JsonProperty("luceneAnalyzerClassArgs") String luceneAnalyzerClassArgs,
+      @JsonProperty("luceneAnalyzerClassArgTypes") String 
luceneAnalyzerClassArgTypes,
+      @JsonProperty("luceneQueryParserClass") String luceneQueryParserClass,
       @JsonProperty("enablePrefixSuffixMatchingInPhraseQueries") Boolean 
enablePrefixSuffixMatchingInPhraseQueries,
       @JsonProperty("reuseMutableIndex") Boolean reuseMutableIndex,
       @JsonProperty("luceneNRTCachingDirectoryMaxBufferSizeMB") Integer 
luceneNRTCachingDirectoryMaxBufferSizeMB) {
@@ -81,6 +90,14 @@ public class TextIndexConfig extends IndexConfig {
         luceneMaxBufferSizeMB == null ? 
LUCENE_INDEX_DEFAULT_MAX_BUFFER_SIZE_MB : luceneMaxBufferSizeMB;
     _luceneAnalyzerClass = (luceneAnalyzerClass == null || 
luceneAnalyzerClass.isEmpty())
         ? FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_ANALYZER_CLASS : 
luceneAnalyzerClass;
+
+    // Note that we cannot depend on jackson's default behavior to 
automatically coerce the comma delimited args to
+    // List<String>. This is because the args may contain comma and other 
special characters such as space. Therefore,
+    // we use our own csv parser to parse the values directly.
+    _luceneAnalyzerClassArgs = CsvParser.parse(luceneAnalyzerClassArgs, true, 
false);
+    _luceneAnalyzerClassArgTypes = 
CsvParser.parse(luceneAnalyzerClassArgTypes, false, true);
+    _luceneQueryParserClass = luceneQueryParserClass == null
+            ? FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_QUERY_PARSER_CLASS : 
luceneQueryParserClass;
     _enablePrefixSuffixMatchingInPhraseQueries =
         enablePrefixSuffixMatchingInPhraseQueries == null ? 
LUCENE_INDEX_ENABLE_PREFIX_SUFFIX_MATCH_IN_PHRASE_SEARCH
             : enablePrefixSuffixMatchingInPhraseQueries;
@@ -141,6 +158,29 @@ public class TextIndexConfig extends IndexConfig {
     return _luceneAnalyzerClass;
   }
 
+  /**
+   * Lucene analyzer arguments in String type. At runtime, the string 
representation are best-effort coerced into the
+   * proper type with the fully-qualified value type specified in 
luceneAnalyzerClassArgTypes
+   */
+  public List<String> getLuceneAnalyzerClassArgs() {
+    return _luceneAnalyzerClassArgs;
+  }
+
+  /**
+   * Lucene analyzer fully qualified argument value types for each argument. 
At runtime, the values specified in the
+   * luceneAnalyserClassArgs (string representation) are best-effort coerced 
into the specified value type.
+   */
+  public List<String> getLuceneAnalyzerClassArgTypes() {
+    return _luceneAnalyzerClassArgTypes;
+  }
+
+  /**
+   * Lucene query parser fully qualified class name specifying which lucene 
query parser class to use for query parsing
+   */
+  public String getLuceneQueryParserClass() {
+    return _luceneQueryParserClass;
+  }
+
   /**
    *  Whether to enable prefix and suffix wildcard term matching (i.e., 
.*value for prefix and value.* for suffix
    *  term matching) in a phrase query. By default, Pinot today treats .* in a 
phrase query like ".*value str1 value.*"
@@ -171,6 +211,9 @@ public class TextIndexConfig extends IndexConfig {
     protected boolean _luceneUseCompoundFile = 
LUCENE_INDEX_DEFAULT_USE_COMPOUND_FILE;
     protected int _luceneMaxBufferSizeMB = 
LUCENE_INDEX_DEFAULT_MAX_BUFFER_SIZE_MB;
     protected String _luceneAnalyzerClass = 
FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_ANALYZER_CLASS;
+    protected List<String> _luceneAnalyzerClassArgs = new ArrayList<>();
+    protected List<String> _luceneAnalyzerClassArgTypes = new ArrayList<>();
+    protected String _luceneQueryParserClass = 
FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_QUERY_PARSER_CLASS;
     protected boolean _enablePrefixSuffixMatchingInPhraseQueries =
         LUCENE_INDEX_ENABLE_PREFIX_SUFFIX_MATCH_IN_PHRASE_SEARCH;
     protected boolean _reuseMutableIndex = LUCENE_INDEX_REUSE_MUTABLE_INDEX;
@@ -189,6 +232,9 @@ public class TextIndexConfig extends IndexConfig {
       _luceneUseCompoundFile = other._luceneUseCompoundFile;
       _luceneMaxBufferSizeMB = other._luceneMaxBufferSizeMB;
       _luceneAnalyzerClass = other._luceneAnalyzerClass;
+      _luceneAnalyzerClassArgs = other._luceneAnalyzerClassArgs;
+      _luceneAnalyzerClassArgTypes = other._luceneAnalyzerClassArgTypes;
+      _luceneQueryParserClass = other._luceneQueryParserClass;
       _enablePrefixSuffixMatchingInPhraseQueries = 
other._enablePrefixSuffixMatchingInPhraseQueries;
       _reuseMutableIndex = other._reuseMutableIndex;
       _luceneNRTCachingDirectoryMaxBufferSizeMB = 
other._luceneNRTCachingDirectoryMaxBufferSizeMB;
@@ -197,7 +243,10 @@ public class TextIndexConfig extends IndexConfig {
     public TextIndexConfig build() {
       return new TextIndexConfig(false, _fstType, _rawValueForTextIndex, 
_enableQueryCache, _useANDForMultiTermQueries,
           _stopWordsInclude, _stopWordsExclude, _luceneUseCompoundFile, 
_luceneMaxBufferSizeMB, _luceneAnalyzerClass,
-          _enablePrefixSuffixMatchingInPhraseQueries, _reuseMutableIndex, 
_luceneNRTCachingDirectoryMaxBufferSizeMB);
+          CsvParser.serialize(_luceneAnalyzerClassArgs, true, false),
+          CsvParser.serialize(_luceneAnalyzerClassArgTypes, true, false),
+          _luceneQueryParserClass, _enablePrefixSuffixMatchingInPhraseQueries, 
_reuseMutableIndex,
+          _luceneNRTCachingDirectoryMaxBufferSizeMB);
     }
 
     public abstract AbstractBuilder withProperties(@Nullable Map<String, 
String> textIndexProperties);
@@ -207,6 +256,11 @@ public class TextIndexConfig extends IndexConfig {
       return this;
     }
 
+    public AbstractBuilder withUseANDForMultiTermQueries(boolean 
useANDForMultiTermQueries) {
+      _useANDForMultiTermQueries = useANDForMultiTermQueries;
+      return this;
+    }
+
     public AbstractBuilder withStopWordsInclude(List<String> stopWordsInclude) 
{
       _stopWordsInclude = stopWordsInclude;
       return this;
@@ -232,6 +286,31 @@ public class TextIndexConfig extends IndexConfig {
       return this;
     }
 
+    public AbstractBuilder withLuceneAnalyzerClassArgs(String 
luceneAnalyzerClassArgs) {
+      _luceneAnalyzerClassArgs = CsvParser.parse(luceneAnalyzerClassArgs, 
true, false);
+      return this;
+    }
+
+    public AbstractBuilder withLuceneAnalyzerClassArgs(List<String> 
luceneAnalyzerClassArgs) {
+      _luceneAnalyzerClassArgs = luceneAnalyzerClassArgs;
+      return this;
+    }
+
+    public AbstractBuilder withLuceneAnalyzerClassArgTypes(String 
luceneAnalyzerClassArgTypes) {
+      _luceneAnalyzerClassArgTypes = 
CsvParser.parse(luceneAnalyzerClassArgTypes, false, true);
+      return this;
+    }
+
+    public AbstractBuilder withLuceneAnalyzerClassArgTypes(List<String> 
luceneAnalyzerClassArgTypes) {
+      _luceneAnalyzerClassArgTypes = luceneAnalyzerClassArgTypes;
+      return this;
+    }
+
+    public AbstractBuilder withLuceneQueryParserClass(String 
luceneQueryParserClass) {
+      _luceneQueryParserClass = luceneQueryParserClass;
+      return this;
+    }
+
     public AbstractBuilder withEnablePrefixSuffixMatchingInPhraseQueries(
         boolean enablePrefixSuffixMatchingInPhraseQueries) {
       _enablePrefixSuffixMatchingInPhraseQueries = 
enablePrefixSuffixMatchingInPhraseQueries;
@@ -269,14 +348,20 @@ public class TextIndexConfig extends IndexConfig {
         && _luceneNRTCachingDirectoryMaxBufferSizeMB == 
that._luceneNRTCachingDirectoryMaxBufferSizeMB
         && _fstType == that._fstType && Objects.equals(_rawValueForTextIndex, 
that._rawValueForTextIndex)
         && Objects.equals(_stopWordsInclude, that._stopWordsInclude) && 
Objects.equals(_stopWordsExclude,
-        that._stopWordsExclude) && Objects.equals(_luceneAnalyzerClass, 
that._luceneAnalyzerClass);
+        that._stopWordsExclude) && _luceneUseCompoundFile == 
that._luceneUseCompoundFile
+        && _luceneMaxBufferSizeMB == that._luceneMaxBufferSizeMB
+        && Objects.equals(_luceneAnalyzerClass, that._luceneAnalyzerClass)
+        && Objects.equals(_luceneAnalyzerClassArgs, 
that._luceneAnalyzerClassArgs)
+        && Objects.equals(_luceneAnalyzerClassArgTypes, 
that._luceneAnalyzerClassArgTypes)
+        && Objects.equals(_luceneQueryParserClass, 
that._luceneQueryParserClass);
   }
 
   @Override
   public int hashCode() {
     return Objects.hash(super.hashCode(), _fstType, _rawValueForTextIndex, 
_enableQueryCache,
         _useANDForMultiTermQueries, _stopWordsInclude, _stopWordsExclude, 
_luceneUseCompoundFile,
-        _luceneMaxBufferSizeMB, _luceneAnalyzerClass, 
_enablePrefixSuffixMatchingInPhraseQueries, _reuseMutableIndex,
+        _luceneMaxBufferSizeMB, _luceneAnalyzerClass, 
_luceneAnalyzerClassArgs, _luceneAnalyzerClassArgTypes,
+        _luceneQueryParserClass, _enablePrefixSuffixMatchingInPhraseQueries, 
_reuseMutableIndex,
         _luceneNRTCachingDirectoryMaxBufferSizeMB);
   }
 }
diff --git 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/utils/CsvParser.java
 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/utils/CsvParser.java
new file mode 100644
index 0000000000..b4c2aeacfb
--- /dev/null
+++ 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/utils/CsvParser.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.spi.utils;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import javax.annotation.Nullable;
+
+public class CsvParser {
+  private CsvParser() {
+    // Hide utility class default constructor
+  }
+
+  /**
+   * Parse the input csv string with customizable parsing behavior. Sometimes 
the individual values may contain comma
+   * and other white space characters. These characters are sometimes expected 
to be part of the actual argument.
+   *
+   * @param input  string to split on comma
+   * @param escapeComma if true, we don't split on escaped commas, and we 
replace "\," with "," after the split
+   * @param trim   whether we should trim each tokenized terms
+   * @return a list of values, empty list if input is empty or null
+   */
+  public static List<String> parse(@Nullable String input, boolean 
escapeComma, boolean trim) {
+    if (null == input || input.isEmpty()) {
+      return Collections.emptyList();
+    }
+
+    Stream<String> tokenStream;
+    if (escapeComma) {
+      // Use regular expression to split on "," unless it is "\,"
+      // Use a non-positive limit to apply the replacement as many times as 
possible and to ensure trailing empty
+      // strings shall not be discarded
+      tokenStream = Arrays.stream(input.split("(?<!\\\\),", -1))
+          .map(s -> s.replace("\\,", ","));
+    } else {
+      tokenStream = Arrays.stream(input.split(","));
+    }
+
+    if (trim) {
+      tokenStream = tokenStream.map(String::trim);
+    }
+
+    return tokenStream.collect(Collectors.toList());
+  }
+
+  /**
+   * Parse the input list of string with customized serialization behavior.
+   * @param input containing a list of string to be serialized
+   * @param escapeComma if true, escape commas by replacing "," with "\," 
before the join
+   * @param trim whether we should trim each tokenized terms before 
serialization
+   * @return serialized string representing the input list of string
+   */
+  public static String serialize(List<String> input, boolean escapeComma, 
boolean trim) {
+    Stream<String> tokenStream = input.stream();
+    if (escapeComma) {
+      tokenStream = tokenStream.map(s -> s.replaceAll(",", 
Matcher.quoteReplacement("\\,")));
+    }
+    if (trim) {
+      tokenStream = tokenStream.map(String::trim);
+    }
+    return tokenStream.collect(Collectors.joining(","));
+  }
+}
diff --git 
a/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/utils/CsvParserTest.java
 
b/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/utils/CsvParserTest.java
new file mode 100644
index 0000000000..b3fd5427f4
--- /dev/null
+++ 
b/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/utils/CsvParserTest.java
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.spi.utils;
+
+import java.util.Arrays;
+import java.util.List;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+public class CsvParserTest {
+    @Test
+    public void testEscapeTrueTrimFalse() {
+        String input = " \\,.\n\t()[]{}\"':=-_$\\?@&|#+/,:=[]$@&|#";
+        List<String> actualParsedOutput = CsvParser.parse(input, true, false);
+        List<String> expectedParsedOutput = Arrays.asList(" 
,.\n\t()[]{}\"':=-_$\\?@&|#+/", ":=[]$@&|#");
+        Assert.assertEquals(actualParsedOutput, expectedParsedOutput);
+        Assert.assertEquals(CsvParser.serialize(actualParsedOutput, true, 
false), input);
+    }
+
+    @Test
+    public void testEscapeTrueTrimTrue() {
+        String input = " \\,.\n\t()[]{}\"':=-_$\\?@&|#+/,:=[]$@&|#";
+        List<String> expectedOutput = 
Arrays.asList(",.\n\t()[]{}\"':=-_$\\?@&|#+/", ":=[]$@&|#");
+        Assert.assertEquals(CsvParser.parse(input, true, true), 
expectedOutput);
+    }
+
+    @Test
+    public void testEscapeFalseTrimTrue() {
+        String input = "abc\\,def.ghi, abc.def.ghi\n";
+        List<String> expectedOutput = Arrays.asList("abc\\", "def.ghi", 
"abc.def.ghi");
+        Assert.assertEquals(CsvParser.parse(input, false, true), 
expectedOutput);
+    }
+}
diff --git 
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java 
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
index 9f6f6b88f8..f6e6f3f99d 100644
--- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
+++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
@@ -52,8 +52,13 @@ public class FieldConfig extends BaseJsonConfig {
   public static final String TEXT_INDEX_LUCENE_USE_COMPOUND_FILE = 
"luceneUseCompoundFile";
   public static final String TEXT_INDEX_LUCENE_MAX_BUFFER_SIZE_MB = 
"luceneMaxBufferSizeMB";
   public static final String TEXT_INDEX_LUCENE_ANALYZER_CLASS = 
"luceneAnalyzerClass";
+  public static final String TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARGS = 
"luceneAnalyzerClassArgs";
+  public static final String TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARG_TYPES = 
"luceneAnalyzerClassArgTypes";
+  public static final String TEXT_INDEX_LUCENE_QUERY_PARSER_CLASS = 
"luceneQueryParserClass";
   public static final String TEXT_INDEX_DEFAULT_LUCENE_ANALYZER_CLASS =
       "org.apache.lucene.analysis.standard.StandardAnalyzer";
+  public static final String TEXT_INDEX_DEFAULT_LUCENE_QUERY_PARSER_CLASS =
+      "org.apache.lucene.queryparser.classic.QueryParser";
   public static final String TEXT_INDEX_STOP_WORD_SEPERATOR = ",";
   // "native" for native, default is Lucene
   public static final String TEXT_FST_TYPE = "fstType";


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(pinot) branch master updated: [Feature] Support configurable Lucene analyzer with args and configurable query parser (#13003)

Reply via email to