This is an automated email from the ASF dual-hosted git repository.

xiangfu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 278e5d6d6f9 Add support for minimum should match in lucene text search 
(#16650)
278e5d6d6f9 is described below

commit 278e5d6d6f9a2e20f97cfbe8bf5e1741b0b70005
Author: RAGHVENDRA KUMAR YADAV <[email protected]>
AuthorDate: Wed Sep 3 06:33:02 2025 -0700

    Add support for minimum should match in lucene text search (#16650)
    
    * Adding MinimumShouldMatchQueryParser for text index.
    
    * Adding unit test and Integration test for minimum should match phrase.
    
    * Minimum should match parser.
    
    * incorporating the review comments.
    
    * Update 
pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MinimumShouldMatchQueryParserTest.java
    
    Co-authored-by: Copilot <[email protected]>
    
    * Update 
pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MinimumShouldMatchQueryParserTest.java
    
    Co-authored-by: Copilot <[email protected]>
    
    * Update 
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MatchQueryParser.java
    
    Co-authored-by: Copilot <[email protected]>
    
    ---------
    
    Co-authored-by: Xiang Fu <[email protected]>
    Co-authored-by: Copilot <[email protected]>
---
 .../pinot/queries/TextSearchQueriesTest.java       |  65 ++++
 .../text/lucene/parsers/MatchQueryParser.java      | 335 +++++++++++++++++++++
 .../segment/local/utils/LuceneTextIndexUtils.java  |  11 +
 .../parsers/MinimumShouldMatchQueryParserTest.java | 266 ++++++++++++++++
 4 files changed, 677 insertions(+)

diff --git 
a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java 
b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
index b88276963a9..5861b3fec10 100644
--- 
a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
+++ 
b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
@@ -2340,4 +2340,69 @@ public class TextSearchQueriesTest extends 
BaseQueriesTest {
             "Failed while searching the text index"),
         "Expected error related to leading wildcard or text search failure, 
got: " + errorMsg);
   }
+
+  @Test
+  public void testTextSearchWithMinimumShouldMatchParser()
+      throws Exception {
+    // Test 1: Require at least 2 out of 3 terms (minimumShouldMatch=2) - AWS 
hadoop big
+    List<Object[]> expectedMin2Of3 = new ArrayList<>();
+    expectedMin2Of3.add(new Object[]{
+        1008, "Amazon EC2, AWS, hadoop, big data, spark, building high 
performance scalable systems, building and "
+        + "deploying large scale production systems, concurrency, 
multi-threading, Java, C++, CPU processing"
+    });
+
+    String queryMin2Of3 =
+        "SELECT INT_COL, SKILLS_TEXT_COL FROM " + TABLE_NAME + " WHERE 
TEXT_MATCH(" + SKILLS_TEXT_COL_NAME
+            + ", 'AWS hadoop big', 'parser=MATCH,minimumShouldMatch=2') LIMIT 
50000";
+    testTextSearchSelectQueryHelper(queryMin2Of3, expectedMin2Of3.size(), 
false, expectedMin2Of3);
+
+    // Test 2: Percentage minimum_should_match - require at least 60% (2 out 
of 3 terms)
+    String queryMin80Percent =
+        "SELECT INT_COL, SKILLS_TEXT_COL FROM " + TABLE_NAME + " WHERE 
TEXT_MATCH(" + SKILLS_TEXT_COL_NAME
+            + ", 'AWS hadoop big', 'parser=MATCH,minimumShouldMatch=80%') 
LIMIT 50000";
+    testTextSearchSelectQueryHelper(queryMin80Percent, expectedMin2Of3.size(), 
false, expectedMin2Of3);
+
+    // Test 3: Require at least 1 out of 2 terms (minimumShouldMatch=1) - 
Stanford Tensor
+    List<Object[]> expectedMin1Of2 = new ArrayList<>();
+    expectedMin1Of2.add(new Object[]{
+        1004, "Machine learning, Tensor flow, Java, Stanford university,"
+    });
+    expectedMin1Of2.add(new Object[]{
+        1007, "C++, Python, Tensor flow, database kernel, storage, indexing 
and transaction processing, building "
+        + "large scale systems, Machine learning"
+    });
+    expectedMin1Of2.add(new Object[]{
+        1016, "CUDA, GPU processing, Tensor flow, Pandas, Python, Jupyter 
notebook, spark, Machine learning, building"
+        + " high performance scalable systems"
+    });
+
+    String queryMin1Of2 =
+        "SELECT INT_COL, SKILLS_TEXT_COL FROM " + TABLE_NAME + " WHERE 
TEXT_MATCH(" + SKILLS_TEXT_COL_NAME
+            + ", 'Stanford Tensor', 'parser=MATCH,minimumShouldMatch=1') LIMIT 
50000";
+    testTextSearchSelectQueryHelper(queryMin1Of2, expectedMin1Of2.size(), 
false, expectedMin1Of2);
+
+    // Test 4: Require at least 3 out of 4 terms (minimumShouldMatch=3) - 
Apache Kafka publish subscribe
+    List<Object[]> expectedMin3Of4 = new ArrayList<>();
+    expectedMin3Of4.add(new Object[]{
+        1017, "Distributed systems, Apache Kafka, publish-subscribe, building 
and deploying large scale production "
+        + "systems, concurrency, multi-threading, C++, CPU processing, Java"
+    });
+
+    String queryMin3Of4 =
+        "SELECT INT_COL, SKILLS_TEXT_COL FROM " + TABLE_NAME + " WHERE 
TEXT_MATCH(" + SKILLS_TEXT_COL_NAME
+            + ", 'Apache Kafka publish subscribe', 
'parser=MATCH,minimumShouldMatch=3') LIMIT 50000";
+    testTextSearchSelectQueryHelper(queryMin3Of4, expectedMin3Of4.size(), 
false, expectedMin3Of4);
+
+    // Test 5: Require all 3 terms (minimumShouldMatch=3) - AWS hadoop spark
+    List<Object[]> expectedMin3Of3 = new ArrayList<>();
+    expectedMin3Of3.add(new Object[]{
+        1008, "Amazon EC2, AWS, hadoop, big data, spark, building high 
performance scalable systems, building and "
+        + "deploying large scale production systems, concurrency, 
multi-threading, Java, C++, CPU processing"
+    });
+
+    String queryMin3Of3 =
+        "SELECT INT_COL, SKILLS_TEXT_COL FROM " + TABLE_NAME + " WHERE 
TEXT_MATCH(" + SKILLS_TEXT_COL_NAME
+            + ", 'AWS hadoop spark', 'parser=MATCH,minimumShouldMatch=3') 
LIMIT 50000";
+    testTextSearchSelectQueryHelper(queryMin3Of3, expectedMin3Of3.size(), 
false, expectedMin3Of3);
+  }
 }
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MatchQueryParser.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MatchQueryParser.java
new file mode 100644
index 00000000000..e945bfa2abd
--- /dev/null
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MatchQueryParser.java
@@ -0,0 +1,335 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.segment.index.text.lucene.parsers;
+
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.queryparser.charstream.CharStream;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.queryparser.classic.QueryParserBase;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+
+
+/**
+ * A custom query parser that implements minimum_should_match behavior.
+ * This parser creates Boolean queries with should clauses and enforces a 
minimum
+ * number of matches.
+ *
+ * <p>This parser supports the following minimum_should_match formats:</p>
+ * <ul>
+ *   <li><strong>Positive integer:</strong> "3" - at least 3 should clauses 
must match</li>
+ *   <li><strong>Negative integer:</strong> "-2" - at most 2 should clauses 
can be missing</li>
+ *   <li><strong>Positive percentage:</strong> "80%" - at least 80% of should 
clauses must match</li>
+ *   <li><strong>Negative percentage:</strong> "-20%" - at most 20% of should 
clauses can be missing</li>
+ * </ul>
+ *
+ * <p><strong>Example usage:</strong></p>
+ * <ul>
+ *   <li>Input: 'java OR python OR scala' with minimumShouldMatch=2
+ *       <br>Output: BooleanQuery with 3 should clauses, requiring at least 2 
matches</li>
+ *   <li>Input: 'machine learning OR deep learning OR neural networks' with 
minimumShouldMatch="80%"
+ *       <br>Output: BooleanQuery with 3 should clauses, requiring at least 2 
matches (80% of 3 = 2.4, rounded down
+ *       to 2)</li>
+ *   <li>Input: 'error OR warning OR critical' with minimumShouldMatch="-1"
+ *       <br>Output: BooleanQuery with 3 should clauses, allowing at most 1 to 
be missing (requiring at least 2
+ *       matches)</li>
+ * </ul>
+ *
+ * <p><strong>Behavior:</strong></p>
+ * <ul>
+ *   <li>Single term queries: Returns TermQuery (minimum_should_match is 
ignored)</li>
+ *   <li>Multiple term queries: Returns BooleanQuery with should clauses and 
minimum match requirement</li>
+ *   <li>Null/empty queries: Throws ParseException</li>
+ * </ul>
+ *
+ * <p>This parser extends Lucene's QueryParserBase and implements the required 
abstract methods.
+ * It uses the provided Analyzer for tokenization and creates appropriate 
Lucene Boolean queries.</p>
+ */
+public class MatchQueryParser extends QueryParserBase {
+  /** The field name to search in */
+  private final String _field;
+
+  /** The analyzer used for tokenizing the query */
+  private final Analyzer _analyzer;
+
+  /** The minimum should match specification (stored as string for dynamic 
calculation) */
+  private String _minimumShouldMatch = "1";
+
+  /** The default operator for combining terms */
+  private BooleanClause.Occur _defaultOperator = BooleanClause.Occur.SHOULD;
+
+  /** Pattern for parsing percentage values */
+  private static final Pattern PERCENTAGE_PATTERN = 
Pattern.compile("^(-?\\d+)%$");
+
+  /**
+   * Constructs a new MinimumShouldMatchQueryParser with the specified field 
and analyzer.
+   *
+   * @param field the field name to search in (must not be null)
+   * @param analyzer the analyzer to use for tokenizing queries (must not be 
null)
+   * @throws IllegalArgumentException if field or analyzer is null
+   */
+  public MatchQueryParser(String field, Analyzer analyzer) {
+    super();
+    _field = field;
+    _analyzer = analyzer;
+  }
+
+  /**
+   * Validates the minimum should match specification.
+   *
+   * <p>This method validates the format and range of the minimum_should_match 
value:</p>
+   * <ul>
+   *   <li><strong>Positive integer:</strong> "3" - at least 3 should clauses 
must match</li>
+   *   <li><strong>Negative integer:</strong> "-2" - at most 2 should clauses 
can be missing</li>
+   *   <li><strong>Positive percentage:</strong> "80%" - at least 80% of 
should clauses must match</li>
+   *   <li><strong>Negative percentage:</strong> "-20%" - at most 20% of 
should clauses can be missing</li>
+   * </ul>
+   *
+   * @param minimumShouldMatch the minimum should match specification to 
validate
+   * @return the validated and trimmed value
+   * @throws IllegalArgumentException if the format is invalid or value is out 
of range
+   */
+  private String validateMinimumShouldMatch(String minimumShouldMatch) {
+    if (minimumShouldMatch == null || minimumShouldMatch.trim().isEmpty()) {
+      return "1";
+    }
+
+    String value = minimumShouldMatch.trim();
+    Matcher matcher = PERCENTAGE_PATTERN.matcher(value);
+    if (matcher.matches()) {
+      int percentage = Integer.parseInt(matcher.group(1));
+      if (percentage < -100 || percentage > 100) {
+        throw new IllegalArgumentException("Percentage must be between -100 
and 100: " + percentage);
+      }
+      return value;
+    } else {
+      try {
+        Integer.parseInt(value);
+        return value;
+      } catch (NumberFormatException e) {
+        throw new IllegalArgumentException("Invalid minimum_should_match 
format: " + value
+            + ". Expected integer or percentage (e.g., '3', '-2', '80%', 
'-20%')");
+      }
+    }
+  }
+
+  /**
+   * Sets the minimum number of should clauses that must match.
+   *
+   * <p>This method supports the same formats as OpenSearch's 
minimum_should_match:</p>
+   * <ul>
+   *   <li><strong>Positive integer:</strong> "3" - at least 3 should clauses 
must match</li>
+   *   <li><strong>Negative integer:</strong> "-2" - at most 2 should clauses 
can be missing</li>
+   *   <li><strong>Positive percentage:</strong> "80%" - at least 80% of 
should clauses must match</li>
+   *   <li><strong>Negative percentage:</strong> "-20%" - at most 20% of 
should clauses can be missing</li>
+   * </ul>
+   *
+   * <p>Examples:</p>
+   * <ul>
+   *   <li>setMinimumShouldMatch("3") - requires at least 3 matches</li>
+   *   <li>setMinimumShouldMatch("-1") - allows at most 1 to be missing</li>
+   *   <li>setMinimumShouldMatch("80%") - requires at least 80% matches</li>
+   *   <li>setMinimumShouldMatch("-20%") - allows at most 20% to be 
missing</li>
+   * </ul>
+   *
+   * @param minimumShouldMatch the minimum should match specification (integer 
or percentage)
+   * @throws IllegalArgumentException if the format is invalid or value is out 
of range
+   */
+  public void setMinimumShouldMatch(String minimumShouldMatch) {
+    _minimumShouldMatch = validateMinimumShouldMatch(minimumShouldMatch);
+  }
+
+  /**
+   * Sets the default operator for combining terms.
+   *
+   * @param defaultOperator the default operator (MUST for AND, SHOULD for OR)
+   */
+  public void setDefaultOperator(BooleanClause.Occur defaultOperator) {
+    _defaultOperator = defaultOperator;
+  }
+
+  /**
+   * Parses the given query string and returns an appropriate Lucene Query.
+   *
+   * <p>This method performs the following steps:</p>
+   * <ol>
+   *   <li>Validates the input query (null, empty, whitespace-only)</li>
+   *   <li>Parses the query using Lucene's QueryParser</li>
+   *   <li>Applies minimum_should_match behavior to Boolean queries</li>
+   * </ol>
+   *
+   * @param query the query string to parse (must not be null or empty)
+   * @return a Lucene Query object representing the parsed query
+   * @throws ParseException if the query is null, empty, or parsing fails
+   */
+  @Override
+  public Query parse(String query)
+      throws ParseException {
+    if (query == null) {
+      throw new ParseException("Query cannot be null");
+    }
+
+    if (query.trim().isEmpty()) {
+      throw new ParseException("Query cannot be empty");
+    }
+
+    // Parse the query using Lucene's QueryParser
+    QueryParser parser = new QueryParser(_field, _analyzer);
+    Query parsedQuery = parser.parse(query);
+
+    // If it's a Boolean query, apply minimum_should_match behavior
+    if (parsedQuery instanceof BooleanQuery) {
+      return applyMinimumShouldMatch((BooleanQuery) parsedQuery);
+    }
+
+    // For single term queries, convert to Boolean query with SHOULD clause
+    // For single terms, minimum_should_match should always be 1
+    if (parsedQuery instanceof TermQuery) {
+      BooleanQuery.Builder builder = new BooleanQuery.Builder();
+      builder.add(parsedQuery, BooleanClause.Occur.SHOULD);
+      builder.setMinimumNumberShouldMatch(1);
+      return builder.build();
+    }
+
+    // All the other queries are returned as is
+    return parsedQuery;
+  }
+
+  /**
+   * Applies minimum_should_match behavior to a BooleanQuery.
+   *
+   * @param booleanQuery the BooleanQuery to modify
+   * @return the modified BooleanQuery with minimum_should_match applied
+   */
+  private Query applyMinimumShouldMatch(BooleanQuery booleanQuery) {
+    return applyMinimumShouldMatch(booleanQuery, new HashSet<>());
+  }
+
+  /**
+   * Applies minimum_should_match behavior to a BooleanQuery with infinite 
loop protection.
+   *
+   * @param booleanQuery the BooleanQuery to modify
+   * @param visitedQueries set of already visited BooleanQueries to prevent 
infinite loops
+   * @return the modified BooleanQuery with minimum_should_match applied
+   */
+  private Query applyMinimumShouldMatch(BooleanQuery booleanQuery, 
Set<BooleanQuery> visitedQueries) {
+    if (visitedQueries.contains(booleanQuery)) {
+      return booleanQuery;
+    }
+    visitedQueries.add(booleanQuery);
+
+    BooleanQuery.Builder builder = new BooleanQuery.Builder();
+
+    for (BooleanClause clause : booleanQuery.clauses()) {
+      Query processedQuery = clause.getQuery();
+
+      // Recursively apply minimum_should_match if this clause's query is 
another BooleanQuery
+      if (processedQuery instanceof BooleanQuery) {
+        processedQuery = applyMinimumShouldMatch((BooleanQuery) 
processedQuery, visitedQueries);
+      }
+
+      builder.add(processedQuery, clause.getOccur());
+    }
+
+    // After processing clauses, apply minimum_should_match at this level if 
there are SHOULD clauses
+    int shouldClauseCount = 0;
+    for (BooleanClause clause : builder.build().clauses()) {
+      if (clause.getOccur() == BooleanClause.Occur.SHOULD) {
+        shouldClauseCount++;
+      }
+    }
+    if (shouldClauseCount > 0 && _minimumShouldMatch != null && 
!_minimumShouldMatch.trim().isEmpty()) {
+      int minimumShouldMatch = calculateMinimumShouldMatch(shouldClauseCount, 
_minimumShouldMatch);
+      builder.setMinimumNumberShouldMatch(minimumShouldMatch);
+    }
+
+    return builder.build();
+  }
+
+  /**
+   * Calculates the actual minimum should match value based on the number of 
tokens and the specified value.
+   *
+   * @param totalTokens the total number of tokens in the query
+   * @param minimumShouldMatchValue the minimum should match specification
+   * @return the calculated minimum should match value
+   */
+  private int calculateMinimumShouldMatch(int totalTokens, String 
minimumShouldMatchValue) {
+    String value = minimumShouldMatchValue.trim();
+
+    // Check if it's a percentage
+    Matcher matcher = PERCENTAGE_PATTERN.matcher(value);
+    if (matcher.matches()) {
+      int percentage = Integer.parseInt(matcher.group(1));
+      if (percentage > 0) {
+        int minimumMatches = (totalTokens * percentage) / 100;
+        return Math.max(0, minimumMatches);
+      } else {
+        int minimumMatches = (totalTokens * (100 + percentage)) / 100;
+        return Math.max(0, minimumMatches);
+      }
+    } else {
+      int intValue = Integer.parseInt(value);
+      if (intValue > 0) {
+        return Math.min(intValue, totalTokens);
+      } else {
+        int minimumMatches = totalTokens + intValue;
+        return Math.max(0, minimumMatches);
+      }
+    }
+  }
+
+  /**
+   * Reinitializes the parser with a new CharStream.
+   *
+   * <p>This method is required by QueryParserBase but is not used in this 
implementation
+   * since we override the parse(String) method directly. The method is left 
as a no-op.</p>
+   *
+   * @param input the CharStream to reinitialize with (ignored in this 
implementation)
+   */
+  @Override
+  public void ReInit(CharStream input) {
+    // This method is required by QueryParserBase but not used in our 
implementation
+    // since we override parse(String) directly
+  }
+
+  /**
+   * Creates a top-level query for the specified field.
+   *
+   * <p>This method is required by QueryParserBase but is not supported in 
this implementation.
+   * Use the parse(String) method instead for query parsing.</p>
+   *
+   * @param field the field name (ignored in this implementation)
+   * @return never returns (always throws UnsupportedOperationException)
+   * @throws ParseException never thrown (method always throws 
UnsupportedOperationException)
+   * @throws UnsupportedOperationException always thrown, indicating this 
method is not supported
+   */
+  @Override
+  public Query TopLevelQuery(String field)
+      throws ParseException {
+    throw new UnsupportedOperationException(
+        "TopLevelQuery is not supported in MinimumShouldMatchQueryParser. Use 
parse(String) method instead.");
+  }
+}
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtils.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtils.java
index d366789d9c7..9cefb8e551b 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtils.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtils.java
@@ -49,6 +49,7 @@ public class LuceneTextIndexUtils {
   public static final String PARSER_STANDARD = "STANDARD";
   public static final String PARSER_COMPLEX = "COMPLEX";
   public static final String PARSER_MATCHPHRASE = "MATCHPHRASE";
+  public static final String PARSER_MATCH = "MATCH";
 
   // Default operator constants
   public static final String DEFAULT_OPERATOR_AND = "AND";
@@ -80,6 +81,7 @@ public class LuceneTextIndexUtils {
     public static final String SLOP = "slop";
     public static final String IN_ORDER = "inOrder";
     public static final String ENABLE_PREFIX_MATCH = "enablePrefixMatch";
+    public static final String MINIMUM_SHOULD_MATCH = "minimumShouldMatch";
   }
 
   // Parser class names
@@ -90,6 +92,8 @@ public class LuceneTextIndexUtils {
   public static final String CLASSIC_QUERY_PARSER = 
"org.apache.lucene.queryparser.classic.QueryParser";
   public static final String MATCHPHRASE_QUERY_PARSER_CLASS =
       
"org.apache.pinot.segment.local.segment.index.text.lucene.parsers.PrefixPhraseQueryParser";
+  public static final String MATCH_QUERY_PARSER_CLASS =
+      
"org.apache.pinot.segment.local.segment.index.text.lucene.parsers.MatchQueryParser";
 
   private LuceneTextIndexUtils() {
   }
@@ -156,6 +160,9 @@ public class LuceneTextIndexUtils {
       case PARSER_MATCHPHRASE:
         parserClassName = MATCHPHRASE_QUERY_PARSER_CLASS;
         break;
+      case PARSER_MATCH:
+        parserClassName = MATCH_QUERY_PARSER_CLASS;
+        break;
       default:
         parserClassName = CLASSIC_QUERY_PARSER;
         break;
@@ -353,6 +360,10 @@ public class LuceneTextIndexUtils {
     public boolean isEnablePrefixMatch() {
       return 
Boolean.parseBoolean(_options.getOrDefault(OptionKey.ENABLE_PREFIX_MATCH, 
"false"));
     }
+
+    public String getMinimumShouldMatch() {
+      return _options.getOrDefault(OptionKey.MINIMUM_SHOULD_MATCH, null);
+    }
   }
 
   /**
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MinimumShouldMatchQueryParserTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MinimumShouldMatchQueryParserTest.java
new file mode 100644
index 00000000000..827f0e32ff7
--- /dev/null
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MinimumShouldMatchQueryParserTest.java
@@ -0,0 +1,266 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.segment.index.text.lucene.parsers;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+
+public class MinimumShouldMatchQueryParserTest {
+
+  private static final String FIELD_NAME = "content";
+
+  /**
+   * Helper method to parse query with minimum_should_match option and return 
the result.
+   *
+   * @param query the query string to parse
+   * @param minimumShouldMatch the minimum_should_match value (can be null)
+   * @return the parsed Query
+   * @throws ParseException if parsing fails
+   */
+  private Query parseQueryWithMinimumShouldMatch(String query, String 
minimumShouldMatch)
+      throws ParseException {
+    MatchQueryParser parser = new MatchQueryParser(FIELD_NAME, new 
StandardAnalyzer());
+    if (minimumShouldMatch != null) {
+      parser.setMinimumShouldMatch(minimumShouldMatch);
+    }
+    return parser.parse(query);
+  }
+
+  @Test
+  public void testPositiveCases()
+      throws ParseException {
+    // Test 1: MUST_SHOULD_80_percent - OpenSearch AND (one OR two OR three OR 
four) with minimumShouldMatch=80%
+    Query result1 = parseQueryWithMinimumShouldMatch("OpenSearch AND (one OR 
two OR three OR four)", "80%");
+    Assert.assertTrue(result1 instanceof BooleanQuery);
+    BooleanQuery booleanQuery1 = (BooleanQuery) result1;
+    // Should have 2 clauses: MUST(OpenSearch) and MUST(nested BooleanQuery)
+    Assert.assertEquals(booleanQuery1.clauses().size(), 2);
+    // The nested BooleanQuery should have minimumShouldMatch=3 (80% of 4 = 
3.2, rounded down to 3)
+    BooleanQuery nestedQuery1 = (BooleanQuery) 
booleanQuery1.clauses().get(1).getQuery();
+    Assert.assertEquals(nestedQuery1.getMinimumNumberShouldMatch(), 3);
+
+    // Test 2: MUST_SHOULD_negative_20_percent - OpenSearch AND (one OR two OR 
three OR four) with
+    // minimumShouldMatch=-20%
+    Query result2 = parseQueryWithMinimumShouldMatch("OpenSearch AND (one OR 
two OR three OR four)", "-20%");
+    Assert.assertTrue(result2 instanceof BooleanQuery);
+    BooleanQuery booleanQuery2 = (BooleanQuery) result2;
+    Assert.assertEquals(booleanQuery2.clauses().size(), 2);
+    // The nested BooleanQuery should have minimumShouldMatch=3 (100+(-20)=80% 
of 4 = 3.2, rounded down to 3)
+    BooleanQuery nestedQuery2 = (BooleanQuery) 
booleanQuery2.clauses().get(1).getQuery();
+    Assert.assertEquals(nestedQuery2.getMinimumNumberShouldMatch(), 3);
+
+    // Test 3: SHOULD_only_default_one - one OR two OR three OR four without 
minimumShouldMatch
+    Query result3 = parseQueryWithMinimumShouldMatch("one OR two OR three OR 
four", null);
+    Assert.assertTrue(result3 instanceof BooleanQuery);
+    BooleanQuery booleanQuery3 = (BooleanQuery) result3;
+    Assert.assertEquals(booleanQuery3.clauses().size(), 4);
+    // Default minimumShouldMatch should be 1 for SHOULD-only queries
+    Assert.assertEquals(booleanQuery3.getMinimumNumberShouldMatch(), 1);
+
+    // Test 4: SHOULD_minimum_2 - one OR two OR three OR four with 
minimumShouldMatch=2
+    Query result4 = parseQueryWithMinimumShouldMatch("one OR two OR three OR 
four", "2");
+    Assert.assertTrue(result4 instanceof BooleanQuery);
+    BooleanQuery booleanQuery4 = (BooleanQuery) result4;
+    Assert.assertEquals(booleanQuery4.clauses().size(), 4);
+    Assert.assertEquals(booleanQuery4.getMinimumNumberShouldMatch(), 2);
+
+    // Test 5: SHOULD_75_percent - one OR two OR three OR four with 
minimumShouldMatch=75%
+    Query result5 = parseQueryWithMinimumShouldMatch("one OR two OR three OR 
four", "75%");
+    Assert.assertTrue(result5 instanceof BooleanQuery);
+    BooleanQuery booleanQuery5 = (BooleanQuery) result5;
+    Assert.assertEquals(booleanQuery5.clauses().size(), 4);
+    // 75% of 4 = 3 matches required
+    Assert.assertEquals(booleanQuery5.getMinimumNumberShouldMatch(), 3);
+
+    // Test 6: SHOULD_100_percent - one OR two OR three OR four with 
minimumShouldMatch=100%
+    Query result6 = parseQueryWithMinimumShouldMatch("one OR two OR three OR 
four", "100%");
+    Assert.assertTrue(result6 instanceof BooleanQuery);
+    BooleanQuery booleanQuery6 = (BooleanQuery) result6;
+    Assert.assertEquals(booleanQuery6.clauses().size(), 4);
+    // 100% of 4 = 4 matches required
+    Assert.assertEquals(booleanQuery6.getMinimumNumberShouldMatch(), 4);
+
+    // Test 7: SHOULD_25_percent - one OR two OR three OR four with 
minimumShouldMatch=25%
+    Query result7 = parseQueryWithMinimumShouldMatch("one OR two OR three OR 
four", "25%");
+    Assert.assertTrue(result7 instanceof BooleanQuery);
+    BooleanQuery booleanQuery7 = (BooleanQuery) result7;
+    Assert.assertEquals(booleanQuery7.clauses().size(), 4);
+    // 25% of 4 = 1 match required
+    Assert.assertEquals(booleanQuery7.getMinimumNumberShouldMatch(), 1);
+
+    // Test 8: single_term_query - OpenSearch with minimumShouldMatch=1
+    Query result8 = parseQueryWithMinimumShouldMatch("OpenSearch", "1");
+    Assert.assertTrue(result8 instanceof BooleanQuery);
+    BooleanQuery booleanQuery8 = (BooleanQuery) result8;
+    Assert.assertEquals(booleanQuery8.clauses().size(), 1);
+    Assert.assertEquals(booleanQuery8.getMinimumNumberShouldMatch(), 1);
+
+    // Test 9: SHOULD_negative_50_percent - one OR two OR three OR four with 
minimumShouldMatch=-50%
+    Query result9 = parseQueryWithMinimumShouldMatch("one OR two OR three OR 
four", "-50%");
+    Assert.assertTrue(result9 instanceof BooleanQuery);
+    BooleanQuery booleanQuery9 = (BooleanQuery) result9;
+    Assert.assertEquals(booleanQuery9.clauses().size(), 4);
+    // -50% means 50% must match, so 50% of 4 = 2 matches required
+    Assert.assertEquals(booleanQuery9.getMinimumNumberShouldMatch(), 2);
+
+    // Test 10: Deep nested query - OpenSearch AND ((one OR two) AND (three OR 
four OR five)) with
+    // minimumShouldMatch=60%
+    Query result10 = parseQueryWithMinimumShouldMatch(
+        "OpenSearch AND ((one OR two) AND (three OR four OR five))", "60%");
+    Assert.assertTrue(result10 instanceof BooleanQuery);
+    BooleanQuery booleanQuery10 = (BooleanQuery) result10;
+    Assert.assertEquals(booleanQuery10.clauses().size(), 2);
+
+    // Get the nested BooleanQuery: ((one OR two) AND (three OR four OR five))
+    BooleanQuery nestedQuery10 = (BooleanQuery) 
booleanQuery10.clauses().get(1).getQuery();
+    Assert.assertEquals(nestedQuery10.clauses().size(), 2);
+
+    // Get the first sub-nested BooleanQuery: (one OR two)
+    BooleanQuery subNested1 = (BooleanQuery) 
nestedQuery10.clauses().get(0).getQuery();
+    Assert.assertEquals(subNested1.clauses().size(), 2);
+    // 60% of 2 = 1.2, rounded down to 1
+    Assert.assertEquals(subNested1.getMinimumNumberShouldMatch(), 1);
+
+    // Get the second sub-nested BooleanQuery: (three OR four OR five)
+    BooleanQuery subNested2 = (BooleanQuery) 
nestedQuery10.clauses().get(1).getQuery();
+    Assert.assertEquals(subNested2.clauses().size(), 3);
+    // 60% of 3 = 1.8, rounded down to 1
+    Assert.assertEquals(subNested2.getMinimumNumberShouldMatch(), 1);
+  }
+
+  @Test
+  public void testNegativeCases()
+      throws ParseException {
+    // Case 1: Invalid percentage value (> 100%)
+    try {
+      parseQueryWithMinimumShouldMatch("java OR python", "101%");
+      Assert.fail("Should throw IllegalArgumentException for invalid 
percentage");
+    } catch (IllegalArgumentException e) {
+      // Expected
+    } catch (ParseException e) {
+      Assert.fail("Should throw IllegalArgumentException, not ParseException");
+    }
+
+    // Case 2: Invalid negative percentage value (< -100%)
+    try {
+      parseQueryWithMinimumShouldMatch("java OR python", "-101%");
+      Assert.fail("Should throw IllegalArgumentException for invalid negative 
percentage");
+    } catch (IllegalArgumentException e) {
+      // Expected
+    } catch (ParseException e) {
+      Assert.fail("Should throw IllegalArgumentException, not ParseException");
+    }
+
+    // Case 3: Invalid format (not integer or percentage)
+    try {
+      parseQueryWithMinimumShouldMatch("java OR python", "abc");
+      Assert.fail("Should throw IllegalArgumentException for invalid format");
+    } catch (IllegalArgumentException e) {
+      // Expected
+    } catch (ParseException e) {
+      Assert.fail("Should throw IllegalArgumentException, not ParseException");
+    }
+
+    // Case 4: Invalid decimal percentage
+    try {
+      parseQueryWithMinimumShouldMatch("java OR python", "50.5%");
+      Assert.fail("Should throw IllegalArgumentException for invalid decimal 
percentage");
+    } catch (IllegalArgumentException e) {
+      // Expected
+    } catch (ParseException e) {
+      Assert.fail("Should throw IllegalArgumentException, not ParseException");
+    }
+
+    // Case 5: Null query
+    try {
+      parseQueryWithMinimumShouldMatch(null, null);
+      Assert.fail("Should throw ParseException for null query");
+    } catch (ParseException e) {
+      // Expected
+    }
+
+    // Case 6: Empty query
+    try {
+      parseQueryWithMinimumShouldMatch("", null);
+      Assert.fail("Should throw ParseException for empty query");
+    } catch (ParseException e) {
+      // Expected
+    }
+
+    // Case 7: Whitespace-only query
+    try {
+      parseQueryWithMinimumShouldMatch("   ", null);
+      Assert.fail("Should throw ParseException for whitespace-only query");
+    } catch (ParseException e) {
+      // Expected
+    }
+    // Case 9: Non-Boolean query (phrase query)
+    parseQueryWithMinimumShouldMatch("\"java programming\"", null);
+    parseQueryWithMinimumShouldMatch("java*", null);
+  }
+
+  @Test
+  public void testEdgeCases()
+      throws ParseException {
+    // Case 1: minimum_should_match value greater than number of should 
clauses (boundary)
+    Query result1 = parseQueryWithMinimumShouldMatch("java OR python", "5");
+    Assert.assertTrue(result1 instanceof BooleanQuery);
+    BooleanQuery booleanQuery1 = (BooleanQuery) result1;
+    Assert.assertEquals(booleanQuery1.clauses().size(), 2);
+    // Should cap at the number of available clauses (2)
+    Assert.assertEquals(booleanQuery1.getMinimumNumberShouldMatch(), 2);
+
+    // Case 2: negative minimum_should_match that would result in negative 
value (boundary)
+    Query result2 = parseQueryWithMinimumShouldMatch("java OR python", "-5");
+    Assert.assertTrue(result2 instanceof BooleanQuery);
+    BooleanQuery booleanQuery2 = (BooleanQuery) result2;
+    Assert.assertEquals(booleanQuery2.clauses().size(), 2);
+    // Should not go below 0 (2 - 5 = -3, but capped at 0)
+    Assert.assertEquals(booleanQuery2.getMinimumNumberShouldMatch(), 0);
+
+    // Case 3: negative percentage that would result in zero (boundary)
+    Query result3 = parseQueryWithMinimumShouldMatch("java OR python", "-80%");
+    Assert.assertTrue(result3 instanceof BooleanQuery);
+    BooleanQuery booleanQuery3 = (BooleanQuery) result3;
+    Assert.assertEquals(booleanQuery3.clauses().size(), 2);
+    // -80% means 20% must match, but 20% of 2 = 0.4, rounded down to 0
+    Assert.assertEquals(booleanQuery3.getMinimumNumberShouldMatch(), 0);
+
+    // Case 4: minimum_should_match value equal to number of should clauses 
(boundary)
+    Query result4 = parseQueryWithMinimumShouldMatch("java OR python OR 
scala", "3");
+    Assert.assertTrue(result4 instanceof BooleanQuery);
+    BooleanQuery booleanQuery4 = (BooleanQuery) result4;
+    Assert.assertEquals(booleanQuery4.clauses().size(), 3);
+    // Should require all 3 clauses to match
+    Assert.assertEquals(booleanQuery4.getMinimumNumberShouldMatch(), 3);
+
+    // Case 5: percentage that results in decimal value (boundary rounding)
+    Query result5 = parseQueryWithMinimumShouldMatch("java OR python OR scala 
OR kotlin", "75%");
+    Assert.assertTrue(result5 instanceof BooleanQuery);
+    BooleanQuery booleanQuery5 = (BooleanQuery) result5;
+    Assert.assertEquals(booleanQuery5.clauses().size(), 4);
+    // 75% of 4 = 3.0, should round down to 3
+    Assert.assertEquals(booleanQuery5.getMinimumNumberShouldMatch(), 3);
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to