This is an automated email from the ASF dual-hosted git repository.
xiangfu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 278e5d6d6f9 Add support for minimum should match in lucene text search
(#16650)
278e5d6d6f9 is described below
commit 278e5d6d6f9a2e20f97cfbe8bf5e1741b0b70005
Author: RAGHVENDRA KUMAR YADAV <[email protected]>
AuthorDate: Wed Sep 3 06:33:02 2025 -0700
Add support for minimum should match in lucene text search (#16650)
* Adding MinimumShouldMatchQueryParser for text index.
* Adding unit test and Integration test for minimum should match phrase.
* Minimum should match parser.
* incorporating the review comments.
* Update
pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MinimumShouldMatchQueryParserTest.java
Co-authored-by: Copilot <[email protected]>
* Update
pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MinimumShouldMatchQueryParserTest.java
Co-authored-by: Copilot <[email protected]>
* Update
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MatchQueryParser.java
Co-authored-by: Copilot <[email protected]>
---------
Co-authored-by: Xiang Fu <[email protected]>
Co-authored-by: Copilot <[email protected]>
---
.../pinot/queries/TextSearchQueriesTest.java | 65 ++++
.../text/lucene/parsers/MatchQueryParser.java | 335 +++++++++++++++++++++
.../segment/local/utils/LuceneTextIndexUtils.java | 11 +
.../parsers/MinimumShouldMatchQueryParserTest.java | 266 ++++++++++++++++
4 files changed, 677 insertions(+)
diff --git
a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
index b88276963a9..5861b3fec10 100644
---
a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
+++
b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
@@ -2340,4 +2340,69 @@ public class TextSearchQueriesTest extends
BaseQueriesTest {
"Failed while searching the text index"),
"Expected error related to leading wildcard or text search failure,
got: " + errorMsg);
}
+
+ @Test
+ public void testTextSearchWithMinimumShouldMatchParser()
+ throws Exception {
+ // Test 1: Require at least 2 out of 3 terms (minimumShouldMatch=2) - AWS
hadoop big
+ List<Object[]> expectedMin2Of3 = new ArrayList<>();
+ expectedMin2Of3.add(new Object[]{
+ 1008, "Amazon EC2, AWS, hadoop, big data, spark, building high
performance scalable systems, building and "
+ + "deploying large scale production systems, concurrency,
multi-threading, Java, C++, CPU processing"
+ });
+
+ String queryMin2Of3 =
+ "SELECT INT_COL, SKILLS_TEXT_COL FROM " + TABLE_NAME + " WHERE
TEXT_MATCH(" + SKILLS_TEXT_COL_NAME
+ + ", 'AWS hadoop big', 'parser=MATCH,minimumShouldMatch=2') LIMIT
50000";
+ testTextSearchSelectQueryHelper(queryMin2Of3, expectedMin2Of3.size(),
false, expectedMin2Of3);
+
+ // Test 2: Percentage minimum_should_match - require at least 60% (2 out
of 3 terms)
+ String queryMin80Percent =
+ "SELECT INT_COL, SKILLS_TEXT_COL FROM " + TABLE_NAME + " WHERE
TEXT_MATCH(" + SKILLS_TEXT_COL_NAME
+ + ", 'AWS hadoop big', 'parser=MATCH,minimumShouldMatch=80%')
LIMIT 50000";
+ testTextSearchSelectQueryHelper(queryMin80Percent, expectedMin2Of3.size(),
false, expectedMin2Of3);
+
+ // Test 3: Require at least 1 out of 2 terms (minimumShouldMatch=1) -
Stanford Tensor
+ List<Object[]> expectedMin1Of2 = new ArrayList<>();
+ expectedMin1Of2.add(new Object[]{
+ 1004, "Machine learning, Tensor flow, Java, Stanford university,"
+ });
+ expectedMin1Of2.add(new Object[]{
+ 1007, "C++, Python, Tensor flow, database kernel, storage, indexing
and transaction processing, building "
+ + "large scale systems, Machine learning"
+ });
+ expectedMin1Of2.add(new Object[]{
+ 1016, "CUDA, GPU processing, Tensor flow, Pandas, Python, Jupyter
notebook, spark, Machine learning, building"
+ + " high performance scalable systems"
+ });
+
+ String queryMin1Of2 =
+ "SELECT INT_COL, SKILLS_TEXT_COL FROM " + TABLE_NAME + " WHERE
TEXT_MATCH(" + SKILLS_TEXT_COL_NAME
+ + ", 'Stanford Tensor', 'parser=MATCH,minimumShouldMatch=1') LIMIT
50000";
+ testTextSearchSelectQueryHelper(queryMin1Of2, expectedMin1Of2.size(),
false, expectedMin1Of2);
+
+ // Test 4: Require at least 3 out of 4 terms (minimumShouldMatch=3) -
Apache Kafka publish subscribe
+ List<Object[]> expectedMin3Of4 = new ArrayList<>();
+ expectedMin3Of4.add(new Object[]{
+ 1017, "Distributed systems, Apache Kafka, publish-subscribe, building
and deploying large scale production "
+ + "systems, concurrency, multi-threading, C++, CPU processing, Java"
+ });
+
+ String queryMin3Of4 =
+ "SELECT INT_COL, SKILLS_TEXT_COL FROM " + TABLE_NAME + " WHERE
TEXT_MATCH(" + SKILLS_TEXT_COL_NAME
+ + ", 'Apache Kafka publish subscribe',
'parser=MATCH,minimumShouldMatch=3') LIMIT 50000";
+ testTextSearchSelectQueryHelper(queryMin3Of4, expectedMin3Of4.size(),
false, expectedMin3Of4);
+
+ // Test 5: Require all 3 terms (minimumShouldMatch=3) - AWS hadoop spark
+ List<Object[]> expectedMin3Of3 = new ArrayList<>();
+ expectedMin3Of3.add(new Object[]{
+ 1008, "Amazon EC2, AWS, hadoop, big data, spark, building high
performance scalable systems, building and "
+ + "deploying large scale production systems, concurrency,
multi-threading, Java, C++, CPU processing"
+ });
+
+ String queryMin3Of3 =
+ "SELECT INT_COL, SKILLS_TEXT_COL FROM " + TABLE_NAME + " WHERE
TEXT_MATCH(" + SKILLS_TEXT_COL_NAME
+ + ", 'AWS hadoop spark', 'parser=MATCH,minimumShouldMatch=3')
LIMIT 50000";
+ testTextSearchSelectQueryHelper(queryMin3Of3, expectedMin3Of3.size(),
false, expectedMin3Of3);
+ }
}
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MatchQueryParser.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MatchQueryParser.java
new file mode 100644
index 00000000000..e945bfa2abd
--- /dev/null
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MatchQueryParser.java
@@ -0,0 +1,335 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.segment.index.text.lucene.parsers;
+
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.queryparser.charstream.CharStream;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.queryparser.classic.QueryParserBase;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+
+
+/**
+ * A custom query parser that implements minimum_should_match behavior.
+ * This parser creates Boolean queries with should clauses and enforces a
minimum
+ * number of matches.
+ *
+ * <p>This parser supports the following minimum_should_match formats:</p>
+ * <ul>
+ * <li><strong>Positive integer:</strong> "3" - at least 3 should clauses
must match</li>
+ * <li><strong>Negative integer:</strong> "-2" - at most 2 should clauses
can be missing</li>
+ * <li><strong>Positive percentage:</strong> "80%" - at least 80% of should
clauses must match</li>
+ * <li><strong>Negative percentage:</strong> "-20%" - at most 20% of should
clauses can be missing</li>
+ * </ul>
+ *
+ * <p><strong>Example usage:</strong></p>
+ * <ul>
+ * <li>Input: 'java OR python OR scala' with minimumShouldMatch=2
+ * <br>Output: BooleanQuery with 3 should clauses, requiring at least 2
matches</li>
+ * <li>Input: 'machine learning OR deep learning OR neural networks' with
minimumShouldMatch="80%"
+ * <br>Output: BooleanQuery with 3 should clauses, requiring at least 2
matches (80% of 3 = 2.4, rounded down
+ * to 2)</li>
+ * <li>Input: 'error OR warning OR critical' with minimumShouldMatch="-1"
+ * <br>Output: BooleanQuery with 3 should clauses, allowing at most 1 to
be missing (requiring at least 2
+ * matches)</li>
+ * </ul>
+ *
+ * <p><strong>Behavior:</strong></p>
+ * <ul>
+ * <li>Single term queries: Returns TermQuery (minimum_should_match is
ignored)</li>
+ * <li>Multiple term queries: Returns BooleanQuery with should clauses and
minimum match requirement</li>
+ * <li>Null/empty queries: Throws ParseException</li>
+ * </ul>
+ *
+ * <p>This parser extends Lucene's QueryParserBase and implements the required
abstract methods.
+ * It uses the provided Analyzer for tokenization and creates appropriate
Lucene Boolean queries.</p>
+ */
+public class MatchQueryParser extends QueryParserBase {
+ /** The field name to search in */
+ private final String _field;
+
+ /** The analyzer used for tokenizing the query */
+ private final Analyzer _analyzer;
+
+ /** The minimum should match specification (stored as string for dynamic
calculation) */
+ private String _minimumShouldMatch = "1";
+
+ /** The default operator for combining terms */
+ private BooleanClause.Occur _defaultOperator = BooleanClause.Occur.SHOULD;
+
+ /** Pattern for parsing percentage values */
+ private static final Pattern PERCENTAGE_PATTERN =
Pattern.compile("^(-?\\d+)%$");
+
+ /**
+ * Constructs a new MinimumShouldMatchQueryParser with the specified field
and analyzer.
+ *
+ * @param field the field name to search in (must not be null)
+ * @param analyzer the analyzer to use for tokenizing queries (must not be
null)
+ * @throws IllegalArgumentException if field or analyzer is null
+ */
+ public MatchQueryParser(String field, Analyzer analyzer) {
+ super();
+ _field = field;
+ _analyzer = analyzer;
+ }
+
+ /**
+ * Validates the minimum should match specification.
+ *
+ * <p>This method validates the format and range of the minimum_should_match
value:</p>
+ * <ul>
+ * <li><strong>Positive integer:</strong> "3" - at least 3 should clauses
must match</li>
+ * <li><strong>Negative integer:</strong> "-2" - at most 2 should clauses
can be missing</li>
+ * <li><strong>Positive percentage:</strong> "80%" - at least 80% of
should clauses must match</li>
+ * <li><strong>Negative percentage:</strong> "-20%" - at most 20% of
should clauses can be missing</li>
+ * </ul>
+ *
+ * @param minimumShouldMatch the minimum should match specification to
validate
+ * @return the validated and trimmed value
+ * @throws IllegalArgumentException if the format is invalid or value is out
of range
+ */
+ private String validateMinimumShouldMatch(String minimumShouldMatch) {
+ if (minimumShouldMatch == null || minimumShouldMatch.trim().isEmpty()) {
+ return "1";
+ }
+
+ String value = minimumShouldMatch.trim();
+ Matcher matcher = PERCENTAGE_PATTERN.matcher(value);
+ if (matcher.matches()) {
+ int percentage = Integer.parseInt(matcher.group(1));
+ if (percentage < -100 || percentage > 100) {
+ throw new IllegalArgumentException("Percentage must be between -100
and 100: " + percentage);
+ }
+ return value;
+ } else {
+ try {
+ Integer.parseInt(value);
+ return value;
+ } catch (NumberFormatException e) {
+ throw new IllegalArgumentException("Invalid minimum_should_match
format: " + value
+ + ". Expected integer or percentage (e.g., '3', '-2', '80%',
'-20%')");
+ }
+ }
+ }
+
+ /**
+ * Sets the minimum number of should clauses that must match.
+ *
+ * <p>This method supports the same formats as OpenSearch's
minimum_should_match:</p>
+ * <ul>
+ * <li><strong>Positive integer:</strong> "3" - at least 3 should clauses
must match</li>
+ * <li><strong>Negative integer:</strong> "-2" - at most 2 should clauses
can be missing</li>
+ * <li><strong>Positive percentage:</strong> "80%" - at least 80% of
should clauses must match</li>
+ * <li><strong>Negative percentage:</strong> "-20%" - at most 20% of
should clauses can be missing</li>
+ * </ul>
+ *
+ * <p>Examples:</p>
+ * <ul>
+ * <li>setMinimumShouldMatch("3") - requires at least 3 matches</li>
+ * <li>setMinimumShouldMatch("-1") - allows at most 1 to be missing</li>
+ * <li>setMinimumShouldMatch("80%") - requires at least 80% matches</li>
+ * <li>setMinimumShouldMatch("-20%") - allows at most 20% to be
missing</li>
+ * </ul>
+ *
+ * @param minimumShouldMatch the minimum should match specification (integer
or percentage)
+ * @throws IllegalArgumentException if the format is invalid or value is out
of range
+ */
+ public void setMinimumShouldMatch(String minimumShouldMatch) {
+ _minimumShouldMatch = validateMinimumShouldMatch(minimumShouldMatch);
+ }
+
+ /**
+ * Sets the default operator for combining terms.
+ *
+ * @param defaultOperator the default operator (MUST for AND, SHOULD for OR)
+ */
+ public void setDefaultOperator(BooleanClause.Occur defaultOperator) {
+ _defaultOperator = defaultOperator;
+ }
+
+ /**
+ * Parses the given query string and returns an appropriate Lucene Query.
+ *
+ * <p>This method performs the following steps:</p>
+ * <ol>
+ * <li>Validates the input query (null, empty, whitespace-only)</li>
+ * <li>Parses the query using Lucene's QueryParser</li>
+ * <li>Applies minimum_should_match behavior to Boolean queries</li>
+ * </ol>
+ *
+ * @param query the query string to parse (must not be null or empty)
+ * @return a Lucene Query object representing the parsed query
+ * @throws ParseException if the query is null, empty, or parsing fails
+ */
+ @Override
+ public Query parse(String query)
+ throws ParseException {
+ if (query == null) {
+ throw new ParseException("Query cannot be null");
+ }
+
+ if (query.trim().isEmpty()) {
+ throw new ParseException("Query cannot be empty");
+ }
+
+ // Parse the query using Lucene's QueryParser
+ QueryParser parser = new QueryParser(_field, _analyzer);
+ Query parsedQuery = parser.parse(query);
+
+ // If it's a Boolean query, apply minimum_should_match behavior
+ if (parsedQuery instanceof BooleanQuery) {
+ return applyMinimumShouldMatch((BooleanQuery) parsedQuery);
+ }
+
+ // For single term queries, convert to Boolean query with SHOULD clause
+ // For single terms, minimum_should_match should always be 1
+ if (parsedQuery instanceof TermQuery) {
+ BooleanQuery.Builder builder = new BooleanQuery.Builder();
+ builder.add(parsedQuery, BooleanClause.Occur.SHOULD);
+ builder.setMinimumNumberShouldMatch(1);
+ return builder.build();
+ }
+
+ // All the other queries are returned as is
+ return parsedQuery;
+ }
+
+ /**
+ * Applies minimum_should_match behavior to a BooleanQuery.
+ *
+ * @param booleanQuery the BooleanQuery to modify
+ * @return the modified BooleanQuery with minimum_should_match applied
+ */
+ private Query applyMinimumShouldMatch(BooleanQuery booleanQuery) {
+ return applyMinimumShouldMatch(booleanQuery, new HashSet<>());
+ }
+
+ /**
+ * Applies minimum_should_match behavior to a BooleanQuery with infinite
loop protection.
+ *
+ * @param booleanQuery the BooleanQuery to modify
+ * @param visitedQueries set of already visited BooleanQueries to prevent
infinite loops
+ * @return the modified BooleanQuery with minimum_should_match applied
+ */
+ private Query applyMinimumShouldMatch(BooleanQuery booleanQuery,
Set<BooleanQuery> visitedQueries) {
+ if (visitedQueries.contains(booleanQuery)) {
+ return booleanQuery;
+ }
+ visitedQueries.add(booleanQuery);
+
+ BooleanQuery.Builder builder = new BooleanQuery.Builder();
+
+ for (BooleanClause clause : booleanQuery.clauses()) {
+ Query processedQuery = clause.getQuery();
+
+ // Recursively apply minimum_should_match if this clause's query is
another BooleanQuery
+ if (processedQuery instanceof BooleanQuery) {
+ processedQuery = applyMinimumShouldMatch((BooleanQuery)
processedQuery, visitedQueries);
+ }
+
+ builder.add(processedQuery, clause.getOccur());
+ }
+
+ // After processing clauses, apply minimum_should_match at this level if
there are SHOULD clauses
+ int shouldClauseCount = 0;
+ for (BooleanClause clause : builder.build().clauses()) {
+ if (clause.getOccur() == BooleanClause.Occur.SHOULD) {
+ shouldClauseCount++;
+ }
+ }
+ if (shouldClauseCount > 0 && _minimumShouldMatch != null &&
!_minimumShouldMatch.trim().isEmpty()) {
+ int minimumShouldMatch = calculateMinimumShouldMatch(shouldClauseCount,
_minimumShouldMatch);
+ builder.setMinimumNumberShouldMatch(minimumShouldMatch);
+ }
+
+ return builder.build();
+ }
+
+ /**
+ * Calculates the actual minimum should match value based on the number of
tokens and the specified value.
+ *
+ * @param totalTokens the total number of tokens in the query
+ * @param minimumShouldMatchValue the minimum should match specification
+ * @return the calculated minimum should match value
+ */
+ private int calculateMinimumShouldMatch(int totalTokens, String
minimumShouldMatchValue) {
+ String value = minimumShouldMatchValue.trim();
+
+ // Check if it's a percentage
+ Matcher matcher = PERCENTAGE_PATTERN.matcher(value);
+ if (matcher.matches()) {
+ int percentage = Integer.parseInt(matcher.group(1));
+ if (percentage > 0) {
+ int minimumMatches = (totalTokens * percentage) / 100;
+ return Math.max(0, minimumMatches);
+ } else {
+ int minimumMatches = (totalTokens * (100 + percentage)) / 100;
+ return Math.max(0, minimumMatches);
+ }
+ } else {
+ int intValue = Integer.parseInt(value);
+ if (intValue > 0) {
+ return Math.min(intValue, totalTokens);
+ } else {
+ int minimumMatches = totalTokens + intValue;
+ return Math.max(0, minimumMatches);
+ }
+ }
+ }
+
+ /**
+ * Reinitializes the parser with a new CharStream.
+ *
+ * <p>This method is required by QueryParserBase but is not used in this
implementation
+ * since we override the parse(String) method directly. The method is left
as a no-op.</p>
+ *
+ * @param input the CharStream to reinitialize with (ignored in this
implementation)
+ */
+ @Override
+ public void ReInit(CharStream input) {
+ // This method is required by QueryParserBase but not used in our
implementation
+ // since we override parse(String) directly
+ }
+
+ /**
+ * Creates a top-level query for the specified field.
+ *
+ * <p>This method is required by QueryParserBase but is not supported in
this implementation.
+ * Use the parse(String) method instead for query parsing.</p>
+ *
+ * @param field the field name (ignored in this implementation)
+ * @return never returns (always throws UnsupportedOperationException)
+ * @throws ParseException never thrown (method always throws
UnsupportedOperationException)
+ * @throws UnsupportedOperationException always thrown, indicating this
method is not supported
+ */
+ @Override
+ public Query TopLevelQuery(String field)
+ throws ParseException {
+ throw new UnsupportedOperationException(
+ "TopLevelQuery is not supported in MinimumShouldMatchQueryParser. Use
parse(String) method instead.");
+ }
+}
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtils.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtils.java
index d366789d9c7..9cefb8e551b 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtils.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtils.java
@@ -49,6 +49,7 @@ public class LuceneTextIndexUtils {
public static final String PARSER_STANDARD = "STANDARD";
public static final String PARSER_COMPLEX = "COMPLEX";
public static final String PARSER_MATCHPHRASE = "MATCHPHRASE";
+ public static final String PARSER_MATCH = "MATCH";
// Default operator constants
public static final String DEFAULT_OPERATOR_AND = "AND";
@@ -80,6 +81,7 @@ public class LuceneTextIndexUtils {
public static final String SLOP = "slop";
public static final String IN_ORDER = "inOrder";
public static final String ENABLE_PREFIX_MATCH = "enablePrefixMatch";
+ public static final String MINIMUM_SHOULD_MATCH = "minimumShouldMatch";
}
// Parser class names
@@ -90,6 +92,8 @@ public class LuceneTextIndexUtils {
public static final String CLASSIC_QUERY_PARSER =
"org.apache.lucene.queryparser.classic.QueryParser";
public static final String MATCHPHRASE_QUERY_PARSER_CLASS =
"org.apache.pinot.segment.local.segment.index.text.lucene.parsers.PrefixPhraseQueryParser";
+ public static final String MATCH_QUERY_PARSER_CLASS =
+
"org.apache.pinot.segment.local.segment.index.text.lucene.parsers.MatchQueryParser";
private LuceneTextIndexUtils() {
}
@@ -156,6 +160,9 @@ public class LuceneTextIndexUtils {
case PARSER_MATCHPHRASE:
parserClassName = MATCHPHRASE_QUERY_PARSER_CLASS;
break;
+ case PARSER_MATCH:
+ parserClassName = MATCH_QUERY_PARSER_CLASS;
+ break;
default:
parserClassName = CLASSIC_QUERY_PARSER;
break;
@@ -353,6 +360,10 @@ public class LuceneTextIndexUtils {
public boolean isEnablePrefixMatch() {
return
Boolean.parseBoolean(_options.getOrDefault(OptionKey.ENABLE_PREFIX_MATCH,
"false"));
}
+
+ public String getMinimumShouldMatch() {
+ return _options.getOrDefault(OptionKey.MINIMUM_SHOULD_MATCH, null);
+ }
}
/**
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MinimumShouldMatchQueryParserTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MinimumShouldMatchQueryParserTest.java
new file mode 100644
index 00000000000..827f0e32ff7
--- /dev/null
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/MinimumShouldMatchQueryParserTest.java
@@ -0,0 +1,266 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.segment.index.text.lucene.parsers;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+
+public class MinimumShouldMatchQueryParserTest {
+
+ private static final String FIELD_NAME = "content";
+
+ /**
+ * Helper method to parse query with minimum_should_match option and return
the result.
+ *
+ * @param query the query string to parse
+ * @param minimumShouldMatch the minimum_should_match value (can be null)
+ * @return the parsed Query
+ * @throws ParseException if parsing fails
+ */
+ private Query parseQueryWithMinimumShouldMatch(String query, String
minimumShouldMatch)
+ throws ParseException {
+ MatchQueryParser parser = new MatchQueryParser(FIELD_NAME, new
StandardAnalyzer());
+ if (minimumShouldMatch != null) {
+ parser.setMinimumShouldMatch(minimumShouldMatch);
+ }
+ return parser.parse(query);
+ }
+
+ @Test
+ public void testPositiveCases()
+ throws ParseException {
+ // Test 1: MUST_SHOULD_80_percent - OpenSearch AND (one OR two OR three OR
four) with minimumShouldMatch=80%
+ Query result1 = parseQueryWithMinimumShouldMatch("OpenSearch AND (one OR
two OR three OR four)", "80%");
+ Assert.assertTrue(result1 instanceof BooleanQuery);
+ BooleanQuery booleanQuery1 = (BooleanQuery) result1;
+ // Should have 2 clauses: MUST(OpenSearch) and MUST(nested BooleanQuery)
+ Assert.assertEquals(booleanQuery1.clauses().size(), 2);
+ // The nested BooleanQuery should have minimumShouldMatch=3 (80% of 4 =
3.2, rounded down to 3)
+ BooleanQuery nestedQuery1 = (BooleanQuery)
booleanQuery1.clauses().get(1).getQuery();
+ Assert.assertEquals(nestedQuery1.getMinimumNumberShouldMatch(), 3);
+
+ // Test 2: MUST_SHOULD_negative_20_percent - OpenSearch AND (one OR two OR
three OR four) with
+ // minimumShouldMatch=-20%
+ Query result2 = parseQueryWithMinimumShouldMatch("OpenSearch AND (one OR
two OR three OR four)", "-20%");
+ Assert.assertTrue(result2 instanceof BooleanQuery);
+ BooleanQuery booleanQuery2 = (BooleanQuery) result2;
+ Assert.assertEquals(booleanQuery2.clauses().size(), 2);
+ // The nested BooleanQuery should have minimumShouldMatch=3 (100+(-20)=80%
of 4 = 3.2, rounded down to 3)
+ BooleanQuery nestedQuery2 = (BooleanQuery)
booleanQuery2.clauses().get(1).getQuery();
+ Assert.assertEquals(nestedQuery2.getMinimumNumberShouldMatch(), 3);
+
+ // Test 3: SHOULD_only_default_one - one OR two OR three OR four without
minimumShouldMatch
+ Query result3 = parseQueryWithMinimumShouldMatch("one OR two OR three OR
four", null);
+ Assert.assertTrue(result3 instanceof BooleanQuery);
+ BooleanQuery booleanQuery3 = (BooleanQuery) result3;
+ Assert.assertEquals(booleanQuery3.clauses().size(), 4);
+ // Default minimumShouldMatch should be 1 for SHOULD-only queries
+ Assert.assertEquals(booleanQuery3.getMinimumNumberShouldMatch(), 1);
+
+ // Test 4: SHOULD_minimum_2 - one OR two OR three OR four with
minimumShouldMatch=2
+ Query result4 = parseQueryWithMinimumShouldMatch("one OR two OR three OR
four", "2");
+ Assert.assertTrue(result4 instanceof BooleanQuery);
+ BooleanQuery booleanQuery4 = (BooleanQuery) result4;
+ Assert.assertEquals(booleanQuery4.clauses().size(), 4);
+ Assert.assertEquals(booleanQuery4.getMinimumNumberShouldMatch(), 2);
+
+ // Test 5: SHOULD_75_percent - one OR two OR three OR four with
minimumShouldMatch=75%
+ Query result5 = parseQueryWithMinimumShouldMatch("one OR two OR three OR
four", "75%");
+ Assert.assertTrue(result5 instanceof BooleanQuery);
+ BooleanQuery booleanQuery5 = (BooleanQuery) result5;
+ Assert.assertEquals(booleanQuery5.clauses().size(), 4);
+ // 75% of 4 = 3 matches required
+ Assert.assertEquals(booleanQuery5.getMinimumNumberShouldMatch(), 3);
+
+ // Test 6: SHOULD_100_percent - one OR two OR three OR four with
minimumShouldMatch=100%
+ Query result6 = parseQueryWithMinimumShouldMatch("one OR two OR three OR
four", "100%");
+ Assert.assertTrue(result6 instanceof BooleanQuery);
+ BooleanQuery booleanQuery6 = (BooleanQuery) result6;
+ Assert.assertEquals(booleanQuery6.clauses().size(), 4);
+ // 100% of 4 = 4 matches required
+ Assert.assertEquals(booleanQuery6.getMinimumNumberShouldMatch(), 4);
+
+ // Test 7: SHOULD_25_percent - one OR two OR three OR four with
minimumShouldMatch=25%
+ Query result7 = parseQueryWithMinimumShouldMatch("one OR two OR three OR
four", "25%");
+ Assert.assertTrue(result7 instanceof BooleanQuery);
+ BooleanQuery booleanQuery7 = (BooleanQuery) result7;
+ Assert.assertEquals(booleanQuery7.clauses().size(), 4);
+ // 25% of 4 = 1 match required
+ Assert.assertEquals(booleanQuery7.getMinimumNumberShouldMatch(), 1);
+
+ // Test 8: single_term_query - OpenSearch with minimumShouldMatch=1
+ Query result8 = parseQueryWithMinimumShouldMatch("OpenSearch", "1");
+ Assert.assertTrue(result8 instanceof BooleanQuery);
+ BooleanQuery booleanQuery8 = (BooleanQuery) result8;
+ Assert.assertEquals(booleanQuery8.clauses().size(), 1);
+ Assert.assertEquals(booleanQuery8.getMinimumNumberShouldMatch(), 1);
+
+ // Test 9: SHOULD_negative_50_percent - one OR two OR three OR four with
minimumShouldMatch=-50%
+ Query result9 = parseQueryWithMinimumShouldMatch("one OR two OR three OR
four", "-50%");
+ Assert.assertTrue(result9 instanceof BooleanQuery);
+ BooleanQuery booleanQuery9 = (BooleanQuery) result9;
+ Assert.assertEquals(booleanQuery9.clauses().size(), 4);
+ // -50% means 50% must match, so 50% of 4 = 2 matches required
+ Assert.assertEquals(booleanQuery9.getMinimumNumberShouldMatch(), 2);
+
+ // Test 10: Deep nested query - OpenSearch AND ((one OR two) AND (three OR
four OR five)) with
+ // minimumShouldMatch=60%
+ Query result10 = parseQueryWithMinimumShouldMatch(
+ "OpenSearch AND ((one OR two) AND (three OR four OR five))", "60%");
+ Assert.assertTrue(result10 instanceof BooleanQuery);
+ BooleanQuery booleanQuery10 = (BooleanQuery) result10;
+ Assert.assertEquals(booleanQuery10.clauses().size(), 2);
+
+ // Get the nested BooleanQuery: ((one OR two) AND (three OR four OR five))
+ BooleanQuery nestedQuery10 = (BooleanQuery)
booleanQuery10.clauses().get(1).getQuery();
+ Assert.assertEquals(nestedQuery10.clauses().size(), 2);
+
+ // Get the first sub-nested BooleanQuery: (one OR two)
+ BooleanQuery subNested1 = (BooleanQuery)
nestedQuery10.clauses().get(0).getQuery();
+ Assert.assertEquals(subNested1.clauses().size(), 2);
+ // 60% of 2 = 1.2, rounded down to 1
+ Assert.assertEquals(subNested1.getMinimumNumberShouldMatch(), 1);
+
+ // Get the second sub-nested BooleanQuery: (three OR four OR five)
+ BooleanQuery subNested2 = (BooleanQuery)
nestedQuery10.clauses().get(1).getQuery();
+ Assert.assertEquals(subNested2.clauses().size(), 3);
+ // 60% of 3 = 1.8, rounded down to 1
+ Assert.assertEquals(subNested2.getMinimumNumberShouldMatch(), 1);
+ }
+
+ @Test
+ public void testNegativeCases()
+ throws ParseException {
+ // Case 1: Invalid percentage value (> 100%)
+ try {
+ parseQueryWithMinimumShouldMatch("java OR python", "101%");
+ Assert.fail("Should throw IllegalArgumentException for invalid
percentage");
+ } catch (IllegalArgumentException e) {
+ // Expected
+ } catch (ParseException e) {
+ Assert.fail("Should throw IllegalArgumentException, not ParseException");
+ }
+
+ // Case 2: Invalid negative percentage value (< -100%)
+ try {
+ parseQueryWithMinimumShouldMatch("java OR python", "-101%");
+ Assert.fail("Should throw IllegalArgumentException for invalid negative
percentage");
+ } catch (IllegalArgumentException e) {
+ // Expected
+ } catch (ParseException e) {
+ Assert.fail("Should throw IllegalArgumentException, not ParseException");
+ }
+
+ // Case 3: Invalid format (not integer or percentage)
+ try {
+ parseQueryWithMinimumShouldMatch("java OR python", "abc");
+ Assert.fail("Should throw IllegalArgumentException for invalid format");
+ } catch (IllegalArgumentException e) {
+ // Expected
+ } catch (ParseException e) {
+ Assert.fail("Should throw IllegalArgumentException, not ParseException");
+ }
+
+ // Case 4: Invalid decimal percentage
+ try {
+ parseQueryWithMinimumShouldMatch("java OR python", "50.5%");
+ Assert.fail("Should throw IllegalArgumentException for invalid decimal
percentage");
+ } catch (IllegalArgumentException e) {
+ // Expected
+ } catch (ParseException e) {
+ Assert.fail("Should throw IllegalArgumentException, not ParseException");
+ }
+
+ // Case 5: Null query
+ try {
+ parseQueryWithMinimumShouldMatch(null, null);
+ Assert.fail("Should throw ParseException for null query");
+ } catch (ParseException e) {
+ // Expected
+ }
+
+ // Case 6: Empty query
+ try {
+ parseQueryWithMinimumShouldMatch("", null);
+ Assert.fail("Should throw ParseException for empty query");
+ } catch (ParseException e) {
+ // Expected
+ }
+
+ // Case 7: Whitespace-only query
+ try {
+ parseQueryWithMinimumShouldMatch(" ", null);
+ Assert.fail("Should throw ParseException for whitespace-only query");
+ } catch (ParseException e) {
+ // Expected
+ }
+ // Case 9: Non-Boolean query (phrase query)
+ parseQueryWithMinimumShouldMatch("\"java programming\"", null);
+ parseQueryWithMinimumShouldMatch("java*", null);
+ }
+
+ @Test
+ public void testEdgeCases()
+ throws ParseException {
+ // Case 1: minimum_should_match value greater than number of should
clauses (boundary)
+ Query result1 = parseQueryWithMinimumShouldMatch("java OR python", "5");
+ Assert.assertTrue(result1 instanceof BooleanQuery);
+ BooleanQuery booleanQuery1 = (BooleanQuery) result1;
+ Assert.assertEquals(booleanQuery1.clauses().size(), 2);
+ // Should cap at the number of available clauses (2)
+ Assert.assertEquals(booleanQuery1.getMinimumNumberShouldMatch(), 2);
+
+ // Case 2: negative minimum_should_match that would result in negative
value (boundary)
+ Query result2 = parseQueryWithMinimumShouldMatch("java OR python", "-5");
+ Assert.assertTrue(result2 instanceof BooleanQuery);
+ BooleanQuery booleanQuery2 = (BooleanQuery) result2;
+ Assert.assertEquals(booleanQuery2.clauses().size(), 2);
+ // Should not go below 0 (2 - 5 = -3, but capped at 0)
+ Assert.assertEquals(booleanQuery2.getMinimumNumberShouldMatch(), 0);
+
+ // Case 3: negative percentage that would result in zero (boundary)
+ Query result3 = parseQueryWithMinimumShouldMatch("java OR python", "-80%");
+ Assert.assertTrue(result3 instanceof BooleanQuery);
+ BooleanQuery booleanQuery3 = (BooleanQuery) result3;
+ Assert.assertEquals(booleanQuery3.clauses().size(), 2);
+ // -80% means 20% must match, but 20% of 2 = 0.4, rounded down to 0
+ Assert.assertEquals(booleanQuery3.getMinimumNumberShouldMatch(), 0);
+
+ // Case 4: minimum_should_match value equal to number of should clauses
(boundary)
+ Query result4 = parseQueryWithMinimumShouldMatch("java OR python OR
scala", "3");
+ Assert.assertTrue(result4 instanceof BooleanQuery);
+ BooleanQuery booleanQuery4 = (BooleanQuery) result4;
+ Assert.assertEquals(booleanQuery4.clauses().size(), 3);
+ // Should require all 3 clauses to match
+ Assert.assertEquals(booleanQuery4.getMinimumNumberShouldMatch(), 3);
+
+ // Case 5: percentage that results in decimal value (boundary rounding)
+ Query result5 = parseQueryWithMinimumShouldMatch("java OR python OR scala
OR kotlin", "75%");
+ Assert.assertTrue(result5 instanceof BooleanQuery);
+ BooleanQuery booleanQuery5 = (BooleanQuery) result5;
+ Assert.assertEquals(booleanQuery5.clauses().size(), 4);
+ // 75% of 4 = 3.0, should round down to 3
+ Assert.assertEquals(booleanQuery5.getMinimumNumberShouldMatch(), 3);
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]