This is an automated email from the ASF dual-hosted git repository.
jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new a4193e6c270 Use 2 bitsets in ScanBasedRegexpLikePredicateEvaluator to
track DictIdToRegexMatcher for REGEX_LIKE predicate (#16922)
a4193e6c270 is described below
commit a4193e6c270b207fcf9310dc477c2fba2330d1c0
Author: Chaitanya Deepthi <[email protected]>
AuthorDate: Mon Oct 6 17:25:24 2025 -0700
Use 2 bitsets in ScanBasedRegexpLikePredicateEvaluator to track
DictIdToRegexMatcher for REGEX_LIKE predicate (#16922)
* Take in common constants
* Checkstyle fixes
* Remove unused variable
* Format the comment
* Checkstyle fix
* Remove the count in ScanBasedRegexpLikePredicateEvaluator
* Address review comments
* Add comment
* Review comment
* Change the datatype to BitSet
* Remove the dictionary based scan for Regex Like Expressions
* Remove the configs that are unused
* checkstyle changes
* checkstyle fixes
* fix test
* Add back dict based scanning in REGEX_LIKE expressions
* Revert back the test
* Change config key and minor cleanup
* Rename a variable
---------
Co-authored-by: Xiaotian (Jackie) Jiang <[email protected]>
---
.../requesthandler/BaseBrokerRequestHandler.java | 7 ++++
.../BaseSingleStageBrokerRequestHandler.java | 4 ++
.../common/utils/config/QueryOptionsUtils.java | 11 ++++++
.../predicate/PredicateEvaluatorProvider.java | 14 +++----
.../RegexpLikePredicateEvaluatorFactory.java | 45 ++++++++++++++++++----
.../apache/pinot/spi/utils/CommonConstants.java | 10 +++++
6 files changed, 76 insertions(+), 15 deletions(-)
diff --git
a/pinot-broker/src/main/java/org/apache/pinot/broker/requesthandler/BaseBrokerRequestHandler.java
b/pinot-broker/src/main/java/org/apache/pinot/broker/requesthandler/BaseBrokerRequestHandler.java
index 615d9c0383b..269cc395e67 100644
---
a/pinot-broker/src/main/java/org/apache/pinot/broker/requesthandler/BaseBrokerRequestHandler.java
+++
b/pinot-broker/src/main/java/org/apache/pinot/broker/requesthandler/BaseBrokerRequestHandler.java
@@ -90,6 +90,8 @@ public abstract class BaseBrokerRequestHandler implements
BrokerRequestHandler {
protected final QueryLogger _queryLogger;
@Nullable
protected final String _enableNullHandling;
+ @Nullable
+ protected final String _regexDictSizeThreshold;
protected final boolean _enableQueryCancellation;
/**
@@ -121,6 +123,7 @@ public abstract class BaseBrokerRequestHandler implements
BrokerRequestHandler {
Broker.DEFAULT_BROKER_ENABLE_ROW_COLUMN_LEVEL_AUTH);
_queryLogger = new QueryLogger(config);
_enableNullHandling =
config.getProperty(Broker.CONFIG_OF_BROKER_QUERY_ENABLE_NULL_HANDLING);
+ _regexDictSizeThreshold =
config.getProperty(Broker.CONFIG_OF_BROKER_QUERY_REGEX_DICT_SIZE_THRESHOLD);
_enableQueryCancellation =
config.getProperty(Broker.CONFIG_OF_BROKER_ENABLE_QUERY_CANCELLATION,
Broker.DEFAULT_BROKER_ENABLE_QUERY_CANCELLATION);
if (_enableQueryCancellation) {
@@ -203,6 +206,10 @@ public abstract class BaseBrokerRequestHandler implements
BrokerRequestHandler {
sqlNodeAndOptions.getOptions().putIfAbsent(QueryOptionKey.ENABLE_NULL_HANDLING,
_enableNullHandling);
}
+ if (_regexDictSizeThreshold != null) {
+
sqlNodeAndOptions.getOptions().putIfAbsent(QueryOptionKey.REGEX_DICT_SIZE_THRESHOLD,
_regexDictSizeThreshold);
+ }
+
BrokerResponse brokerResponse =
handleRequest(requestId, query, sqlNodeAndOptions, request,
requesterIdentity, requestContext, httpHeaders,
accessControl);
diff --git
a/pinot-broker/src/main/java/org/apache/pinot/broker/requesthandler/BaseSingleStageBrokerRequestHandler.java
b/pinot-broker/src/main/java/org/apache/pinot/broker/requesthandler/BaseSingleStageBrokerRequestHandler.java
index 36feabc993a..4cf79bd233d 100644
---
a/pinot-broker/src/main/java/org/apache/pinot/broker/requesthandler/BaseSingleStageBrokerRequestHandler.java
+++
b/pinot-broker/src/main/java/org/apache/pinot/broker/requesthandler/BaseSingleStageBrokerRequestHandler.java
@@ -1213,6 +1213,10 @@ public abstract class
BaseSingleStageBrokerRequestHandler extends BaseBrokerRequ
.putIfAbsent(Broker.Request.QueryOptionKey.ENABLE_NULL_HANDLING,
_enableNullHandling);
}
+ if (_regexDictSizeThreshold != null) {
+
sqlNodeAndOptions.getOptions().putIfAbsent(QueryOptionKey.REGEX_DICT_SIZE_THRESHOLD,
_regexDictSizeThreshold);
+ }
+
BrokerResponse response =
doHandleRequest(requestId, subquery, sqlNodeAndOptions, jsonRequest,
requesterIdentity, requestContext,
httpHeaders, accessControl);
diff --git
a/pinot-common/src/main/java/org/apache/pinot/common/utils/config/QueryOptionsUtils.java
b/pinot-common/src/main/java/org/apache/pinot/common/utils/config/QueryOptionsUtils.java
index f4bb0469ed5..4236e379617 100644
---
a/pinot-common/src/main/java/org/apache/pinot/common/utils/config/QueryOptionsUtils.java
+++
b/pinot-common/src/main/java/org/apache/pinot/common/utils/config/QueryOptionsUtils.java
@@ -585,4 +585,15 @@ public class QueryOptionsUtils {
}
return Boolean.parseBoolean(value);
}
+
+ /// When evaluating REGEXP_LIKE predicate on a dictionary encoded column:
+ /// - If dictionary size is smaller than this threshold, scan the dictionary
to get the matching dictionary ids
+ /// first, where inverted index can be applied if exists
+ /// - Otherwise, read dictionary while scanning the forward index, cache the
matching/unmatching dictionary ids
+ /// during the scan
+ @Nullable
+ public static Integer getRegexDictSizeThreshold(Map<String, String>
queryOptions) {
+ String regexDictSizeThreshold =
queryOptions.get(QueryOptionKey.REGEX_DICT_SIZE_THRESHOLD);
+ return uncheckedParseInt(QueryOptionKey.REGEX_DICT_SIZE_THRESHOLD,
regexDictSizeThreshold);
+ }
}
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/PredicateEvaluatorProvider.java
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/PredicateEvaluatorProvider.java
index 6db11163282..033cb68b537 100644
---
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/PredicateEvaluatorProvider.java
+++
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/PredicateEvaluatorProvider.java
@@ -49,11 +49,11 @@ public class PredicateEvaluatorProvider {
// dictionary based predicate evaluators
switch (predicate.getType()) {
case EQ:
- return EqualsPredicateEvaluatorFactory
- .newDictionaryBasedEvaluator((EqPredicate) predicate,
dictionary, dataType);
+ return
EqualsPredicateEvaluatorFactory.newDictionaryBasedEvaluator((EqPredicate)
predicate, dictionary,
+ dataType);
case NOT_EQ:
- return NotEqualsPredicateEvaluatorFactory
- .newDictionaryBasedEvaluator((NotEqPredicate) predicate,
dictionary, dataType);
+ return
NotEqualsPredicateEvaluatorFactory.newDictionaryBasedEvaluator((NotEqPredicate)
predicate,
+ dictionary, dataType);
case IN:
return
InPredicateEvaluatorFactory.newDictionaryBasedEvaluator((InPredicate)
predicate, dictionary,
dataType, queryContext);
@@ -61,11 +61,11 @@ public class PredicateEvaluatorProvider {
return
NotInPredicateEvaluatorFactory.newDictionaryBasedEvaluator((NotInPredicate)
predicate, dictionary,
dataType, queryContext);
case RANGE:
- return RangePredicateEvaluatorFactory
- .newDictionaryBasedEvaluator((RangePredicate) predicate,
dictionary, dataType);
+ return
RangePredicateEvaluatorFactory.newDictionaryBasedEvaluator((RangePredicate)
predicate, dictionary,
+ dataType);
case REGEXP_LIKE:
return
RegexpLikePredicateEvaluatorFactory.newDictionaryBasedEvaluator((RegexpLikePredicate)
predicate,
- dictionary, dataType);
+ dictionary, dataType, queryContext);
default:
throw new UnsupportedOperationException("Unsupported predicate
type: " + predicate.getType());
}
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
index ac91cad9c52..a022f905cf7 100644
---
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
+++
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
@@ -23,10 +23,16 @@ import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import it.unimi.dsi.fastutil.ints.IntSet;
+import java.util.BitSet;
+import javax.annotation.Nullable;
import org.apache.pinot.common.request.context.predicate.RegexpLikePredicate;
+import org.apache.pinot.common.utils.config.QueryOptionsUtils;
import org.apache.pinot.common.utils.regex.Matcher;
+import org.apache.pinot.core.query.request.context.QueryContext;
import org.apache.pinot.segment.spi.index.reader.Dictionary;
import org.apache.pinot.spi.data.FieldSpec.DataType;
+import
org.apache.pinot.spi.utils.CommonConstants.Broker.Request.QueryOptionValue;
+
/**
* Factory for REGEXP_LIKE predicate evaluators.
@@ -35,21 +41,27 @@ public class RegexpLikePredicateEvaluatorFactory {
private RegexpLikePredicateEvaluatorFactory() {
}
- /// When the cardinality of the dictionary is less than this threshold, scan
the dictionary to get the matching ids.
- public static final int DICTIONARY_CARDINALITY_THRESHOLD_FOR_SCAN = 10000;
-
/**
* Create a new instance of dictionary based REGEXP_LIKE predicate evaluator.
*
* @param regexpLikePredicate REGEXP_LIKE predicate to evaluate
- * @param dictionary Dictionary for the column
- * @param dataType Data type for the column
+ * @param dictionary Dictionary for the column
+ * @param dataType Data type for the column
+ * @param queryContext
* @return Dictionary based REGEXP_LIKE predicate evaluator
*/
public static BaseDictionaryBasedPredicateEvaluator
newDictionaryBasedEvaluator(
- RegexpLikePredicate regexpLikePredicate, Dictionary dictionary, DataType
dataType) {
+ RegexpLikePredicate regexpLikePredicate, Dictionary dictionary, DataType
dataType,
+ @Nullable QueryContext queryContext) {
Preconditions.checkArgument(dataType.getStoredType() == DataType.STRING,
"Unsupported data type: " + dataType);
- if (dictionary.length() < DICTIONARY_CARDINALITY_THRESHOLD_FOR_SCAN) {
+ Integer regexDictSizeThreshold = null;
+ if (queryContext != null) {
+ regexDictSizeThreshold =
QueryOptionsUtils.getRegexDictSizeThreshold(queryContext.getQueryOptions());
+ }
+ if (regexDictSizeThreshold == null) {
+ regexDictSizeThreshold =
QueryOptionValue.DEFAULT_REGEX_DICT_SIZE_THRESHOLD;
+ }
+ if (dictionary.length() < regexDictSizeThreshold) {
return new DictIdBasedRegexpLikePredicateEvaluator(regexpLikePredicate,
dictionary);
} else {
return new ScanBasedRegexpLikePredicateEvaluator(regexpLikePredicate,
dictionary);
@@ -122,14 +134,31 @@ public class RegexpLikePredicateEvaluatorFactory {
// within the scope of a single thread.
final Matcher _matcher;
+ // _evaluatedIds: tracks which dictionary IDs have been evaluated
+ // _matchingIds: tracks which dictionary IDs match the regex pattern
+ final BitSet _evaluatedIds;
+ final BitSet _matchingIds;
+
public ScanBasedRegexpLikePredicateEvaluator(RegexpLikePredicate
regexpLikePredicate, Dictionary dictionary) {
super(regexpLikePredicate, dictionary);
_matcher = regexpLikePredicate.getPattern().matcher("");
+ int dictionarySize = dictionary.length();
+ _evaluatedIds = new BitSet(dictionarySize);
+ _matchingIds = new BitSet(dictionarySize);
}
@Override
public boolean applySV(int dictId) {
- return _matcher.reset(_dictionary.getStringValue(dictId)).find();
+ // Check if already evaluated
+ if (_evaluatedIds.get(dictId)) {
+ return _matchingIds.get(dictId);
+ }
+ boolean match =
_matcher.reset(_dictionary.getStringValue(dictId)).find();
+ _evaluatedIds.set(dictId);
+ if (match) {
+ _matchingIds.set(dictId);
+ }
+ return match;
}
@Override
diff --git
a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java
b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java
index 9864ad80fb5..2b229abe89d 100644
--- a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java
+++ b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java
@@ -350,6 +350,9 @@ public class CommonConstants {
"pinot.broker.query.log.logBeforeProcessing";
public static final boolean DEFAULT_BROKER_QUERY_LOG_BEFORE_PROCESSING =
true;
public static final String CONFIG_OF_BROKER_QUERY_ENABLE_NULL_HANDLING =
"pinot.broker.query.enable.null.handling";
+ /// Provide broker level default for query option
[Request.QueryOptionKey#REGEX_DICT_SIZE_THRESHOLD]
+ public static final String
CONFIG_OF_BROKER_QUERY_REGEX_DICT_SIZE_THRESHOLD =
+ "pinot.broker.query.regex.dict.size.threshold";
public static final String CONFIG_OF_BROKER_ENABLE_QUERY_CANCELLATION =
"pinot.broker.enable.query.cancellation";
public static final boolean DEFAULT_BROKER_ENABLE_QUERY_CANCELLATION =
true;
public static final double DEFAULT_BROKER_QUERY_LOG_MAX_RATE_PER_SECOND =
10_000d;
@@ -715,6 +718,12 @@ public class CommonConstants {
public static final String IN_PREDICATE_PRE_SORTED =
"inPredicatePreSorted";
public static final String IN_PREDICATE_LOOKUP_ALGORITHM =
"inPredicateLookupAlgorithm";
+ // When evaluating REGEXP_LIKE predicate on a dictionary encoded
column:
+ // - If dictionary size is smaller than this threshold, scan the
dictionary to get the matching dictionary ids
+ // first, where inverted index can be applied if exists
+ // - Otherwise, read dictionary while scanning the forward index,
cache the matching/unmatching dictionary ids
+ // during the scan
+ public static final String REGEX_DICT_SIZE_THRESHOLD =
"regexDictSizeThreshold";
public static final String DROP_RESULTS = "dropResults";
@@ -816,6 +825,7 @@ public class CommonConstants {
public static class QueryOptionValue {
public static final int DEFAULT_MAX_STREAMING_PENDING_BLOCKS = 100;
+ public static final int DEFAULT_REGEX_DICT_SIZE_THRESHOLD = 10000;
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]