xiangfu0 commented on code in PR #18588:
URL: https://github.com/apache/pinot/pull/18588#discussion_r3316442417
##########
pinot-core/src/main/java/org/apache/pinot/core/operator/query/JsonIndexDistinctOperator.java:
##########
@@ -50,516 +47,396 @@
import org.apache.pinot.core.query.distinct.table.StringDistinctTable;
import org.apache.pinot.core.query.request.context.QueryContext;
import org.apache.pinot.segment.spi.IndexSegment;
-import org.apache.pinot.segment.spi.SegmentContext;
import org.apache.pinot.segment.spi.datasource.DataSource;
import org.apache.pinot.segment.spi.index.IndexService;
import org.apache.pinot.segment.spi.index.IndexType;
import org.apache.pinot.segment.spi.index.reader.JsonIndexReader;
-import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.FieldSpec.DataType;
import org.apache.pinot.spi.query.QueryThreadContext;
-import org.apache.pinot.sql.parsers.CalciteSqlParser;
+import org.apache.pinot.spi.utils.JsonUtils;
import org.roaringbitmap.RoaringBitmap;
import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
-/**
- * Distinct operator for the scalar {@code jsonExtractIndex(column, path,
type[, defaultValue])} form.
- *
- * <p>Execution flow:
- * 1. Push a same-path {@code JSON_MATCH} predicate into the JSON-index lookup
when it cannot match missing paths.
- * 2. Convert matching flattened doc ids back to segment doc ids.
- * 3. Apply any remaining row-level filter and materialize DISTINCT results,
including missing-path handling.
- */
+/// Distinct operator for `jsonExtractIndex(column, path, type[,
defaultValue[, filterJsonExpression]])`.
+///
+/// Supports both SV (e.g. `STRING`) and MV (e.g. `STRING_ARRAY`) result types
— DISTINCT collapses MV array elements
+/// to scalar rows, matching the scan-based `SELECT DISTINCT mvCol`
convention. The 4-arg default is a single value
+/// for SV; for MV it's a JSON array whose elements are each added to the
distinct set when no doc matches the path.
+///
+/// Execution flow:
+/// 1. Pass the optional 5-arg `filterJsonExpression` directly to the
JSON-index lookup (matches
+/// `JsonExtractIndexTransformFunction`'s convention).
+/// 2. Convert matching flattened doc ids back to segment doc ids.
+/// 3. Apply any remaining row-level WHERE filter and materialize DISTINCT
results, including missing-path handling.
public class JsonIndexDistinctOperator extends
BaseOperator<DistinctResultsBlock> {
private static final String EXPLAIN_NAME = "DISTINCT_JSON_INDEX";
private static final String FUNCTION_NAME = "jsonExtractIndex";
+ /// Returns true if the expression is a `jsonExtractIndex` function call.
All other validation (argument count/types,
+ /// column existence, JSON index presence, path support) happens inside the
operator's constructor and matches what
+ /// the scan-based fallback (`JsonExtractIndexTransformFunction`) would
surface during its own `init`.
+ public static boolean canUseJsonIndexDistinct(ExpressionContext expr) {
+ return expr.getType() == ExpressionContext.Type.FUNCTION &&
FUNCTION_NAME.equalsIgnoreCase(
+ expr.getFunction().getFunctionName());
+ }
+
private final IndexSegment _indexSegment;
- private final SegmentContext _segmentContext;
+ private final int _totalDocs;
private final QueryContext _queryContext;
private final BaseFilterOperator _filterOperator;
+ private final ExpressionContext _expression;
+ private final boolean _skipMissingPath;
+ private final JsonIndexReader _jsonIndexReader;
+ private final String _jsonPathString;
+ private final DataType _dataType;
+ @Nullable
+ private final String[] _defaultValueLiterals;
+ @Nullable
+ private final String _filterJsonExpression;
+ private final DataSchema _dataSchema;
+ @Nullable
+ private final OrderByExpressionContext _orderByExpression;
- private int _numEntriesExamined = 0;
+ private int _numDocsScanned = 0;
private long _numEntriesScannedInFilter = 0;
+ private int _numEntriesExaminedPostFilter = 0;
- public JsonIndexDistinctOperator(IndexSegment indexSegment, SegmentContext
segmentContext,
- QueryContext queryContext, BaseFilterOperator filterOperator) {
+ public JsonIndexDistinctOperator(IndexSegment indexSegment, QueryContext
queryContext,
+ BaseFilterOperator filterOperator) {
_indexSegment = indexSegment;
- _segmentContext = segmentContext;
+ _totalDocs = indexSegment.getSegmentMetadata().getTotalDocs();
_queryContext = queryContext;
_filterOperator = filterOperator;
- }
-
- @Override
- protected DistinctResultsBlock getNextBlock() {
- List<ExpressionContext> expressions = _queryContext.getSelectExpressions();
+ List<ExpressionContext> expressions = queryContext.getSelectExpressions();
if (expressions.size() != 1) {
throw new IllegalStateException("JsonIndexDistinctOperator supports
single expression only");
}
+ _expression = expressions.get(0);
+ _skipMissingPath =
QueryOptionsUtils.isJsonIndexDistinctSkipMissingPath(queryContext.getQueryOptions());
- ExpressionContext expr = expressions.get(0);
- ParsedJsonExtractIndex parsed = parseJsonExtractIndex(expr);
- if (parsed == null) {
- throw new IllegalStateException("Expected 3/4-arg scalar
jsonExtractIndex expression");
- }
+ // Mirrors the arguments handling logic in
`JsonExtractIndexTransformFunction`
- DataSource dataSource = _indexSegment.getDataSource(parsed._columnName,
_queryContext.getSchema());
- JsonIndexReader jsonIndexReader = getJsonIndexReader(dataSource);
- if (jsonIndexReader == null) {
- throw new IllegalStateException("Column " + parsed._columnName + " has
no JSON index");
+ List<ExpressionContext> arguments =
_expression.getFunction().getArguments();
+ int numArguments = arguments.size();
+ // Check that there are exactly 3 or 4 or 5 arguments
+ if (numArguments < 3 || numArguments > 5) {
+ throw new IllegalArgumentException(
+ "Expected 3/4/5 arguments for jsonExtractIndex(jsonFieldName,
'jsonPath', 'resultsType',"
+ + " ['defaultValue'], ['jsonFilterExpression'])");
}
- String pushedDownFilterJson = extractSamePathJsonMatchFilter(parsed,
_queryContext.getFilter());
Review Comment:
Would removing this short circuit cause perf regression?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]