Copilot commented on code in PR #17247:
URL: https://github.com/apache/pinot/pull/17247#discussion_r2605037608
##########
pinot-core/src/main/java/org/apache/pinot/core/operator/query/DistinctOperator.java:
##########
@@ -40,31 +43,81 @@
*/
public class DistinctOperator extends BaseOperator<DistinctResultsBlock> {
private static final String EXPLAIN_NAME = "DISTINCT";
+ private static final int UNLIMITED_ROWS = Integer.MAX_VALUE;
private final IndexSegment _indexSegment;
private final QueryContext _queryContext;
private final BaseProjectOperator<?> _projectOperator;
private int _numDocsScanned = 0;
+ private final int _maxRowsInDistinct;
+ private final int _numRowsWithoutChangeInDistinct;
+ private int _numRowsWithoutNewDistinct = 0;
+ private boolean _hitMaxRowsLimit = false;
+ private boolean _hitNoChangeLimit = false;
public DistinctOperator(IndexSegment indexSegment, QueryContext queryContext,
BaseProjectOperator<?> projectOperator) {
_indexSegment = indexSegment;
_queryContext = queryContext;
_projectOperator = projectOperator;
+ Map<String, String> queryOptions = queryContext.getQueryOptions();
+ if (queryOptions != null) {
+ Integer maxRowsInDistinct =
QueryOptionsUtils.getMaxRowsInDistinct(queryOptions);
+ _maxRowsInDistinct = maxRowsInDistinct != null ? maxRowsInDistinct :
UNLIMITED_ROWS;
+ Integer numRowsWithoutChange =
QueryOptionsUtils.getNumRowsWithoutChangeInDistinct(queryOptions);
+ _numRowsWithoutChangeInDistinct =
+ numRowsWithoutChange != null ? numRowsWithoutChange : UNLIMITED_ROWS;
+ } else {
+ _maxRowsInDistinct = UNLIMITED_ROWS;
+ _numRowsWithoutChangeInDistinct = UNLIMITED_ROWS;
+ }
}
@Override
protected DistinctResultsBlock getNextBlock() {
DistinctExecutor executor =
DistinctExecutorFactory.getDistinctExecutor(_projectOperator, _queryContext);
+ executor.setMaxRowsToProcess(_maxRowsInDistinct);
ValueBlock valueBlock;
+ boolean enforceRowLimit = _maxRowsInDistinct != UNLIMITED_ROWS;
+ boolean enforceNoChangeLimit = _numRowsWithoutChangeInDistinct !=
UNLIMITED_ROWS;
while ((valueBlock = _projectOperator.nextBlock()) != null) {
- _numDocsScanned += valueBlock.getNumDocs();
- if (executor.process(valueBlock)) {
+ if (enforceRowLimit && executor.getRemainingRowsToProcess() <= 0) {
+ _hitMaxRowsLimit = true;
+ break;
+ }
+ int rowsRemainingBefore = executor.getRemainingRowsToProcess();
+ int distinctCountBeforeBlock = enforceNoChangeLimit ?
executor.getNumDistinctRowsCollected() : -1;
+ boolean satisfied = executor.process(valueBlock);
+ int rowsRemainingAfter = executor.getRemainingRowsToProcess();
+ int docsProcessedForLimit =
+ enforceRowLimit ? Math.max(0, rowsRemainingBefore -
rowsRemainingAfter) : valueBlock.getNumDocs();
Review Comment:
The calculation of `docsProcessedForLimit` uses `valueBlock.getNumDocs()`
when row limits are not enforced, but when limits are active, it computes the
difference in remaining rows. This creates inconsistent semantics: with limits,
it counts rows consumed by the executor (which may be less than the block
size), while without limits, it counts the full block. Consider unifying the
logic to always derive processed rows from the executor's state or documenting
why this asymmetry is intentional.
```suggestion
int docsProcessedForLimit = Math.max(0, rowsRemainingBefore -
rowsRemainingAfter);
```
##########
pinot-core/src/main/java/org/apache/pinot/core/query/distinct/DistinctExecutorFactory.java:
##########
@@ -55,7 +55,18 @@ private DistinctExecutorFactory() {
public static DistinctExecutor getDistinctExecutor(BaseProjectOperator<?>
projectOperator,
QueryContext queryContext) {
List<ExpressionContext> expressions = queryContext.getSelectExpressions();
- int limit = queryContext.getLimit();
+ int queryLimit = queryContext.getLimit();
+ // The distinct table capacity is independent from the early-termination
row budgets enforced by DistinctOperator.
+ // Keep a dedicated variable so it is clear we are only constraining how
many unique rows we can store.
+ int tableCapacity = queryLimit;
+ if (queryContext.getQueryOptions() != null) {
+ Integer maxRowsInDistinct =
+
org.apache.pinot.common.utils.config.QueryOptionsUtils.getMaxRowsInDistinct(queryContext.getQueryOptions());
Review Comment:
The fully qualified class name
`org.apache.pinot.common.utils.config.QueryOptionsUtils` should be imported at
the top of the file instead of being used inline. This improves readability and
follows standard Java conventions.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]