This is an automated email from the ASF dual-hosted git repository.
zhenchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/calcite.git
The following commit(s) were added to refs/heads/main by this push:
new 43e7fbf0ee [CALCITE-5597] SELECT DISTINCT query with ORDER BY column
will get error result
43e7fbf0ee is described below
commit 43e7fbf0eec99955408c91ef064d18a827ac2caf
Author: Zhen Chen <[email protected]>
AuthorDate: Wed Jan 14 09:02:27 2026 +0800
[CALCITE-5597] SELECT DISTINCT query with ORDER BY column will get error
result
---
.../apache/calcite/sql2rel/SqlToRelConverter.java | 192 +++++++++++++++------
.../apache/calcite/test/SqlToRelConverterTest.java | 8 +
.../apache/calcite/test/SqlToRelConverterTest.xml | 15 ++
3 files changed, 165 insertions(+), 50 deletions(-)
diff --git
a/core/src/main/java/org/apache/calcite/sql2rel/SqlToRelConverter.java
b/core/src/main/java/org/apache/calcite/sql2rel/SqlToRelConverter.java
index 1ad87eaed6..8fc83104c1 100644
--- a/core/src/main/java/org/apache/calcite/sql2rel/SqlToRelConverter.java
+++ b/core/src/main/java/org/apache/calcite/sql2rel/SqlToRelConverter.java
@@ -842,8 +842,14 @@ protected void convertSelectImpl(
}
/**
- * Having translated 'SELECT ... FROM ... [GROUP BY ...] [HAVING ...]', adds
- * a relational expression to make the results unique.
+ * The translation of 'SELECT ... FROM ... [GROUP BY ...] [HAVING ...]' uses
+ * an {@link org.apache.calcite.rel.core.Aggregate}.
+ *
+ * <p>For example, {@code SELECT DISTINCT x FROM t ORDER BY y} is converted
+ * to an {@code Aggregate} on {@code (x, y)} if {@code y} is deterministic.
+ * If {@code y} is non-deterministic (e.g. {@code RAND()}), it is converted
+ * to an {@code Aggregate} on {@code x}, and {@code y} is applied over the
+ * result.
*
* <p>If the SELECT clause contains duplicate expressions, adds
* {@link org.apache.calcite.rel.logical.LogicalProject}s so that we are
@@ -864,69 +870,155 @@ private void distinctify(
throw new IllegalArgumentException("rel must not be null");
}
final RelNode rel = bb.root;
+ int groupCount = rel.getRowType().getFieldCount();
+ if (bb.scope != null && bb.scope.getNode() instanceof SqlSelect) {
+ groupCount =
validator().getValidatedNodeType(bb.scope.getNode()).getFieldCount();
+ }
+ distinctify(bb, checkForDupExprs, groupCount);
+ }
+
+ /**
+ * The translation of 'SELECT ... FROM ... [GROUP BY ...] [HAVING ...]' uses
+ * an {@link org.apache.calcite.rel.core.Aggregate}.
+ *
+ * <p>For example, {@code SELECT DISTINCT x FROM t ORDER BY y} is converted
+ * to an {@code Aggregate} on {@code (x, y)} if {@code y} is deterministic.
+ * If {@code y} is non-deterministic (e.g. {@code RAND()}), it is converted
+ * to an {@code Aggregate} on {@code x}, and {@code y} is applied over the
+ * result.
+ *
+ * <p>If the SELECT clause contains duplicate expressions, adds
+ * {@link org.apache.calcite.rel.logical.LogicalProject}s so that we are
+ * grouping on the minimal set of keys. The performance gain isn't huge, but
+ * it is difficult to detect these duplicate expressions later.
+ *
+ * @param bb Blackboard
+ * @param checkForDupExprs Check for duplicate expressions
+ * @param groupCount Number of fields in the SELECT clause
+ */
+ private void distinctify(
+ Blackboard bb,
+ boolean checkForDupExprs,
+ int groupCount) {
+ if (bb.root == null) {
+ throw new IllegalArgumentException("rel must not be null");
+ }
+ RelNode rel = bb.root;
+
+ // 1. Handle duplicate expressions in the Project if requested.
if (checkForDupExprs && (rel instanceof LogicalProject)) {
- LogicalProject project = (LogicalProject) rel;
+ final LogicalProject project = (LogicalProject) rel;
final List<RexNode> projectExprs = project.getProjects();
final List<Integer> origins = new ArrayList<>();
- int dupCount = 0;
+ final Map<RexNode, Integer> seen = new HashMap<>();
for (int i = 0; i < projectExprs.size(); i++) {
- int x = projectExprs.indexOf(projectExprs.get(i));
- if (x >= 0 && x < i) {
- origins.add(x);
- ++dupCount;
- } else {
- origins.add(i);
+ Integer first = seen.putIfAbsent(projectExprs.get(i), i);
+ origins.add(first != null ? first : i);
+ }
+
+ if (seen.size() < projectExprs.size()) {
+ final List<RelDataTypeField> fields = rel.getRowType().getFieldList();
+ final PairList<RexNode, String> newProjects = PairList.of();
+ final List<Integer> mapping = new ArrayList<>();
+ for (int i = 0; i < fields.size(); i++) {
+ if (origins.get(i) == i) {
+ mapping.add(newProjects.size());
+ newProjects.add(projectExprs.get(i), fields.get(i).getName());
+ } else {
+ mapping.add(-1);
+ }
}
- }
- if (dupCount == 0) {
- distinctify(bb, false);
+ bb.setRoot(
+ LogicalProject.create(project.getInput(), project.getHints(),
+ newProjects.leftList(), newProjects.rightList(),
+ project.getVariablesSet()), false);
+
+ int newGroupCount = 0;
+ for (int i = 0; i < groupCount; i++) {
+ if (origins.get(i) == i) {
+ newGroupCount++;
+ }
+ }
+ distinctify(bb, false, newGroupCount);
+
+ final RelNode distinctRel = bb.root();
+ final PairList<RexNode, String> undoProjects = PairList.of();
+ for (int i = 0; i < fields.size(); i++) {
+ final int origin = origins.get(i);
+ final int newIdx = mapping.get(origin);
+ undoProjects.add(rexBuilder.makeInputRef(distinctRel, newIdx),
+ fields.get(i).getName());
+ }
+
+ bb.setRoot(
+ LogicalProject.create(distinctRel, ImmutableList.of(),
+ undoProjects.leftList(), undoProjects.rightList(),
+ ImmutableSet.of()),
+ false);
return;
}
+ }
- final Map<Integer, Integer> squished = new HashMap<>();
- final List<RelDataTypeField> fields = rel.getRowType().getFieldList();
- final PairList<RexNode, String> newProjects = PairList.of();
- for (int i = 0; i < fields.size(); i++) {
- if (origins.get(i) == i) {
- squished.put(i, newProjects.size());
- RexInputRef.add2(newProjects, i, fields);
- }
- }
- bb.root =
- LogicalProject.create(rel, ImmutableList.of(),
- newProjects.leftList(), newProjects.rightList(),
- project.getVariablesSet());
- distinctify(bb, false);
- final RelNode rel3 = bb.root();
-
- // Create the expressions to reverse the mapping.
- // Project($0, $1, $0, $2).
- final PairList<RexNode, String> undoProjects = PairList.of();
- for (int i = 0; i < fields.size(); i++) {
- final int origin = origins.get(i);
- RelDataTypeField field = fields.get(i);
- undoProjects.add(
- new RexInputRef(castNonNull(squished.get(origin)),
- field.getType()),
- field.getName());
+ // 2. Determine group set and mapping for non-deterministic columns.
+ final int totalCount = rel.getRowType().getFieldCount();
+ final Project project = rel instanceof Project ? (Project) rel : null;
+ final ImmutableBitSet.Builder groupSetBuilder = ImmutableBitSet.builder();
+
+ for (int i = 0; i < totalCount; i++) {
+ if (i < groupCount
+ || project == null
+ || RexUtil.isDeterministic(project.getProjects().get(i))) {
+ groupSetBuilder.set(i);
}
+ }
+ final ImmutableBitSet groupSet = groupSetBuilder.build();
+ if (groupSet.cardinality() == totalCount) {
bb.setRoot(
- LogicalProject.create(rel3, ImmutableList.of(),
- undoProjects.leftList(), undoProjects.rightList(),
- ImmutableSet.of()),
- false);
-
+ createAggregate(bb, groupSet, ImmutableList.of(groupSet),
+ ImmutableList.of()), false);
return;
}
- // Usual case: all expressions in the SELECT clause are different.
- final ImmutableBitSet groupSet =
- ImmutableBitSet.range(rel.getRowType().getFieldCount());
+ // 3. Handle non-deterministic ORDER BY columns using the mapping.
+ final List<RexNode> bottomExprs = new ArrayList<>();
+ final List<String> bottomNames = new ArrayList<>();
+ for (int i : groupSet) {
+ bottomExprs.add(castNonNull(project).getProjects().get(i));
+ bottomNames.add(rel.getRowType().getFieldNames().get(i));
+ }
+
bb.setRoot(
- createAggregate(bb, groupSet, ImmutableList.of(groupSet),
- ImmutableList.of()),
- false);
+ LogicalProject.create(castNonNull(project).getInput(),
project.getHints(),
+ bottomExprs, bottomNames, project.getVariablesSet()), false);
+
+ final ImmutableBitSet aggGroupSet =
ImmutableBitSet.range(groupSet.cardinality());
+ bb.setRoot(
+ createAggregate(bb, aggGroupSet, ImmutableList.of(aggGroupSet),
+ ImmutableList.of()), false);
+
+ final RelNode aggregate = bb.root();
+ final RexShuttle shuttle = new RexShuttle() {
+ @Override public RexNode visitInputRef(RexInputRef ref) {
+ int idx = groupSet.indexOf(ref.getIndex());
+ return idx >= 0
+ ? rexBuilder.makeInputRef(aggregate, idx)
+ : super.visitInputRef(ref);
+ }
+ };
+
+ final List<RexNode> topExprs = new ArrayList<>();
+ for (int i = 0; i < totalCount; i++) {
+ int idx = groupSet.indexOf(i);
+ if (idx >= 0) {
+ topExprs.add(rexBuilder.makeInputRef(aggregate, idx));
+ } else {
+
topExprs.add(castNonNull(project).getProjects().get(i).accept(shuttle));
+ }
+ }
+ bb.setRoot(
+ LogicalProject.create(aggregate, ImmutableList.of(), topExprs,
+ rel.getRowType().getFieldNames(), ImmutableSet.of()), false);
}
/**
diff --git
a/core/src/test/java/org/apache/calcite/test/SqlToRelConverterTest.java
b/core/src/test/java/org/apache/calcite/test/SqlToRelConverterTest.java
index 8a485421cf..28173513d8 100644
--- a/core/src/test/java/org/apache/calcite/test/SqlToRelConverterTest.java
+++ b/core/src/test/java/org/apache/calcite/test/SqlToRelConverterTest.java
@@ -6001,4 +6001,12 @@ void checkUserDefinedOrderByOver(NullCollation
nullCollation) {
+ "and t1.ename in (select t3.ename from emp t3 )";
sql(sql).ok();
}
+
+ /** Test case of
+ * <a
href="https://issues.apache.org/jira/browse/CALCITE-5597">[CALCITE-5597]
+ * SELECT DISTINCT query with ORDER BY column will get error result</a>. */
+ @Test void testDistinctOrderByRand() {
+ final String sql = "select distinct deptno, deptno, empno, 1, 'a' from emp
order by rand(), 1";
+ sql(sql).ok();
+ }
}
diff --git
a/core/src/test/resources/org/apache/calcite/test/SqlToRelConverterTest.xml
b/core/src/test/resources/org/apache/calcite/test/SqlToRelConverterTest.xml
index 57dc035f9c..bd829189cf 100644
--- a/core/src/test/resources/org/apache/calcite/test/SqlToRelConverterTest.xml
+++ b/core/src/test/resources/org/apache/calcite/test/SqlToRelConverterTest.xml
@@ -1887,6 +1887,21 @@ LogicalTableModify(table=[[CATALOG, SALES, EMP]],
operation=[DELETE], flattened=
LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],
SAL=[$5], COMM=[$6], DEPTNO=[$7], SLACKER=[$8])
LogicalFilter(condition=[=($7, 10)])
LogicalTableScan(table=[[CATALOG, SALES, EMP]])
+]]>
+ </Resource>
+ </TestCase>
+ <TestCase name="testDistinctOrderByRand">
+ <Resource name="sql">
+ <![CDATA[select distinct deptno, deptno, empno, 1, 'a' from emp order by
rand(), 1]]>
+ </Resource>
+ <Resource name="plan">
+ <![CDATA[
+LogicalProject(DEPTNO=[$0], DEPTNO1=[$1], EMPNO=[$2], EXPR$2=[$3], EXPR$3=[$4])
+ LogicalSort(sort0=[$5], sort1=[$0], dir0=[ASC], dir1=[ASC])
+ LogicalProject(DEPTNO=[$0], DEPTNO0=[$0], EMPNO=[$1], EXPR$3=[$2],
EXPR$4=[$3], EXPR$5=[RAND()])
+ LogicalAggregate(group=[{0, 1, 2, 3}])
+ LogicalProject(DEPTNO=[$7], EMPNO=[$0], EXPR$3=[1], EXPR$4=['a'])
+ LogicalTableScan(table=[[CATALOG, SALES, EMP]])
]]>
</Resource>
</TestCase>