(calcite) branch main updated: [CALCITE-5597] SELECT DISTINCT query with ORDER BY column will get error result

zhenchen Wed, 14 Jan 2026 06:28:02 -0800

This is an automated email from the ASF dual-hosted git repository.

zhenchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/calcite.git



The following commit(s) were added to refs/heads/main by this push:
     new 43e7fbf0ee [CALCITE-5597] SELECT DISTINCT query with ORDER BY column 
will get error result
43e7fbf0ee is described below

commit 43e7fbf0eec99955408c91ef064d18a827ac2caf
Author: Zhen Chen <[email protected]>
AuthorDate: Wed Jan 14 09:02:27 2026 +0800

    [CALCITE-5597] SELECT DISTINCT query with ORDER BY column will get error 
result
---
 .../apache/calcite/sql2rel/SqlToRelConverter.java  | 192 +++++++++++++++------
 .../apache/calcite/test/SqlToRelConverterTest.java |   8 +
 .../apache/calcite/test/SqlToRelConverterTest.xml  |  15 ++
 3 files changed, 165 insertions(+), 50 deletions(-)

diff --git 
a/core/src/main/java/org/apache/calcite/sql2rel/SqlToRelConverter.java 
b/core/src/main/java/org/apache/calcite/sql2rel/SqlToRelConverter.java
index 1ad87eaed6..8fc83104c1 100644
--- a/core/src/main/java/org/apache/calcite/sql2rel/SqlToRelConverter.java
+++ b/core/src/main/java/org/apache/calcite/sql2rel/SqlToRelConverter.java
@@ -842,8 +842,14 @@ protected void convertSelectImpl(
   }
 
   /**
-   * Having translated 'SELECT ... FROM ... [GROUP BY ...] [HAVING ...]', adds
-   * a relational expression to make the results unique.
+   * The translation of 'SELECT ... FROM ... [GROUP BY ...] [HAVING ...]' uses
+   * an {@link org.apache.calcite.rel.core.Aggregate}.
+   *
+   * <p>For example, {@code SELECT DISTINCT x FROM t ORDER BY y} is converted
+   * to an {@code Aggregate} on {@code (x, y)} if {@code y} is deterministic.
+   * If {@code y} is non-deterministic (e.g. {@code RAND()}), it is converted
+   * to an {@code Aggregate} on {@code x}, and {@code y} is applied over the
+   * result.
    *
    * <p>If the SELECT clause contains duplicate expressions, adds
    * {@link org.apache.calcite.rel.logical.LogicalProject}s so that we are
@@ -864,69 +870,155 @@ private void distinctify(
       throw new IllegalArgumentException("rel must not be null");
     }
     final RelNode rel = bb.root;
+    int groupCount = rel.getRowType().getFieldCount();
+    if (bb.scope != null && bb.scope.getNode() instanceof SqlSelect) {
+      groupCount = 
validator().getValidatedNodeType(bb.scope.getNode()).getFieldCount();
+    }
+    distinctify(bb, checkForDupExprs, groupCount);
+  }
+
+  /**
+   * The translation of 'SELECT ... FROM ... [GROUP BY ...] [HAVING ...]' uses
+   * an {@link org.apache.calcite.rel.core.Aggregate}.
+   *
+   * <p>For example, {@code SELECT DISTINCT x FROM t ORDER BY y} is converted
+   * to an {@code Aggregate} on {@code (x, y)} if {@code y} is deterministic.
+   * If {@code y} is non-deterministic (e.g. {@code RAND()}), it is converted
+   * to an {@code Aggregate} on {@code x}, and {@code y} is applied over the
+   * result.
+   *
+   * <p>If the SELECT clause contains duplicate expressions, adds
+   * {@link org.apache.calcite.rel.logical.LogicalProject}s so that we are
+   * grouping on the minimal set of keys. The performance gain isn't huge, but
+   * it is difficult to detect these duplicate expressions later.
+   *
+   * @param bb               Blackboard
+   * @param checkForDupExprs Check for duplicate expressions
+   * @param groupCount       Number of fields in the SELECT clause
+   */
+  private void distinctify(
+      Blackboard bb,
+      boolean checkForDupExprs,
+      int groupCount) {
+    if (bb.root == null) {
+      throw new IllegalArgumentException("rel must not be null");
+    }
+    RelNode rel = bb.root;
+
+    // 1. Handle duplicate expressions in the Project if requested.
     if (checkForDupExprs && (rel instanceof LogicalProject)) {
-      LogicalProject project = (LogicalProject) rel;
+      final LogicalProject project = (LogicalProject) rel;
       final List<RexNode> projectExprs = project.getProjects();
       final List<Integer> origins = new ArrayList<>();
-      int dupCount = 0;
+      final Map<RexNode, Integer> seen = new HashMap<>();
       for (int i = 0; i < projectExprs.size(); i++) {
-        int x = projectExprs.indexOf(projectExprs.get(i));
-        if (x >= 0 && x < i) {
-          origins.add(x);
-          ++dupCount;
-        } else {
-          origins.add(i);
+        Integer first = seen.putIfAbsent(projectExprs.get(i), i);
+        origins.add(first != null ? first : i);
+      }
+
+      if (seen.size() < projectExprs.size()) {
+        final List<RelDataTypeField> fields = rel.getRowType().getFieldList();
+        final PairList<RexNode, String> newProjects = PairList.of();
+        final List<Integer> mapping = new ArrayList<>();
+        for (int i = 0; i < fields.size(); i++) {
+          if (origins.get(i) == i) {
+            mapping.add(newProjects.size());
+            newProjects.add(projectExprs.get(i), fields.get(i).getName());
+          } else {
+            mapping.add(-1);
+          }
         }
-      }
-      if (dupCount == 0) {
-        distinctify(bb, false);
+        bb.setRoot(
+            LogicalProject.create(project.getInput(), project.getHints(),
+                newProjects.leftList(), newProjects.rightList(),
+                project.getVariablesSet()), false);
+
+        int newGroupCount = 0;
+        for (int i = 0; i < groupCount; i++) {
+          if (origins.get(i) == i) {
+            newGroupCount++;
+          }
+        }
+        distinctify(bb, false, newGroupCount);
+
+        final RelNode distinctRel = bb.root();
+        final PairList<RexNode, String> undoProjects = PairList.of();
+        for (int i = 0; i < fields.size(); i++) {
+          final int origin = origins.get(i);
+          final int newIdx = mapping.get(origin);
+          undoProjects.add(rexBuilder.makeInputRef(distinctRel, newIdx),
+              fields.get(i).getName());
+        }
+
+        bb.setRoot(
+            LogicalProject.create(distinctRel, ImmutableList.of(),
+                undoProjects.leftList(), undoProjects.rightList(),
+                ImmutableSet.of()),
+            false);
         return;
       }
+    }
 
-      final Map<Integer, Integer> squished = new HashMap<>();
-      final List<RelDataTypeField> fields = rel.getRowType().getFieldList();
-      final PairList<RexNode, String> newProjects = PairList.of();
-      for (int i = 0; i < fields.size(); i++) {
-        if (origins.get(i) == i) {
-          squished.put(i, newProjects.size());
-          RexInputRef.add2(newProjects, i, fields);
-        }
-      }
-      bb.root =
-          LogicalProject.create(rel, ImmutableList.of(),
-              newProjects.leftList(), newProjects.rightList(),
-              project.getVariablesSet());
-      distinctify(bb, false);
-      final RelNode rel3 = bb.root();
-
-      // Create the expressions to reverse the mapping.
-      // Project($0, $1, $0, $2).
-      final PairList<RexNode, String> undoProjects = PairList.of();
-      for (int i = 0; i < fields.size(); i++) {
-        final int origin = origins.get(i);
-        RelDataTypeField field = fields.get(i);
-        undoProjects.add(
-            new RexInputRef(castNonNull(squished.get(origin)),
-                field.getType()),
-            field.getName());
+    // 2. Determine group set and mapping for non-deterministic columns.
+    final int totalCount = rel.getRowType().getFieldCount();
+    final Project project = rel instanceof Project ? (Project) rel : null;
+    final ImmutableBitSet.Builder groupSetBuilder = ImmutableBitSet.builder();
+
+    for (int i = 0; i < totalCount; i++) {
+      if (i < groupCount
+          || project == null
+          || RexUtil.isDeterministic(project.getProjects().get(i))) {
+        groupSetBuilder.set(i);
       }
+    }
 
+    final ImmutableBitSet groupSet = groupSetBuilder.build();
+    if (groupSet.cardinality() == totalCount) {
       bb.setRoot(
-          LogicalProject.create(rel3, ImmutableList.of(),
-              undoProjects.leftList(), undoProjects.rightList(),
-              ImmutableSet.of()),
-          false);
-
+          createAggregate(bb, groupSet, ImmutableList.of(groupSet),
+              ImmutableList.of()), false);
       return;
     }
 
-    // Usual case: all expressions in the SELECT clause are different.
-    final ImmutableBitSet groupSet =
-        ImmutableBitSet.range(rel.getRowType().getFieldCount());
+    // 3. Handle non-deterministic ORDER BY columns using the mapping.
+    final List<RexNode> bottomExprs = new ArrayList<>();
+    final List<String> bottomNames = new ArrayList<>();
+    for (int i : groupSet) {
+      bottomExprs.add(castNonNull(project).getProjects().get(i));
+      bottomNames.add(rel.getRowType().getFieldNames().get(i));
+    }
+
     bb.setRoot(
-        createAggregate(bb, groupSet, ImmutableList.of(groupSet),
-            ImmutableList.of()),
-        false);
+        LogicalProject.create(castNonNull(project).getInput(), 
project.getHints(),
+            bottomExprs, bottomNames, project.getVariablesSet()), false);
+
+    final ImmutableBitSet aggGroupSet = 
ImmutableBitSet.range(groupSet.cardinality());
+    bb.setRoot(
+        createAggregate(bb, aggGroupSet, ImmutableList.of(aggGroupSet),
+            ImmutableList.of()), false);
+
+    final RelNode aggregate = bb.root();
+    final RexShuttle shuttle = new RexShuttle() {
+      @Override public RexNode visitInputRef(RexInputRef ref) {
+        int idx = groupSet.indexOf(ref.getIndex());
+        return idx >= 0
+            ? rexBuilder.makeInputRef(aggregate, idx)
+            : super.visitInputRef(ref);
+      }
+    };
+
+    final List<RexNode> topExprs = new ArrayList<>();
+    for (int i = 0; i < totalCount; i++) {
+      int idx = groupSet.indexOf(i);
+      if (idx >= 0) {
+        topExprs.add(rexBuilder.makeInputRef(aggregate, idx));
+      } else {
+        
topExprs.add(castNonNull(project).getProjects().get(i).accept(shuttle));
+      }
+    }
+    bb.setRoot(
+        LogicalProject.create(aggregate, ImmutableList.of(), topExprs,
+            rel.getRowType().getFieldNames(), ImmutableSet.of()), false);
   }
 
   /**
diff --git 
a/core/src/test/java/org/apache/calcite/test/SqlToRelConverterTest.java 
b/core/src/test/java/org/apache/calcite/test/SqlToRelConverterTest.java
index 8a485421cf..28173513d8 100644
--- a/core/src/test/java/org/apache/calcite/test/SqlToRelConverterTest.java
+++ b/core/src/test/java/org/apache/calcite/test/SqlToRelConverterTest.java
@@ -6001,4 +6001,12 @@ void checkUserDefinedOrderByOver(NullCollation 
nullCollation) {
         + "and t1.ename in (select t3.ename from emp t3 )";
     sql(sql).ok();
   }
+
+  /** Test case of
+   * <a 
href="https://issues.apache.org/jira/browse/CALCITE-5597";>[CALCITE-5597]
+   * SELECT DISTINCT query with ORDER BY column will get error result</a>. */
+  @Test void testDistinctOrderByRand() {
+    final String sql = "select distinct deptno, deptno, empno, 1, 'a' from emp 
order by rand(), 1";
+    sql(sql).ok();
+  }
 }
diff --git 
a/core/src/test/resources/org/apache/calcite/test/SqlToRelConverterTest.xml 
b/core/src/test/resources/org/apache/calcite/test/SqlToRelConverterTest.xml
index 57dc035f9c..bd829189cf 100644
--- a/core/src/test/resources/org/apache/calcite/test/SqlToRelConverterTest.xml
+++ b/core/src/test/resources/org/apache/calcite/test/SqlToRelConverterTest.xml
@@ -1887,6 +1887,21 @@ LogicalTableModify(table=[[CATALOG, SALES, EMP]], 
operation=[DELETE], flattened=
   LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], 
SAL=[$5], COMM=[$6], DEPTNO=[$7], SLACKER=[$8])
     LogicalFilter(condition=[=($7, 10)])
       LogicalTableScan(table=[[CATALOG, SALES, EMP]])
+]]>
+    </Resource>
+  </TestCase>
+  <TestCase name="testDistinctOrderByRand">
+    <Resource name="sql">
+      <![CDATA[select distinct deptno, deptno, empno, 1, 'a' from emp order by 
rand(), 1]]>
+    </Resource>
+    <Resource name="plan">
+      <![CDATA[
+LogicalProject(DEPTNO=[$0], DEPTNO1=[$1], EMPNO=[$2], EXPR$2=[$3], EXPR$3=[$4])
+  LogicalSort(sort0=[$5], sort1=[$0], dir0=[ASC], dir1=[ASC])
+    LogicalProject(DEPTNO=[$0], DEPTNO0=[$0], EMPNO=[$1], EXPR$3=[$2], 
EXPR$4=[$3], EXPR$5=[RAND()])
+      LogicalAggregate(group=[{0, 1, 2, 3}])
+        LogicalProject(DEPTNO=[$7], EMPNO=[$0], EXPR$3=[1], EXPR$4=['a'])
+          LogicalTableScan(table=[[CATALOG, SALES, EMP]])
 ]]>
     </Resource>
   </TestCase>

(calcite) branch main updated: [CALCITE-5597] SELECT DISTINCT query with ORDER BY column will get error result

Reply via email to