abhishekrb19 opened a new issue, #17951:
URL: https://github.com/apache/druid/issues/17951
I have the following SQL query involving unnest and two CTEs that doesn't
yield correct results. The query appears to be generating an incorrect native
query plan:
```sql
WITH cte1 AS (
SELECT
c1,
TIME_FLOOR(__time, 'PT5M') AS floored_time,
STRLEN(c1) AS c1_len
FROM unnest_ds
GROUP BY 1, 2
),
cte2 AS (
SELECT
c1,
ARRAY_AGG(ARRAY[floored_time, c1_len]) AS pair
FROM cte1
GROUP BY 1
)
SELECT
tr.c1,
tv
FROM cte2 AS tr
CROSS JOIN UNNEST(pair) AS tv
```
The above SQL query results in a native plan:
<details>
<summary>Native JSON query</summary>
```json
{
"queryType": "scan",
"dataSource": {
"type": "unnest",
"base": {
"type": "query",
"query": {
"queryType": "groupBy",
"dataSource": {
"type": "query",
"query": {
"queryType": "groupBy",
"dataSource": {
"type": "table",
"name": "unnest_ds"
},
"intervals": {
"type": "intervals",
"intervals": [
"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
]
},
"virtualColumns": [
{
"type": "expression",
"name": "v0",
"expression":
"timestamp_floor(\"__time\",'PT5M',null,'UTC')",
"outputType": "LONG"
}
],
"granularity": {
"type": "all"
},
"dimensions": [
{
"type": "default",
"dimension": "c1",
"outputName": "d0",
"outputType": "STRING"
},
{
"type": "default",
"dimension": "v0",
"outputName": "d1",
"outputType": "LONG"
}
],
"limitSpec": {
"type": "NoopLimitSpec"
},
"context": {
"queryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
"sqlOuterLimit": 1001,
"sqlQueryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
"sqlStringifyArrays": false,
"timestampResultField": "d1",
"timestampResultFieldGranularity": "\"FIVE_MINUTE\"",
"timestampResultFieldInOriginalDimensions": 1,
"useNativeQueryExplain": true
}
}
},
"intervals": {
"type": "intervals",
"intervals": [
"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
]
},
"virtualColumns": [
{
"type": "expression",
"name": "v0",
"expression": "array(\"d1\",strlen(\"d0\"))",
"outputType": "ARRAY<LONG>"
}
],
"granularity": {
"type": "all"
},
"dimensions": [
{
"type": "default",
"dimension": "d0",
"outputName": "_d0",
"outputType": "STRING"
}
],
"aggregations": [
{
"type": "expression",
"name": "a0",
"fields": [
"v0"
],
"accumulatorIdentifier": "__acc",
"initialValue": "ARRAY<ARRAY<LONG>>[]",
"initialCombineValue": "ARRAY<ARRAY<LONG>>[]",
"isNullUnlessAggregated": true,
"shouldAggregateNullInputs": true,
"shouldCombineAggregateNullInputs": false,
"fold": "array_append(\"__acc\", \"v0\")",
"combine": "array_concat(\"__acc\", \"a0\")",
"maxSizeBytes": 1024
}
],
"limitSpec": {
"type": "NoopLimitSpec"
},
"context": {
"queryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
"sqlOuterLimit": 1001,
"sqlQueryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
"sqlStringifyArrays": false,
"useNativeQueryExplain": true
}
}
},
"virtualColumn": {
"type": "expression",
"name": "j0.unnest",
"expression": "timestamp_floor(\"__time\",'PT5M',null,'UTC')",
"outputType": "LONG"
},
"unnestFilter": null
},
"intervals": {
"type": "intervals",
"intervals": [
"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
]
},
"resultFormat": "compactedList",
"limit": 1001,
"columns": [
"_d0",
"j0.unnest"
],
"context": {
"queryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
"sqlOuterLimit": 1001,
"sqlQueryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
"sqlStringifyArrays": false,
"useNativeQueryExplain": true
},
"columnTypes": [
"STRING",
"ARRAY<LONG>"
],
"granularity": {
"type": "all"
},
"legacy": false
}
```
</details>
The problematic part of the plan is:
```
"virtualColumn": {
"type": "expression",
"name": "j0.unnest",
"expression": "timestamp_floor(\"__time\",'PT5M',null,'UTC')",
"outputType": "LONG"
},
```
The unnest column is using the `timestamp_floor` expression from the first
CTE instead of the `a0` expression from the second CTE whose output type is
`"ARRAY<ARRAY<LONG>>"`.
Updating that in the plan and running it yields correct results.
<details>
<summary>Updated plan with the correct unnest expression</summary>
```json
{
"queryType": "scan",
"dataSource": {
"type": "unnest",
"base": {
"type": "query",
"query": {
"queryType": "groupBy",
"dataSource": {
"type": "query",
"query": {
"queryType": "groupBy",
"dataSource": {
"type": "table",
"name": "unnest_ds"
},
"intervals": {
"type": "intervals",
"intervals": [
"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
]
},
"virtualColumns": [
{
"type": "expression",
"name": "v0",
"expression":
"timestamp_floor(\"__time\",'PT5M',null,'UTC')",
"outputType": "LONG"
}
],
"granularity": {
"type": "all"
},
"dimensions": [
{
"type": "default",
"dimension": "c1",
"outputName": "d0",
"outputType": "STRING"
},
{
"type": "default",
"dimension": "v0",
"outputName": "d1",
"outputType": "LONG"
}
],
"limitSpec": {
"type": "NoopLimitSpec"
},
"context": {
"queryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
"sqlOuterLimit": 1001,
"sqlQueryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
"sqlStringifyArrays": false,
"timestampResultField": "d1",
"timestampResultFieldGranularity": "\"FIVE_MINUTE\"",
"timestampResultFieldInOriginalDimensions": 1,
"useNativeQueryExplain": true
}
}
},
"intervals": {
"type": "intervals",
"intervals": [
"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
]
},
"virtualColumns": [
{
"type": "expression",
"name": "v0",
"expression": "array(\"d1\",strlen(\"d0\"))",
"outputType": "ARRAY<LONG>"
}
],
"granularity": {
"type": "all"
},
"dimensions": [
{
"type": "default",
"dimension": "d0",
"outputName": "_d0",
"outputType": "STRING"
}
],
"aggregations": [
{
"type": "expression",
"name": "a0",
"fields": [
"v0"
],
"accumulatorIdentifier": "__acc",
"initialValue": "ARRAY<ARRAY<LONG>>[]",
"initialCombineValue": "ARRAY<ARRAY<LONG>>[]",
"isNullUnlessAggregated": true,
"shouldAggregateNullInputs": true,
"shouldCombineAggregateNullInputs": false,
"fold": "array_append(\"__acc\", \"v0\")",
"combine": "array_concat(\"__acc\", \"a0\")",
"maxSizeBytes": 1024
}
],
"limitSpec": {
"type": "NoopLimitSpec"
},
"context": {
"queryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
"sqlOuterLimit": 1001,
"sqlQueryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
"sqlStringifyArrays": false,
"useNativeQueryExplain": true
}
}
},
"virtualColumn": {
"type": "expression",
"name": "j0.unnest",
"expression": "a0",
"outputType": "ARRAY<ARRAY<LONG>>"
},
"unnestFilter": null
},
"intervals": {
"type": "intervals",
"intervals": [
"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
]
},
"resultFormat": "compactedList",
"limit": 1001,
"columns": [
"_d0",
"j0.unnest"
],
"context": {
"queryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
"sqlOuterLimit": 1001,
"sqlQueryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
"sqlStringifyArrays": false,
"useNativeQueryExplain": true
},
"columnTypes": [
"STRING",
"ARRAY<LONG>"
],
"granularity": {
"type": "all"
},
"legacy": false
}
```
</details>
```diff
< "expression": "timestamp_floor(\"__time\",'PT5M',null,'UTC')",
< "outputType": "LONG"
---
> "expression": "a0",
> "outputType": "ARRAY<ARRAY<LONG>>"
```
### Affected Version
32.0.0 & 2c46787d8c399b96f88dcf3a14c68b6753b12464 (tip of master)
### Description
The issue appears to be related to how virtual columns are generated and
handled during the native query planning stage. Specifically, when a `GROUP BY`
clause is present in the first CTE, the planner incorrectly references virtual
columns from the first native query in the unnest phase, instead of those
generated by the second native query. Removing the `GROUP BY` from the first
CTE and modifying the structure of the second CTE resolves the issue, but this
approach loses the original grouping context.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]