mgrenonville opened a new issue, #20315:
URL: https://github.com/apache/datafusion/issues/20315
### Describe the bug
When a SQL query contains multiple correlated scalar subqueries in the
SELECT clause, the `scalar_subquery_to_join` optimizer rule fails with:
```
Schema error: Ambiguous reference to unqualified field __always_true
```
### To Reproduce
```rust
// Reproduction case for DataFusion bug: Ambiguous reference to __always_true
// with multiple correlated scalar subqueries
use datafusion::arrow::array::{Float64Array, StringArray};
use datafusion::arrow::datatypes::{DataType, Field, Schema};
use datafusion::arrow::record_batch::RecordBatch;
use datafusion::datasource::MemTable;
use datafusion::prelude::*;
use std::sync::Arc;
#[tokio::main]
async fn main() -> datafusion::error::Result<()> {
println!("DataFusion Multiple Correlated Scalar Subqueries Bug
Reproduction\n");
// Create a simple table with region and duration columns
let schema = Arc::new(Schema::new(vec![
Field::new("region", DataType::Utf8, false),
Field::new("duration", DataType::Float64, false),
]));
let region = StringArray::from(vec!["US", "EU", "US", "EU", "APAC"]);
let duration = Float64Array::from(vec![10.0, 20.0, 15.0, 25.0, 30.0]);
let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(region),
Arc::new(duration)])?;
// Create session context
let config = SessionConfig::new().with_target_partitions(1);
let ctx = SessionContext::new_with_config(config);
// Register the table
let table = MemTable::try_new(schema, vec![vec![batch]])?;
ctx.register_table("activity", Arc::new(table))?;
println!("=== Test 1: Single correlated scalar subquery (WORKS) ===");
let sql_single = "
SELECT
region,
SUM(duration) as total_duration,
(SELECT COUNT(*) FROM activity WHERE region = a.region) as count
FROM activity a
GROUP BY region
";
match ctx.sql(sql_single).await {
Ok(df) => match df.collect().await {
Ok(results) => {
println!("Single subquery works!");
println!(
"Results: {} rows\n",
results.iter().map(|b| b.num_rows()).sum::<usize>()
);
}
Err(e) => println!("Execution error: {}\n", e),
},
Err(e) => println!("Planning error: {}\n", e),
}
println!("=== Test 2: Multiple correlated scalar subqueries (FAILS)
===");
let sql_multiple = "
SELECT
region,
SUM(duration) as total_duration,
(SELECT COUNT(*) FROM activity WHERE region = a.region) as count,
(SELECT MAX(duration) FROM activity WHERE region = a.region) as
max_duration
FROM activity a
GROUP BY region
";
match ctx.sql(sql_multiple).await {
Ok(df) => match df.collect().await {
Ok(results) => {
println!("Multiple subqueries work!");
println!(
"Results: {} rows\n",
results.iter().map(|b| b.num_rows()).sum::<usize>()
);
}
Err(e) => {
println!("Execution error:");
println!("{}\n", e);
}
},
Err(e) => {
println!("❌ Planning error: {}\n", e);
}
}
println!("=== Test 3: Multiple subqueries with different tables (for
comparison) ===");
// Create a second table
let schema2 = Arc::new(Schema::new(vec![
Field::new("region", DataType::Utf8, false),
Field::new("revenue", DataType::Float64, false),
]));
let region2 = StringArray::from(vec!["US", "EU", "APAC"]);
let revenue = Float64Array::from(vec![100.0, 200.0, 150.0]);
let batch2 = RecordBatch::try_new(schema2.clone(),
vec![Arc::new(region2), Arc::new(revenue)])?;
let table2 = MemTable::try_new(schema2, vec![vec![batch2]])?;
ctx.register_table("sales", Arc::new(table2))?;
let sql_different_tables = "
SELECT
region,
SUM(duration) as total_duration,
(SELECT COUNT(*) FROM activity WHERE region = a.region) as
activity_count,
(SELECT MAX(revenue) FROM sales WHERE region = a.region) as
max_revenue
FROM activity a
GROUP BY region
";
match ctx.sql(sql_different_tables).await {
Ok(df) => match df.collect().await {
Ok(results) => {
println!("Multiple subqueries on different tables work!");
println!(
"Results: {} rows\n",
results.iter().map(|b| b.num_rows()).sum::<usize>()
);
}
Err(e) => println!("Execution error: {}\n", e),
},
Err(e) => println!("Planning error: {}\n", e),
}
Ok(())
}
```
This outputs errors like :
```
Optimizer rule 'scalar_subquery_to_join' failed
caused by
Schema error: Ambiguous reference to unqualified field __always_true
```
### Expected behavior
The query should execute successfully, returning aggregated results with the
scalar subquery values.
It's working with single correlated subquery.
### Additional context
_No response_
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]