mgrenonville opened a new issue, #20315:
URL: https://github.com/apache/datafusion/issues/20315

   ### Describe the bug
   
   When a SQL query contains multiple correlated scalar subqueries in the 
SELECT clause, the `scalar_subquery_to_join` optimizer rule fails with:
   ```
   Schema error: Ambiguous reference to unqualified field __always_true
   ```
   
   ### To Reproduce
   
   ```rust
   // Reproduction case for DataFusion bug: Ambiguous reference to __always_true
   // with multiple correlated scalar subqueries
   
   use datafusion::arrow::array::{Float64Array, StringArray};
   use datafusion::arrow::datatypes::{DataType, Field, Schema};
   use datafusion::arrow::record_batch::RecordBatch;
   use datafusion::datasource::MemTable;
   use datafusion::prelude::*;
   use std::sync::Arc;
   
   #[tokio::main]
   async fn main() -> datafusion::error::Result<()> {
       println!("DataFusion Multiple Correlated Scalar Subqueries Bug 
Reproduction\n");
   
       // Create a simple table with region and duration columns
       let schema = Arc::new(Schema::new(vec![
           Field::new("region", DataType::Utf8, false),
           Field::new("duration", DataType::Float64, false),
       ]));
   
       let region = StringArray::from(vec!["US", "EU", "US", "EU", "APAC"]);
       let duration = Float64Array::from(vec![10.0, 20.0, 15.0, 25.0, 30.0]);
   
       let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(region), 
Arc::new(duration)])?;
   
       // Create session context
       let config = SessionConfig::new().with_target_partitions(1);
       let ctx = SessionContext::new_with_config(config);
   
       // Register the table
       let table = MemTable::try_new(schema, vec![vec![batch]])?;
       ctx.register_table("activity", Arc::new(table))?;
   
       println!("=== Test 1: Single correlated scalar subquery (WORKS) ===");
       let sql_single = "
           SELECT
               region,
               SUM(duration) as total_duration,
               (SELECT COUNT(*) FROM activity WHERE region = a.region) as count
           FROM activity a
           GROUP BY region
       ";
   
       match ctx.sql(sql_single).await {
           Ok(df) => match df.collect().await {
               Ok(results) => {
                   println!("Single subquery works!");
                   println!(
                       "Results: {} rows\n",
                       results.iter().map(|b| b.num_rows()).sum::<usize>()
                   );
               }
               Err(e) => println!("Execution error: {}\n", e),
           },
           Err(e) => println!("Planning error: {}\n", e),
       }
   
       println!("=== Test 2: Multiple correlated scalar subqueries (FAILS) 
===");
       let sql_multiple = "
           SELECT
               region,
               SUM(duration) as total_duration,
               (SELECT COUNT(*) FROM activity WHERE region = a.region) as count,
               (SELECT MAX(duration) FROM activity WHERE region = a.region) as 
max_duration
           FROM activity a
           GROUP BY region
       ";
   
       match ctx.sql(sql_multiple).await {
           Ok(df) => match df.collect().await {
               Ok(results) => {
                   println!("Multiple subqueries work!");
                   println!(
                       "Results: {} rows\n",
                       results.iter().map(|b| b.num_rows()).sum::<usize>()
                   );
               }
               Err(e) => {
                   println!("Execution error:");
                   println!("{}\n", e);
               }
           },
           Err(e) => {
               println!("❌ Planning error: {}\n", e);
           }
       }
   
       println!("=== Test 3: Multiple subqueries with different tables (for 
comparison) ===");
       // Create a second table
       let schema2 = Arc::new(Schema::new(vec![
           Field::new("region", DataType::Utf8, false),
           Field::new("revenue", DataType::Float64, false),
       ]));
   
       let region2 = StringArray::from(vec!["US", "EU", "APAC"]);
       let revenue = Float64Array::from(vec![100.0, 200.0, 150.0]);
   
       let batch2 = RecordBatch::try_new(schema2.clone(), 
vec![Arc::new(region2), Arc::new(revenue)])?;
   
       let table2 = MemTable::try_new(schema2, vec![vec![batch2]])?;
       ctx.register_table("sales", Arc::new(table2))?;
   
       let sql_different_tables = "
           SELECT
               region,
               SUM(duration) as total_duration,
               (SELECT COUNT(*) FROM activity WHERE region = a.region) as 
activity_count,
               (SELECT MAX(revenue) FROM sales WHERE region = a.region) as 
max_revenue
           FROM activity a
           GROUP BY region
       ";
   
       match ctx.sql(sql_different_tables).await {
           Ok(df) => match df.collect().await {
               Ok(results) => {
                   println!("Multiple subqueries on different tables work!");
                   println!(
                       "Results: {} rows\n",
                       results.iter().map(|b| b.num_rows()).sum::<usize>()
                   );
               }
               Err(e) => println!("Execution error: {}\n", e),
           },
           Err(e) => println!("Planning error: {}\n", e),
       }
   
       Ok(())
   }
   
   ```
   This outputs errors like : 
   ```
   Optimizer rule 'scalar_subquery_to_join' failed
   caused by
   Schema error: Ambiguous reference to unqualified field __always_true
   ```
   
   
   ### Expected behavior
   
   The query should execute successfully, returning aggregated results with the 
scalar subquery values.
   It's working with single correlated subquery.
   
   ### Additional context
   
   _No response_


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to