drewrip opened a new issue, #23138:
URL: https://github.com/apache/datafusion/issues/23138

   ### Describe the bug
   
   This is a follow to #22961 where I described bugs in the `Unparser` that is 
creating invalid SQL. In general, I'm still seeing a lot of errors in generated 
SQL due to column references not being appropriately qualified. This usually is 
showing up where we have an unnamed subquery.
   
   ### To Reproduce
   
   This is a simple reproducer that uses DuckDB because DataFusion seems to 
accept the SQL. It targets datafusion `main` so that we can @goutamadwant 's 
changes from #23002 .
   
   The Rust project:
   
   ```toml
   datafusion = { git = "https://github.com/apache/datafusion.git"; }
   duckdb = { version = "1.10503.1", features = ["bundled"] }
   tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
   ```
   
   ```rust
   use std::sync::Arc;
   
   use datafusion::arrow::datatypes::{DataType, Field, Schema};
   use datafusion::catalog::{
       CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, 
SchemaProvider,
   };
   use datafusion::datasource::empty::EmptyTable;
   use datafusion::prelude::*;
   use datafusion::sql::unparser::Unparser;
   use datafusion::sql::unparser::dialect::DuckDBDialect;
   use duckdb::Connection;
   
   const QUERY: &str = r#"
   SELECT * FROM
   (
   SELECT
           order_id
       FROM
           "warehouse"."main"."order_items"
   ) oi
   JOIN (
       SELECT
           order_id,
           coalesce(discount_pct, 0) AS discount_pct_2
       FROM
           "warehouse"."main"."orders"
   ) o USING (order_id)
   "#;
   
   #[tokio::main]
   async fn main() -> Result<(), Box<dyn std::error::Error>> {
       let order_items_schema = Arc::new(Schema::new(vec![Field::new(
           "order_id",
           DataType::Int32,
           true,
       )]));
   
       let orders_schema = Arc::new(Schema::new(vec![
           Field::new("order_id", DataType::Int32, false),
           Field::new("customer_id", DataType::Int32, true),
           Field::new("discount_pct", DataType::Decimal128(5, 2), true),
       ]));
   
       let ctx = SessionContext::new();
   
       let schema_provider = Arc::new(MemorySchemaProvider::new());
       schema_provider.register_table(
           "order_items".to_string(),
           Arc::new(EmptyTable::new(order_items_schema)),
       )?;
       schema_provider.register_table(
           "orders".to_string(),
           Arc::new(EmptyTable::new(orders_schema)),
       )?;
   
       let catalog = Arc::new(MemoryCatalogProvider::new());
       catalog.register_schema("main", schema_provider)?;
       ctx.register_catalog("warehouse", catalog);
   
       let dialect = DuckDBDialect::new();
       let unparser = Unparser::new(&dialect);
       let conn = Connection::open("warehouse.duckdb")?;
   
       {
           let unopt_plan = ctx.sql(QUERY).await?.into_unoptimized_plan();
           let sql = unparser.plan_to_sql(&unopt_plan)?;
           let mut stmt = conn.prepare(&sql.to_string())?;
           match stmt.query([]) {
               Ok(_) => {
                   println!("original query is valid");
               }
               Err(e) => {
                   println!("original query is not valid: {e}");
               }
           }
       }
   
       let plan = ctx.sql(QUERY).await?.into_optimized_plan()?;
       let sql = unparser.plan_to_sql(&plan)?;
       match conn.execute(&sql.to_string(), []) {
           Ok(_) => {
               println!("success");
           }
           Err(e) => {
               println!("failed: {e}");
               println!("Optimized sql =\n{sql}");
               println!("Optimized plan =\n{}", plan.display_indent());
           }
       }
   
       Ok(())
   }
   ```
   
   This yields the following error:
   ```
   original query is valid
   failed: Binder Error: Referenced table "o" not found!
   Candidate tables: "oi"
   
   LINE 1: ... "warehouse"."main"."order_items" AS "oi" INNER JOIN (SELECT 
"o"."order_id", CASE WHEN "__common_expr_1" IS NOT NULL...
                                                                           ^
   Optimized sql =
   SELECT "o"."order_id", "o"."discount_pct_2" FROM 
"warehouse"."main"."order_items" AS "oi" INNER JOIN (SELECT "o"."order_id", 
CASE WHEN "__common_expr_1" IS NOT NULL THEN "__common_expr_1" ELSE 0.00 END AS 
"discount_pct_2" FROM (SELECT CAST("o"."discount_pct" AS DECIMAL(22,2)) AS 
"__common_expr_1", "o"."order_id" FROM "warehouse"."main"."orders" AS "o")) AS 
"o" ON "oi"."order_id" = "o"."order_id"
   Optimized plan =
   Projection: o.order_id, o.discount_pct_2
     Inner Join: oi.order_id = o.order_id
       SubqueryAlias: oi
         TableScan: warehouse.main.order_items projection=[order_id]
       SubqueryAlias: o
         Projection: warehouse.main.orders.order_id, CASE WHEN __common_expr_1 
IS NOT NULL THEN __common_expr_1 ELSE Decimal128(0.00,22,2) END AS 
discount_pct_2
           Projection: CAST(warehouse.main.orders.discount_pct AS 
Decimal128(22, 2)) AS __common_expr_1, warehouse.main.orders.order_id
             TableScan: warehouse.main.orders projection=[order_id, 
discount_pct]
   ```
   
   ### Expected behavior
   
   The `Unparser` should generate valid SQL. Specifically, we shouldn't be 
referencing "o" outside of the unnamed subquery.
   
   ```
   SELECT
       "o"."order_id",
       "o"."discount_pct_2"
   FROM
       "warehouse"."main"."order_items" AS "oi"
       INNER JOIN (
           SELECT
               "o"."order_id",                   <------ "o" doesn't exist here
               CASE
                   WHEN "__common_expr_1" IS NOT NULL THEN "__common_expr_1"
                   ELSE 0.00
               END AS "discount_pct_2"
           FROM
               (
                   SELECT
                       CAST("o"."discount_pct" AS DECIMAL(22, 2)) AS 
"__common_expr_1",
                       "o"."order_id"
                   FROM
                       "warehouse"."main"."orders" AS "o"
               )
       ) AS "o" ON "oi"."order_id" = "o"."order_id"
   ```
   
   ### Additional context
   
   The `LogicalPlan` that triggers this invalid SQL in this case is a direct 
result of the CSE pass. However, it does not seem to be doing anything wrong, 
we just can't accurately unparse the resulting `LogicalPlan`.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to