Jefffrey opened a new issue, #18746:
URL: https://github.com/apache/datafusion/issues/18746
### Describe the bug
Full reproduction:
```rust
use std::any::Any;
use arrow::datatypes::DataType;
use datafusion::common::types::{logical_binary, logical_string, NativeType};
use datafusion::common::ScalarValue;
use datafusion::error::Result;
use datafusion::logical_expr::{Coercion, TypeSignatureClass, Volatility};
use datafusion::logical_expr::{
ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
};
use datafusion::prelude::*;
#[derive(Debug, PartialEq, Eq, Hash)]
struct ExampleUdf {
signature: Signature,
}
impl ExampleUdf {
fn new() -> Self {
Self {
signature: Signature::coercible(
vec![Coercion::new_implicit(
TypeSignatureClass::Native(logical_binary()),
vec![TypeSignatureClass::Native(logical_string())],
NativeType::Binary,
)],
Volatility::Immutable,
),
}
}
}
impl ScalarUDFImpl for ExampleUdf {
fn as_any(&self) -> &dyn Any {
self
}
fn name(&self) -> &str {
"example_udf"
}
fn signature(&self) -> &Signature {
&self.signature
}
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
match &arg_types[0] {
DataType::Binary | DataType::BinaryView => Ok(DataType::Utf8),
DataType::LargeBinary => Ok(DataType::LargeUtf8),
_ => unreachable!(),
}
}
fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
match &args.args[0].data_type() {
DataType::Binary | DataType::BinaryView =>
Ok(ColumnarValue::Scalar(
ScalarValue::Utf8(Some("a".to_string())),
)),
DataType::LargeBinary =>
Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(
Some("a".to_string()),
))),
_ => unreachable!(),
}
}
}
#[tokio::main]
async fn main() -> Result<()> {
let ctx = SessionContext::new();
ctx.register_udf(ScalarUDF::from(ExampleUdf::new()));
let sqls = [
// all of these are fine
"select example_udf(arrow_cast('a', 'Binary'))",
"select example_udf(arrow_cast('a', 'BinaryView'))",
"select example_udf(arrow_cast('a', 'LargeBinary'))",
"select example_udf(arrow_cast('a', 'Utf8'))",
"select example_udf(arrow_cast('a', 'Utf8View'))",
// this one has error
"select example_udf(arrow_cast('a', 'LargeUtf8'))",
];
for sql in sqls {
ctx.sql(sql).await?.show().await?;
}
Ok(())
}
```
Here we have a UDF which accepts string & binary inputs, where strings are
coerced to binary. Can see in the example SQLs we test each type that should be
accepted. The first five SQL are fine, however the last one fails with error:
```
Error: Context("Optimizer rule 'optimize_projections' failed",
Context("Check optimizer-specific invariants after optimizer rule:
optimize_projections", Internal("Failed due to a difference in schemas:
original schema: DFSchema { inner: Schema { fields: [Field { name:
\"example_udf(arrow_cast(Utf8(\\\"a\\\"),Utf8(\\\"LargeUtf8\\\")))\",
data_type: Utf8, nullable: true }], metadata: {} }, field_qualifiers: [None],
functional_dependencies: FunctionalDependencies { deps: [] } }, new schema:
DFSchema { inner: Schema { fields: [Field { name:
\"example_udf(arrow_cast(Utf8(\\\"a\\\"),Utf8(\\\"LargeUtf8\\\")))\",
data_type: LargeUtf8 }], metadata: {} }, field_qualifiers: [None],
functional_dependencies: FunctionalDependencies { deps: [] } }")))
```
Formatted for readability:
```
Failed due to a difference in schemas:
original schema: DFSchema { inner: Schema { fields: [Field { name:
"example_udf(arrow_cast(Utf8("a"),Utf8("LargeUtf8")))", data_type: Utf8,
nullable: true }] } },
new schema: DFSchema { inner: Schema { fields: [Field { name:
"example_udf(arrow_cast(Utf8("a"),Utf8("LargeUtf8")))", data_type: LargeUtf8 }]
} }"
```
- Remove unrelated info and extracted the difference in schema
- Note the error is from `optimize_projections` but I believe this is only
because this is where it checks the schema; error stems from type_coercion rule
- See how the data type is difference as well as nullability
There seems to be something going on in the type coercion analyzer rule that
incorrectly changes the project schema from `LargeUtf8` to `Utf8`?
### To Reproduce
_No response_
### Expected behavior
_No response_
### Additional context
_No response_
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]