alamb commented on code in PR #11787:
URL: https://github.com/apache/datafusion/pull/11787#discussion_r1702177110


##########
datafusion/functions/src/string/starts_with.rs:
##########
@@ -29,14 +29,37 @@ use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use crate::utils::make_scalar_function;
 
 /// Returns true if string starts with prefix.
-/// starts_with('alphabet', 'alph') = 't'
+/// starts_with('alphabet', 'alph') = t
 pub fn starts_with<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let left = as_generic_string_array::<T>(&args[0])?;
-    let right = as_generic_string_array::<T>(&args[1])?;
+    let bool_result = match (args[0].data_type(), args[1].data_type()) {
+        (DataType::Utf8View, DataType::Utf8View) => {

Review Comment:
   Since 
https://docs.rs/arrow/latest/arrow/compute/kernels/comparison/fn.starts_with.html
 already does this downcasting,  the I think you could simply call this with 
two `&dyn Array` rather than having to check the types
   
   In particular I think you could simply call 
   
   ```
   arrow::compute::kernels::comparison::starts_with(left, right)?
   ```
   
   and remove the `<T: OffsetSizeTrait>` from this call
   
   BTW I double checked that arrow has native StringView support (thanks to 
@XiangpengHao ):
   https://docs.rs/arrow-string/52.2.0/src/arrow_string/like.rs.html#129-148 ✅ 



##########
datafusion/functions/src/string/starts_with.rs:
##########
@@ -81,18 +106,73 @@ impl ScalarUDFImpl for StartsWithFunc {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        use DataType::*;
-
-        Ok(Boolean)
+        Ok(DataType::Boolean)
     }
 
     fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
         match args[0].data_type() {
             DataType::Utf8 => make_scalar_function(starts_with::<i32>, 
vec![])(args),
-            DataType::LargeUtf8 => {
-                return make_scalar_function(starts_with::<i64>, vec![])(args);
-            }
-            _ => internal_err!("Unsupported data type"),
+            DataType::LargeUtf8 => make_scalar_function(starts_with::<i64>, 
vec![])(args),
+            DataType::Utf8View => make_scalar_function(starts_with::<i32>, 
vec![])(args),
+            _ => internal_err!("Unsupported data types for starts_with")?,
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::utils::test::test_function;
+    use arrow::array::{Array, BooleanArray};
+    use arrow::datatypes::DataType::Boolean;
+    use datafusion_common::{Result, ScalarValue};
+    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+    use super::*;
+
+    #[test]
+    fn test_functions() -> Result<()> {
+        // Generate test cases for starts_with
+        let test_cases = vec![

Review Comment:
   I recommend using a slt test (example of sql test above)



##########
datafusion/functions/src/string/starts_with.rs:
##########
@@ -53,16 +76,18 @@ impl Default for StartsWithFunc {
 impl StartsWithFunc {
     pub fn new() -> Self {
         use DataType::*;
+
+        let string_types = vec![Utf8, LargeUtf8, Utf8View];
+        let mut type_signatures = vec![];
+
+        for left in &string_types {

Review Comment:
   I don't think the underlying kernel supports different string types (you 
can't actually call it with (`Utf8`, `LargeUtf8`) so the existing function 
actually has a bug
   
   In fact here is reproducer showing the probelm on main
   ```sql
   DataFusion CLI v40.0.0
   > create table foo as values (arrow_cast('foo', 'Utf8'), arrow_cast('bar', 
'LargeUtf8'));
   0 row(s) fetched.
   Elapsed 0.046 seconds.
   
   > select * from foo;
   +---------+---------+
   | column1 | column2 |
   +---------+---------+
   | foo     | bar     |
   +---------+---------+
   1 row(s) fetched.
   Elapsed 0.010 seconds.
   
   > select starts_with(column1, column2) from foo;
   Internal error: could not cast value to 
arrow_array::array::byte_array::GenericByteArray<arrow_array::types::GenericStringType<ii32>>.
   This was likely caused by a bug in DataFusion's code and we would welcome 
that you file an bug report in our issue tracker
   ```
   
   So in other words, I think the correct signature is
   
   ```rust
               signature: Signature::one_of(
                   vec![
                       Exact(vec![Utf8, Utf8]),
                       Exact(vec![LargeUtf8, LargeUtf8]),
                       Exact(vec![Utf8View, Utf8View]),
                   ],
                   Volatility::Immutable,
               ),
           }
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to