Copilot commented on code in PR #19581:
URL: https://github.com/apache/datafusion/pull/19581#discussion_r2655423711


##########
datafusion/functions/src/string/octet_length.rs:
##########
@@ -90,7 +94,45 @@ impl ScalarUDFImpl for OctetLengthFunc {
         let [array] = take_function_args(self.name(), &args.args)?;
 
         match array {
-            ColumnarValue::Array(v) => 
Ok(ColumnarValue::Array(length(v.as_ref())?)),
+            ColumnarValue::Array(v) => {
+                let arr: ArrayRef = v.clone();

Review Comment:
   The clone of the ArrayRef on this line is unnecessary. Since ArrayRef is 
already an Arc, cloning it is cheap but redundant here. You can use the 
original `v` directly in the downcasts below without cloning.
   
   Consider removing the clone:
   ```rust
   ColumnarValue::Array(v) => {
       if let Some(arr) = v.as_any().downcast_ref::<StringArray>() {
           // ...
   ```
   
   This eliminates an unnecessary atomic reference count increment/decrement.



##########
datafusion/functions/src/string/octet_length.rs:
##########
@@ -90,7 +94,45 @@ impl ScalarUDFImpl for OctetLengthFunc {
         let [array] = take_function_args(self.name(), &args.args)?;
 
         match array {
-            ColumnarValue::Array(v) => 
Ok(ColumnarValue::Array(length(v.as_ref())?)),
+            ColumnarValue::Array(v) => {
+                let arr: ArrayRef = v.clone();
+
+                if let Some(arr) = arr.as_any().downcast_ref::<StringArray>() {
+                    let mut builder = Int32Builder::with_capacity(arr.len());
+                    for i in 0..arr.len() {
+                        if arr.is_null(i) {
+                            builder.append_null();
+                        } else {
+                            builder.append_value(arr.value_length(i) as i32);
+                        }
+                    }
+                    Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+                } else if let Some(arr) = 
arr.as_any().downcast_ref::<LargeStringArray>()
+                {
+                    let mut builder = Int64Builder::with_capacity(arr.len());
+                    for i in 0..arr.len() {
+                        if arr.is_null(i) {
+                            builder.append_null();
+                        } else {
+                            builder.append_value(arr.value_length(i) as i64);
+                        }
+                    }
+                    Ok(ColumnarValue::Array(Arc::new(builder.finish())))

Review Comment:
   The manual loop with index-based access can be replaced with a more 
idiomatic and potentially more efficient iterator-based approach. The pattern 
used in this codebase (e.g., levenshtein.rs lines 184-193) uses 
`.iter().map().collect()` which is cleaner and allows the compiler to optimize 
better.
   
   Consider replacing this with:
   ```rust
   let result = arr.iter().map(|s| s.map(|s| s.len() as 
i64)).collect::<Int64Array>();
   Ok(ColumnarValue::Array(Arc::new(result)))
   ```
   
   This approach:
   - Eliminates the need for manual capacity management
   - Handles nulls automatically through Option mapping
   - Is more consistent with similar functions in the codebase
   - May enable better compiler optimizations



##########
datafusion/functions/src/string/octet_length.rs:
##########
@@ -90,7 +94,45 @@ impl ScalarUDFImpl for OctetLengthFunc {
         let [array] = take_function_args(self.name(), &args.args)?;
 
         match array {
-            ColumnarValue::Array(v) => 
Ok(ColumnarValue::Array(length(v.as_ref())?)),
+            ColumnarValue::Array(v) => {
+                let arr: ArrayRef = v.clone();
+
+                if let Some(arr) = arr.as_any().downcast_ref::<StringArray>() {
+                    let mut builder = Int32Builder::with_capacity(arr.len());
+                    for i in 0..arr.len() {
+                        if arr.is_null(i) {
+                            builder.append_null();
+                        } else {
+                            builder.append_value(arr.value_length(i) as i32);
+                        }
+                    }
+                    Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+                } else if let Some(arr) = 
arr.as_any().downcast_ref::<LargeStringArray>()
+                {
+                    let mut builder = Int64Builder::with_capacity(arr.len());
+                    for i in 0..arr.len() {
+                        if arr.is_null(i) {
+                            builder.append_null();
+                        } else {
+                            builder.append_value(arr.value_length(i) as i64);
+                        }
+                    }
+                    Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+                } else if let Some(arr) = 
arr.as_any().downcast_ref::<StringViewArray>() {
+                    let mut builder = Int32Builder::with_capacity(arr.len());
+                    for i in 0..arr.len() {
+                        if arr.is_null(i) {
+                            builder.append_null();
+                        } else {
+                            builder.append_value(arr.value(i).len() as i32);
+                        }
+                    }
+                    Ok(ColumnarValue::Array(Arc::new(builder.finish())))

Review Comment:
   The manual loop with index-based access can be replaced with a more 
idiomatic and potentially more efficient iterator-based approach. The pattern 
used in this codebase (e.g., levenshtein.rs lines 154-163) uses 
`.iter().map().collect()` which is cleaner and allows the compiler to optimize 
better.
   
   Consider replacing this with:
   ```rust
   let result = arr.iter().map(|s| s.map(|s| s.len() as 
i32)).collect::<Int32Array>();
   Ok(ColumnarValue::Array(Arc::new(result)))
   ```
   
   This approach:
   - Eliminates the need for manual capacity management
   - Handles nulls automatically through Option mapping
   - Is more consistent with similar functions in the codebase
   - May enable better compiler optimizations



##########
datafusion/functions/src/string/octet_length.rs:
##########
@@ -90,7 +94,45 @@ impl ScalarUDFImpl for OctetLengthFunc {
         let [array] = take_function_args(self.name(), &args.args)?;
 
         match array {
-            ColumnarValue::Array(v) => 
Ok(ColumnarValue::Array(length(v.as_ref())?)),
+            ColumnarValue::Array(v) => {
+                let arr: ArrayRef = v.clone();
+
+                if let Some(arr) = arr.as_any().downcast_ref::<StringArray>() {
+                    let mut builder = Int32Builder::with_capacity(arr.len());
+                    for i in 0..arr.len() {
+                        if arr.is_null(i) {
+                            builder.append_null();
+                        } else {
+                            builder.append_value(arr.value_length(i) as i32);
+                        }
+                    }
+                    Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+                } else if let Some(arr) = 
arr.as_any().downcast_ref::<LargeStringArray>()
+                {
+                    let mut builder = Int64Builder::with_capacity(arr.len());
+                    for i in 0..arr.len() {
+                        if arr.is_null(i) {
+                            builder.append_null();
+                        } else {
+                            builder.append_value(arr.value_length(i) as i64);
+                        }
+                    }
+                    Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+                } else if let Some(arr) = 
arr.as_any().downcast_ref::<StringViewArray>() {
+                    let mut builder = Int32Builder::with_capacity(arr.len());
+                    for i in 0..arr.len() {
+                        if arr.is_null(i) {
+                            builder.append_null();
+                        } else {
+                            builder.append_value(arr.value(i).len() as i32);
+                        }
+                    }
+                    Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+                } else {
+                    unreachable!("octet_length expects string arrays")
+                }

Review Comment:
   There's significant code duplication across the three branches for 
StringArray, LargeStringArray, and StringViewArray. The logic is identical 
except for the array type and return type. This can be refactored using a 
generic helper function or by extracting the common logic.
   
   Consider a refactoring similar to how other functions in this codebase 
handle multiple string types (e.g., repeat.rs uses a generic `repeat_impl` 
function with StringArrayType trait). This would:
   - Reduce code duplication
   - Make the code easier to maintain
   - Reduce the chance of introducing bugs when updating one branch but not 
others
   
   The three branches (lines 100-130) could potentially be consolidated into a 
single implementation using Arrow's iterator pattern combined with generics.



##########
datafusion/functions/src/string/octet_length.rs:
##########
@@ -90,7 +94,45 @@ impl ScalarUDFImpl for OctetLengthFunc {
         let [array] = take_function_args(self.name(), &args.args)?;
 
         match array {
-            ColumnarValue::Array(v) => 
Ok(ColumnarValue::Array(length(v.as_ref())?)),
+            ColumnarValue::Array(v) => {
+                let arr: ArrayRef = v.clone();
+
+                if let Some(arr) = arr.as_any().downcast_ref::<StringArray>() {
+                    let mut builder = Int32Builder::with_capacity(arr.len());
+                    for i in 0..arr.len() {
+                        if arr.is_null(i) {
+                            builder.append_null();
+                        } else {
+                            builder.append_value(arr.value_length(i) as i32);
+                        }
+                    }
+                    Ok(ColumnarValue::Array(Arc::new(builder.finish())))

Review Comment:
   The manual loop with index-based access can be replaced with a more 
idiomatic and potentially more efficient iterator-based approach. The pattern 
used in this codebase (e.g., levenshtein.rs lines 169-178) uses 
`.iter().map().collect()` which is cleaner and allows the compiler to optimize 
better.
   
   Consider replacing this with:
   ```rust
   let result = arr.iter().map(|s| s.map(|s| s.len() as 
i32)).collect::<Int32Array>();
   Ok(ColumnarValue::Array(Arc::new(result)))
   ```
   
   This approach:
   - Eliminates the need for manual capacity management
   - Handles nulls automatically through Option mapping
   - Is more consistent with similar functions in the codebase
   - May enable better compiler optimizations



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to