seddonm1 commented on a change in pull request #8966:
URL: https://github.com/apache/arrow/pull/8966#discussion_r546443419
##########
File path: rust/datafusion/src/physical_plan/string_expressions.rs
##########
@@ -66,3 +71,73 @@ pub fn concatenate(args: &[ArrayRef]) -> Result<StringArray>
{
}
Ok(builder.finish())
}
+
+/// character_length returns number of characters in the string
+/// character_length('josé') = 4
+pub fn character_length(args: &[ArrayRef]) -> Result<Int32Array> {
+ let num_rows = args[0].len();
+ let string_args =
+ &args[0]
+ .as_any()
+ .downcast_ref::<StringArray>()
+ .ok_or_else(|| {
+ DataFusionError::Internal(
+ "could not cast input to StringArray".to_string(),
+ )
+ })?;
+
+ let result = (0..num_rows)
+ .map(|i| {
+ if string_args.is_null(i) {
+ // NB: Since we use the same null bitset as the input,
+ // the output for this value will be ignored, but we
+ // need some value in the array we are building.
+ Ok(0)
+ } else {
+ Ok(string_args.value(i).chars().count() as i32)
+ }
+ })
+ .collect::<Result<Vec<_>>>()?;
+
+ let data = ArrayData::new(
+ DataType::Int32,
+ num_rows,
+ Some(string_args.null_count()),
+ string_args.data().null_buffer().cloned(),
+ 0,
+ vec![Buffer::from(result.to_byte_slice())],
+ vec![],
+ );
+
+ Ok(Int32Array::from(Arc::new(data)))
+}
+
+macro_rules! string_unary_function {
+ ($NAME:ident, $FUNC:ident) => {
+ /// string function that accepts utf8 and returns utf8
+ pub fn $NAME(args: &[ArrayRef]) -> Result<StringArray> {
+ let string_args = &args[0]
+ .as_any()
+ .downcast_ref::<StringArray>()
+ .ok_or_else(|| {
+ DataFusionError::Internal(
+ "could not cast input to StringArray".to_string(),
+ )
+ })?;
+
+ let mut builder = StringBuilder::new(args.len());
+ for index in 0..args[0].len() {
+ if string_args.is_null(index) {
+ builder.append_null()?;
+ } else {
+ builder.append_value(&string_args.value(index).$FUNC())?;
+ }
+ }
+ Ok(builder.finish())
Review comment:
Thanks @jorgecarleitao . Your code makes a lot of sense and the macro is
much cleaner however I am stuck at the next bit which is how to pass in `T`. I
can do it in `functions::create_physical_expr` like below but this does not
feel correct.
```rust
BuiltinScalarFunction::Lower => |args| match args[0].data_type() {
DataType::Utf8 => Ok(Arc::new(string_expressions::lower::<i32>(args)?)),
DataType::LargeUtf8 =>
Ok(Arc::new(string_expressions::lower::<i64>(args)?)),
other => Err(DataFusionError::Internal(format!(
"Unsupported data type {:?} for function lower",
other,
))),
},
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]