seddonm1 commented on a change in pull request #9243:
URL: https://github.com/apache/arrow/pull/9243#discussion_r561653478
##########
File path: rust/datafusion/src/physical_plan/string_expressions.rs
##########
@@ -34,42 +34,446 @@ macro_rules! downcast_vec {
}};
}
-/// concatenate string columns together.
-pub fn concatenate(args: &[ArrayRef]) -> Result<StringArray> {
+/// Returns the numeric code of the first character of the argument.
+pub fn ascii<T: StringOffsetSizeTrait>(args: &[ArrayRef]) ->
Result<Int32Array> {
+ let array = args[0]
+ .as_any()
+ .downcast_ref::<GenericStringArray<T>>()
+ .unwrap();
+ // first map is the iterator, second is for the `Option<_>`
+ Ok(array
+ .iter()
+ .map(|x| {
+ x.map(|x: &str| {
+ let mut chars = x.chars();
+ chars.next().map_or(0, |v| v as i32)
+ })
+ })
+ .collect())
+}
+
+/// Removes the longest string containing only characters in characters (a
space by default) from the start and end of string.
+pub fn btrim<T: StringOffsetSizeTrait>(args: &[ArrayRef]) ->
Result<StringArray> {
+ match args.len() {
+ 0 => Err(DataFusionError::Internal(
+ "btrim was called with 0 arguments. It requires at least
one.".to_string(),
+ )),
+ 1 => {
+ let string_array = args[0]
+ .as_any()
+ .downcast_ref::<GenericStringArray<T>>()
+ .unwrap();
+
+ Ok(string_array
+ .iter()
+ .map(|x| x.map(|x: &str| x.trim()))
+ .collect())
+ }
+ 2 => {
+ let string_array = args[0]
+ .as_any()
+ .downcast_ref::<GenericStringArray<T>>()
+ .unwrap();
+
+ let characters_array = args[1]
+ .as_any()
+ .downcast_ref::<GenericStringArray<T>>()
+ .unwrap();
+
+ Ok(string_array
+ .iter()
+ .enumerate()
+ .map(|(i, x)| {
+ if characters_array.is_null(i) {
+ None
+ } else {
+ x.map(|x: &str| {
+ let chars: Vec<char> =
+ characters_array.value(i).chars().collect();
+ x.trim_start_matches(&chars[..])
+ .trim_end_matches(&chars[..])
+ })
+ }
+ })
+ .collect())
+ }
+ other => Err(DataFusionError::Internal(format!(
+ "btrim was called with {} arguments. It requires at most two.",
+ other
+ ))),
+ }
+}
+
+/// Returns the character with the given code.
+pub fn chr(args: &[ArrayRef]) -> Result<StringArray> {
+ let array = args[0].as_any().downcast_ref::<Int64Array>().unwrap();
+ // first map is the iterator, second is for the `Option<_>`
+ Ok(array
+ .iter()
+ .map(|x: Option<i64>| {
+ x.map(|x| {
+ if x == 0 {
+ Err(DataFusionError::Internal(
+ "null character not permitted.".to_string(),
+ ))
+ } else {
+ match core::char::from_u32(x as u32) {
+ Some(x) => Ok(x.to_string()),
+ None => Err(DataFusionError::Internal(
+ "requested character too large for
encoding.".to_string(),
+ )),
+ }
+ }
+ .unwrap()
Review comment:
I'm not sure if we should be panicing if these characters appear
##########
File path: rust/datafusion/src/physical_plan/type_coercion.rs
##########
@@ -69,13 +69,42 @@ pub fn data_types(
signature: &Signature,
) -> Result<Vec<DataType>> {
let valid_types = match signature {
- Signature::Variadic(valid_types) => valid_types
+ Signature::Any(number) => {
+ if current_types.len() != *number {
+ return Err(DataFusionError::Plan(format!(
+ "The function expected {} arguments but received {}",
+ number,
+ current_types.len()
+ )));
+ }
+ vec![(0..*number).map(|i| current_types[i].clone()).collect()]
+ }
+ Signature::Exact(valid_types) => vec![valid_types.clone()],
+ Signature::Uniform(valid_types) => {
+ let valid_signature = valid_types
+ .iter()
+ .filter(|x| x.len() == current_types.len())
+ .collect::<Vec<_>>();
+ if valid_signature.len() != 1 {
+ return Err(DataFusionError::Plan(format!(
+ "The function expected {} arguments but received {}",
+ valid_types
+ .iter()
+ .map(|x| x.len().to_string())
+ .collect::<Vec<_>>()
+ .join(" or "),
+ current_types.len()
+ )));
+ }
+ cartesian_product(valid_signature.first().unwrap())
Review comment:
Thanks @jorgecarleitao . Yes I will split this out.
A good example is lpad which is either:
[string, int] or [string, int, string]. I am away a couple of days but will
split this out so we can work throught methodically.
##########
File path: rust/datafusion/src/physical_plan/string_expressions.rs
##########
@@ -34,42 +35,553 @@ macro_rules! downcast_vec {
}};
}
-/// concatenate string columns together.
-pub fn concatenate(args: &[ArrayRef]) -> Result<StringArray> {
+/// Returns the numeric code of the first character of the argument.
+pub fn ascii<T: StringOffsetSizeTrait>(args: &[ArrayRef]) ->
Result<Int32Array> {
+ let array = args[0]
+ .as_any()
+ .downcast_ref::<GenericStringArray<T>>()
+ .unwrap();
+ // first map is the iterator, second is for the `Option<_>`
+ Ok(array
+ .iter()
+ .map(|x| {
+ x.map(|x: &str| {
+ let mut chars = x.chars();
+ chars.next().map_or(0, |v| v as i32)
+ })
+ })
+ .collect())
+}
+
+/// Removes the longest string containing only characters in characters (a
space by default) from the start and end of string.
+pub fn btrim<T: StringOffsetSizeTrait>(args: &[ArrayRef]) ->
Result<StringArray> {
+ match args.len() {
+ 0 => Err(DataFusionError::Internal(
+ "btrim was called with 0 arguments. It requires at least
1.".to_string(),
+ )),
+ 1 => {
+ let string_array = args[0]
+ .as_any()
+ .downcast_ref::<GenericStringArray<T>>()
+ .unwrap();
+
+ Ok(string_array
+ .iter()
+ .map(|x| x.map(|x: &str| x.trim()))
+ .collect())
+ }
+ 2 => {
+ let string_array = args[0]
+ .as_any()
+ .downcast_ref::<GenericStringArray<T>>()
+ .unwrap();
+
+ let characters_array = args[1]
+ .as_any()
+ .downcast_ref::<GenericStringArray<T>>()
+ .unwrap();
+
+ Ok(string_array
+ .iter()
+ .enumerate()
+ .map(|(i, x)| {
+ if characters_array.is_null(i) {
+ None
+ } else {
+ x.map(|x: &str| {
+ let chars: Vec<char> =
+ characters_array.value(i).chars().collect();
+ x.trim_start_matches(&chars[..])
+ .trim_end_matches(&chars[..])
+ })
+ }
+ })
+ .collect())
+ }
+ other => Err(DataFusionError::Internal(format!(
+ "btrim was called with {} arguments. It requires at most 2.",
+ other
+ ))),
+ }
+}
+
+/// Returns number of characters in the string.
+pub fn character_length_i32(args: &[ArrayRef]) -> Result<Int32Array> {
+ let array = args[0]
+ .as_any()
+ .downcast_ref::<GenericStringArray<i32>>()
+ .unwrap();
+ // first map is the iterator, second is for the `Option<_>`
+ Ok(array
+ .iter()
+ .map(|x| x.map(|x: &str| x.graphemes(true).count() as i32))
+ .collect())
+}
+
+/// Returns number of characters in the string.
+pub fn character_length_i64(args: &[ArrayRef]) -> Result<Int64Array> {
+ let array = args[0]
+ .as_any()
+ .downcast_ref::<GenericStringArray<i64>>()
+ .unwrap();
+ // first map is the iterator, second is for the `Option<_>`
+ Ok(array
+ .iter()
+ .map(|x| x.map(|x: &str| x.graphemes(true).count() as i64))
+ .collect())
+}
+
+/// Returns the character with the given code.
+pub fn chr(args: &[ArrayRef]) -> Result<StringArray> {
+ let array = args[0].as_any().downcast_ref::<Int64Array>().unwrap();
+ // first map is the iterator, second is for the `Option<_>`
+ Ok(array
Review comment:
Ah of course 🤦
##########
File path: rust/datafusion/src/physical_plan/functions.rs
##########
@@ -60,10 +59,15 @@ pub enum Signature {
// A function such as `array` is `VariadicEqual`
// The first argument decides the type used for coercion
VariadicEqual,
+ /// fixed number of arguments of vector of vectors of valid types
+ // A function of one argument of f64 is
`Uniform(vc![vec![vec![DataType::Float64]]])`
+ // A function of one argument of f64 or f32 is
`Uniform(vec![vec![vec![DataType::Float32, DataType::Float64]]])`
+ // A function of two arguments with first argument of f64 or f32 and
second argument of utf8 is `Uniform(vec![vec![vec![DataType::Float32,
DataType::Float64], vec![DataType::Utf8]]])`
+ Uniform(Vec<Vec<Vec<DataType>>>),
Review comment:
Yes. Agree. The existing code clearly took some thought so wanted to
leave it until we can agree correct course of action.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]