HaoYang670 commented on code in PR #1784:
URL: https://github.com/apache/arrow-rs/pull/1784#discussion_r890100818
##########
arrow/src/compute/kernels/substring.rs:
##########
@@ -150,6 +152,56 @@ pub fn substring(array: &dyn Array, start: i64, length:
Option<u64>) -> Result<A
}
}
+/// # Arguments
+/// * `array` - The input string array
+///
+/// * `start` - The start index of all substrings.
+/// If `start >= 0`, then count from the start of the string,
+/// otherwise count from the end of the string.
+///
+/// * `length`(option) - The length of all substrings.
+/// If `length` is `None`, then the substring is from `start` to the end of
the string.
+///
+/// Attention: Both `start` and `length` are counted by char.
+///
+/// # Performance
+/// This function is slower than [substring].
+/// Theoretically, the time complexity is `O(n)` where `n` is the length of
the value buffer.
+/// It is recommended to use [substring] if the input array only contains
ASCII chars.
+///
+/// # Basic usage
+/// ```
+/// # use arrow::array::StringArray;
+/// # use arrow::compute::kernels::substring::substring_by_char;
+/// let array = StringArray::from(vec![Some("arrow"), None, Some("Γ ⊢x:T")]);
+/// let result = substring_by_char(&array, 1, Some(4)).unwrap();
+/// assert_eq!(result, StringArray::from(vec![Some("rrow"), None, Some("
⊢x:")]));
+/// ```
+pub fn substring_by_char<OffsetSize: OffsetSizeTrait>(
+ array: &GenericStringArray<OffsetSize>,
+ start: i64,
+ length: Option<u64>,
+) -> Result<GenericStringArray<OffsetSize>> {
+ Ok(array
+ .iter()
+ .map(|val| {
+ val.map(|val| {
+ let char_count = val.chars().count();
+ let start = if start >= 0 {
+ start.to_usize().unwrap().min(char_count)
+ } else {
+ char_count - (-start).to_usize().unwrap().min(char_count)
+ };
+ let length = length.map_or(char_count - start, |length| {
+ length.to_usize().unwrap().min(char_count - start)
+ });
+
+ val.chars().skip(start).take(length).collect::<String>()
Review Comment:
Tracked by #1800
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]