tustvold commented on code in PR #5245:
URL: https://github.com/apache/arrow-rs/pull/5245#discussion_r1438881918
##########
arrow-string/src/regexp.rs:
##########
@@ -248,6 +227,186 @@ pub fn regexp_match<OffsetSize: OffsetSizeTrait>(
Ok(Arc::new(list_builder.finish()))
}
+fn get_scalar_pattern_flag<'a, OffsetSize: OffsetSizeTrait>(
+ regex_array: &'a dyn Array,
+ flag_array: Option<&'a dyn Array>,
+) -> (Option<&'a str>, Option<&'a str>) {
+ let regex = regex_array
Review Comment:
We could use as_string here
##########
arrow-string/src/regexp.rs:
##########
@@ -248,6 +227,186 @@ pub fn regexp_match<OffsetSize: OffsetSizeTrait>(
Ok(Arc::new(list_builder.finish()))
}
+fn get_scalar_pattern_flag<'a, OffsetSize: OffsetSizeTrait>(
+ regex_array: &'a dyn Array,
+ flag_array: Option<&'a dyn Array>,
+) -> (Option<&'a str>, Option<&'a str>) {
+ let regex = regex_array
+ .as_any()
+ .downcast_ref::<GenericStringArray<OffsetSize>>()
+ .expect("Unable to downcast to StringArray/LargeStringArray");
+ let regex = if regex.is_valid(0) {
Review Comment:
Could use bool::then to make this simpler
##########
arrow-string/src/regexp.rs:
##########
@@ -248,6 +227,186 @@ pub fn regexp_match<OffsetSize: OffsetSizeTrait>(
Ok(Arc::new(list_builder.finish()))
}
+fn get_scalar_pattern_flag<'a, OffsetSize: OffsetSizeTrait>(
+ regex_array: &'a dyn Array,
+ flag_array: Option<&'a dyn Array>,
+) -> (Option<&'a str>, Option<&'a str>) {
+ let regex = regex_array
+ .as_any()
+ .downcast_ref::<GenericStringArray<OffsetSize>>()
+ .expect("Unable to downcast to StringArray/LargeStringArray");
+ let regex = if regex.is_valid(0) {
+ Some(regex.value(0))
+ } else {
+ None
+ };
+
+ if flag_array.is_some() {
+ let flag = flag_array
+ .unwrap()
+ .as_any()
+ .downcast_ref::<GenericStringArray<OffsetSize>>()
+ .expect("Unable to downcast to StringArray/LargeStringArray");
+
+ if flag.is_valid(0) {
+ let flag = flag.value(0);
+ (regex, Some(flag))
+ } else {
+ (regex, None)
+ }
+ } else {
+ (regex, None)
+ }
+}
+
+fn regexp_scalar_match<OffsetSize: OffsetSizeTrait>(
+ array: &GenericStringArray<OffsetSize>,
+ regex: &Regex,
+) -> std::result::Result<ArrayRef, ArrowError> {
+ let builder: GenericStringBuilder<OffsetSize> =
GenericStringBuilder::with_capacity(0, 0);
+ let mut list_builder = ListBuilder::new(builder);
+
+ array
+ .iter()
+ .map(|value| {
+ match value {
+ // Required for Postgres compatibility:
+ // SELECT regexp_match('foobarbequebaz', ''); = {""}
+ Some(_) if regex.as_str() == "" => {
+ list_builder.values().append_value("");
+ list_builder.append(true);
+ }
+ Some(value) => match regex.captures(value) {
+ Some(caps) => {
+ let mut iter = caps.iter();
+ if caps.len() > 1 {
+ iter.next();
+ }
+ for m in iter.flatten() {
+ list_builder.values().append_value(m.as_str());
+ }
+
+ list_builder.append(true);
+ }
+ None => list_builder.append(false),
+ },
+ _ => list_builder.append(false),
+ }
+ Ok(())
+ })
+ .collect::<Result<Vec<()>, ArrowError>>()?;
+
+ Ok(Arc::new(list_builder.finish()))
+}
+
+/// Extract all groups matched by a regular expression for a given String
array.
+///
+/// Modelled after the Postgres [regexp_match].
+///
+/// Returns a ListArray of [`GenericStringArray`] with each element containing
the leftmost-first
+/// match of the corresponding index in `regex_array` to string in `array`
+///
+/// If there is no match, the list element is NULL.
+///
+/// If a match is found, and the pattern contains no capturing parenthesized
subexpressions,
+/// then the list element is a single-element [`GenericStringArray`]
containing the substring
+/// matching the whole pattern.
+///
+/// If a match is found, and the pattern contains capturing parenthesized
subexpressions, then the
+/// list element is a [`GenericStringArray`] whose n'th element is the
substring matching
+/// the n'th capturing parenthesized subexpression of the pattern.
+///
+/// The flags parameter is an optional text string containing zero or more
single-letter flags
+/// that change the function's behavior.
+///
+/// [regexp_match]:
https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-POSIX-REGEXP
+pub fn regexp_match<OffsetSize: OffsetSizeTrait>(
+ array: &GenericStringArray<OffsetSize>,
Review Comment:
Should this be &dyn Array?
##########
arrow-string/src/regexp.rs:
##########
@@ -248,6 +227,186 @@ pub fn regexp_match<OffsetSize: OffsetSizeTrait>(
Ok(Arc::new(list_builder.finish()))
}
+fn get_scalar_pattern_flag<'a, OffsetSize: OffsetSizeTrait>(
+ regex_array: &'a dyn Array,
+ flag_array: Option<&'a dyn Array>,
+) -> (Option<&'a str>, Option<&'a str>) {
+ let regex = regex_array
+ .as_any()
+ .downcast_ref::<GenericStringArray<OffsetSize>>()
+ .expect("Unable to downcast to StringArray/LargeStringArray");
+ let regex = if regex.is_valid(0) {
+ Some(regex.value(0))
+ } else {
+ None
+ };
+
+ if flag_array.is_some() {
+ let flag = flag_array
+ .unwrap()
+ .as_any()
+ .downcast_ref::<GenericStringArray<OffsetSize>>()
+ .expect("Unable to downcast to StringArray/LargeStringArray");
+
+ if flag.is_valid(0) {
+ let flag = flag.value(0);
+ (regex, Some(flag))
+ } else {
+ (regex, None)
+ }
+ } else {
+ (regex, None)
+ }
+}
+
+fn regexp_scalar_match<OffsetSize: OffsetSizeTrait>(
+ array: &GenericStringArray<OffsetSize>,
+ regex: &Regex,
+) -> std::result::Result<ArrayRef, ArrowError> {
Review Comment:
```suggestion
) -> Result<ArrayRef, ArrowError> {
```
##########
arrow-string/src/regexp.rs:
##########
@@ -248,6 +227,186 @@ pub fn regexp_match<OffsetSize: OffsetSizeTrait>(
Ok(Arc::new(list_builder.finish()))
}
+fn get_scalar_pattern_flag<'a, OffsetSize: OffsetSizeTrait>(
+ regex_array: &'a dyn Array,
+ flag_array: Option<&'a dyn Array>,
+) -> (Option<&'a str>, Option<&'a str>) {
+ let regex = regex_array
+ .as_any()
+ .downcast_ref::<GenericStringArray<OffsetSize>>()
+ .expect("Unable to downcast to StringArray/LargeStringArray");
+ let regex = if regex.is_valid(0) {
+ Some(regex.value(0))
+ } else {
+ None
+ };
+
+ if flag_array.is_some() {
Review Comment:
`if let Some(flag_array) = flag_array`
Would avoid the later unwrap
##########
arrow-string/src/regexp.rs:
##########
@@ -248,6 +227,186 @@ pub fn regexp_match<OffsetSize: OffsetSizeTrait>(
Ok(Arc::new(list_builder.finish()))
}
+fn get_scalar_pattern_flag<'a, OffsetSize: OffsetSizeTrait>(
+ regex_array: &'a dyn Array,
+ flag_array: Option<&'a dyn Array>,
+) -> (Option<&'a str>, Option<&'a str>) {
+ let regex = regex_array
+ .as_any()
+ .downcast_ref::<GenericStringArray<OffsetSize>>()
+ .expect("Unable to downcast to StringArray/LargeStringArray");
+ let regex = if regex.is_valid(0) {
+ Some(regex.value(0))
+ } else {
+ None
+ };
+
+ if flag_array.is_some() {
+ let flag = flag_array
+ .unwrap()
+ .as_any()
+ .downcast_ref::<GenericStringArray<OffsetSize>>()
Review Comment:
as_string again
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]