wiedld commented on code in PR #6849:
URL: https://github.com/apache/arrow-rs/pull/6849#discussion_r1893486940
##########
arrow-string/src/regexp.rs:
##########
@@ -333,42 +353,74 @@ fn get_scalar_pattern_flag<'a, OffsetSize:
OffsetSizeTrait>(
}
}
+fn get_scalar_pattern_flag_utf8view<'a>(
+ regex_array: &'a dyn Array,
+ flag_array: Option<&'a dyn Array>,
+) -> (Option<&'a str>, Option<&'a str>) {
+ let regex = regex_array.as_string_view();
+ let regex = regex.is_valid(0).then(|| regex.value(0));
+
+ if let Some(flag_array) = flag_array {
+ let flag = flag_array.as_string_view();
+ (regex, flag.is_valid(0).then(|| flag.value(0)))
+ } else {
+ (regex, None)
+ }
+}
+
+macro_rules! process_regexp_match {
+ ($array:expr, $regex:expr, $list_builder:expr) => {
+ $array
+ .iter()
+ .map(|value| {
+ match value {
+ // Required for Postgres compatibility:
+ // SELECT regexp_match('foobarbequebaz', ''); = {""}
+ Some(_) if $regex.as_str().is_empty() => {
+ $list_builder.values().append_value("");
+ $list_builder.append(true);
+ }
+ Some(value) => match $regex.captures(value) {
+ Some(caps) => {
+ let mut iter = caps.iter();
+ if caps.len() > 1 {
+ iter.next();
+ }
+ for m in iter.flatten() {
+
$list_builder.values().append_value(m.as_str());
+ }
+ $list_builder.append(true);
+ }
+ None => $list_builder.append(false),
+ },
+ None => $list_builder.append(false),
+ }
+ Ok(())
+ })
+ .collect::<Result<Vec<()>, ArrowError>>()?
+ };
+}
+
fn regexp_scalar_match<OffsetSize: OffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
regex: &Regex,
) -> Result<ArrayRef, ArrowError> {
let builder: GenericStringBuilder<OffsetSize> =
GenericStringBuilder::with_capacity(0, 0);
let mut list_builder = ListBuilder::new(builder);
- array
- .iter()
- .map(|value| {
- match value {
- // Required for Postgres compatibility:
- // SELECT regexp_match('foobarbequebaz', ''); = {""}
- Some(_) if regex.as_str() == "" => {
- list_builder.values().append_value("");
- list_builder.append(true);
- }
- Some(value) => match regex.captures(value) {
- Some(caps) => {
- let mut iter = caps.iter();
- if caps.len() > 1 {
- iter.next();
- }
- for m in iter.flatten() {
- list_builder.values().append_value(m.as_str());
- }
+ process_regexp_match!(array, regex, list_builder);
- list_builder.append(true);
- }
- None => list_builder.append(false),
- },
- _ => list_builder.append(false),
- }
- Ok(())
- })
- .collect::<Result<Vec<()>, ArrowError>>()?;
+ Ok(Arc::new(list_builder.finish()))
+}
+
+fn regexp_scalar_match_utf8view(
+ array: &StringViewArray,
+ regex: &Regex,
+) -> Result<ArrayRef, ArrowError> {
+ let builder = StringViewBuilder::with_capacity(0);
+ let mut list_builder = ListBuilder::new(builder);
+
+ process_regexp_match!(array, regex, list_builder);
Review Comment:
Same pattern, and same question as before.
It's the same logic (for a single regex) into a macro to use for both
StringView and StringArray. Why the macro?
##########
arrow-string/src/regexp.rs:
##########
@@ -243,78 +245,96 @@ where
Ok(BooleanArray::from(data))
}
-fn regexp_array_match<OffsetSize: OffsetSizeTrait>(
- array: &GenericStringArray<OffsetSize>,
- regex_array: &GenericStringArray<OffsetSize>,
- flags_array: Option<&GenericStringArray<OffsetSize>>,
-) -> Result<ArrayRef, ArrowError> {
- let mut patterns: HashMap<String, Regex> = HashMap::new();
- let builder: GenericStringBuilder<OffsetSize> =
GenericStringBuilder::with_capacity(0, 0);
- let mut list_builder = ListBuilder::new(builder);
+macro_rules! process_regexp_array_match {
+ ($array:expr, $regex_array:expr, $flags_array:expr, $list_builder:expr) =>
{
+ let mut patterns: HashMap<String, Regex> = HashMap::new();
- let complete_pattern = match flags_array {
- Some(flags) => Box::new(
- regex_array
- .iter()
- .zip(flags.iter())
- .map(|(pattern, flags)| {
+ let complete_pattern = match $flags_array {
+ Some(flags) => Box::new($regex_array.iter().zip(flags.iter()).map(
+ |(pattern, flags)| {
pattern.map(|pattern| match flags {
Some(value) => format!("(?{value}){pattern}"),
None => pattern.to_string(),
})
- }),
- ) as Box<dyn Iterator<Item = Option<String>>>,
- None => Box::new(
- regex_array
- .iter()
- .map(|pattern| pattern.map(|pattern| pattern.to_string())),
- ),
- };
+ },
+ )) as Box<dyn Iterator<Item = Option<String>>>,
+ None => Box::new(
+ $regex_array
+ .iter()
+ .map(|pattern| pattern.map(|pattern| pattern.to_string())),
+ ),
+ };
- array
- .iter()
- .zip(complete_pattern)
- .map(|(value, pattern)| {
- match (value, pattern) {
- // Required for Postgres compatibility:
- // SELECT regexp_match('foobarbequebaz', ''); = {""}
- (Some(_), Some(pattern)) if pattern == *"" => {
- list_builder.values().append_value("");
- list_builder.append(true);
- }
- (Some(value), Some(pattern)) => {
- let existing_pattern = patterns.get(&pattern);
- let re = match existing_pattern {
- Some(re) => re,
- None => {
- let re = Regex::new(pattern.as_str()).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Regular expression did not compile: {e:?}"
- ))
- })?;
- patterns.entry(pattern).or_insert(re)
- }
- };
- match re.captures(value) {
- Some(caps) => {
- let mut iter = caps.iter();
- if caps.len() > 1 {
- iter.next();
- }
- for m in iter.flatten() {
- list_builder.values().append_value(m.as_str());
+ $array
+ .iter()
+ .zip(complete_pattern)
+ .map(|(value, pattern)| {
+ match (value, pattern) {
+ // Required for Postgres compatibility:
+ // SELECT regexp_match('foobarbequebaz', ''); = {""}
+ (Some(_), Some(pattern)) if pattern == *"" => {
+ $list_builder.values().append_value("");
+ $list_builder.append(true);
+ }
+ (Some(value), Some(pattern)) => {
+ let existing_pattern = patterns.get(&pattern);
+ let re = match existing_pattern {
+ Some(re) => re,
+ None => {
+ let re =
Regex::new(pattern.as_str()).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Regular expression did not compile:
{e:?}"
+ ))
+ })?;
+ patterns.entry(pattern).or_insert(re)
}
+ };
+ match re.captures(value) {
+ Some(caps) => {
+ let mut iter = caps.iter();
+ if caps.len() > 1 {
+ iter.next();
+ }
+ for m in iter.flatten() {
+
$list_builder.values().append_value(m.as_str());
+ }
- list_builder.append(true);
+ $list_builder.append(true);
+ }
+ None => $list_builder.append(false),
}
- None => list_builder.append(false),
}
+ _ => $list_builder.append(false),
}
- _ => list_builder.append(false),
- }
- Ok(())
- })
- .collect::<Result<Vec<()>, ArrowError>>()?;
+ Ok(())
+ })
+ .collect::<Result<Vec<()>, ArrowError>>()?;
+ };
+}
+
+fn regexp_array_match<OffsetSize: OffsetSizeTrait>(
+ array: &GenericStringArray<OffsetSize>,
+ regex_array: &GenericStringArray<OffsetSize>,
+ flags_array: Option<&GenericStringArray<OffsetSize>>,
+) -> Result<ArrayRef, ArrowError> {
+ let builder: GenericStringBuilder<OffsetSize> =
GenericStringBuilder::with_capacity(0, 0);
+ let mut list_builder = ListBuilder::new(builder);
+
+ process_regexp_array_match!(array, regex_array, flags_array, list_builder);
+
+ Ok(Arc::new(list_builder.finish()))
+}
+
+fn regexp_array_match_utf8view(
+ array: &StringViewArray,
+ regex_array: &StringViewArray,
+ flags_array: Option<&StringViewArray>,
+) -> Result<ArrayRef, ArrowError> {
+ let builder = StringViewBuilder::with_capacity(0);
+ let mut list_builder = ListBuilder::new(builder);
+
+ process_regexp_array_match!(array, regex_array, flags_array, list_builder);
Review Comment:
Confirmed this expands to the original logic, for both StringView (here) and
StringArrays (above), to match on an array of regex. Why the use of macros?
##########
arrow-string/src/regexp.rs:
##########
@@ -497,114 +559,292 @@ pub fn regexp_match(
mod tests {
use super::*;
- #[test]
- fn match_single_group() {
- let values = vec![
+ macro_rules! test_match_single_group {
+ ($test_name:ident, $values:expr, $patterns:expr, $arr_type:ty,
$builder_type:ty, $expected:expr) => {
+ #[test]
+ fn $test_name() {
+ let array: $arr_type = <$arr_type>::from($values);
+ let pattern: $arr_type = <$arr_type>::from($patterns);
+
+ let actual = regexp_match(&array, &pattern, None).unwrap();
+
+ let elem_builder: $builder_type = <$builder_type>::new();
+ let mut expected_builder = ListBuilder::new(elem_builder);
+
+ for val in $expected {
+ match val {
+ Some(v) => {
+ expected_builder.values().append_value(v);
+ expected_builder.append(true);
+ }
+ None => expected_builder.append(false),
+ }
+ }
+
+ let expected = expected_builder.finish();
+ let result =
actual.as_any().downcast_ref::<ListArray>().unwrap();
+ assert_eq!(&expected, result);
+ }
+ };
+ }
+
+ test_match_single_group!(
+ match_single_group_string,
+ vec![
Some("abc-005-def"),
Some("X-7-5"),
Some("X545"),
None,
Some("foobarbequebaz"),
Some("foobarbequebaz"),
- ];
- let array = StringArray::from(values);
- let mut pattern_values = vec![r".*-(\d*)-.*"; 4];
- pattern_values.push(r"(bar)(bequ1e)");
- pattern_values.push("");
- let pattern = GenericStringArray::<i32>::from(pattern_values);
- let actual = regexp_match(&array, &pattern, None).unwrap();
- let elem_builder: GenericStringBuilder<i32> =
GenericStringBuilder::new();
- let mut expected_builder = ListBuilder::new(elem_builder);
- expected_builder.values().append_value("005");
- expected_builder.append(true);
- expected_builder.values().append_value("7");
- expected_builder.append(true);
- expected_builder.append(false);
- expected_builder.append(false);
- expected_builder.append(false);
- expected_builder.values().append_value("");
- expected_builder.append(true);
Review Comment:
Much nicer not having this anymore.
##########
arrow-string/src/regexp.rs:
##########
@@ -497,114 +559,292 @@ pub fn regexp_match(
mod tests {
use super::*;
- #[test]
- fn match_single_group() {
- let values = vec![
+ macro_rules! test_match_single_group {
+ ($test_name:ident, $values:expr, $patterns:expr, $arr_type:ty,
$builder_type:ty, $expected:expr) => {
+ #[test]
+ fn $test_name() {
+ let array: $arr_type = <$arr_type>::from($values);
+ let pattern: $arr_type = <$arr_type>::from($patterns);
+
+ let actual = regexp_match(&array, &pattern, None).unwrap();
+
+ let elem_builder: $builder_type = <$builder_type>::new();
+ let mut expected_builder = ListBuilder::new(elem_builder);
+
+ for val in $expected {
+ match val {
+ Some(v) => {
+ expected_builder.values().append_value(v);
+ expected_builder.append(true);
+ }
+ None => expected_builder.append(false),
+ }
+ }
+
+ let expected = expected_builder.finish();
+ let result =
actual.as_any().downcast_ref::<ListArray>().unwrap();
+ assert_eq!(&expected, result);
+ }
+ };
+ }
+
+ test_match_single_group!(
+ match_single_group_string,
+ vec![
Some("abc-005-def"),
Some("X-7-5"),
Some("X545"),
None,
Some("foobarbequebaz"),
Some("foobarbequebaz"),
- ];
- let array = StringArray::from(values);
- let mut pattern_values = vec![r".*-(\d*)-.*"; 4];
- pattern_values.push(r"(bar)(bequ1e)");
- pattern_values.push("");
- let pattern = GenericStringArray::<i32>::from(pattern_values);
- let actual = regexp_match(&array, &pattern, None).unwrap();
- let elem_builder: GenericStringBuilder<i32> =
GenericStringBuilder::new();
- let mut expected_builder = ListBuilder::new(elem_builder);
- expected_builder.values().append_value("005");
- expected_builder.append(true);
- expected_builder.values().append_value("7");
- expected_builder.append(true);
- expected_builder.append(false);
- expected_builder.append(false);
- expected_builder.append(false);
- expected_builder.values().append_value("");
- expected_builder.append(true);
- let expected = expected_builder.finish();
- let result = actual.as_any().downcast_ref::<ListArray>().unwrap();
- assert_eq!(&expected, result);
- }
+ ],
+ vec![
+ r".*-(\d*)-.*",
+ r".*-(\d*)-.*",
+ r".*-(\d*)-.*",
+ r".*-(\d*)-.*",
+ r"(bar)(bequ1e)",
+ ""
+ ],
+ StringArray,
+ GenericStringBuilder<i32>,
+ [Some("005"), Some("7"), None, None, None, Some("")]
+ );
+ test_match_single_group!(
+ match_single_group_string_view,
+ vec![
+ Some("abc-005-def"),
+ Some("X-7-5"),
+ Some("X545"),
+ None,
+ Some("foobarbequebaz"),
+ Some("foobarbequebaz"),
+ ],
+ vec![
+ r".*-(\d*)-.*",
+ r".*-(\d*)-.*",
+ r".*-(\d*)-.*",
+ r".*-(\d*)-.*",
+ r"(bar)(bequ1e)",
+ ""
+ ],
+ StringViewArray,
+ StringViewBuilder,
+ [Some("005"), Some("7"), None, None, None, Some("")]
+ );
+
+ macro_rules! test_match_single_group_with_flags {
+ ($test_name:ident, $values:expr, $patterns:expr, $flags:expr,
$array_type:ty, $builder_type:ty, $expected:expr) => {
+ #[test]
+ fn $test_name() {
+ let array: $array_type = <$array_type>::from($values);
+ let pattern: $array_type = <$array_type>::from($patterns);
+ let flags: $array_type = <$array_type>::from($flags);
+
+ let actual = regexp_match(&array, &pattern,
Some(&flags)).unwrap();
+
+ let elem_builder: $builder_type = <$builder_type>::new();
+ let mut expected_builder = ListBuilder::new(elem_builder);
+
+ for val in $expected {
+ match val {
+ Some(v) => {
+ expected_builder.values().append_value(v);
+ expected_builder.append(true);
+ }
+ None => {
+ expected_builder.append(false);
+ }
+ }
+ }
- #[test]
- fn match_single_group_with_flags() {
- let values = vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"),
None];
- let array = StringArray::from(values);
- let pattern = StringArray::from(vec![r"x.*-(\d*)-.*"; 4]);
- let flags = StringArray::from(vec!["i"; 4]);
- let actual = regexp_match(&array, &pattern, Some(&flags)).unwrap();
- let elem_builder: GenericStringBuilder<i32> =
GenericStringBuilder::with_capacity(0, 0);
- let mut expected_builder = ListBuilder::new(elem_builder);
- expected_builder.append(false);
- expected_builder.values().append_value("7");
- expected_builder.append(true);
- expected_builder.append(false);
- expected_builder.append(false);
- let expected = expected_builder.finish();
- let result = actual.as_any().downcast_ref::<ListArray>().unwrap();
- assert_eq!(&expected, result);
+ let expected = expected_builder.finish();
+ let result =
actual.as_any().downcast_ref::<ListArray>().unwrap();
+ assert_eq!(&expected, result);
+ }
+ };
}
- #[test]
- fn match_scalar_pattern() {
- let values = vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"),
None];
- let array = StringArray::from(values);
- let pattern = Scalar::new(StringArray::from(vec![r"x.*-(\d*)-.*"; 1]));
- let flags = Scalar::new(StringArray::from(vec!["i"; 1]));
- let actual = regexp_match(&array, &pattern, Some(&flags)).unwrap();
- let elem_builder: GenericStringBuilder<i32> =
GenericStringBuilder::with_capacity(0, 0);
- let mut expected_builder = ListBuilder::new(elem_builder);
- expected_builder.append(false);
- expected_builder.values().append_value("7");
- expected_builder.append(true);
- expected_builder.append(false);
- expected_builder.append(false);
- let expected = expected_builder.finish();
- let result = actual.as_any().downcast_ref::<ListArray>().unwrap();
- assert_eq!(&expected, result);
-
- // No flag
- let values = vec![Some("abc-005-def"), Some("x-7-5"), Some("X545"),
None];
- let array = StringArray::from(values);
- let actual = regexp_match(&array, &pattern, None).unwrap();
- let result = actual.as_any().downcast_ref::<ListArray>().unwrap();
- assert_eq!(&expected, result);
+ test_match_single_group_with_flags!(
+ match_single_group_with_flags_string,
+ vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None],
+ vec![r"x.*-(\d*)-.*"; 4],
+ vec!["i"; 4],
+ StringArray,
+ GenericStringBuilder<i32>,
+ [None, Some("7"), None, None]
+ );
+ test_match_single_group_with_flags!(
+ match_single_group_with_flags_stringview,
+ vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None],
+ vec![r"x.*-(\d*)-.*"; 4],
+ vec!["i"; 4],
+ StringViewArray,
+ StringViewBuilder,
+ [None, Some("7"), None, None]
+ );
+
+ macro_rules! test_match_scalar_pattern {
+ ($test_name:ident, $values:expr, $pattern:expr, $flag:expr,
$array_type:ty, $builder_type:ty, $expected:expr) => {
+ #[test]
+ fn $test_name() {
+ let array: $array_type = <$array_type>::from($values);
+
+ let pattern_scalar =
Scalar::new(<$array_type>::from(vec![$pattern; 1]));
+ let flag_scalar = Scalar::new(<$array_type>::from(vec![$flag;
1]));
+
+ let actual = regexp_match(&array, &pattern_scalar,
Some(&flag_scalar)).unwrap();
+
+ let elem_builder: $builder_type = <$builder_type>::new();
+ let mut expected_builder = ListBuilder::new(elem_builder);
+
+ for val in $expected {
+ match val {
+ Some(v) => {
+ expected_builder.values().append_value(v);
+ expected_builder.append(true);
+ }
+ None => expected_builder.append(false),
+ }
+ }
+
+ let expected = expected_builder.finish();
+ let result =
actual.as_any().downcast_ref::<ListArray>().unwrap();
+ assert_eq!(&expected, result);
+ }
+ };
}
- #[test]
- fn match_scalar_no_pattern() {
- let values = vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"),
None];
- let array = StringArray::from(values);
- let pattern = Scalar::new(new_null_array(&DataType::Utf8, 1));
- let actual = regexp_match(&array, &pattern, None).unwrap();
- let elem_builder: GenericStringBuilder<i32> =
GenericStringBuilder::with_capacity(0, 0);
- let mut expected_builder = ListBuilder::new(elem_builder);
- expected_builder.append(false);
- expected_builder.append(false);
- expected_builder.append(false);
- expected_builder.append(false);
- let expected = expected_builder.finish();
- let result = actual.as_any().downcast_ref::<ListArray>().unwrap();
- assert_eq!(&expected, result);
+ test_match_scalar_pattern!(
+ match_scalar_pattern_string_with_flags,
+ vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None],
+ r"x.*-(\d*)-.*",
+ Some("i"),
+ StringArray,
+ GenericStringBuilder<i32>,
+ [None, Some("7"), None, None]
+ );
+ test_match_scalar_pattern!(
+ match_scalar_pattern_stringview_with_flags,
+ vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None],
+ r"x.*-(\d*)-.*",
+ Some("i"),
+ StringViewArray,
+ StringViewBuilder,
+ [None, Some("7"), None, None]
+ );
+
+ test_match_scalar_pattern!(
+ match_scalar_pattern_string_no_flags,
+ vec![Some("abc-005-def"), Some("x-7-5"), Some("X545"), None],
+ r"x.*-(\d*)-.*",
+ None::<&str>,
+ StringArray,
+ GenericStringBuilder<i32>,
+ [None, Some("7"), None, None]
+ );
Review Comment:
There is no difference with or without flags. Might be nice to have test
cases demonstrating this difference. 🙏🏼
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]