This is an automated email from the ASF dual-hosted git repository.
luchunliang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/inlong.git
The following commit(s) were added to refs/heads/master by this push:
new b68bba28e2 [INLONG-11172][SDK] Transform REGEXP_MATCHES() function
supports more flags (#11174)
b68bba28e2 is described below
commit b68bba28e2908370d86bc4c88951588c2f495f6d
Author: emptyOVO <[email protected]>
AuthorDate: Tue Sep 24 09:50:50 2024 +0800
[INLONG-11172][SDK] Transform REGEXP_MATCHES() function supports more flags
(#11174)
---
.../process/function/RegexpMatchesFunction.java | 44 +++++++++++++---------
.../function/string/TestRegexpMatchesFunction.java | 13 +++++++
2 files changed, 39 insertions(+), 18 deletions(-)
diff --git
a/inlong-sdk/transform-sdk/src/main/java/org/apache/inlong/sdk/transform/process/function/RegexpMatchesFunction.java
b/inlong-sdk/transform-sdk/src/main/java/org/apache/inlong/sdk/transform/process/function/RegexpMatchesFunction.java
index d86641654d..e12e8c4a3f 100644
---
a/inlong-sdk/transform-sdk/src/main/java/org/apache/inlong/sdk/transform/process/function/RegexpMatchesFunction.java
+++
b/inlong-sdk/transform-sdk/src/main/java/org/apache/inlong/sdk/transform/process/function/RegexpMatchesFunction.java
@@ -41,7 +41,8 @@ import java.util.stream.Collectors;
* 3) flags: one or more characters that control the behavior of a
function,
* 'g' flag can be used when we want to match all the
substrings that occur,
* 'i' flag to ignore case for matching,
- * 'm' flag allows regular expressions to match across multiple
lines
+ * 'x' flag to extend syntax (ignoring whitespace and comments
in regular expressions)
+ * 'm' and 'n' flag allows regular expressions to match across
multiple lines
*/
@TransformFunction(names = {"regexp_matches"})
public class RegexpMatchesFunction implements ValueParser {
@@ -78,30 +79,37 @@ public class RegexpMatchesFunction implements ValueParser {
private String regexpMatches(String input, String regex, String flags) {
int flag = 0;
- if (flags.contains("i")) {
- flag |= Pattern.CASE_INSENSITIVE;
- }
- if (flags.contains("m")) {
- flag |= Pattern.MULTILINE;
- }
- if (flags.contains("g")) {
- flag |= Pattern.DOTALL;
+ if (flags != null) {
+ if (flags.contains("i")) {
+ flag |= Pattern.CASE_INSENSITIVE;
+ }
+ if (flags.contains("m") || flags.contains("n")) {
+ flag |= Pattern.MULTILINE;
+ }
+ if (flags.contains("s")) {
+ flag |= Pattern.DOTALL;
+ }
+ if (flags.contains("x")) {
+ flag |= Pattern.COMMENTS;
+ }
}
Pattern pattern = Pattern.compile(regex, flag);
Matcher matcher = pattern.matcher(input);
-
+ boolean isGlobalMatch = flags != null && flags.contains("g");
List<String[]> matches = new ArrayList<>();
while (matcher.find()) {
- if (matcher.groupCount() == 0) {
- matches.add(new String[]{matcher.group(0)});
- } else {
- String[] matchGroups = new String[matcher.groupCount()];
- for (int i = 1; i <= matcher.groupCount(); i++) {
- matchGroups[i - 1] = matcher.group(i) != null ?
matcher.group(i) : "";
- }
- matches.add(matchGroups);
+ int groupCount = matcher.groupCount();
+ String[] matchGroups = new String[groupCount > 0 ? groupCount : 1];
+
+ for (int i = 0; i <= groupCount; i++) {
+ matchGroups[i == 0 ? 0 : i - 1] = matcher.group(i) != null ?
matcher.group(i) : "";
+ }
+ matches.add(matchGroups);
+
+ if (!isGlobalMatch) {
+ break;
}
}
return listToString(matches);
diff --git
a/inlong-sdk/transform-sdk/src/test/java/org/apache/inlong/sdk/transform/process/function/string/TestRegexpMatchesFunction.java
b/inlong-sdk/transform-sdk/src/test/java/org/apache/inlong/sdk/transform/process/function/string/TestRegexpMatchesFunction.java
index f3569ecaff..48c2a6f835 100644
---
a/inlong-sdk/transform-sdk/src/test/java/org/apache/inlong/sdk/transform/process/function/string/TestRegexpMatchesFunction.java
+++
b/inlong-sdk/transform-sdk/src/test/java/org/apache/inlong/sdk/transform/process/function/string/TestRegexpMatchesFunction.java
@@ -37,6 +37,7 @@ public class TestRegexpMatchesFunction extends
AbstractFunctionStringTestBase {
TransformProcessor<String, String> processor1 = TransformProcessor
.create(config1,
SourceDecoderFactory.createCsvDecoder(csvSource),
SinkEncoderFactory.createKvEncoder(kvSink));
+
// case1: regexp_matches("The quick brown fox", "quick")
List<String> output1 = processor1.transform("The quick brown
fox|quick|5|2|1|3", new HashMap<>());
Assert.assertEquals(1, output1.size());
@@ -46,11 +47,13 @@ public class TestRegexpMatchesFunction extends
AbstractFunctionStringTestBase {
TransformProcessor<String, String> processor2 = TransformProcessor
.create(config2,
SourceDecoderFactory.createCsvDecoder(csvSource),
SinkEncoderFactory.createKvEncoder(kvSink));
+
// case2: regexp_matches("User: Alice, ID: 12345", "User: (\\w+), ID:
(\\d+)")
List<String> output2 =
processor2.transform("User: Alice, ID: 12345|User: (\\\\w+),
ID: (\\\\d+)|5|2|1|3", new HashMap<>());
Assert.assertEquals(1, output2.size());
Assert.assertEquals(output2.get(0), "result=[{\"Alice\",\"12345\"}]");
+
// case3: regexp_matches("User: Alice, ID: 12345User: Alice, ID: 12345;
// User: Bob, ID: 67890", "User: (\\w+), ID: (\\d+)")
List<String> output3 =
@@ -62,10 +65,12 @@ public class TestRegexpMatchesFunction extends
AbstractFunctionStringTestBase {
TransformProcessor<String, String> processor3 = TransformProcessor
.create(config3,
SourceDecoderFactory.createCsvDecoder(csvSource),
SinkEncoderFactory.createKvEncoder(kvSink));
+
// case4: regexp_matches("foo 123 bar 456", "\\d+", "g")
List<String> output4 = processor3.transform("foo 123 bar
456|\\\\d+|g|2|1|3", new HashMap<>());
Assert.assertEquals(1, output4.size());
Assert.assertEquals(output4.get(0), "result=[{\"123\"},{\"456\"}]");
+
// case5: regexp_matches("User: Alice, ID: 12345User: Alice, ID: 12345;
// User: Bob, ID: 67890", "User: (\\w+),ID: (\\d+)", "g")
List<String> output5 = processor3.transform(
@@ -77,6 +82,7 @@ public class TestRegexpMatchesFunction extends
AbstractFunctionStringTestBase {
TransformProcessor<String, String> processor4 = TransformProcessor
.create(config4,
SourceDecoderFactory.createCsvDecoder(csvSource),
SinkEncoderFactory.createKvEncoder(kvSink));
+
// case6: regexp_matches("Hello! hello World", "hello", "ig")
List<String> output6 = processor4.transform("Hello! hello
World|hello|ig|2|1|3", new HashMap<>());
Assert.assertEquals(1, output6.size());
@@ -86,9 +92,16 @@ public class TestRegexpMatchesFunction extends
AbstractFunctionStringTestBase {
TransformProcessor<String, String> processor5 = TransformProcessor
.create(config5,
SourceDecoderFactory.createCsvDecoder(csvSource),
SinkEncoderFactory.createKvEncoder(kvSink));
+
// case7: regexp_matches("First line\nSecond line", "^Second", "m")
List<String> output7 = processor5.transform("First line\\\nSecond
line|^Second|m|2|1|3", new HashMap<>());
Assert.assertEquals(1, output7.size());
Assert.assertEquals(output7.get(0), "result=[{\"Second\"}]");
+
+ // without 'g' flag
+ // case7: regexp_matches("Hello! hello World", "hello", "i")
+ List<String> output8 = processor5.transform("Hello! hello
World|hello|i|2|1|3", new HashMap<>());
+ Assert.assertEquals(1, output8.size());
+ Assert.assertEquals(output8.get(0), "result=[{\"Hello\"}]");
}
}