This is an automated email from the ASF dual-hosted git repository.

luchunliang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/inlong.git


The following commit(s) were added to refs/heads/master by this push:
     new b68bba28e2 [INLONG-11172][SDK] Transform REGEXP_MATCHES() function 
supports more flags (#11174)
b68bba28e2 is described below

commit b68bba28e2908370d86bc4c88951588c2f495f6d
Author: emptyOVO <[email protected]>
AuthorDate: Tue Sep 24 09:50:50 2024 +0800

    [INLONG-11172][SDK] Transform REGEXP_MATCHES() function supports more flags 
(#11174)
---
 .../process/function/RegexpMatchesFunction.java    | 44 +++++++++++++---------
 .../function/string/TestRegexpMatchesFunction.java | 13 +++++++
 2 files changed, 39 insertions(+), 18 deletions(-)

diff --git 
a/inlong-sdk/transform-sdk/src/main/java/org/apache/inlong/sdk/transform/process/function/RegexpMatchesFunction.java
 
b/inlong-sdk/transform-sdk/src/main/java/org/apache/inlong/sdk/transform/process/function/RegexpMatchesFunction.java
index d86641654d..e12e8c4a3f 100644
--- 
a/inlong-sdk/transform-sdk/src/main/java/org/apache/inlong/sdk/transform/process/function/RegexpMatchesFunction.java
+++ 
b/inlong-sdk/transform-sdk/src/main/java/org/apache/inlong/sdk/transform/process/function/RegexpMatchesFunction.java
@@ -41,7 +41,8 @@ import java.util.stream.Collectors;
  *             3) flags: one or more characters that control the behavior of a 
function,
  *                'g' flag can be used when we want to match all the 
substrings that occur,
  *                'i' flag to ignore case for matching,
- *                'm' flag allows regular expressions to match across multiple 
lines
+ *                'x' flag to extend syntax (ignoring whitespace and comments 
in regular expressions)
+ *                'm' and 'n' flag allows regular expressions to match across 
multiple lines
  */
 @TransformFunction(names = {"regexp_matches"})
 public class RegexpMatchesFunction implements ValueParser {
@@ -78,30 +79,37 @@ public class RegexpMatchesFunction implements ValueParser {
 
     private String regexpMatches(String input, String regex, String flags) {
         int flag = 0;
-        if (flags.contains("i")) {
-            flag |= Pattern.CASE_INSENSITIVE;
-        }
-        if (flags.contains("m")) {
-            flag |= Pattern.MULTILINE;
-        }
-        if (flags.contains("g")) {
-            flag |= Pattern.DOTALL;
+        if (flags != null) {
+            if (flags.contains("i")) {
+                flag |= Pattern.CASE_INSENSITIVE;
+            }
+            if (flags.contains("m") || flags.contains("n")) {
+                flag |= Pattern.MULTILINE;
+            }
+            if (flags.contains("s")) {
+                flag |= Pattern.DOTALL;
+            }
+            if (flags.contains("x")) {
+                flag |= Pattern.COMMENTS;
+            }
         }
 
         Pattern pattern = Pattern.compile(regex, flag);
         Matcher matcher = pattern.matcher(input);
-
+        boolean isGlobalMatch = flags != null && flags.contains("g");
         List<String[]> matches = new ArrayList<>();
 
         while (matcher.find()) {
-            if (matcher.groupCount() == 0) {
-                matches.add(new String[]{matcher.group(0)});
-            } else {
-                String[] matchGroups = new String[matcher.groupCount()];
-                for (int i = 1; i <= matcher.groupCount(); i++) {
-                    matchGroups[i - 1] = matcher.group(i) != null ? 
matcher.group(i) : "";
-                }
-                matches.add(matchGroups);
+            int groupCount = matcher.groupCount();
+            String[] matchGroups = new String[groupCount > 0 ? groupCount : 1];
+
+            for (int i = 0; i <= groupCount; i++) {
+                matchGroups[i == 0 ? 0 : i - 1] = matcher.group(i) != null ? 
matcher.group(i) : "";
+            }
+            matches.add(matchGroups);
+
+            if (!isGlobalMatch) {
+                break;
             }
         }
         return listToString(matches);
diff --git 
a/inlong-sdk/transform-sdk/src/test/java/org/apache/inlong/sdk/transform/process/function/string/TestRegexpMatchesFunction.java
 
b/inlong-sdk/transform-sdk/src/test/java/org/apache/inlong/sdk/transform/process/function/string/TestRegexpMatchesFunction.java
index f3569ecaff..48c2a6f835 100644
--- 
a/inlong-sdk/transform-sdk/src/test/java/org/apache/inlong/sdk/transform/process/function/string/TestRegexpMatchesFunction.java
+++ 
b/inlong-sdk/transform-sdk/src/test/java/org/apache/inlong/sdk/transform/process/function/string/TestRegexpMatchesFunction.java
@@ -37,6 +37,7 @@ public class TestRegexpMatchesFunction extends 
AbstractFunctionStringTestBase {
         TransformProcessor<String, String> processor1 = TransformProcessor
                 .create(config1, 
SourceDecoderFactory.createCsvDecoder(csvSource),
                         SinkEncoderFactory.createKvEncoder(kvSink));
+
         // case1: regexp_matches("The quick brown fox", "quick")
         List<String> output1 = processor1.transform("The quick brown 
fox|quick|5|2|1|3", new HashMap<>());
         Assert.assertEquals(1, output1.size());
@@ -46,11 +47,13 @@ public class TestRegexpMatchesFunction extends 
AbstractFunctionStringTestBase {
         TransformProcessor<String, String> processor2 = TransformProcessor
                 .create(config2, 
SourceDecoderFactory.createCsvDecoder(csvSource),
                         SinkEncoderFactory.createKvEncoder(kvSink));
+
         // case2: regexp_matches("User: Alice, ID: 12345", "User: (\\w+), ID: 
(\\d+)")
         List<String> output2 =
                 processor2.transform("User: Alice, ID: 12345|User: (\\\\w+), 
ID: (\\\\d+)|5|2|1|3", new HashMap<>());
         Assert.assertEquals(1, output2.size());
         Assert.assertEquals(output2.get(0), "result=[{\"Alice\",\"12345\"}]");
+
         // case3: regexp_matches("User: Alice, ID: 12345User: Alice, ID: 12345;
         // User: Bob, ID: 67890", "User: (\\w+), ID: (\\d+)")
         List<String> output3 =
@@ -62,10 +65,12 @@ public class TestRegexpMatchesFunction extends 
AbstractFunctionStringTestBase {
         TransformProcessor<String, String> processor3 = TransformProcessor
                 .create(config3, 
SourceDecoderFactory.createCsvDecoder(csvSource),
                         SinkEncoderFactory.createKvEncoder(kvSink));
+
         // case4: regexp_matches("foo 123 bar 456", "\\d+", "g")
         List<String> output4 = processor3.transform("foo 123 bar 
456|\\\\d+|g|2|1|3", new HashMap<>());
         Assert.assertEquals(1, output4.size());
         Assert.assertEquals(output4.get(0), "result=[{\"123\"},{\"456\"}]");
+
         // case5: regexp_matches("User: Alice, ID: 12345User: Alice, ID: 12345;
         // User: Bob, ID: 67890", "User: (\\w+),ID: (\\d+)", "g")
         List<String> output5 = processor3.transform(
@@ -77,6 +82,7 @@ public class TestRegexpMatchesFunction extends 
AbstractFunctionStringTestBase {
         TransformProcessor<String, String> processor4 = TransformProcessor
                 .create(config4, 
SourceDecoderFactory.createCsvDecoder(csvSource),
                         SinkEncoderFactory.createKvEncoder(kvSink));
+
         // case6: regexp_matches("Hello! hello World", "hello", "ig")
         List<String> output6 = processor4.transform("Hello! hello 
World|hello|ig|2|1|3", new HashMap<>());
         Assert.assertEquals(1, output6.size());
@@ -86,9 +92,16 @@ public class TestRegexpMatchesFunction extends 
AbstractFunctionStringTestBase {
         TransformProcessor<String, String> processor5 = TransformProcessor
                 .create(config5, 
SourceDecoderFactory.createCsvDecoder(csvSource),
                         SinkEncoderFactory.createKvEncoder(kvSink));
+
         // case7: regexp_matches("First line\nSecond line", "^Second", "m")
         List<String> output7 = processor5.transform("First line\\\nSecond 
line|^Second|m|2|1|3", new HashMap<>());
         Assert.assertEquals(1, output7.size());
         Assert.assertEquals(output7.get(0), "result=[{\"Second\"}]");
+
+        // without 'g' flag
+        // case7: regexp_matches("Hello! hello World", "hello", "i")
+        List<String> output8 = processor5.transform("Hello! hello 
World|hello|i|2|1|3", new HashMap<>());
+        Assert.assertEquals(1, output8.size());
+        Assert.assertEquals(output8.get(0), "result=[{\"Hello\"}]");
     }
 }

Reply via email to