This is an automated email from the ASF dual-hosted git repository.

dockerzhang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/inlong.git


The following commit(s) were added to refs/heads/master by this push:
     new 45e7afc108 [INLONG-9788][Sort] Supports data parse that contains 
delimiters in kv and csv data format (#9789)
45e7afc108 is described below

commit 45e7afc108c0987d79c252f44cfe974273c48b6e
Author: baomingyu <[email protected]>
AuthorDate: Sun Mar 10 16:49:55 2024 +0800

    [INLONG-9788][Sort] Supports data parse that contains delimiters in kv and 
csv data format (#9789)
---
 .../inlong/sort/formats/util/StringUtils.java      |  74 ++++++++++---
 .../sort/formats/common/StringUtilsTest.java       | 115 +++++++++++++++++++++
 .../inlong/sort/formats/csv/CsvUtilsTest.java      |  10 +-
 .../apache/inlong/sort/formats/kv/KvUtilsTest.java |  36 +++++--
 .../apache/inlong/sort/formats/kv/KvUtilsTest.java |  30 ++++--
 5 files changed, 226 insertions(+), 39 deletions(-)

diff --git 
a/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java
 
b/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java
index 5ba0c248f4..3d06625212 100644
--- 
a/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java
+++ 
b/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java
@@ -86,6 +86,9 @@ public class StringUtils {
         String key = "";
         String value;
 
+        String lastKey = null;
+        String lastValue = null;
+
         int state = STATE_KEY;
 
         /*
@@ -93,9 +96,9 @@ public class StringUtils {
          */
         int kvState = STATE_KEY;
 
+        char lastCh = 0;
         for (int i = 0; i < text.length(); ++i) {
             char ch = text.charAt(i);
-
             if (ch == kvDelimiter) {
                 switch (state) {
                     case STATE_KEY:
@@ -104,8 +107,8 @@ public class StringUtils {
                         state = STATE_VALUE;
                         break;
                     case STATE_VALUE:
-                        throw new IllegalArgumentException("Unexpected token " 
+
-                                ch + " at position " + i + ".");
+                        stringBuilder.append(ch);
+                        break;
                     case STATE_ESCAPING:
                         stringBuilder.append(ch);
                         state = kvState;
@@ -117,12 +120,22 @@ public class StringUtils {
             } else if (ch == entryDelimiter) {
                 switch (state) {
                     case STATE_KEY:
-                        throw new IllegalArgumentException("Unexpected token " 
+
-                                ch + " at position " + i + ".");
+                        key = lastKey;
+                        if (lastValue == null) {
+                            value = ch + stringBuilder.toString();
+                        } else {
+                            value = lastValue + ch + stringBuilder.toString();
+                        }
+                        fields.put(key, value);
+                        lastKey = key;
+                        lastValue = value;
+                        stringBuilder.setLength(0);
+                        break;
                     case STATE_VALUE:
                         value = stringBuilder.toString();
                         fields.put(key, value);
-
+                        lastKey = key;
+                        lastValue = value;
                         stringBuilder.setLength(0);
                         state = STATE_KEY;
                         break;
@@ -137,6 +150,12 @@ public class StringUtils {
             } else if (escapeChar != null && ch == escapeChar) {
                 switch (state) {
                     case STATE_KEY:
+                        if (lastCh != 0) {
+                            stringBuilder.append(lastCh);
+                        }
+                        kvState = state;
+                        state = STATE_ESCAPING;
+                        break;
                     case STATE_VALUE:
                         kvState = state;
                         state = STATE_ESCAPING;
@@ -152,6 +171,12 @@ public class StringUtils {
             } else if (quoteChar != null && ch == quoteChar) {
                 switch (state) {
                     case STATE_KEY:
+                        if (lastCh != 0) {
+                            stringBuilder.append(lastCh);
+                        }
+                        kvState = state;
+                        state = STATE_QUOTING;
+                        break;
                     case STATE_VALUE:
                         kvState = state;
                         state = STATE_QUOTING;
@@ -166,7 +191,18 @@ public class StringUtils {
                 }
             } else if (lineDelimiter != null && ch == lineDelimiter) {
                 switch (state) {
+                    case STATE_KEY:
+                        key = lastKey;
+                        stringBuilder.append(lastValue).append(lastCh);
+                        value = stringBuilder.toString();
+                        fields.put(key, value);
+                        lastKey = null;
+                        lastValue = null;
+                        stringBuilder.setLength(0);
+                        break;
                     case STATE_VALUE:
+                        lastKey = null;
+                        lastValue = null;
                         value = stringBuilder.toString();
                         fields.put(key, value);
                         Map<String, String> copyFields = new HashMap<>();
@@ -187,20 +223,33 @@ public class StringUtils {
             } else {
                 stringBuilder.append(ch);
             }
+            lastCh = ch;
         }
 
         switch (state) {
             case STATE_KEY:
-                throw new IllegalArgumentException("Dangling key.");
+                if (lastKey != null && lastValue != null && text != null) {
+                    fields.put(lastKey, lastValue + lastCh);
+                }
+                lines.add(fields);
+                return lines;
             case STATE_VALUE:
                 value = stringBuilder.toString();
                 fields.put(key, value);
                 lines.add(fields);
                 return lines;
             case STATE_ESCAPING:
-                throw new IllegalArgumentException("Not closed escaping.");
             case STATE_QUOTING:
-                throw new IllegalArgumentException("Not closed quoting.");
+                value = stringBuilder.toString();
+                String oldValue = fields.get(key);
+                if (value != null && !"".equals(value)
+                        && oldValue != null && !"".equals(oldValue)) {
+                    fields.put(key, oldValue + value);
+                } else if (value != null && !"".equals(value)) {
+                    fields.put(key, value);
+                }
+                lines.add(fields);
+                return lines;
             default:
                 throw new IllegalStateException();
         }
@@ -441,6 +490,8 @@ public class StringUtils {
 
         switch (state) {
             case STATE_NORMAL:
+            case STATE_ESCAPING:
+            case STATE_QUOTING:
                 String field = stringBuilder.toString();
                 fields.add(field);
                 lines.add(fields.toArray(new String[0]));
@@ -450,11 +501,6 @@ public class StringUtils {
                     result[i] = lines.get(i);
                 }
                 return result;
-
-            case STATE_ESCAPING:
-                throw new IllegalArgumentException(String.format("Not closed 
escaping. Text=[%s].", text));
-            case STATE_QUOTING:
-                throw new IllegalArgumentException(String.format("Not closed 
quoting. Text=[%s].", text));
             default:
                 throw new IllegalStateException(String.format("Text=[%s].", 
text));
         }
diff --git 
a/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java
 
b/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java
new file mode 100644
index 0000000000..714652664e
--- /dev/null
+++ 
b/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.inlong.sort.formats.common;
+
+import org.apache.inlong.sort.formats.util.StringUtils;
+
+import org.junit.Test;
+
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+
+public class StringUtilsTest {
+
+    @Test
+    public void testSplitKvString() {
+
+        String kvString1 = "name=n&age=10";
+        Map<String, String> map1 = StringUtils.splitKv(kvString1, '&',
+                '=', '\\', '\'');
+        assertEquals("n", map1.get("name"));
+        assertEquals("10", map1.get("age"));
+
+        String kvString2 = "name=&age=20&";
+        Map<String, String> map2 = StringUtils.splitKv(kvString2, '&',
+                '=', '\\', '\'');
+        assertEquals("", map2.get("name"));
+        assertEquals("20&", map2.get("age"));
+
+        String kvString3 = "name==&age=20&&&value=aaa&dddd&";
+        Map<String, String> map3 = StringUtils.splitKv(kvString3, '&',
+                '=', '\\', '\'');
+        assertEquals("=", map3.get("name"));
+        assertEquals("20&&", map3.get("age"));
+        assertEquals("aaa&dddd&", map3.get("value"));
+
+        String kvString4 = "name==&age=20&&\nname1==&age1=20&&";
+        List<Map<String, String>> map4 = StringUtils.splitKv(kvString4, '&',
+                '=', '\\', '\'', '\n');
+        assertEquals("=", map4.get(0).get("name"));
+        assertEquals("20&&", map4.get(0).get("age"));
+        assertEquals("=", map4.get(0).get("name1"));
+        assertEquals("20&&", map4.get(0).get("age1"));
+
+        String kvString5 = 
"name==&age=20&&\nname1==&age1=20&&&value=aaa&dddd&";
+        List<Map<String, String>> map5 = StringUtils.splitKv(kvString5, '&',
+                '=', '\\', '\'', '\n');
+        assertEquals("=", map5.get(0).get("name"));
+        assertEquals("20&&", map5.get(0).get("age"));
+        assertEquals("=", map5.get(0).get("name1"));
+        assertEquals("20&&", map5.get(0).get("age1"));
+        assertEquals("aaa&dddd&", map5.get(0).get("value"));
+
+        String kvString6 = "name==&age=20&&\\";
+        List<Map<String, String>> map6 = StringUtils.splitKv(kvString6, '&',
+                '=', '\\', '\'', '\n');
+        assertEquals("=", map6.get(0).get("name"));
+        assertEquals("20&&", map6.get(0).get("age"));
+
+        String kvString7 = "name==&age=20&&'";
+        List<Map<String, String>> map7 = StringUtils.splitKv(kvString7, '&',
+                '=', '\\', '\'', '\n');
+        assertEquals("=", map7.get(0).get("name"));
+        assertEquals("20&&", map7.get(0).get("age"));
+
+        String kvString8 = "name=\\=&age=20&a&'";
+        List<Map<String, String>> map8 = StringUtils.splitKv(kvString8, '&',
+                '=', '\\', '\'', '\n');
+        assertEquals("=", map8.get(0).get("name"));
+        assertEquals("20&a&", map8.get(0).get("age"));
+
+        String kvString9 = "name=\\=&age=20&a\\&'";
+        List<Map<String, String>> map9 = StringUtils.splitKv(kvString9, '&',
+                '=', '\\', '\'', '\n');
+        assertEquals("=", map8.get(0).get("name"));
+        assertEquals("20&a&", map8.get(0).get("age"));
+    }
+
+    @Test
+    public void testSplitCsvString() {
+        String csvString1 = "name|age=20\\||&'";
+        String[][] csv1Array1 = StringUtils.splitCsv(csvString1, '|',
+                '\\', '\'', '\n');
+
+        assertEquals("age=20|", csv1Array1[0][1]);
+        assertEquals("&", csv1Array1[0][2]);
+
+        String csvString2 = 
"name|age=20\\||&'\n\name|age=20\\||&'\n\n|home|\\home\\";
+        String[][] csv1Array2 = StringUtils.splitCsv(csvString2, '|',
+                '\\', '\'', '\n');
+
+        assertEquals("name", csv1Array2[0][0]);
+        assertEquals("age=20|", csv1Array2[0][1]);
+        assertEquals("&\n\name|age=20\\||&", csv1Array2[0][2]);
+        assertEquals("", csv1Array2[2][0]);
+        assertEquals("home", csv1Array2[2][1]);
+        assertEquals("home", csv1Array2[2][2]);
+    }
+}
diff --git 
a/inlong-sort/sort-formats/format-row/format-csv/src/test/java/org/apache/inlong/sort/formats/csv/CsvUtilsTest.java
 
b/inlong-sort/sort-formats/format-row/format-csv/src/test/java/org/apache/inlong/sort/formats/csv/CsvUtilsTest.java
index da2820d87b..47157dc522 100644
--- 
a/inlong-sort/sort-formats/format-row/format-csv/src/test/java/org/apache/inlong/sort/formats/csv/CsvUtilsTest.java
+++ 
b/inlong-sort/sort-formats/format-row/format-csv/src/test/java/org/apache/inlong/sort/formats/csv/CsvUtilsTest.java
@@ -67,14 +67,16 @@ public class CsvUtilsTest {
                 StringUtils.splitCsv("a|\\\"b|c\\\"|d", '|', '\\', '\"'));
     }
 
-    @Test(expected = IllegalArgumentException.class)
+    @Test
     public void testSplitUnclosedEscaping() {
-        StringUtils.splitCsv("a|b\\", '|', '\\', '\"');
+        String[] csvStr = StringUtils.splitCsv("a|b\\", '|', '\\', '\"');
+        Assert.assertEquals("b", csvStr[1]);
     }
 
-    @Test(expected = IllegalArgumentException.class)
+    @Test
     public void testSplitUnclosedQuoting() {
-        StringUtils.splitCsv("a|b\"", '|', '\\', '\"');
+        String[] csvStr = StringUtils.splitCsv("a|b\"", '|', '\\', '\"');
+        Assert.assertEquals("b", csvStr[1]);
     }
 
     @Test
diff --git 
a/inlong-sort/sort-formats/format-row/format-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java
 
b/inlong-sort/sort-formats/format-row/format-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java
index 8a79966e64..37bbe758aa 100644
--- 
a/inlong-sort/sort-formats/format-row/format-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java
+++ 
b/inlong-sort/sort-formats/format-row/format-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java
@@ -17,9 +17,11 @@
 
 package org.apache.inlong.sort.formats.kv;
 
+import org.junit.Assert;
 import org.junit.Test;
 
 import java.util.HashMap;
+import java.util.Map;
 
 import static org.apache.inlong.sort.formats.util.StringUtils.concatKv;
 import static org.apache.inlong.sort.formats.util.StringUtils.splitKv;
@@ -152,7 +154,8 @@ public class KvUtilsTest {
                         put("f4", "d");
                     }
                 },
-                splitKv("f1=a&f2=\\\"b&f3=c\\\"&f4=d", '&', '=', '\\', '\"'));
+                splitKv("f1=a&f2=\\\"b&f3=c\\\"&f4=d", '&',
+                        '=', '\\', '\"'));
 
         assertEquals(
                 new HashMap<String, String>() {
@@ -174,29 +177,40 @@ public class KvUtilsTest {
                 splitKv("=a&f=", '&', '=', '\\', '\"'));
     }
 
-    @Test(expected = IllegalArgumentException.class)
+    @Test
     public void testSplitNestedValue() {
-        splitKv("f1=a=a&f2=b&f3=c", '&', '=', '\\', '\"');
+        Map<String, String> kvMap = splitKv("f1=a=a&f2=b&f3=c", '&', '=',
+                '\\', '\"');
+        Assert.assertEquals("a=a", kvMap.get("f1"));
+
     }
 
-    @Test(expected = IllegalArgumentException.class)
+    @Test
     public void testSplitUnclosedEscaping() {
-        splitKv("f1=a&f2=b\\", '&', '=', '\\', '\"');
+        Map<String, String> kvMap = splitKv("f1=a&f2=b\\", '&', '=',
+                '\\', '\"');
+        Assert.assertEquals("b", kvMap.get("f2"));
     }
 
-    @Test(expected = IllegalArgumentException.class)
+    @Test
     public void testSplitUnclosedQuoting() {
-        splitKv("f1=a&f2=b\"", '&', '=', '\\', '\"');
+        Map<String, String> kvMap = splitKv("f1=a&f2=b\"",
+                '&', '=', '\\', '\"');
+        Assert.assertEquals("b", kvMap.get("f2"));
     }
 
-    @Test(expected = IllegalArgumentException.class)
+    @Test
     public void testSplitDanglingKey1() {
-        splitKv("f1", '&', '=', null, null);
+        Map<String, String> kvMap = splitKv("f1", '&',
+                '=', null, null);
+        Assert.assertEquals(null, kvMap.get("f1"));
     }
 
-    @Test(expected = IllegalArgumentException.class)
+    @Test
     public void testSplitDanglingKey2() {
-        splitKv("f1&f2=3", '&', '=', null, null);
+        Map<String, String> kvMap = splitKv("f1&f2=3", '&',
+                '=', null, null);
+        Assert.assertEquals("3", kvMap.get("f2"));
     }
 
     @Test
diff --git 
a/inlong-sort/sort-formats/format-rowdata/format-rowdata-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java
 
b/inlong-sort/sort-formats/format-rowdata/format-rowdata-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java
index 4ad02f88e8..9452395f59 100644
--- 
a/inlong-sort/sort-formats/format-rowdata/format-rowdata-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java
+++ 
b/inlong-sort/sort-formats/format-rowdata/format-rowdata-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java
@@ -17,6 +17,7 @@
 
 package org.apache.inlong.sort.formats.kv;
 
+import org.junit.Assert;
 import org.junit.Test;
 
 import java.util.ArrayList;
@@ -194,29 +195,38 @@ public class KvUtilsTest {
                 splitKv("=a&f=", '&', '=', '\\', '\"'));
     }
 
-    @Test(expected = RuntimeException.class)
+    @Test
     public void testSplitNestedValue() {
-        splitKv("f1=a=a&f2=b&f3=c", '&', '=', '\\', '\"');
+        Map<String, String> kvMap = splitKv("f1=a=a&f2=b&f3=c", '&', '=',
+                '\\', '\"');
+        Assert.assertEquals("a=a", kvMap.get("f1"));
     }
 
-    @Test(expected = RuntimeException.class)
+    @Test
     public void testSplitUnclosedEscaping() {
-        splitKv("f1=a&f2=b\\", '&', '=', '\\', '\"');
+        Map<String, String> kvMap = splitKv("f1=a&f2=b\\", '&', '=',
+                '\\', '\"');
+        Assert.assertEquals("b", kvMap.get("f2"));
     }
 
-    @Test(expected = RuntimeException.class)
+    @Test
     public void testSplitUnclosedQuoting() {
-        splitKv("f1=a&f2=b\"", '&', '=', '\\', '\"');
+        Map<String, String> kvMap = splitKv("f1=a&f2=b\"", '&', '=',
+                '\\', '\"');
+        Assert.assertEquals("b", kvMap.get("f2"));
     }
 
-    @Test(expected = RuntimeException.class)
+    @Test
     public void testSplitDanglingKey1() {
-        splitKv("f1", '&', '=', null, null);
+        Map<String, String> kvMap = splitKv("f1", '&', '=',
+                null, null);
+        Assert.assertEquals(null, kvMap.get("f1"));
     }
 
-    @Test(expected = RuntimeException.class)
+    @Test
     public void testSplitDanglingKey2() {
-        splitKv("f1&f2=3", '&', '=', null, null);
+        Map<String, String> kvMap = splitKv("f1&f2=3", '&', '=',
+                null, null);
     }
 
     @Test

Reply via email to