This is an automated email from the ASF dual-hosted git repository.
dockerzhang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/inlong.git
The following commit(s) were added to refs/heads/master by this push:
new 45e7afc108 [INLONG-9788][Sort] Supports data parse that contains
delimiters in kv and csv data format (#9789)
45e7afc108 is described below
commit 45e7afc108c0987d79c252f44cfe974273c48b6e
Author: baomingyu <[email protected]>
AuthorDate: Sun Mar 10 16:49:55 2024 +0800
[INLONG-9788][Sort] Supports data parse that contains delimiters in kv and
csv data format (#9789)
---
.../inlong/sort/formats/util/StringUtils.java | 74 ++++++++++---
.../sort/formats/common/StringUtilsTest.java | 115 +++++++++++++++++++++
.../inlong/sort/formats/csv/CsvUtilsTest.java | 10 +-
.../apache/inlong/sort/formats/kv/KvUtilsTest.java | 36 +++++--
.../apache/inlong/sort/formats/kv/KvUtilsTest.java | 30 ++++--
5 files changed, 226 insertions(+), 39 deletions(-)
diff --git
a/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java
b/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java
index 5ba0c248f4..3d06625212 100644
---
a/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java
+++
b/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java
@@ -86,6 +86,9 @@ public class StringUtils {
String key = "";
String value;
+ String lastKey = null;
+ String lastValue = null;
+
int state = STATE_KEY;
/*
@@ -93,9 +96,9 @@ public class StringUtils {
*/
int kvState = STATE_KEY;
+ char lastCh = 0;
for (int i = 0; i < text.length(); ++i) {
char ch = text.charAt(i);
-
if (ch == kvDelimiter) {
switch (state) {
case STATE_KEY:
@@ -104,8 +107,8 @@ public class StringUtils {
state = STATE_VALUE;
break;
case STATE_VALUE:
- throw new IllegalArgumentException("Unexpected token "
+
- ch + " at position " + i + ".");
+ stringBuilder.append(ch);
+ break;
case STATE_ESCAPING:
stringBuilder.append(ch);
state = kvState;
@@ -117,12 +120,22 @@ public class StringUtils {
} else if (ch == entryDelimiter) {
switch (state) {
case STATE_KEY:
- throw new IllegalArgumentException("Unexpected token "
+
- ch + " at position " + i + ".");
+ key = lastKey;
+ if (lastValue == null) {
+ value = ch + stringBuilder.toString();
+ } else {
+ value = lastValue + ch + stringBuilder.toString();
+ }
+ fields.put(key, value);
+ lastKey = key;
+ lastValue = value;
+ stringBuilder.setLength(0);
+ break;
case STATE_VALUE:
value = stringBuilder.toString();
fields.put(key, value);
-
+ lastKey = key;
+ lastValue = value;
stringBuilder.setLength(0);
state = STATE_KEY;
break;
@@ -137,6 +150,12 @@ public class StringUtils {
} else if (escapeChar != null && ch == escapeChar) {
switch (state) {
case STATE_KEY:
+ if (lastCh != 0) {
+ stringBuilder.append(lastCh);
+ }
+ kvState = state;
+ state = STATE_ESCAPING;
+ break;
case STATE_VALUE:
kvState = state;
state = STATE_ESCAPING;
@@ -152,6 +171,12 @@ public class StringUtils {
} else if (quoteChar != null && ch == quoteChar) {
switch (state) {
case STATE_KEY:
+ if (lastCh != 0) {
+ stringBuilder.append(lastCh);
+ }
+ kvState = state;
+ state = STATE_QUOTING;
+ break;
case STATE_VALUE:
kvState = state;
state = STATE_QUOTING;
@@ -166,7 +191,18 @@ public class StringUtils {
}
} else if (lineDelimiter != null && ch == lineDelimiter) {
switch (state) {
+ case STATE_KEY:
+ key = lastKey;
+ stringBuilder.append(lastValue).append(lastCh);
+ value = stringBuilder.toString();
+ fields.put(key, value);
+ lastKey = null;
+ lastValue = null;
+ stringBuilder.setLength(0);
+ break;
case STATE_VALUE:
+ lastKey = null;
+ lastValue = null;
value = stringBuilder.toString();
fields.put(key, value);
Map<String, String> copyFields = new HashMap<>();
@@ -187,20 +223,33 @@ public class StringUtils {
} else {
stringBuilder.append(ch);
}
+ lastCh = ch;
}
switch (state) {
case STATE_KEY:
- throw new IllegalArgumentException("Dangling key.");
+ if (lastKey != null && lastValue != null && text != null) {
+ fields.put(lastKey, lastValue + lastCh);
+ }
+ lines.add(fields);
+ return lines;
case STATE_VALUE:
value = stringBuilder.toString();
fields.put(key, value);
lines.add(fields);
return lines;
case STATE_ESCAPING:
- throw new IllegalArgumentException("Not closed escaping.");
case STATE_QUOTING:
- throw new IllegalArgumentException("Not closed quoting.");
+ value = stringBuilder.toString();
+ String oldValue = fields.get(key);
+ if (value != null && !"".equals(value)
+ && oldValue != null && !"".equals(oldValue)) {
+ fields.put(key, oldValue + value);
+ } else if (value != null && !"".equals(value)) {
+ fields.put(key, value);
+ }
+ lines.add(fields);
+ return lines;
default:
throw new IllegalStateException();
}
@@ -441,6 +490,8 @@ public class StringUtils {
switch (state) {
case STATE_NORMAL:
+ case STATE_ESCAPING:
+ case STATE_QUOTING:
String field = stringBuilder.toString();
fields.add(field);
lines.add(fields.toArray(new String[0]));
@@ -450,11 +501,6 @@ public class StringUtils {
result[i] = lines.get(i);
}
return result;
-
- case STATE_ESCAPING:
- throw new IllegalArgumentException(String.format("Not closed
escaping. Text=[%s].", text));
- case STATE_QUOTING:
- throw new IllegalArgumentException(String.format("Not closed
quoting. Text=[%s].", text));
default:
throw new IllegalStateException(String.format("Text=[%s].",
text));
}
diff --git
a/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java
b/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java
new file mode 100644
index 0000000000..714652664e
--- /dev/null
+++
b/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.inlong.sort.formats.common;
+
+import org.apache.inlong.sort.formats.util.StringUtils;
+
+import org.junit.Test;
+
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+
+public class StringUtilsTest {
+
+ @Test
+ public void testSplitKvString() {
+
+ String kvString1 = "name=n&age=10";
+ Map<String, String> map1 = StringUtils.splitKv(kvString1, '&',
+ '=', '\\', '\'');
+ assertEquals("n", map1.get("name"));
+ assertEquals("10", map1.get("age"));
+
+ String kvString2 = "name=&age=20&";
+ Map<String, String> map2 = StringUtils.splitKv(kvString2, '&',
+ '=', '\\', '\'');
+ assertEquals("", map2.get("name"));
+ assertEquals("20&", map2.get("age"));
+
+ String kvString3 = "name==&age=20&&&value=aaa&dddd&";
+ Map<String, String> map3 = StringUtils.splitKv(kvString3, '&',
+ '=', '\\', '\'');
+ assertEquals("=", map3.get("name"));
+ assertEquals("20&&", map3.get("age"));
+ assertEquals("aaa&dddd&", map3.get("value"));
+
+ String kvString4 = "name==&age=20&&\nname1==&age1=20&&";
+ List<Map<String, String>> map4 = StringUtils.splitKv(kvString4, '&',
+ '=', '\\', '\'', '\n');
+ assertEquals("=", map4.get(0).get("name"));
+ assertEquals("20&&", map4.get(0).get("age"));
+ assertEquals("=", map4.get(0).get("name1"));
+ assertEquals("20&&", map4.get(0).get("age1"));
+
+ String kvString5 =
"name==&age=20&&\nname1==&age1=20&&&value=aaa&dddd&";
+ List<Map<String, String>> map5 = StringUtils.splitKv(kvString5, '&',
+ '=', '\\', '\'', '\n');
+ assertEquals("=", map5.get(0).get("name"));
+ assertEquals("20&&", map5.get(0).get("age"));
+ assertEquals("=", map5.get(0).get("name1"));
+ assertEquals("20&&", map5.get(0).get("age1"));
+ assertEquals("aaa&dddd&", map5.get(0).get("value"));
+
+ String kvString6 = "name==&age=20&&\\";
+ List<Map<String, String>> map6 = StringUtils.splitKv(kvString6, '&',
+ '=', '\\', '\'', '\n');
+ assertEquals("=", map6.get(0).get("name"));
+ assertEquals("20&&", map6.get(0).get("age"));
+
+ String kvString7 = "name==&age=20&&'";
+ List<Map<String, String>> map7 = StringUtils.splitKv(kvString7, '&',
+ '=', '\\', '\'', '\n');
+ assertEquals("=", map7.get(0).get("name"));
+ assertEquals("20&&", map7.get(0).get("age"));
+
+ String kvString8 = "name=\\=&age=20&a&'";
+ List<Map<String, String>> map8 = StringUtils.splitKv(kvString8, '&',
+ '=', '\\', '\'', '\n');
+ assertEquals("=", map8.get(0).get("name"));
+ assertEquals("20&a&", map8.get(0).get("age"));
+
+ String kvString9 = "name=\\=&age=20&a\\&'";
+ List<Map<String, String>> map9 = StringUtils.splitKv(kvString9, '&',
+ '=', '\\', '\'', '\n');
+ assertEquals("=", map8.get(0).get("name"));
+ assertEquals("20&a&", map8.get(0).get("age"));
+ }
+
+ @Test
+ public void testSplitCsvString() {
+ String csvString1 = "name|age=20\\||&'";
+ String[][] csv1Array1 = StringUtils.splitCsv(csvString1, '|',
+ '\\', '\'', '\n');
+
+ assertEquals("age=20|", csv1Array1[0][1]);
+ assertEquals("&", csv1Array1[0][2]);
+
+ String csvString2 =
"name|age=20\\||&'\n\name|age=20\\||&'\n\n|home|\\home\\";
+ String[][] csv1Array2 = StringUtils.splitCsv(csvString2, '|',
+ '\\', '\'', '\n');
+
+ assertEquals("name", csv1Array2[0][0]);
+ assertEquals("age=20|", csv1Array2[0][1]);
+ assertEquals("&\n\name|age=20\\||&", csv1Array2[0][2]);
+ assertEquals("", csv1Array2[2][0]);
+ assertEquals("home", csv1Array2[2][1]);
+ assertEquals("home", csv1Array2[2][2]);
+ }
+}
diff --git
a/inlong-sort/sort-formats/format-row/format-csv/src/test/java/org/apache/inlong/sort/formats/csv/CsvUtilsTest.java
b/inlong-sort/sort-formats/format-row/format-csv/src/test/java/org/apache/inlong/sort/formats/csv/CsvUtilsTest.java
index da2820d87b..47157dc522 100644
---
a/inlong-sort/sort-formats/format-row/format-csv/src/test/java/org/apache/inlong/sort/formats/csv/CsvUtilsTest.java
+++
b/inlong-sort/sort-formats/format-row/format-csv/src/test/java/org/apache/inlong/sort/formats/csv/CsvUtilsTest.java
@@ -67,14 +67,16 @@ public class CsvUtilsTest {
StringUtils.splitCsv("a|\\\"b|c\\\"|d", '|', '\\', '\"'));
}
- @Test(expected = IllegalArgumentException.class)
+ @Test
public void testSplitUnclosedEscaping() {
- StringUtils.splitCsv("a|b\\", '|', '\\', '\"');
+ String[] csvStr = StringUtils.splitCsv("a|b\\", '|', '\\', '\"');
+ Assert.assertEquals("b", csvStr[1]);
}
- @Test(expected = IllegalArgumentException.class)
+ @Test
public void testSplitUnclosedQuoting() {
- StringUtils.splitCsv("a|b\"", '|', '\\', '\"');
+ String[] csvStr = StringUtils.splitCsv("a|b\"", '|', '\\', '\"');
+ Assert.assertEquals("b", csvStr[1]);
}
@Test
diff --git
a/inlong-sort/sort-formats/format-row/format-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java
b/inlong-sort/sort-formats/format-row/format-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java
index 8a79966e64..37bbe758aa 100644
---
a/inlong-sort/sort-formats/format-row/format-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java
+++
b/inlong-sort/sort-formats/format-row/format-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java
@@ -17,9 +17,11 @@
package org.apache.inlong.sort.formats.kv;
+import org.junit.Assert;
import org.junit.Test;
import java.util.HashMap;
+import java.util.Map;
import static org.apache.inlong.sort.formats.util.StringUtils.concatKv;
import static org.apache.inlong.sort.formats.util.StringUtils.splitKv;
@@ -152,7 +154,8 @@ public class KvUtilsTest {
put("f4", "d");
}
},
- splitKv("f1=a&f2=\\\"b&f3=c\\\"&f4=d", '&', '=', '\\', '\"'));
+ splitKv("f1=a&f2=\\\"b&f3=c\\\"&f4=d", '&',
+ '=', '\\', '\"'));
assertEquals(
new HashMap<String, String>() {
@@ -174,29 +177,40 @@ public class KvUtilsTest {
splitKv("=a&f=", '&', '=', '\\', '\"'));
}
- @Test(expected = IllegalArgumentException.class)
+ @Test
public void testSplitNestedValue() {
- splitKv("f1=a=a&f2=b&f3=c", '&', '=', '\\', '\"');
+ Map<String, String> kvMap = splitKv("f1=a=a&f2=b&f3=c", '&', '=',
+ '\\', '\"');
+ Assert.assertEquals("a=a", kvMap.get("f1"));
+
}
- @Test(expected = IllegalArgumentException.class)
+ @Test
public void testSplitUnclosedEscaping() {
- splitKv("f1=a&f2=b\\", '&', '=', '\\', '\"');
+ Map<String, String> kvMap = splitKv("f1=a&f2=b\\", '&', '=',
+ '\\', '\"');
+ Assert.assertEquals("b", kvMap.get("f2"));
}
- @Test(expected = IllegalArgumentException.class)
+ @Test
public void testSplitUnclosedQuoting() {
- splitKv("f1=a&f2=b\"", '&', '=', '\\', '\"');
+ Map<String, String> kvMap = splitKv("f1=a&f2=b\"",
+ '&', '=', '\\', '\"');
+ Assert.assertEquals("b", kvMap.get("f2"));
}
- @Test(expected = IllegalArgumentException.class)
+ @Test
public void testSplitDanglingKey1() {
- splitKv("f1", '&', '=', null, null);
+ Map<String, String> kvMap = splitKv("f1", '&',
+ '=', null, null);
+ Assert.assertEquals(null, kvMap.get("f1"));
}
- @Test(expected = IllegalArgumentException.class)
+ @Test
public void testSplitDanglingKey2() {
- splitKv("f1&f2=3", '&', '=', null, null);
+ Map<String, String> kvMap = splitKv("f1&f2=3", '&',
+ '=', null, null);
+ Assert.assertEquals("3", kvMap.get("f2"));
}
@Test
diff --git
a/inlong-sort/sort-formats/format-rowdata/format-rowdata-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java
b/inlong-sort/sort-formats/format-rowdata/format-rowdata-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java
index 4ad02f88e8..9452395f59 100644
---
a/inlong-sort/sort-formats/format-rowdata/format-rowdata-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java
+++
b/inlong-sort/sort-formats/format-rowdata/format-rowdata-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java
@@ -17,6 +17,7 @@
package org.apache.inlong.sort.formats.kv;
+import org.junit.Assert;
import org.junit.Test;
import java.util.ArrayList;
@@ -194,29 +195,38 @@ public class KvUtilsTest {
splitKv("=a&f=", '&', '=', '\\', '\"'));
}
- @Test(expected = RuntimeException.class)
+ @Test
public void testSplitNestedValue() {
- splitKv("f1=a=a&f2=b&f3=c", '&', '=', '\\', '\"');
+ Map<String, String> kvMap = splitKv("f1=a=a&f2=b&f3=c", '&', '=',
+ '\\', '\"');
+ Assert.assertEquals("a=a", kvMap.get("f1"));
}
- @Test(expected = RuntimeException.class)
+ @Test
public void testSplitUnclosedEscaping() {
- splitKv("f1=a&f2=b\\", '&', '=', '\\', '\"');
+ Map<String, String> kvMap = splitKv("f1=a&f2=b\\", '&', '=',
+ '\\', '\"');
+ Assert.assertEquals("b", kvMap.get("f2"));
}
- @Test(expected = RuntimeException.class)
+ @Test
public void testSplitUnclosedQuoting() {
- splitKv("f1=a&f2=b\"", '&', '=', '\\', '\"');
+ Map<String, String> kvMap = splitKv("f1=a&f2=b\"", '&', '=',
+ '\\', '\"');
+ Assert.assertEquals("b", kvMap.get("f2"));
}
- @Test(expected = RuntimeException.class)
+ @Test
public void testSplitDanglingKey1() {
- splitKv("f1", '&', '=', null, null);
+ Map<String, String> kvMap = splitKv("f1", '&', '=',
+ null, null);
+ Assert.assertEquals(null, kvMap.get("f1"));
}
- @Test(expected = RuntimeException.class)
+ @Test
public void testSplitDanglingKey2() {
- splitKv("f1&f2=3", '&', '=', null, null);
+ Map<String, String> kvMap = splitKv("f1&f2=3", '&', '=',
+ null, null);
}
@Test