This is an automated email from the ASF dual-hosted git repository.
dockerzhang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/inlong.git
The following commit(s) were added to refs/heads/master by this push:
new e50f45ba8c [INLONG-10768][Sort] Csv utils support specified the max
split field size (#10769)
e50f45ba8c is described below
commit e50f45ba8c40f7d61ea0079657cc683f2619c8c3
Author: vernedeng <[email protected]>
AuthorDate: Sun Aug 11 15:44:58 2024 +0800
[INLONG-10768][Sort] Csv utils support specified the max split field size
(#10769)
---
.../inlong/sort/formats/util/StringUtils.java | 69 +++++++++++++++++++++-
.../sort/formats/common/StringUtilsTest.java | 41 +++++++++++++
2 files changed, 108 insertions(+), 2 deletions(-)
diff --git
a/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java
b/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java
index f33ad8e825..3ea6678ca1 100644
---
a/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java
+++
b/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java
@@ -387,6 +387,19 @@ public class StringUtils {
return splitCsv(text, delimiter, escapeChar, quoteChar, lineDelimiter,
false);
}
+ /**
+ * @see StringUtils#splitCsv(String, Character, Character, Character,
Character, boolean, Integer)
+ */
+ public static String[][] splitCsv(
+ @Nonnull String text,
+ @Nonnull Character delimiter,
+ @Nullable Character escapeChar,
+ @Nullable Character quoteChar,
+ @Nullable Character lineDelimiter,
+ boolean deleteHeadDelimiter) {
+ return splitCsv(text, delimiter, escapeChar, quoteChar, lineDelimiter,
deleteHeadDelimiter, null);
+ }
+
/**
* Splits the csv text, which may contains multiple lines of data.
*
@@ -402,6 +415,7 @@ public class StringUtils {
* @param lineDelimiter The delimiter between lines, e.g. '\n'.
* @param deleteHeadDelimiter If true and the leading character of a line
* is a delimiter, it will be ignored.
+ * @param maxFieldSize The max filed size of one single line
* @return A 2-D String array representing the parsed data, where the 1st
* dimension is row and the 2nd dimension is column.
*/
@@ -411,9 +425,16 @@ public class StringUtils {
@Nullable Character escapeChar,
@Nullable Character quoteChar,
@Nullable Character lineDelimiter,
- boolean deleteHeadDelimiter) {
+ boolean deleteHeadDelimiter,
+ @Nullable Integer maxFieldSize) {
+ if (maxFieldSize != null && maxFieldSize <= 0) {
+ return new String[0][];
+ }
+
List<String[]> lines = new ArrayList<>();
List<String> fields = new ArrayList<>();
+ int splittedSize = 0;
+ int lastFieldStartIndex = 0;
StringBuilder stringBuilder = new StringBuilder();
int state = STATE_NORMAL;
@@ -431,6 +452,14 @@ public class StringUtils {
String field = stringBuilder.toString();
fields.add(field);
stringBuilder.setLength(0);
+
+ splittedSize++;
+ // if the last field, mark the last filed start index
+ if (maxFieldSize != null && splittedSize ==
maxFieldSize - 1) {
+ if (i + 1 < text.length()) {
+ lastFieldStartIndex = i + 1;
+ }
+ }
break;
case STATE_ESCAPING:
stringBuilder.append(ch);
@@ -471,10 +500,19 @@ public class StringUtils {
case STATE_NORMAL:
String field = stringBuilder.toString();
fields.add(field);
- lines.add(fields.toArray(new String[0]));
+ // if the max field size < the real field size,
+ // remove the extra fields and copy the latest field
from lastFieldStartIndex to current index
+ if (maxFieldSize != null && fields.size() >
maxFieldSize) {
+ fields = replaceLastField(fields, maxFieldSize,
text, lastFieldStartIndex, i);
+ }
+ // reset the lastFieldStartIndex for new line
+ lastFieldStartIndex = i + 1;
+
+ lines.add(fields.toArray(new String[0]));
stringBuilder.setLength(0);
fields.clear();
+ splittedSize = 0;
break;
case STATE_ESCAPING:
stringBuilder.append(ch);
@@ -498,6 +536,11 @@ public class StringUtils {
case STATE_QUOTING:
String field = stringBuilder.toString();
fields.add(field);
+
+ if (maxFieldSize != null && fields.size() > maxFieldSize) {
+ fields = replaceLastField(fields, maxFieldSize, text,
lastFieldStartIndex, text.length());
+ }
+
lines.add(fields.toArray(new String[0]));
String[][] result = new String[lines.size()][];
@@ -510,6 +553,28 @@ public class StringUtils {
}
}
+ /**
+ * if the max field size < the real field size,
+ * remove the extra fields and copy the latest field from
lastFieldStartIndex to lastFieldEndIndex
+ *
+ * @param fields Target field list
+ * @param maxFieldSize Specified max fieldSize
+ * @param text Origin text
+ * @param lastFieldStartIndex Start index of last field
+ * @param lastFieldEndIndex End index of last field
+ */
+ private static List<String> replaceLastField(
+ List<String> fields,
+ int maxFieldSize,
+ String text,
+ int lastFieldStartIndex,
+ int lastFieldEndIndex) {
+ List<String> newField = fields.subList(0, maxFieldSize - 1);
+ String last = text.substring(lastFieldStartIndex, lastFieldEndIndex);
+ newField.add(last);
+ return newField;
+ }
+
/**
* Concat the given fields.
*
diff --git
a/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java
b/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java
index 714652664e..fc64811a97 100644
---
a/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java
+++
b/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java
@@ -112,4 +112,45 @@ public class StringUtilsTest {
assertEquals("home", csv1Array2[2][1]);
assertEquals("home", csv1Array2[2][2]);
}
+
+ @Test
+ public void testSplitCsvStringWithMaxFields() {
+
+ String csvString =
"name|age=20\\||&'\n\name|age=20\\||&'\n\n|home|\\home\\";
+ String[][] csv1Array0 = StringUtils.splitCsv(csvString, '|',
+ '\\', '\'', '\n', false, 0);
+ assertEquals(0, csv1Array0.length);
+
+ String[][] csv1Array1 = StringUtils.splitCsv(csvString, '|',
+ '\\', '\'', '\n', false, 1);
+ assertEquals("name|age=20\\||&'\n\name|age=20\\||&'",
csv1Array1[0][0]);
+ assertEquals("", csv1Array1[1][0]);
+ assertEquals("|home|\\home\\", csv1Array1[2][0]);
+
+ String[][] csv1Array2 = StringUtils.splitCsv(csvString, '|',
+ '\\', '\'', '\n', false, 2);
+ assertEquals("name", csv1Array2[0][0]);
+ assertEquals("age=20\\||&'\n\name|age=20\\||&'", csv1Array2[0][1]);
+ assertEquals("", csv1Array2[1][0]);
+ assertEquals("", csv1Array2[2][0]);
+ assertEquals("home|\\home\\", csv1Array2[2][1]);
+
+ String[][] csv1Array3 = StringUtils.splitCsv(csvString, '|',
+ '\\', '\'', '\n', false, 3);
+ assertEquals("name", csv1Array3[0][0]);
+ assertEquals("age=20|", csv1Array3[0][1]);
+ assertEquals("&\n\name|age=20\\||&", csv1Array3[0][2]);
+ assertEquals("", csv1Array3[2][0]);
+ assertEquals("home", csv1Array3[2][1]);
+ assertEquals("home", csv1Array3[2][2]);
+
+ String[][] csv1Array4 = StringUtils.splitCsv(csvString, '|',
+ '\\', '\'', '\n', false, 4);
+ assertEquals("name", csv1Array4[0][0]);
+ assertEquals("age=20|", csv1Array4[0][1]);
+ assertEquals("&\n\name|age=20\\||&", csv1Array4[0][2]);
+ assertEquals("", csv1Array4[2][0]);
+ assertEquals("home", csv1Array4[2][1]);
+ assertEquals("home", csv1Array4[2][2]);
+ }
}