This is an automated email from the ASF dual-hosted git repository.

lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git


The following commit(s) were added to refs/heads/master by this push:
     new 184b95aaea [format] csv: optimize parsing of short, byte, int and long 
(#6856)
184b95aaea is described below

commit 184b95aaeaeaa36472996ef5bb42df82406df8e1
Author: jerry <[email protected]>
AuthorDate: Wed Dec 24 21:57:59 2025 +0800

    [format] csv: optimize parsing of short, byte, int and long (#6856)
---
 .../org/apache/paimon/format/csv/CsvParser.java    | 163 ++++++++++++++++++---
 .../paimon/format/csv/CsvFileFormatTest.java       |  36 +++++
 2 files changed, 176 insertions(+), 23 deletions(-)

diff --git 
a/paimon-format/src/main/java/org/apache/paimon/format/csv/CsvParser.java 
b/paimon-format/src/main/java/org/apache/paimon/format/csv/CsvParser.java
index a74908f73c..8ea5e60904 100644
--- a/paimon-format/src/main/java/org/apache/paimon/format/csv/CsvParser.java
+++ b/paimon-format/src/main/java/org/apache/paimon/format/csv/CsvParser.java
@@ -18,6 +18,7 @@
 
 package org.apache.paimon.format.csv;
 
+import org.apache.paimon.annotation.VisibleForTesting;
 import org.apache.paimon.casting.CastExecutor;
 import org.apache.paimon.casting.CastExecutors;
 import org.apache.paimon.data.BinaryString;
@@ -27,6 +28,7 @@ import org.apache.paimon.types.DataType;
 import org.apache.paimon.types.DataTypeRoot;
 import org.apache.paimon.types.DataTypes;
 import org.apache.paimon.types.RowType;
+import org.apache.paimon.utils.Pair;
 
 import javax.annotation.Nullable;
 
@@ -35,6 +37,8 @@ import java.util.Base64;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 
+import static org.apache.paimon.format.csv.CsvOptions.Mode.DROPMALFORMED;
+import static org.apache.paimon.format.csv.CsvOptions.Mode.PERMISSIVE;
 import static org.apache.paimon.utils.Preconditions.checkArgument;
 import static org.apache.paimon.utils.StringUtils.isNullOrWhitespaceOnly;
 
@@ -44,6 +48,7 @@ public class CsvParser {
     private static final Base64.Decoder BASE64_DECODER = Base64.getDecoder();
     private static final Map<String, CastExecutor<?, ?>> CAST_EXECUTOR_CACHE =
             new ConcurrentHashMap<>();
+    private static final int NUMBER_PARSE_RADIX = 10;
 
     private final RowType dataSchemaRowType;
     private final int[] projectMapping;
@@ -157,20 +162,28 @@ public class CsvParser {
         for (int i = 0; i < projectMapping.length; i++) {
             int ordinal = projectMapping[i];
             DataType type = dataSchemaRowType.getTypeAt(ordinal);
-            Object field = null;
+            Pair<Boolean, Object> parseResult = null;
+            Exception exception = null;
+            String parseValue = rowValues[ordinal];
             try {
-                field = parseField(rowValues[ordinal], type);
+                parseResult = parseField(parseValue, type);
             } catch (Exception e) {
-                switch (mode) {
-                    case PERMISSIVE:
-                        break;
-                    case DROPMALFORMED:
-                        return null;
-                    case FAILFAST:
-                        throw e;
-                }
+                exception = e;
+            }
+            if (parseResult != null && parseResult.getLeft()) {
+                row.setField(i, parseResult.getValue());
+            } else if (mode == PERMISSIVE
+                    && (parseResult == null || !parseResult.getLeft() || 
exception != null)) {
+                break;
+            } else if (mode == DROPMALFORMED
+                    && (parseResult == null || !parseResult.getLeft() || 
exception != null)) {
+                return null;
+            } else if (exception != null) {
+                throw new RuntimeException(exception);
+            } else if (parseResult == null
+                    || !parseResult.getLeft() && parseResult.getValue() == 
null) {
+                throw new NumberFormatException("For input string: \"" + 
parseValue + "\"");
             }
-            row.setField(i, field);
         }
         return row;
     }
@@ -188,35 +201,46 @@ public class CsvParser {
         return true;
     }
 
-    private Object parseField(String field, DataType dataType) {
+    @VisibleForTesting
+    public Pair<Boolean, Object> parseField(String field, DataType dataType) {
         if (field == null || field.equals(nullLiteral)) {
-            return null;
+            return Pair.of(true, null);
         }
 
         DataTypeRoot typeRoot = dataType.getTypeRoot();
         switch (typeRoot) {
             case TINYINT:
-                return Byte.parseByte(field);
+                Integer intVal = parseInt(field);
+                if (intVal == null || intVal > Byte.MAX_VALUE || intVal < 
Byte.MIN_VALUE) {
+                    return Pair.of(false, null);
+                }
+                return Pair.of(true, intVal.byteValue());
             case SMALLINT:
-                return Short.parseShort(field);
+                intVal = parseInt(field);
+                if (intVal == null || intVal > Short.MAX_VALUE || intVal < 
Short.MIN_VALUE) {
+                    return Pair.of(false, null);
+                }
+                return Pair.of(true, intVal.shortValue());
             case INTEGER:
-                return Integer.parseInt(field);
+                intVal = parseInt(field);
+                return Pair.of(intVal != null, intVal);
             case BIGINT:
-                return Long.parseLong(field);
+                Long longVal = parseLong(field);
+                return Pair.of(longVal != null, longVal);
             case FLOAT:
-                return Float.parseFloat(field);
+                return Pair.of(true, Float.parseFloat(field));
             case DOUBLE:
-                return Double.parseDouble(field);
+                return Pair.of(true, Double.parseDouble(field));
             case BOOLEAN:
-                return Boolean.parseBoolean(field);
+                return Pair.of(true, Boolean.parseBoolean(field));
             case CHAR:
             case VARCHAR:
-                return BinaryString.fromString(field);
+                return Pair.of(true, BinaryString.fromString(field));
             case BINARY:
             case VARBINARY:
-                return BASE64_DECODER.decode(field);
+                return Pair.of(true, BASE64_DECODER.decode(field));
             default:
-                return parseByCastExecutor(field, dataType);
+                return Pair.of(true, parseByCastExecutor(field, dataType));
         }
     }
 
@@ -233,4 +257,97 @@ public class CsvParser {
         }
         return BinaryString.fromString(field);
     }
+
+    private static Integer parseInt(String s) {
+        if (s == null || s.isEmpty()) {
+            return null;
+        }
+        int len = s.length();
+        int i = 0;
+        char firstChar = s.charAt(0);
+        boolean negative = false;
+        int limit = -Integer.MAX_VALUE;
+
+        if (firstChar < '0') {
+            if (firstChar == '-') {
+                negative = true;
+                limit = Integer.MIN_VALUE;
+            } else if (firstChar != '+') {
+                return null;
+            }
+
+            if (len == 1) {
+                return null;
+            }
+            i++;
+        }
+
+        int multmin = limit / NUMBER_PARSE_RADIX;
+        int result = 0;
+        int digit;
+
+        while (i < len) {
+            digit = Character.digit(s.charAt(i++), NUMBER_PARSE_RADIX);
+            if (digit < 0) {
+                return null;
+            }
+            if (result < multmin) {
+                return null;
+            }
+            result *= NUMBER_PARSE_RADIX;
+            if (result < limit + digit) {
+                return null;
+            }
+            result -= digit;
+        }
+
+        return negative ? result : -result;
+    }
+
+    private static Long parseLong(String s) {
+        if (s == null || s.isEmpty()) {
+            return null;
+        }
+        int len = s.length();
+        int i = 0;
+        char firstChar = s.charAt(0);
+        boolean negative = false;
+        long limit = -Long.MAX_VALUE;
+
+        if (firstChar < '0') {
+            if (firstChar == '-') {
+                negative = true;
+                limit = Long.MIN_VALUE;
+            } else if (firstChar != '+') {
+                return null;
+            }
+
+            if (len == 1) {
+                return null;
+            }
+            i++;
+        }
+
+        long multmin = limit / NUMBER_PARSE_RADIX;
+        long result = 0;
+        int digit;
+
+        while (i < len) {
+            digit = Character.digit(s.charAt(i++), NUMBER_PARSE_RADIX);
+            if (digit < 0) {
+                return null;
+            }
+
+            if (result < multmin) {
+                return null;
+            }
+            result *= NUMBER_PARSE_RADIX;
+            if (result < limit + digit) {
+                return null;
+            }
+            result -= digit;
+        }
+
+        return negative ? result : -result;
+    }
 }
diff --git 
a/paimon-format/src/test/java/org/apache/paimon/format/csv/CsvFileFormatTest.java
 
b/paimon-format/src/test/java/org/apache/paimon/format/csv/CsvFileFormatTest.java
index a325bdb415..8e3b223a82 100644
--- 
a/paimon-format/src/test/java/org/apache/paimon/format/csv/CsvFileFormatTest.java
+++ 
b/paimon-format/src/test/java/org/apache/paimon/format/csv/CsvFileFormatTest.java
@@ -534,6 +534,7 @@ public class CsvFileFormatTest extends FormatReadWriteTest {
                                     rowType,
                                     testFile);
                         })
+                .cause()
                 .isInstanceOf(IllegalArgumentException.class);
     }
 
@@ -569,6 +570,41 @@ public class CsvFileFormatTest extends FormatReadWriteTest 
{
         assertThat(permissiveResult.get(3).getDouble(2)).isEqualTo(400.81);
     }
 
+    @Test
+    public void testCsvParserParseField() {
+        RowType rowType =
+                DataTypes.ROW(
+                        DataTypes.TINYINT(),
+                        DataTypes.SMALLINT(),
+                        DataTypes.INT(),
+                        DataTypes.BIGINT());
+        int[] projection = {0, 1, 2, 3};
+        CsvParser parser = new CsvParser(rowType, projection, new 
CsvOptions(new Options()));
+
+        // Test normal cases
+        assertThat(parser.parseField("123", 
DataTypes.INT()).getValue()).isEqualTo(123);
+        assertThat(parser.parseField("-0", 
DataTypes.INT()).getValue()).isEqualTo(0);
+        assertThat(parser.parseField("0", 
DataTypes.INT()).getValue()).isEqualTo(0);
+        assertThat(parser.parseField("123", 
DataTypes.BIGINT()).getValue()).isEqualTo(123L);
+        assertThat(parser.parseField("-0", 
DataTypes.BIGINT()).getValue()).isEqualTo(0L);
+        assertThat(parser.parseField("0", 
DataTypes.BIGINT()).getValue()).isEqualTo(0L);
+        assertThat(parser.parseField("123", 
DataTypes.TINYINT()).getValue()).isEqualTo((byte) 123);
+        assertThat(parser.parseField("12345", DataTypes.SMALLINT()).getValue())
+                .isEqualTo((short) 12345);
+
+        // Test invalid format
+        assertThat(parser.parseField("abc", 
DataTypes.INT()).getValue()).isNull();
+        assertThat(parser.parseField("12.3", 
DataTypes.INT()).getValue()).isNull();
+
+        // Test overflow
+        assertThat(parser.parseField("2147483648", 
DataTypes.INT()).getValue()).isNull();
+        assertThat(parser.parseField("-2147483649", 
DataTypes.INT()).getValue()).isNull();
+        assertThat(parser.parseField("9223372036854775808", 
DataTypes.BIGINT()).getValue())
+                .isNull();
+        assertThat(parser.parseField("128", 
DataTypes.TINYINT()).getValue()).isNull();
+        assertThat(parser.parseField("32768", 
DataTypes.SMALLINT()).getValue()).isNull();
+    }
+
     private List<InternalRow> read(
             FileFormat format, RowType fullRowType, RowType readRowType, Path 
testFile)
             throws IOException {

Reply via email to