This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch dev-1.1.2
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/dev-1.1.2 by this push:
     new f0996b4f69 [feature](spark-load)Spark load supports string type data 
import (#11927)
f0996b4f69 is described below

commit f0996b4f6939a7d32de54937b268390d951be1d4
Author: jiafeng.zhang <[email protected]>
AuthorDate: Mon Aug 22 08:56:59 2022 +0800

    [feature](spark-load)Spark load supports string type data import (#11927)
---
 be/src/olap/push_handler.cpp                       |  1 +
 .../apache/doris/load/loadv2/dpp/ColumnParser.java | 23 ++++++++++++++++++++++
 .../org/apache/doris/load/loadv2/dpp/DppUtils.java |  7 +++++++
 .../org/apache/doris/load/loadv2/dpp/SparkDpp.java | 17 ++++++++++++++++
 4 files changed, 48 insertions(+)

diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp
index bb3149216b..f01ae29c60 100644
--- a/be/src/olap/push_handler.cpp
+++ b/be/src/olap/push_handler.cpp
@@ -1000,6 +1000,7 @@ OLAPStatus 
PushBrokerReader::fill_field_row(RowCursorCell* dst, const char* src,
     case OLAP_FIELD_TYPE_DOUBLE:
     case OLAP_FIELD_TYPE_CHAR:
     case OLAP_FIELD_TYPE_VARCHAR:
+    case OLAP_FIELD_TYPE_STRING:
     case OLAP_FIELD_TYPE_HLL:
     case OLAP_FIELD_TYPE_OBJECT: {
         dst->set_is_null(src_null);
diff --git 
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java 
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java
index a7b8d3df5b..d3dc27970f 100644
--- 
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java
+++ 
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java
@@ -28,6 +28,7 @@ import java.io.Serializable;
 import java.math.BigDecimal;
 import java.math.BigInteger;
 
+
 // Parser to validate value for different type
 public abstract class ColumnParser implements Serializable {
 
@@ -57,6 +58,9 @@ public abstract class ColumnParser implements Serializable {
             return new DateParser();
         } else if (columnType.equalsIgnoreCase("DATETIME")) {
             return new DatetimeParser();
+        } else if (columnType.equalsIgnoreCase("STRING")
+                || columnType.equalsIgnoreCase("TEXT")) {
+            return new StringTypeParser(etlColumn);
         } else if (columnType.equalsIgnoreCase("VARCHAR")
                 || columnType.equalsIgnoreCase("CHAR")
                 || columnType.equalsIgnoreCase("BITMAP")
@@ -200,6 +204,25 @@ class StringParser extends ColumnParser {
     }
 }
 
+class StringTypeParser extends ColumnParser {
+
+    private EtlJobConfig.EtlColumn etlColumn;
+
+    public StringTypeParser(EtlJobConfig.EtlColumn etlColumn) {
+        this.etlColumn = etlColumn;
+    }
+
+    @Override
+    public boolean parse(String value) {
+        try {
+            return value.getBytes("UTF-8").length <= 
DppUtils.STRING_LENGTH_LIMIT;
+        } catch (Exception e) {
+            throw new RuntimeException("string check failed ", e);
+        }
+    }
+}
+
+
 class DecimalParser extends ColumnParser {
 
     public static int PRECISION = 27;
diff --git 
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java 
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java
index 1249cac993..3f6cabdfc3 100644
--- a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java
+++ b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java
@@ -41,6 +41,9 @@ import java.util.zip.CRC32;
 
 public class DppUtils {
     public static final String BUCKET_ID = "__bucketId__";
+
+    public static final int STRING_LENGTH_LIMIT = 1048576;
+
     public static Class getClassFromDataType(DataType dataType) {
         if (dataType == null) {
             return null;
@@ -94,6 +97,8 @@ public class DppUtils {
             case "HLL":
             case "CHAR":
             case "VARCHAR":
+            case "STRING":
+            case "TEXT":
             case "BITMAP":
             case "OBJECT":
                 return String.class;
@@ -139,6 +144,8 @@ public class DppUtils {
                 break;
             case "CHAR":
             case "VARCHAR":
+            case "STRING":
+            case "TEXT":
             case "OBJECT":
                 dataType = DataTypes.StringType;
                 break;
diff --git 
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java 
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java
index f94a82700b..4e78c63d8b 100644
--- a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java
+++ b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java
@@ -403,6 +403,23 @@ public final class SparkDpp implements 
java.io.Serializable {
                     return false;
                 }
                 break;
+<<<<<<< HEAD
+=======
+            case "STRING":
+            case "TEXT":
+                // TODO(zjf) padding string type
+                int strDataSize = 0;
+                if (srcValue != null && (strDataSize = 
srcValue.toString().getBytes(StandardCharsets.UTF_8).length)
+                        > DppUtils.STRING_LENGTH_LIMIT) {
+                    LOG.warn(String.format("The string type is limited to a 
maximum of %s bytes."
+                                    + " column_name:%s,input_str[%s],actual 
length:%s",
+                            DppUtils.STRING_LENGTH_LIMIT, 
etlColumn.columnName, row.toString(), strDataSize));
+                    return false;
+                }
+                break;
+            default:
+                return true;
+>>>>>>> 915d8989c ([feature](spark-load)Spark load supports string type data 
import (#11927))
         }
         return true;
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to