This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 915d8989c5 [feature](spark-load)Spark load supports string type data
import (#11927)
915d8989c5 is described below
commit 915d8989c5774638190d487d77c2079a422aea27
Author: jiafeng.zhang <[email protected]>
AuthorDate: Mon Aug 22 08:56:59 2022 +0800
[feature](spark-load)Spark load supports string type data import (#11927)
---
be/src/olap/push_handler.cpp | 1 +
.../apache/doris/load/loadv2/dpp/ColumnParser.java | 23 ++++++++++++++++++++++
.../org/apache/doris/load/loadv2/dpp/DppUtils.java | 6 ++++++
.../org/apache/doris/load/loadv2/dpp/SparkDpp.java | 12 +++++++++++
4 files changed, 42 insertions(+)
diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp
index 7cae605904..e6b858015a 100644
--- a/be/src/olap/push_handler.cpp
+++ b/be/src/olap/push_handler.cpp
@@ -905,6 +905,7 @@ Status PushBrokerReader::fill_field_row(RowCursorCell* dst,
const char* src, boo
case OLAP_FIELD_TYPE_DOUBLE:
case OLAP_FIELD_TYPE_CHAR:
case OLAP_FIELD_TYPE_VARCHAR:
+ case OLAP_FIELD_TYPE_STRING:
case OLAP_FIELD_TYPE_HLL:
case OLAP_FIELD_TYPE_OBJECT: {
dst->set_is_null(src_null);
diff --git
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java
index e72d471a9f..1425dd3bef 100644
---
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java
+++
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java
@@ -29,6 +29,7 @@ import java.math.BigInteger;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
+
// Parser to validate value for different type
public abstract class ColumnParser implements Serializable {
@@ -62,6 +63,9 @@ public abstract class ColumnParser implements Serializable {
return new DateParser();
} else if (columnType.equalsIgnoreCase("DATETIME")) {
return new DatetimeParser();
+ } else if (columnType.equalsIgnoreCase("STRING")
+ || columnType.equalsIgnoreCase("TEXT")) {
+ return new StringTypeParser(etlColumn);
} else if (columnType.equalsIgnoreCase("VARCHAR")
|| columnType.equalsIgnoreCase("CHAR")
|| columnType.equalsIgnoreCase("BITMAP")
@@ -208,6 +212,25 @@ class StringParser extends ColumnParser {
}
}
+class StringTypeParser extends ColumnParser {
+
+ private EtlJobConfig.EtlColumn etlColumn;
+
+ public StringTypeParser(EtlJobConfig.EtlColumn etlColumn) {
+ this.etlColumn = etlColumn;
+ }
+
+ @Override
+ public boolean parse(String value) {
+ try {
+ return value.getBytes("UTF-8").length <=
DppUtils.STRING_LENGTH_LIMIT;
+ } catch (Exception e) {
+ throw new RuntimeException("string check failed ", e);
+ }
+ }
+}
+
+
class DecimalParser extends ColumnParser {
public static int PRECISION = 27;
diff --git
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java
index c53e1ae087..9a252b38b7 100644
--- a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java
+++ b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java
@@ -41,6 +41,8 @@ import java.util.zip.CRC32;
public class DppUtils {
public static final String BUCKET_ID = "__bucketId__";
+ public static final int STRING_LENGTH_LIMIT = 1048576;
+
public static Class getClassFromDataType(DataType dataType) {
if (dataType == null) {
return null;
@@ -94,6 +96,8 @@ public class DppUtils {
case "HLL":
case "CHAR":
case "VARCHAR":
+ case "STRING":
+ case "TEXT":
case "BITMAP":
case "OBJECT":
return String.class;
@@ -142,6 +146,8 @@ public class DppUtils {
break;
case "CHAR":
case "VARCHAR":
+ case "STRING":
+ case "TEXT":
case "OBJECT":
dataType = DataTypes.StringType;
break;
diff --git
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java
index ab7e791a1c..5d951ad70b 100644
--- a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java
+++ b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java
@@ -405,6 +405,18 @@ public final class SparkDpp implements
java.io.Serializable {
return false;
}
break;
+ case "STRING":
+ case "TEXT":
+ // TODO(zjf) padding string type
+ int strDataSize = 0;
+ if (srcValue != null && (strDataSize =
srcValue.toString().getBytes(StandardCharsets.UTF_8).length)
+ > DppUtils.STRING_LENGTH_LIMIT) {
+ LOG.warn(String.format("The string type is limited to a
maximum of %s bytes."
+ + " column_name:%s,input_str[%s],actual
length:%s",
+ DppUtils.STRING_LENGTH_LIMIT,
etlColumn.columnName, row.toString(), strDataSize));
+ return false;
+ }
+ break;
default:
return true;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]