This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch dev-1.1.2
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/dev-1.1.2 by this push:
new f0996b4f69 [feature](spark-load)Spark load supports string type data
import (#11927)
f0996b4f69 is described below
commit f0996b4f6939a7d32de54937b268390d951be1d4
Author: jiafeng.zhang <[email protected]>
AuthorDate: Mon Aug 22 08:56:59 2022 +0800
[feature](spark-load)Spark load supports string type data import (#11927)
---
be/src/olap/push_handler.cpp | 1 +
.../apache/doris/load/loadv2/dpp/ColumnParser.java | 23 ++++++++++++++++++++++
.../org/apache/doris/load/loadv2/dpp/DppUtils.java | 7 +++++++
.../org/apache/doris/load/loadv2/dpp/SparkDpp.java | 17 ++++++++++++++++
4 files changed, 48 insertions(+)
diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp
index bb3149216b..f01ae29c60 100644
--- a/be/src/olap/push_handler.cpp
+++ b/be/src/olap/push_handler.cpp
@@ -1000,6 +1000,7 @@ OLAPStatus
PushBrokerReader::fill_field_row(RowCursorCell* dst, const char* src,
case OLAP_FIELD_TYPE_DOUBLE:
case OLAP_FIELD_TYPE_CHAR:
case OLAP_FIELD_TYPE_VARCHAR:
+ case OLAP_FIELD_TYPE_STRING:
case OLAP_FIELD_TYPE_HLL:
case OLAP_FIELD_TYPE_OBJECT: {
dst->set_is_null(src_null);
diff --git
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java
index a7b8d3df5b..d3dc27970f 100644
---
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java
+++
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java
@@ -28,6 +28,7 @@ import java.io.Serializable;
import java.math.BigDecimal;
import java.math.BigInteger;
+
// Parser to validate value for different type
public abstract class ColumnParser implements Serializable {
@@ -57,6 +58,9 @@ public abstract class ColumnParser implements Serializable {
return new DateParser();
} else if (columnType.equalsIgnoreCase("DATETIME")) {
return new DatetimeParser();
+ } else if (columnType.equalsIgnoreCase("STRING")
+ || columnType.equalsIgnoreCase("TEXT")) {
+ return new StringTypeParser(etlColumn);
} else if (columnType.equalsIgnoreCase("VARCHAR")
|| columnType.equalsIgnoreCase("CHAR")
|| columnType.equalsIgnoreCase("BITMAP")
@@ -200,6 +204,25 @@ class StringParser extends ColumnParser {
}
}
+class StringTypeParser extends ColumnParser {
+
+ private EtlJobConfig.EtlColumn etlColumn;
+
+ public StringTypeParser(EtlJobConfig.EtlColumn etlColumn) {
+ this.etlColumn = etlColumn;
+ }
+
+ @Override
+ public boolean parse(String value) {
+ try {
+ return value.getBytes("UTF-8").length <=
DppUtils.STRING_LENGTH_LIMIT;
+ } catch (Exception e) {
+ throw new RuntimeException("string check failed ", e);
+ }
+ }
+}
+
+
class DecimalParser extends ColumnParser {
public static int PRECISION = 27;
diff --git
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java
index 1249cac993..3f6cabdfc3 100644
--- a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java
+++ b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java
@@ -41,6 +41,9 @@ import java.util.zip.CRC32;
public class DppUtils {
public static final String BUCKET_ID = "__bucketId__";
+
+ public static final int STRING_LENGTH_LIMIT = 1048576;
+
public static Class getClassFromDataType(DataType dataType) {
if (dataType == null) {
return null;
@@ -94,6 +97,8 @@ public class DppUtils {
case "HLL":
case "CHAR":
case "VARCHAR":
+ case "STRING":
+ case "TEXT":
case "BITMAP":
case "OBJECT":
return String.class;
@@ -139,6 +144,8 @@ public class DppUtils {
break;
case "CHAR":
case "VARCHAR":
+ case "STRING":
+ case "TEXT":
case "OBJECT":
dataType = DataTypes.StringType;
break;
diff --git
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java
index f94a82700b..4e78c63d8b 100644
--- a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java
+++ b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java
@@ -403,6 +403,23 @@ public final class SparkDpp implements
java.io.Serializable {
return false;
}
break;
+<<<<<<< HEAD
+=======
+ case "STRING":
+ case "TEXT":
+ // TODO(zjf) padding string type
+ int strDataSize = 0;
+ if (srcValue != null && (strDataSize =
srcValue.toString().getBytes(StandardCharsets.UTF_8).length)
+ > DppUtils.STRING_LENGTH_LIMIT) {
+ LOG.warn(String.format("The string type is limited to a
maximum of %s bytes."
+ + " column_name:%s,input_str[%s],actual
length:%s",
+ DppUtils.STRING_LENGTH_LIMIT,
etlColumn.columnName, row.toString(), strDataSize));
+ return false;
+ }
+ break;
+ default:
+ return true;
+>>>>>>> 915d8989c ([feature](spark-load)Spark load supports string type data
import (#11927))
}
return true;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]