This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 77edc59eca1 [enhance](hive) support reading hive table with
OpenCSVSerde #42257 (#42942)
77edc59eca1 is described below
commit 77edc59eca191f5c01f5f38c804551e984d4a505
Author: Mingyu Chen (Rayner) <[email protected]>
AuthorDate: Thu Oct 31 09:54:36 2024 +0800
[enhance](hive) support reading hive table with OpenCSVSerde #42257 (#42942)
cherry pick from #42257
Co-authored-by: Socrates <[email protected]>
---
.../regression/serde_prop/some_serde_table.hql | 64 ++++++++++++++++++++++
.../doris/datasource/hive/HiveProperties.java | 50 +++++++++--------
.../doris/datasource/hive/source/HiveScanNode.java | 44 ++++++++++-----
.../hive/test_hive_serde_prop.out | 23 ++++++++
.../hive/test_hive_serde_prop.groovy | 4 ++
5 files changed, 148 insertions(+), 37 deletions(-)
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
index 4de85bc19f0..0368547f8be 100644
---
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
+++
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql
@@ -106,3 +106,67 @@ insert into serde_test4 values(1, "abc"),(2, "def");
insert into serde_test5 values(1, "abc"),(2, "def");
insert into serde_test6 values(1, "abc"),(2, "def");
insert into serde_test7 values(1, null),(2, "|||"),(3, "aaa"),(4, "\"null\"");
+
+CREATE TABLE test_open_csv_default_prop (
+ id INT,
+ name STRING,
+ age INT,
+ salary DOUBLE,
+ is_active BOOLEAN,
+ hire_date DATE,
+ last_login TIMESTAMP,
+ rating FLOAT,
+ description STRING
+)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+STORED AS TEXTFILE;
+
+CREATE TABLE test_open_csv_standard_prop (
+ id INT,
+ name STRING,
+ age INT,
+ salary DOUBLE,
+ is_active BOOLEAN,
+ hire_date DATE,
+ last_login TIMESTAMP,
+ rating FLOAT,
+ description STRING
+)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+WITH SERDEPROPERTIES (
+ "separatorChar" = ",",
+ "quoteChar" = "\"",
+ "escapeChar" = "\\"
+)
+STORED AS TEXTFILE;
+
+CREATE TABLE test_open_csv_custom_prop (
+ id INT,
+ name STRING,
+ age INT,
+ salary DOUBLE,
+ is_active BOOLEAN,
+ hire_date DATE,
+ last_login TIMESTAMP,
+ rating FLOAT,
+ description STRING
+)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+WITH SERDEPROPERTIES (
+ "separatorChar" = "\t",
+ "quoteChar" = "\'",
+ "escapeChar" = "|"
+)
+STORED AS TEXTFILE;
+
+INSERT INTO TABLE test_open_csv_default_prop VALUES
+(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5,
'Senior Developer'),
+(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project
Manager\"');
+
+INSERT INTO TABLE test_open_csv_standard_prop VALUES
+(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5,
'Senior Developer'),
+(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project
Manager\"');
+
+INSERT INTO TABLE test_open_csv_custom_prop VALUES
+(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5,
'Senior Developer'),
+(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project
Manager\"');
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
index 5ded87e0d23..74f3dcc1a9d 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
@@ -19,6 +19,7 @@ package org.apache.doris.datasource.hive;
import com.google.common.collect.ImmutableSet;
import org.apache.hadoop.hive.metastore.api.Table;
+import org.apache.hadoop.hive.serde2.OpenCSVSerde;
import java.util.HashMap;
import java.util.Map;
@@ -27,15 +28,12 @@ import java.util.Set;
public class HiveProperties {
public static final String PROP_FIELD_DELIMITER = "field.delim";
- public static final String PROP_SEPARATOR_CHAR = "separatorChar";
public static final String PROP_SERIALIZATION_FORMAT =
"serialization.format";
public static final String DEFAULT_FIELD_DELIMITER = "\1"; // "\x01"
public static final String PROP_LINE_DELIMITER = "line.delim";
public static final String DEFAULT_LINE_DELIMITER = "\n";
- public static final String PROP_QUOTE_CHAR = "quoteChar";
-
public static final String PROP_COLLECTION_DELIMITER_HIVE2 =
"colelction.delim";
public static final String PROP_COLLECTION_DELIMITER_HIVE3 =
"collection.delim";
public static final String DEFAULT_COLLECTION_DELIMITER = "\2";
@@ -49,6 +47,14 @@ public class HiveProperties {
public static final String PROP_NULL_FORMAT = "serialization.null.format";
public static final String DEFAULT_NULL_FORMAT = "\\N";
+ // The following properties are used for OpenCsvSerde.
+ public static final String PROP_SEPARATOR_CHAR =
OpenCSVSerde.SEPARATORCHAR;
+ public static final String DEFAULT_SEPARATOR_CHAR = ",";
+ public static final String PROP_QUOTE_CHAR = OpenCSVSerde.QUOTECHAR;
+ public static final String DEFAULT_QUOTE_CHAR = "\"";
+ public static final String PROP_ESCAPE_CHAR = OpenCSVSerde.ESCAPECHAR;
+ public static final String DEFAULT_ESCAPE_CHAR = "\\";
+
public static final Set<String> HIVE_SERDE_PROPERTIES = ImmutableSet.of(
PROP_FIELD_DELIMITER,
PROP_COLLECTION_DELIMITER_HIVE2,
@@ -59,37 +65,33 @@ public class HiveProperties {
PROP_QUOTE_CHAR,
PROP_MAP_KV_DELIMITER,
PROP_ESCAPE_DELIMITER,
- PROP_NULL_FORMAT
- );
+ PROP_ESCAPE_CHAR,
+ PROP_NULL_FORMAT);
public static String getFieldDelimiter(Table table) {
// This method is used for text format.
- // If you need compatibility with csv format, please use
`getColumnSeparator`.
Optional<String> fieldDelim =
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER);
Optional<String> serFormat =
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT);
return
HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_FIELD_DELIMITER, fieldDelim, serFormat));
}
- public static String getColumnSeparator(Table table) {
- Optional<String> fieldDelim =
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER);
- Optional<String> columnSeparator =
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SEPARATOR_CHAR);
- Optional<String> serFormat =
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT);
- return
HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
- DEFAULT_FIELD_DELIMITER, fieldDelim, columnSeparator, serFormat));
+ public static String getSeparatorChar(Table table) {
+ Optional<String> separatorChar =
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SEPARATOR_CHAR);
+ return HiveMetaStoreClientHelper.firstPresentOrDefault(
+ DEFAULT_SEPARATOR_CHAR, separatorChar);
}
-
public static String getLineDelimiter(Table table) {
Optional<String> lineDelim =
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_LINE_DELIMITER);
return
HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
- DEFAULT_LINE_DELIMITER, lineDelim));
+ DEFAULT_LINE_DELIMITER, lineDelim));
}
public static String getMapKvDelimiter(Table table) {
Optional<String> mapkvDelim =
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_MAP_KV_DELIMITER);
return
HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
- DEFAULT_MAP_KV_DELIMITER, mapkvDelim));
+ DEFAULT_MAP_KV_DELIMITER, mapkvDelim));
}
public static String getCollectionDelimiter(Table table) {
@@ -101,14 +103,6 @@ public class HiveProperties {
DEFAULT_COLLECTION_DELIMITER, collectionDelimHive2,
collectionDelimHive3));
}
- public static Optional<String> getQuoteChar(Table table) {
- Map<String, String> serdeParams =
table.getSd().getSerdeInfo().getParameters();
- if (serdeParams.containsKey(PROP_QUOTE_CHAR)) {
- return Optional.of(serdeParams.get(PROP_QUOTE_CHAR));
- }
- return Optional.empty();
- }
-
public static Optional<String> getEscapeDelimiter(Table table) {
Optional<String> escapeDelim =
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_ESCAPE_DELIMITER);
if (escapeDelim.isPresent()) {
@@ -127,6 +121,16 @@ public class HiveProperties {
return
HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_NULL_FORMAT,
nullFormat);
}
+ public static String getQuoteChar(Table table) {
+ Optional<String> quoteChar =
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_QUOTE_CHAR);
+ return
HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_QUOTE_CHAR, quoteChar);
+ }
+
+ public static String getEscapeChar(Table table) {
+ Optional<String> escapeChar =
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_ESCAPE_CHAR);
+ return
HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_ESCAPE_CHAR,
escapeChar);
+ }
+
// Set properties to table
public static void setTableProperties(Table table, Map<String, String>
properties) {
HashMap<String, String> serdeProps = new HashMap<>();
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
index e62971ffd69..111d82cfe84 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
@@ -416,20 +416,36 @@ public class HiveScanNode extends FileQueryScanNode {
protected TFileAttributes getFileAttributes() throws UserException {
TFileTextScanRangeParams textParams = new TFileTextScanRangeParams();
Table table = hmsTable.getRemoteTable();
- // 1. set column separator
-
textParams.setColumnSeparator(HiveProperties.getColumnSeparator(table));
- // 2. set line delimiter
- textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
- // 3. set mapkv delimiter
- textParams.setMapkvDelimiter(HiveProperties.getMapKvDelimiter(table));
- // 4. set collection delimiter
-
textParams.setCollectionDelimiter(HiveProperties.getCollectionDelimiter(table));
- // 5. set quote char
- HiveProperties.getQuoteChar(table).ifPresent(d ->
textParams.setEnclose(d.getBytes()[0]));
- // 6. set escape delimiter
- HiveProperties.getEscapeDelimiter(table).ifPresent(d ->
textParams.setEscape(d.getBytes()[0]));
- // 7. set null format
- textParams.setNullFormat(HiveProperties.getNullFormat(table));
+ // TODO: separate hive text table and OpenCsv table
+ String serDeLib = table.getSd().getSerdeInfo().getSerializationLib();
+ if
(serDeLib.equals("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) {
+ // set properties of LazySimpleSerDe
+ // 1. set column separator
+
textParams.setColumnSeparator(HiveProperties.getFieldDelimiter(table));
+ // 2. set line delimiter
+
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
+ // 3. set mapkv delimiter
+
textParams.setMapkvDelimiter(HiveProperties.getMapKvDelimiter(table));
+ // 4. set collection delimiter
+
textParams.setCollectionDelimiter(HiveProperties.getCollectionDelimiter(table));
+ // 5. set escape delimiter
+ HiveProperties.getEscapeDelimiter(table).ifPresent(d ->
textParams.setEscape(d.getBytes()[0]));
+ // 6. set null format
+ textParams.setNullFormat(HiveProperties.getNullFormat(table));
+ } else if
(serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) {
+ // set set properties of OpenCSVSerde
+ // 1. set column separator
+
textParams.setColumnSeparator(HiveProperties.getSeparatorChar(table));
+ // 2. set line delimiter
+
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
+ // 3. set enclose char
+
textParams.setEnclose(HiveProperties.getQuoteChar(table).getBytes()[0]);
+ // 4. set escape char
+
textParams.setEscape(HiveProperties.getEscapeChar(table).getBytes()[0]);
+ } else {
+ throw new UserException(
+ "unsupported hive table serde: " + serDeLib);
+ }
TFileAttributes fileAttributes = new TFileAttributes();
fileAttributes.setTextParams(textParams);
diff --git
a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
index a527c7b687d..3d45e525ecf 100644
--- a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
+++ b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
@@ -39,6 +39,18 @@ b 2.2
3 aaa
4 "null"
+-- !test_open_csv_default_prop --
+1 John Doe 28 50000.75 TRUE 2022-01-15
2023-10-21 14:30:00 4.5 Senior Developer
+2 Jane,Smith FALSE 2020-05-20
""Project Manager""
+
+-- !test_open_csv_standard_prop --
+1 John Doe 28 50000.75 TRUE 2022-01-15
2023-10-21 14:30:00 4.5 Senior Developer
+2 Jane,Smith FALSE 2020-05-20
"Project Manager"
+
+-- !test_open_csv_custom_prop --
+1 John Doe 28 50000.75 TRUE 2022-01-15
2023-10-21 14:30:00 4.5 Senior Developer
+2 Jane,Smith FALSE 2020-05-20
"Project Manager"
+
-- !1 --
a 1.1
b 2.2
@@ -79,3 +91,14 @@ b 2.2
3 aaa
4 "null"
+-- !test_open_csv_default_prop --
+1 John Doe 28 50000.75 TRUE 2022-01-15
2023-10-21 14:30:00 4.5 Senior Developer
+2 Jane,Smith FALSE 2020-05-20
""Project Manager""
+
+-- !test_open_csv_standard_prop --
+1 John Doe 28 50000.75 TRUE 2022-01-15
2023-10-21 14:30:00 4.5 Senior Developer
+2 Jane,Smith FALSE 2020-05-20
"Project Manager"
+
+-- !test_open_csv_custom_prop --
+1 John Doe 28 50000.75 TRUE 2022-01-15
2023-10-21 14:30:00 4.5 Senior Developer
+2 Jane,Smith FALSE 2020-05-20
"Project Manager"
diff --git
a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
index d0c191f7c67..52cdd25eb07 100644
--- a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
+++ b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy
@@ -51,6 +51,10 @@ suite("test_hive_serde_prop",
"external_docker,hive,external_docker_hive,p0,exte
hive_docker """truncate table regression.serde_test8;"""
sql """insert into ${catalog_name}.regression.serde_test8 select *
from ${catalog_name}.regression.serde_test7;"""
qt_9 """select * from ${catalog_name}.regression.serde_test8 order by
id;"""
+
+ qt_test_open_csv_default_prop """select * from
${catalog_name}.regression.test_open_csv_default_prop order by id;"""
+ qt_test_open_csv_standard_prop """select * from
${catalog_name}.regression.test_open_csv_standard_prop order by id;"""
+ qt_test_open_csv_custom_prop """select * from
${catalog_name}.regression.test_open_csv_custom_prop order by id;"""
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]