This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 0d09cfefaed Reapply "[enhancement](hive)Initial support for Hive
org.openx.data.jsonserde.JsonSerDe" (#49928) (#49958)
0d09cfefaed is described below
commit 0d09cfefaed57d3521005d47bea3607854eee5f0
Author: daidai <[email protected]>
AuthorDate: Sat Apr 19 06:10:47 2025 +0800
Reapply "[enhancement](hive)Initial support for Hive
org.openx.data.jsonserde.JsonSerDe" (#49928) (#49958)
This reverts commit fa511851863e883487faead5e0bccc2876671523.
---
be/src/vec/exec/format/json/new_json_reader.cpp | 33 +++++++-
be/src/vec/exec/format/json/new_json_reader.h | 8 +-
.../hive/scripts/auxlib/json-serde-1.3.9.tar.gz | Bin 0 -> 78992 bytes
.../scripts/create_preinstalled_scripts/run76.hql | 59 ++++++++++++++-
.../docker-compose/hive/scripts/hive-metastore.sh | 9 +++
.../json/openx_json/json_data_arrays_tb/1 | 2 +
.../json/openx_json/json_one_column_table/1 | 5 ++
.../preinstalled_data/json/openx_json/json_table/1 | 2 +
.../preinstalled_data/json/openx_json/json_table/2 | 11 +++
.../json/openx_json/scalar_to_array_tb/1 | 1 +
.../doris/datasource/hive/HMSExternalTable.java | 8 ++
.../datasource/hive/HiveMetaStoreClientHelper.java | 1 +
.../doris/datasource/hive/HiveProperties.java | 10 +++
.../doris/datasource/hive/source/HiveScanNode.java | 48 +++++++++++-
.../java/org/apache/doris/qe/SessionVariable.java | 21 ++++++
gensrc/thrift/PlanNodes.thrift | 3 +
.../hive/test_hive_openx_json.out | Bin 0 -> 709 bytes
.../hive/test_hive_openx_json.groovy | 84 +++++++++++++++++++++
18 files changed, 295 insertions(+), 10 deletions(-)
diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp
b/be/src/vec/exec/format/json/new_json_reader.cpp
index 5820174e7fa..a125654eada 100644
--- a/be/src/vec/exec/format/json/new_json_reader.cpp
+++ b/be/src/vec/exec/format/json/new_json_reader.cpp
@@ -399,6 +399,20 @@ Status NewJsonReader::_get_range_params() {
if (_range.table_format_params.table_format_type == "hive") {
_is_hive_table = true;
}
+ if (_params.file_attributes.__isset.openx_json_ignore_malformed) {
+ _openx_json_ignore_malformed =
_params.file_attributes.openx_json_ignore_malformed;
+ }
+ return Status::OK();
+}
+
+static Status ignore_malformed_json_append_null(Block& block) {
+ for (auto& column : block.get_columns()) {
+ if (!column->is_nullable()) [[unlikely]] {
+ return Status::DataQualityError("malformed json, but the column
`{}` is not nullable.",
+ column->get_name());
+ }
+
static_cast<ColumnNullable*>(column->assume_mutable().get())->insert_default();
+ }
return Status::OK();
}
@@ -486,8 +500,13 @@ Status NewJsonReader::_vhandle_simple_json(RuntimeState*
/*state*/, Block& block
bool valid = false;
if (_next_row >= _total_rows) { // parse json and generic document
Status st = _parse_json(is_empty_row, eof);
- if (_is_load && st.is<DATA_QUALITY_ERROR>()) {
- continue; // continue to read next (for load, after this ,
already append error to file.)
+ if (st.is<DATA_QUALITY_ERROR>()) {
+ if (_is_load) {
+ continue; // continue to read next (for load, after this ,
already append error to file.)
+ } else if (_openx_json_ignore_malformed) {
+ RETURN_IF_ERROR(ignore_malformed_json_append_null(block));
+ continue;
+ }
}
RETURN_IF_ERROR(st);
if (*is_empty_row) {
@@ -1296,9 +1315,15 @@ Status
NewJsonReader::_simdjson_handle_simple_json(RuntimeState* /*state*/, Bloc
// step2: get json value by json doc
Status st = _get_json_value(&size, eof, &error, is_empty_row);
- if (_is_load && st.is<DATA_QUALITY_ERROR>()) {
- return Status::OK();
+ if (st.is<DATA_QUALITY_ERROR>()) {
+ if (_is_load) {
+ return Status::OK();
+ } else if (_openx_json_ignore_malformed) {
+ RETURN_IF_ERROR(ignore_malformed_json_append_null(block));
+ return Status::OK();
+ }
}
+
RETURN_IF_ERROR(st);
if (*is_empty_row || *eof) {
return Status::OK();
diff --git a/be/src/vec/exec/format/json/new_json_reader.h
b/be/src/vec/exec/format/json/new_json_reader.h
index 430f8c7af18..6b42ca23b4f 100644
--- a/be/src/vec/exec/format/json/new_json_reader.h
+++ b/be/src/vec/exec/format/json/new_json_reader.h
@@ -293,18 +293,22 @@ private:
int32_t skip_bitmap_col_idx {-1};
- bool _is_load = true;
//Used to indicate whether it is a stream load. When loading, only data
will be inserted into columnString.
//If an illegal value is encountered during the load process,
`_append_error_msg` should be called
//instead of directly returning `Status::DataQualityError`
+ bool _is_load = true;
- bool _is_hive_table = false;
// In hive : create table xxx ROW FORMAT SERDE
'org.apache.hive.hcatalog.data.JsonSerDe';
// Hive will not allow you to create columns with the same name but
different case, including field names inside
// structs, and will automatically convert uppercase names in create sql
to lowercase.However, when Hive loads data
// to table, the column names in the data may be uppercase,and there may
be multiple columns with
// the same name but different capitalization.We refer to the behavior of
hive, convert all column names
// in the data to lowercase,and use the last one as the insertion value
+ bool _is_hive_table = false;
+
+ // hive : org.openx.data.jsonserde.JsonSerDe, `ignore.malformed.json` prop.
+ // If the variable is true, `null` will be inserted for llegal json format
instead of return error.
+ bool _openx_json_ignore_malformed = false;
DataTypeSerDeSPtrs _serdes;
vectorized::DataTypeSerDe::FormatOptions _serde_options;
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/auxlib/json-serde-1.3.9.tar.gz
b/docker/thirdparties/docker-compose/hive/scripts/auxlib/json-serde-1.3.9.tar.gz
new file mode 100644
index 00000000000..1eb63aa7727
Binary files /dev/null and
b/docker/thirdparties/docker-compose/hive/scripts/auxlib/json-serde-1.3.9.tar.gz
differ
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql
index 451f74482e3..c003c9e7d50 100755
---
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql
+++
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql
@@ -18,4 +18,61 @@ ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION '/user/doris/preinstalled_data/text/text_table_compressed_skip_header'
-TBLPROPERTIES ("skip.header.line.count"="5");
\ No newline at end of file
+TBLPROPERTIES ("skip.header.line.count"="5");
+
+create database if not exists openx_json;
+use openx_json;
+
+
+CREATE TABLE IF NOT EXISTS json_table (
+ id INT,
+ name STRING,
+ numbers ARRAY<INT>,
+ scores MAP<STRING, INT>,
+ details STRUCT<a:INT, b:STRING, c:BIGINT>
+)
+ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
+LOCATION '/user/doris/preinstalled_data/json/openx_json/json_table';
+
+
+CREATE TABLE IF NOT EXISTS json_table_ignore_malformed (
+ id INT,
+ name STRING,
+ numbers ARRAY<INT>,
+ scores MAP<STRING, INT>,
+ details STRUCT<a:INT, b:STRING, c:BIGINT>
+)
+ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
+WITH SERDEPROPERTIES ("ignore.malformed.json" = "true" )
+LOCATION '/user/doris/preinstalled_data/json/openx_json/json_table';
+
+
+CREATE TABLE json_data_arrays_tb (
+ name string, age int)
+ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
+LOCATION '/user/doris/preinstalled_data/json/openx_json/json_data_arrays_tb';
+
+
+CREATE TABLE IF NOT EXISTS scalar_to_array_tb(
+ id INT,
+ name STRING,
+ tags ARRAY<STRING>
+)ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
+LOCATION '/user/doris/preinstalled_data/json/openx_json/scalar_to_array_tb';
+
+
+CREATE TABLE IF NOT EXISTS json_one_column_table (
+ name STRING,
+ id INT,
+ numbers ARRAY<INT>,
+ scores MAP<STRING, INT>,
+ details STRUCT<a:INT, b:STRING, c:BIGINT>
+)
+ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
+LOCATION '/user/doris/preinstalled_data/json/openx_json/json_one_column_table';
+
+msck repair table json_table;
+msck repair table json_table_ignore_malformed;
+msck repair table json_data_arrays_tb;
+msck repair table scalar_to_array_tb;
+msck repair table json_one_column_table;
diff --git a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
index ac4c9ae4480..20c469a880d 100755
--- a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
+++ b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
@@ -20,6 +20,15 @@ set -e -x
parallel=$(getconf _NPROCESSORS_ONLN)
+AUX_LIB="/mnt/scripts/auxlib"
+for file in "${AUX_LIB}"/*.tar.gz; do
+ [ -e "$file" ] || continue
+ tar -xzvf "$file" -C "$AUX_LIB"
+ echo "file = ${file}"
+done
+ls "${AUX_LIB}/"
+cp -r "${AUX_LIB}"/ /opt/hive
+
nohup /opt/hive/bin/hive --service metastore &
# wait lockfile
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_data_arrays_tb/1
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_data_arrays_tb/1
new file mode 100644
index 00000000000..098bb346b50
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_data_arrays_tb/1
@@ -0,0 +1,2 @@
+["John", 26 ]
+["Mary", 23 ]
\ No newline at end of file
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_one_column_table/1
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_one_column_table/1
new file mode 100644
index 00000000000..d396f66a079
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_one_column_table/1
@@ -0,0 +1,5 @@
+
+{"name":"bad1","id":5,"numbers":[1,2,3]
+[1,2,3]
+"just a string"
+{"name":"bad2","id":6,"numbers":"not an
array","scores":{"key4":40},"details":{"a":4,"b":"text","c":4000000}}
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/1
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/1
new file mode 100644
index 00000000000..11a3edf6e80
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/1
@@ -0,0 +1,2 @@
+{"id": 1, "name": "Alice", "numbers": [1, 2, 3], "scores": {"math": 90,
"english": 85}, "details": {"a": 100, "b": "test1", "c": 1234567890}}
+{"id": 2, "name": "Bob", "numbers": [4, 5], "scores": {"math": 80, "science":
95}, "details": {"a": 200, "b": "test2", "c": 9876543210}}
\ No newline at end of file
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/2
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/2
new file mode 100644
index 00000000000..e77c1f49d85
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/2
@@ -0,0 +1,11 @@
+{"id" 3 "name" "Bob",
+"numbers": [
+ 4 5
+],
+"scores": {
+"math": 80
+},
+"details"
+: {
+"a": 200 , "b" } "test2", "c": 9876543210
+}}
\ No newline at end of file
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/scalar_to_array_tb/1
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/scalar_to_array_tb/1
new file mode 100644
index 00000000000..24a9acc63a3
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/scalar_to_array_tb/1
@@ -0,0 +1 @@
+{"name":"Charlie","id":4,"tags":"flink"}
\ No newline at end of file
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
index dfd7d6fe4b8..33c945272e7 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
@@ -1084,6 +1084,14 @@ public class HMSExternalTable extends ExternalTable
implements MTMVRelatedTableI
public void beforeMTMVRefresh(MTMV mtmv) throws DdlException {
}
+ public boolean firstColumnIsString() {
+ List<Column> columns = getColumns();
+ if (columns == null || columns.isEmpty()) {
+ return false;
+ }
+ return columns.get(0).getType().isScalarType(PrimitiveType.STRING);
+ }
+
public HoodieTableMetaClient getHudiClient() {
return Env.getCurrentEnv()
.getExtMetaCacheMgr()
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
index 9bb09225607..d98bf8227e1 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
@@ -95,6 +95,7 @@ public class HiveMetaStoreClientHelper {
public static final String HIVE_JSON_SERDE =
"org.apache.hive.hcatalog.data.JsonSerDe";
public static final String LEGACY_HIVE_JSON_SERDE =
"org.apache.hadoop.hive.serde2.JsonSerDe";
+ public static final String OPENX_JSON_SERDE =
"org.openx.data.jsonserde.JsonSerDe";
public enum HiveFileFormat {
TEXT_FILE(0, "text"),
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
index 7c4d5279969..bdc8e0cacd9 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
@@ -61,6 +61,10 @@ public class HiveProperties {
public static final String PROP_ESCAPE_CHAR = OpenCSVSerde.ESCAPECHAR;
public static final String DEFAULT_ESCAPE_CHAR = "\\";
+ // org.openx.data.jsonserde.JsonSerDe
+ public static final String PROP_OPENX_IGNORE_MALFORMED_JSON =
"ignore.malformed.json";
+ public static final String DEFAULT_OPENX_IGNORE_MALFORMED_JSON = "false";
+
public static final Set<String> HIVE_SERDE_PROPERTIES = ImmutableSet.of(
PROP_FIELD_DELIMITER,
PROP_COLLECTION_DELIMITER_HIVE2,
@@ -151,6 +155,12 @@ public class HiveProperties {
.parseInt(HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_SKIP_FOOTER_COUNT,
skipFooterCount));
}
+ public static String getOpenxJsonIgnoreMalformed(Table table) {
+ Optional<String> escapeChar =
HiveMetaStoreClientHelper.getSerdeProperty(table,
+ PROP_OPENX_IGNORE_MALFORMED_JSON);
+ return
HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_OPENX_IGNORE_MALFORMED_JSON,
escapeChar);
+ }
+
// Set properties to table
public static void setTableProperties(Table table, Map<String, String>
properties) {
HashMap<String, String> serdeProps = new HashMap<>();
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
index 8c8558ced6a..a65d057b8f3 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
@@ -125,7 +125,7 @@ public class HiveScanNode extends FileQueryScanNode {
this.hiveTransaction = new
HiveTransaction(DebugUtil.printId(ConnectContext.get().queryId()),
ConnectContext.get().getQualifiedUser(), hmsTable,
hmsTable.isFullAcidTable());
Env.getCurrentHiveTransactionMgr().register(hiveTransaction);
- skipCheckingAcidVersionFile =
ConnectContext.get().getSessionVariable().skipCheckingAcidVersionFile;
+ skipCheckingAcidVersionFile =
sessionVariable.skipCheckingAcidVersionFile;
}
}
@@ -413,6 +413,17 @@ public class HiveScanNode extends FileQueryScanNode {
if (serDeLib.equals(HiveMetaStoreClientHelper.HIVE_JSON_SERDE)
||
serDeLib.equals(HiveMetaStoreClientHelper.LEGACY_HIVE_JSON_SERDE)) {
type = TFileFormatType.FORMAT_JSON;
+ } else if
(serDeLib.equals(HiveMetaStoreClientHelper.OPENX_JSON_SERDE)) {
+ if (!sessionVariable.isReadHiveJsonInOneColumn()) {
+ type = TFileFormatType.FORMAT_JSON;
+ } else if (sessionVariable.isReadHiveJsonInOneColumn()
+ && hmsTable.firstColumnIsString()) {
+ type = TFileFormatType.FORMAT_CSV_PLAIN;
+ } else {
+ throw new UserException("You set
read_hive_json_in_one_column = true, but the first column of "
+ + "table " + hmsTable.getName()
+ + " is not a string column.");
+ }
} else {
type = TFileFormatType.FORMAT_CSV_PLAIN;
}
@@ -452,7 +463,7 @@ public class HiveScanNode extends FileQueryScanNode {
fileAttributes.setTextParams(textParams);
fileAttributes.setHeaderType("");
fileAttributes.setEnableTextValidateUtf8(
-
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
+ sessionVariable.enableTextValidateUtf8);
} else if
(serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) {
TFileTextScanRangeParams textParams = new
TFileTextScanRangeParams();
// set set properties of OpenCSVSerde
@@ -470,7 +481,7 @@ public class HiveScanNode extends FileQueryScanNode {
fileAttributes.setTrimDoubleQuotes(true);
}
fileAttributes.setEnableTextValidateUtf8(
-
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
+ sessionVariable.enableTextValidateUtf8);
} else if (serDeLib.equals("org.apache.hive.hcatalog.data.JsonSerDe"))
{
TFileTextScanRangeParams textParams = new
TFileTextScanRangeParams();
textParams.setColumnSeparator("\t");
@@ -484,6 +495,37 @@ public class HiveScanNode extends FileQueryScanNode {
fileAttributes.setReadJsonByLine(true);
fileAttributes.setStripOuterArray(false);
fileAttributes.setHeaderType("");
+ } else if (serDeLib.equals("org.openx.data.jsonserde.JsonSerDe")) {
+ if (!sessionVariable.isReadHiveJsonInOneColumn()) {
+ TFileTextScanRangeParams textParams = new
TFileTextScanRangeParams();
+ textParams.setColumnSeparator("\t");
+ textParams.setLineDelimiter("\n");
+ fileAttributes.setTextParams(textParams);
+
+ fileAttributes.setJsonpaths("");
+ fileAttributes.setJsonRoot("");
+ fileAttributes.setNumAsString(true);
+ fileAttributes.setFuzzyParse(false);
+ fileAttributes.setReadJsonByLine(true);
+ fileAttributes.setStripOuterArray(false);
+ fileAttributes.setHeaderType("");
+
+ fileAttributes.setOpenxJsonIgnoreMalformed(
+
Boolean.parseBoolean(HiveProperties.getOpenxJsonIgnoreMalformed(table)));
+ } else if (sessionVariable.isReadHiveJsonInOneColumn()
+ && hmsTable.firstColumnIsString()) {
+ TFileTextScanRangeParams textParams = new
TFileTextScanRangeParams();
+ textParams.setLineDelimiter("\n");
+ textParams.setColumnSeparator("\n");
+ //First, perform row splitting according to `\n`. When
performing column splitting,
+ // since there is no `\n`, only one column of data will be
generated.
+ fileAttributes.setTextParams(textParams);
+ fileAttributes.setHeaderType("");
+ } else {
+ throw new UserException("You set read_hive_json_in_one_column
= true, but the first column of table "
+ + hmsTable.getName()
+ + " is not a string column.");
+ }
} else {
throw new UserException(
"unsupported hive table serde: " + serDeLib);
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index d2b0bc7ee6c..73b0725233a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -712,6 +712,8 @@ public class SessionVariable implements Serializable,
Writable {
"enable_cooldown_replica_affinity";
public static final String SKIP_CHECKING_ACID_VERSION_FILE =
"skip_checking_acid_version_file";
+ public static final String READ_HIVE_JSON_IN_ONE_COLUMN =
"read_hive_json_in_one_column";
+
/**
* Inserting overwrite for auto partition table allows creating partition
for
* datas which cannot find partition to overwrite.
@@ -1224,6 +1226,17 @@ public class SessionVariable implements Serializable,
Writable {
@VariableMgr.VarAttr(name = PARALLEL_PREPARE_THRESHOLD, fuzzy = true)
public int parallelPrepareThreshold = 32;
+ @VariableMgr.VarAttr(name = READ_HIVE_JSON_IN_ONE_COLUMN,
+ description = {"在读取hive
json的时候,由于存在一些不支持的json格式,我们默认会报错。为了让用户使用体验更好,"
+ +
"当该变量为true的时候,将一整行json读取到第一列中,用户可以自行选择对一整行json进行处理,例如JSON_PARSE。"
+ + "需要表的第一列的数据类型为string.",
+ "When reading hive json, we will report an error by
default because there are some unsupported "
+ + "json formats. In order to provide users with a better
experience, when this variable is true,"
+ + "a whole line of json is read into the first column.
Users can choose to process a whole line"
+ + "of json, such as JSON_PARSE. The data type of the first
column of the table needs to"
+ + "be string."})
+ private boolean readHiveJsonInOneColumn = false;
+
@VariableMgr.VarAttr(name = ENABLE_COST_BASED_JOIN_REORDER)
private boolean enableJoinReorderBasedCost = false;
@@ -3782,6 +3795,14 @@ public class SessionVariable implements Serializable,
Writable {
this.keepCarriageReturn = keepCarriageReturn;
}
+ public boolean isReadHiveJsonInOneColumn() {
+ return readHiveJsonInOneColumn;
+ }
+
+ public void setReadHiveJsonInOneColumn(boolean readHiveJsonInOneColumn) {
+ this.readHiveJsonInOneColumn = readHiveJsonInOneColumn;
+ }
+
public boolean isDropTableIfCtasFailed() {
return dropTableIfCtasFailed;
}
diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift
index 407c77fc340..3165a6ac764 100644
--- a/gensrc/thrift/PlanNodes.thrift
+++ b/gensrc/thrift/PlanNodes.thrift
@@ -271,6 +271,9 @@ struct TFileAttributes {
11: optional i32 skip_lines;
//For text type file reading, whether to enable utf8 encoding
check.(Catalog && TVF)
12: optional bool enable_text_validate_utf8 = true;
+ // org.openx.data.jsonserde.JsonSerDe
+ 13: optional bool openx_json_ignore_malformed = false;
+
// for cloud copy into
1001: optional bool ignore_csv_redundant_col;
}
diff --git
a/regression-test/data/external_table_p0/hive/test_hive_openx_json.out
b/regression-test/data/external_table_p0/hive/test_hive_openx_json.out
new file mode 100644
index 00000000000..6eadea56694
Binary files /dev/null and
b/regression-test/data/external_table_p0/hive/test_hive_openx_json.out differ
diff --git
a/regression-test/suites/external_table_p0/hive/test_hive_openx_json.groovy
b/regression-test/suites/external_table_p0/hive/test_hive_openx_json.groovy
new file mode 100644
index 00000000000..b9698809c4d
--- /dev/null
+++ b/regression-test/suites/external_table_p0/hive/test_hive_openx_json.groovy
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_hive_openx_json",
"p0,external,hive,external_docker,external_docker_hive") {
+
+
+ String enabled = context.config.otherConfigs.get("enableHiveTest")
+ if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+ logger.info("diable Hive test.")
+ return;
+ }
+
+ for (String hivePrefix : ["hive3"]) {
+ try {
+ sql """set enable_fallback_to_original_planner=false"""
+ String externalEnvIp =
context.config.otherConfigs.get("externalEnvIp")
+ String hms_port = context.config.otherConfigs.get(hivePrefix +
"HmsPort")
+ String catalog_name = "${hivePrefix}_test_hive_openx_json"
+ String broker_name = "hdfs"
+
+ sql """drop catalog if exists ${catalog_name}"""
+ sql """create catalog if not exists ${catalog_name} properties (
+ 'type'='hms',
+ 'hive.metastore.uris'='thrift://${externalEnvIp}:${hms_port}'
+ );"""
+ sql """use `${catalog_name}`.`openx_json`"""
+
+ try {
+ sql """ select * from json_table """;
+ } catch (Exception e) {
+ log.info(e.getMessage())
+ assertTrue(e.getMessage().contains("DATA_QUALITY_ERROR"))
+ }
+
+ order_qt_q1 """ select * from json_table_ignore_malformed """
+
+
+ try{
+ sql """ select * from json_data_arrays_tb """;
+ } catch (Exception e) {
+ log.info(e.getMessage())
+ assertTrue(e.getMessage().contains("DATA_QUALITY_ERROR"))
+ }
+
+
+ try{
+ sql """ select * from scalar_to_array_tb """;
+ } catch (Exception e) {
+ log.info(e.getMessage())
+ assertTrue(e.getMessage().contains("DATA_QUALITY_ERROR"))
+ }
+
+ sql """ set read_hive_json_in_one_column = true; """
+
+ order_qt_2 """ select * from json_data_arrays_tb """
+ order_qt_3 """ select * from json_one_column_table """
+
+ try{
+ sql """ select * from scalar_to_array_tb """;
+ } catch (Exception e) {
+ log.info(e.getMessage())
+ assertTrue(e.getMessage().contains("is not a string column."))
+ }
+
+
+ sql """drop catalog if exists ${catalog_name}"""
+ } finally {
+ }
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]