This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new ef2f765e432 [Bug](json reader) object should stop processing when
encounter error (#31159)
ef2f765e432 is described below
commit ef2f765e4324035242525deaa8321bfb71fa7f33
Author: lihangyu <[email protected]>
AuthorDate: Wed Feb 21 10:13:43 2024 +0800
[Bug](json reader) object should stop processing when encounter error
(#31159)
If DATA_QUALITY_ERROR encountered we should stop processing this document
any more.Otherwise there will be UB in simdjson.
---
be/src/exprs/json_functions.cpp | 2 +-
be/src/vec/exec/format/json/new_json_reader.cpp | 6 ++--
.../data/load_p0/stream_load/test_json_load.out | 4 +++
.../stream_load/test_malformed_json_with_path.json | 3 ++
.../load_p0/stream_load/test_json_load.groovy | 33 ++++++++++++++++++++--
5 files changed, 42 insertions(+), 6 deletions(-)
diff --git a/be/src/exprs/json_functions.cpp b/be/src/exprs/json_functions.cpp
index ff432c4655a..29c1596ed8f 100644
--- a/be/src/exprs/json_functions.cpp
+++ b/be/src/exprs/json_functions.cpp
@@ -261,7 +261,7 @@ Status
JsonFunctions::extract_from_object(simdjson::ondemand::object& obj,
const std::string& _msg = msg;
\
if (UNLIKELY(_err)) {
\
if (_err == simdjson::NO_SUCH_FIELD || _err ==
simdjson::INDEX_OUT_OF_BOUNDS) { \
- return Status::DataQualityError(
\
+ return Status::NotFound<false>(
\
fmt::format("Not found target filed, err: {}, msg:
{}", \
simdjson::error_message(_err), _msg));
\
}
\
diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp
b/be/src/vec/exec/format/json/new_json_reader.cpp
index 97affdcd0bb..514a925cba4 100644
--- a/be/src/vec/exec/format/json/new_json_reader.cpp
+++ b/be/src/vec/exec/format/json/new_json_reader.cpp
@@ -1199,7 +1199,7 @@ Status
NewJsonReader::_simdjson_handle_flat_array_complex_json_write_columns(
simdjson::ondemand::value val;
Status st = JsonFunctions::extract_from_object(cur,
_parsed_json_root, &val);
if (UNLIKELY(!st.ok())) {
- if (st.is<DATA_QUALITY_ERROR>()) {
+ if (st.is<NOT_FOUND>()) {
RETURN_IF_ERROR(_append_error_msg(nullptr,
st.to_string(), "", nullptr));
ADVANCE_ROW();
continue;
@@ -1630,11 +1630,11 @@ Status
NewJsonReader::_simdjson_write_columns_by_jsonpath(
Status st;
if (i < _parsed_jsonpaths.size()) {
st = JsonFunctions::extract_from_object(*value,
_parsed_jsonpaths[i], &json_value);
- if (!st.ok() && !st.is<DATA_QUALITY_ERROR>()) {
+ if (!st.ok() && !st.is<NOT_FOUND>()) {
return st;
}
}
- if (i >= _parsed_jsonpaths.size() || st.is<DATA_QUALITY_ERROR>()) {
+ if (i >= _parsed_jsonpaths.size() || st.is<NOT_FOUND>()) {
// not match in jsondata, filling with default value
RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr,
valid));
if (!(*valid)) {
diff --git a/regression-test/data/load_p0/stream_load/test_json_load.out
b/regression-test/data/load_p0/stream_load/test_json_load.out
index b9250608475..7351891633d 100644
--- a/regression-test/data/load_p0/stream_load/test_json_load.out
+++ b/regression-test/data/load_p0/stream_load/test_json_load.out
@@ -241,3 +241,7 @@ John 30 New York
{"email":"[email protected]","phone":"+1-123-456-7890"}
100 2345676
200 755
+-- !select26 --
+android \N \N \N \N \N
+android \N \N \N \N \N
+
diff --git
a/regression-test/data/load_p0/stream_load/test_malformed_json_with_path.json
b/regression-test/data/load_p0/stream_load/test_malformed_json_with_path.json
new file mode 100644
index 00000000000..f87ebaa5d3b
--- /dev/null
+++
b/regression-test/data/load_p0/stream_load/test_malformed_json_with_path.json
@@ -0,0 +1,3 @@
+{"app_version":"v1.0.0","app_package":"com.fdf.listen","subject":"USER","ip":"45334","platform":"android","app_name":"听听","pro_brand":"图书","report_time":0,"user_id":"unknown","platform_ID":"1","action":"CLICK","event_name":"section_play","phone_num":"45645642692","pro_code":"unknown","event_value":"device_id":"gikj78675678","media_id":"67867","album_id":"1734","duration":"60","event_time":1706841911773,"object":"play_content"}
+{"app_version":"v1.0.0","app_package":"com.fdf.listen","subject":"USER","ip":"45334","platform":"android","app_name":"听听","pro_brand":"图书","report_time":0,"user_id":"unknown","platform_ID":"1","action":"CLICK","event_name":"section_play","phone_num":"45645642692","pro_code":"unknown","device_id":"gikj78675678","media_id":"67867","album_id":"1734","duration":"60","event_time":1706841911773,"object":"play_content"}
+{"app_version":"v1.0.0","app_package":"com.fdf.listen","subject":"USER","ip":"45334","platform":"android","app_name":"听听","pro_brand":"图书","report_time":0,"user_id":"unknown","platform_ID":"1","action":"CLICK","event_name":"section_play","phone_num":"45645642692","pro_code":"unknown","device_id":"gikj78675678","syscode":123,
"media_id":"67867","album_id":"1734","duration":"60","event_time":1706841911773,"object":"play_content"}
\ No newline at end of file
diff --git a/regression-test/suites/load_p0/stream_load/test_json_load.groovy
b/regression-test/suites/load_p0/stream_load/test_json_load.groovy
index 816765b1233..f41610f3ba2 100644
--- a/regression-test/suites/load_p0/stream_load/test_json_load.groovy
+++ b/regression-test/suites/load_p0/stream_load/test_json_load.groovy
@@ -727,6 +727,35 @@ suite("test_json_load", "p0") {
try_sql("DROP TABLE IF EXISTS ${testTable}")
}
+ // case27: import json with malformed json along with json path
+ try {
+ sql "DROP TABLE IF EXISTS ${testTable}"
+
+ sql """CREATE TABLE IF NOT EXISTS ${testTable}
+ (
+ `syscode` VARCHAR(20) NOT NULL COMMENT "",
+ `event_dt` DateTime NULL COMMENT "",
+ `pro_brand` VARCHAR(20) COMMENT "",
+ `app_package` VARCHAR(50) COMMENT "",
+ `platform` VARCHAR(20) COMMENT "",
+ `log_num` BIGINT DEFAULT "0" COMMENT ""
+ )
+ DUPLICATE KEY(`syscode`,
`event_dt`,`pro_brand`,`app_package`,`platform`)
+ COMMENT ''
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );"""
+
+ load_json_data.call("${testTable}", "${testTable}_case27_1", 'false',
'true', 'json', 'id= id * 10',
'[\"$.platform\",\"$.app_package\",\"$.sysCode\",\"$.sys_code\",\"$.proBrand\",\"$.pro_brand\",\"$.event_time\"]',
+ '', '', '', 'test_malformed_json_with_path.json',
false, 2)
+ sql "sync"
+ qt_select26 "select * from ${testTable}"
+
+ } finally {
+ try_sql("DROP TABLE IF EXISTS ${testTable}")
+ }
+
// test jsonpaths error
try {
sql "DROP TABLE IF EXISTS ${testTable}"
@@ -734,7 +763,7 @@ suite("test_json_load", "p0") {
create_json_test_table.call(testTable)
streamLoad {
table "${testTable}"
- set 'jsonpaths', '[\"Name\", \"Age\", \"Agent_id\"]'
+ set 'jsonpaths', '[\"$.Name\", \"$.Age\", \"$.Agent_id\"]'
set 'format', 'json'
file 'test_json_error.json' // import json file
time 10000 // limit inflight 10s
@@ -754,7 +783,7 @@ suite("test_json_load", "p0") {
def code = process.waitFor()
def out = process.text
log.info("result: ${out}".toString())
- def reason = "Reason: There is no column matching jsonpaths in
the json file, columns:[name, age, agent_id, ], please check columns and
jsonpaths:[\"Name\", \"Age\", \"Agent_id\"]. src line
[{\"name\":\"Name1\",\"age\":21,\"agent_id\":\"1\"}]; \n"
+ def reason = "Reason: There is no column matching jsonpaths in
the json file, columns:[name, age, agent_id, ], please check columns and
jsonpaths:[\"\$.Name\", \"\$.Age\", \"\$.Agent_id\"]. src line
[{\"name\":\"Name1\",\"age\":21,\"agent_id\":\"1\"}]; \n"
assertEquals("${reason}", "${out}")
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]