This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 436a73b231b branch-4.0: [fix](load) Fix undefined behavior in the JSON
reader caused by multiple calls to the get_string method #58107 (#58185)
436a73b231b is described below
commit 436a73b231b939d103264cff2ab141d68d442255
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Thu Nov 20 22:23:56 2025 +0800
branch-4.0: [fix](load) Fix undefined behavior in the JSON reader caused by
multiple calls to the get_string method #58107 (#58185)
Cherry-picked from #58107
Co-authored-by: Jerry Hu <[email protected]>
---
be/src/vec/exec/format/json/new_json_reader.cpp | 34 +++++++--
be/src/vec/exec/format/json/new_json_reader.h | 20 +++++
.../stream_load/load_json_with_jsonpath.out | 10 +++
.../stream_load/test_load_with_jsonpath2.json | 20 +++++
.../stream_load/load_json_with_jsonpath.groovy | 87 +++++++++++++++++++++-
5 files changed, 160 insertions(+), 11 deletions(-)
diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp
b/be/src/vec/exec/format/json/new_json_reader.cpp
index bd143d9c4bd..a78d6ea0ed6 100644
--- a/be/src/vec/exec/format/json/new_json_reader.cpp
+++ b/be/src/vec/exec/format/json/new_json_reader.cpp
@@ -909,6 +909,7 @@ Status
NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val
bool has_valid_value = false;
// iterate through object, simdjson::ondemond will parsing on the fly
size_t key_index = 0;
+
for (auto field : *value) {
std::string_view key = field.unescaped_key();
StringRef name_ref(key.data(), key.size());
@@ -937,7 +938,7 @@ Status
NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val
}
simdjson::ondemand::value val = field.value();
auto* column_ptr =
block.get_by_position(column_index).column->assume_mutable().get();
- RETURN_IF_ERROR(_simdjson_write_data_to_column(
+ RETURN_IF_ERROR(_simdjson_write_data_to_column<false>(
val, slot_descs[column_index]->type(), column_ptr,
slot_descs[column_index]->col_name(), _serdes[column_index],
valid));
if (!(*valid)) {
@@ -1028,6 +1029,7 @@ Status
NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val
return Status::OK();
}
+template <bool use_string_cache>
Status
NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value& value,
const DataTypePtr&
type_desc,
vectorized::IColumn*
column_ptr,
@@ -1064,7 +1066,20 @@ Status
NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value&
auto primitive_type = type_desc->get_primitive_type();
if (_is_load || !is_complex_type(primitive_type)) {
if (value.type() == simdjson::ondemand::json_type::string) {
- std::string_view value_string = value.get_string();
+ std::string_view value_string;
+ if constexpr (use_string_cache) {
+ const auto cache_key = value.raw_json().value();
+ if (_cached_string_values.contains(cache_key)) {
+ value_string = _cached_string_values[cache_key];
+ } else {
+ value_string = value.get_string();
+ _cached_string_values.emplace(cache_key, value_string);
+ }
+ } else {
+ DCHECK(_cached_string_values.empty());
+ value_string = value.get_string();
+ }
+
Slice slice {value_string.data(), value_string.size()};
RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr,
slice,
_serde_options));
@@ -1125,7 +1140,7 @@ Status
NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value&
has_value[sub_column_idx] = true;
const auto& sub_col_type =
type_struct->get_element(sub_column_idx);
- RETURN_IF_ERROR(_simdjson_write_data_to_column(
+ RETURN_IF_ERROR(_simdjson_write_data_to_column<use_string_cache>(
sub.value(), sub_col_type, sub_column_ptr.get(),
column_name + "." + sub_key,
sub_serdes[sub_column_idx], valid));
}
@@ -1184,7 +1199,7 @@ Status
NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value&
sub_serdes[0], _serde_options, valid));
simdjson::ondemand::value field_value = member_value.value();
- RETURN_IF_ERROR(_simdjson_write_data_to_column(
+ RETURN_IF_ERROR(_simdjson_write_data_to_column<use_string_cache>(
field_value,
assert_cast<const
DataTypeMap*>(remove_nullable(type_desc).get())
->get_value_type(),
@@ -1209,7 +1224,7 @@ Status
NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value&
int field_count = 0;
for (simdjson::ondemand::value sub_value : array_value) {
- RETURN_IF_ERROR(_simdjson_write_data_to_column(
+ RETURN_IF_ERROR(_simdjson_write_data_to_column<use_string_cache>(
sub_value,
assert_cast<const
DataTypeArray*>(remove_nullable(type_desc).get())
->get_nested_type(),
@@ -1405,6 +1420,9 @@ Status NewJsonReader::_simdjson_write_columns_by_jsonpath(
Block& block, bool* valid) {
// write by jsonpath
bool has_valid_value = false;
+
+ Defer clear_defer([this]() { _cached_string_values.clear(); });
+
for (size_t i = 0; i < slot_descs.size(); i++) {
auto* slot_desc = slot_descs[i];
if (!slot_desc->is_materialized()) {
@@ -1439,9 +1457,9 @@ Status NewJsonReader::_simdjson_write_columns_by_jsonpath(
return Status::OK();
}
} else {
- RETURN_IF_ERROR(_simdjson_write_data_to_column(json_value,
slot_desc->type(),
- column_ptr,
slot_desc->col_name(),
- _serdes[i], valid));
+ RETURN_IF_ERROR(_simdjson_write_data_to_column<true>(json_value,
slot_desc->type(),
+ column_ptr,
slot_desc->col_name(),
+ _serdes[i],
valid));
if (!(*valid)) {
return Status::OK();
}
diff --git a/be/src/vec/exec/format/json/new_json_reader.h
b/be/src/vec/exec/format/json/new_json_reader.h
index 283b1c6251e..338611ecacf 100644
--- a/be/src/vec/exec/format/json/new_json_reader.h
+++ b/be/src/vec/exec/format/json/new_json_reader.h
@@ -26,6 +26,7 @@
#include <memory>
#include <string>
+#include <string_view>
#include <unordered_map>
#include <unordered_set>
#include <vector>
@@ -143,6 +144,7 @@ private:
Status _simdjson_set_column_value(simdjson::ondemand::object* value,
Block& block,
const std::vector<SlotDescriptor*>&
slot_descs, bool* valid);
+ template <bool use_string_cache>
Status _simdjson_write_data_to_column(simdjson::ondemand::value& value,
const DataTypePtr& type_desc,
vectorized::IColumn* column_ptr,
@@ -258,6 +260,24 @@ private:
// column to default value string map
std::unordered_map<std::string, std::string> _col_default_value_map;
+ // From document of simdjson:
+ // ```
+ // Important: a value should be consumed once. Calling get_string()
twice on the same value is an error.
+ // ```
+ // We should cache the string_views to avoid multiple get_string() calls.
+ struct StringViewHash {
+ size_t operator()(const std::string_view& str) const {
+ return std::hash<int64_t>()(reinterpret_cast<int64_t>(str.data()));
+ }
+ };
+ struct StringViewEqual {
+ bool operator()(const std::string_view& lhs, const std::string_view&
rhs) const {
+ return lhs.data() == rhs.data() && lhs.size() == rhs.size();
+ }
+ };
+ std::unordered_map<std::string_view, std::string_view, StringViewHash,
StringViewEqual>
+ _cached_string_values;
+
int32_t skip_bitmap_col_idx {-1};
//Used to indicate whether it is a stream load. When loading, only data
will be inserted into columnString.
diff --git
a/regression-test/data/load_p0/stream_load/load_json_with_jsonpath.out
b/regression-test/data/load_p0/stream_load/load_json_with_jsonpath.out
index 43037b624dd..84e9bd05565 100644
--- a/regression-test/data/load_p0/stream_load/load_json_with_jsonpath.out
+++ b/regression-test/data/load_p0/stream_load/load_json_with_jsonpath.out
@@ -9,3 +9,13 @@
1000 7395.231067
2000 \N
+-- !test_select2 --
+22 7291 7291 7291 \N I am a long string here. I am a
long string here. I am a long string here. I am another long
string here 4. I am another long string here 4. I am another long
string here 4. I am another long string here 4. I am another long
string here 4. I am another long string here 4. I am another long
string here 4. I am another long string here 4. \N
+7291.703724 2000 2000 2000 \N I am a long string here.
I am a long string here. I am a long string here. I am another
long string here 3. I am another long string here 3. I am another
long string here 3. I am another long string here 3. I am another
long string here 3. I am another long string here 3. I am another
long string here 3. I am another long string here 3. \N
+7395.231067 1000 1000 1000 \N I am a long string here.
I am a long string here. I am a long string here. I am another
long string here 2. I am another long string here 2. I am another
long string here 2. I am another long string here 2. I am another
long string here 2. I am another long string here 2. I am another
long string here 2. I am another long string here 2. \N
+
+-- !test_select3 --
+22 7291 I am a long string here. I am another long string here 4.
+7291.703724 2000 I am a long string here. I am another long
string here 3.
+7395.231067 1000 I am a long string here. I am another long
string here 2.
+
diff --git
a/regression-test/data/load_p0/stream_load/test_load_with_jsonpath2.json
b/regression-test/data/load_p0/stream_load/test_load_with_jsonpath2.json
new file mode 100644
index 00000000000..0028701ca0e
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/test_load_with_jsonpath2.json
@@ -0,0 +1,20 @@
+[
+ {
+ "k1": "7395.231067",
+ "k2": "1000",
+ "k3": "I am a long string here.",
+ "k4": "I am another long string here 2."
+ },
+ {
+ "k1": "7291.703724",
+ "k2": "2000",
+ "k3": "I am a long string here.",
+ "k4": "I am another long string here 3."
+ },
+ {
+ "k1": "22",
+ "k2": "7291",
+ "k3": "I am a long string here.",
+ "k4": "I am another long string here 4."
+ }
+]
\ No newline at end of file
diff --git
a/regression-test/suites/load_p0/stream_load/load_json_with_jsonpath.groovy
b/regression-test/suites/load_p0/stream_load/load_json_with_jsonpath.groovy
index 963cfbfa9a6..d28a17140b8 100644
--- a/regression-test/suites/load_p0/stream_load/load_json_with_jsonpath.groovy
+++ b/regression-test/suites/load_p0/stream_load/load_json_with_jsonpath.groovy
@@ -34,7 +34,7 @@ suite("test_load_json_with_jsonpath", "p0") {
"""
}
- def load_array_data = {new_json_reader_flag, table_name, strip_flag,
read_flag, format_flag, exprs, json_paths,
+ def load_array_data = { table_name, strip_flag, read_flag, format_flag,
exprs, json_paths,
json_root, where_expr, fuzzy_flag, column_sep,
file_name ->
// load the json data
streamLoad {
@@ -82,17 +82,98 @@ suite("test_load_json_with_jsonpath", "p0") {
create_test_table.call()
- load_array_data.call('false', testTable, 'true', '', 'json', '',
'["$.k1", "$.v1"]', '', '', '', '', 'test_load_with_jsonpath.json')
+ load_array_data.call(testTable, 'true', '', 'json', '', '["$.k1",
"$.v1"]', '', '', '', '', 'test_load_with_jsonpath.json')
check_data_correct(testTable)
// test new json load, should be deleted after new_load_scan ready
sql "DROP TABLE IF EXISTS ${testTable}"
create_test_table.call()
- load_array_data.call('true', testTable, 'true', '', 'json', '',
'["$.k1", "$.v1"]', '', '', '', '', 'test_load_with_jsonpath.json')
+ load_array_data.call(testTable, 'true', '', 'json', '', '["$.k1",
"$.v1"]', '', '', '', '', 'test_load_with_jsonpath.json')
check_data_correct(testTable)
} finally {
try_sql("DROP TABLE IF EXISTS ${testTable}")
}
+
+ // case2 with duplicate json paths
+ try {
+ sql "DROP TABLE IF EXISTS tbl_test_load_json_with_jsonpath2"
+
+ sql """
+ CREATE TABLE IF NOT EXISTS tbl_test_load_json_with_jsonpath2 (
+ `k1` varchar(128) NULL COMMENT "",
+ `k2` int NULL COMMENT "",
+ `k22` int NULL COMMENT "",
+ `k222` int NULL COMMENT "",
+ `k2222` int NULL COMMENT "",
+ `k3` STRING NULL COMMENT "",
+ `k33` STRING NULL COMMENT "",
+ `k333` STRING NULL COMMENT "",
+ `k3333` STRING NULL COMMENT "",
+ `k4` STRING NULL COMMENT "",
+ `k44` STRING NULL COMMENT "",
+ `k444` STRING NULL COMMENT "",
+ `k4444` STRING NULL COMMENT "",
+ `k44444` STRING NULL COMMENT "",
+ `k444444` STRING NULL COMMENT "",
+ `k4444444` STRING NULL COMMENT "",
+ `k44444444` STRING NULL COMMENT "",
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`k1`)
+ DISTRIBUTED BY HASH(`k1`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "storage_format" = "V2"
+ )
+ """
+
+ load_array_data.call(
+ 'tbl_test_load_json_with_jsonpath2',
+ 'true',
+ '',
+ 'json',
+ '',
+ '["$.k1", "$.k2", "$.k2", "$.k2", "$.k3", "$.k3", "$.k3","$.k3",
"$.k4", "$.k4", "$.k4", "$.k4", "$.k4", "$.k4", "$.k4", "$.k4"]',
+ '', '', '', '', 'test_load_with_jsonpath2.json')
+
+ qt_test_select2 "select * from tbl_test_load_json_with_jsonpath2 order
by k1"
+
+ } finally {
+
+ }
+
+ // case3 without json paths
+ try {
+ sql "DROP TABLE IF EXISTS tbl_test_load_json_with_jsonpath3"
+
+ sql """
+ CREATE TABLE IF NOT EXISTS tbl_test_load_json_with_jsonpath3 (
+ `k1` varchar(128) NULL COMMENT "",
+ `k2` int NULL COMMENT "",
+ `k3` STRING NULL COMMENT "",
+ `k4` STRING NULL COMMENT ""
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`k1`)
+ DISTRIBUTED BY HASH(`k1`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "storage_format" = "V2"
+ )
+ """
+
+ load_array_data.call(
+ 'tbl_test_load_json_with_jsonpath3',
+ 'true',
+ '',
+ 'json',
+ '',
+ '[]',
+ '', '', '', '', 'test_load_with_jsonpath2.json')
+
+ qt_test_select3 "select * from tbl_test_load_json_with_jsonpath3 order
by k1"
+
+ } finally {
+
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]