This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 6bb31b7e1d6f3ce92c3433221ad944090622c583 Author: lihangyu <[email protected]> AuthorDate: Thu Jul 20 10:33:17 2023 +0800 [Improve](simdjson) put unescaped string value after parsed (#21866) In some cases, it is necessary to unescape the original value, such as when converting a string to JSONB. If not unescape, then later jsonb parse will be failed --- be/src/vec/exec/format/json/new_json_reader.cpp | 13 +++- be/src/vec/exec/format/json/new_json_reader.h | 1 + .../data/load_p0/stream_load/test_json_load.out | 3 + .../data/load_p0/stream_load/with_jsonb.json | 1 + .../load_p0/stream_load/test_json_load.groovy | 87 ++++++++++++++-------- 5 files changed, 71 insertions(+), 34 deletions(-) diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp b/be/src/vec/exec/format/json/new_json_reader.cpp index 94c376ae69..02145c3360 100644 --- a/be/src/vec/exec/format/json/new_json_reader.cpp +++ b/be/src/vec/exec/format/json/new_json_reader.cpp @@ -1095,6 +1095,7 @@ Status NewJsonReader::_simdjson_init_reader() { _slot_desc_index[_file_slot_descs[i]->col_name()] = i; } _simdjson_ondemand_padding_buffer.resize(_padded_size); + _simdjson_ondemand_unscape_padding_buffer.resize(_padded_size); return Status::OK(); } @@ -1479,15 +1480,18 @@ Status NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value& break; } default: { - auto str_view = simdjson::to_json_string(value).value(); if (value.type() == simdjson::ondemand::json_type::string) { + uint8_t* unescape_buffer = + reinterpret_cast<uint8_t*>(&_simdjson_ondemand_unscape_padding_buffer[0]); + std::string_view unescaped_value = + _ondemand_json_parser->unescape(value.get_raw_json_string(), unescape_buffer); nullable_column->get_null_map_data().push_back(0); - // trim - column_string->insert_data(str_view.data() + 1, str_view.length() - 2); + column_string->insert_data(unescaped_value.data(), unescaped_value.length()); break; } + auto value_str = simdjson::to_json_string(value).value(); nullable_column->get_null_map_data().push_back(0); - column_string->insert_data(str_view.data(), str_view.length()); + column_string->insert_data(value_str.data(), value_str.length()); } } *valid = true; @@ -1570,6 +1574,7 @@ Status NewJsonReader::_simdjson_parse_json_doc(size_t* size, bool* eof) { // For efficiency reasons, simdjson requires a string with a few bytes (simdjson::SIMDJSON_PADDING) at the end. // Hence, a re-allocation is needed if the space is not enough. _simdjson_ondemand_padding_buffer.resize(*size + simdjson::SIMDJSON_PADDING); + _simdjson_ondemand_unscape_padding_buffer.resize(*size + simdjson::SIMDJSON_PADDING); _padded_size = *size + simdjson::SIMDJSON_PADDING; } // trim BOM since simdjson does not handle UTF-8 Unicode (with BOM) diff --git a/be/src/vec/exec/format/json/new_json_reader.h b/be/src/vec/exec/format/json/new_json_reader.h index 5dbd5b5c28..99106d100f 100644 --- a/be/src/vec/exec/format/json/new_json_reader.h +++ b/be/src/vec/exec/format/json/new_json_reader.h @@ -254,6 +254,7 @@ private: static constexpr size_t _init_buffer_size = 1024 * 1024 * 8; size_t _padded_size = _init_buffer_size + simdjson::SIMDJSON_PADDING; std::string _simdjson_ondemand_padding_buffer; + std::string _simdjson_ondemand_unscape_padding_buffer; // char _simdjson_ondemand_padding_buffer[_padded_size]; simdjson::ondemand::document _original_json_doc; simdjson::ondemand::value _json_value; diff --git a/regression-test/data/load_p0/stream_load/test_json_load.out b/regression-test/data/load_p0/stream_load/test_json_load.out index f7e7d61607..6296f37099 100644 --- a/regression-test/data/load_p0/stream_load/test_json_load.out +++ b/regression-test/data/load_p0/stream_load/test_json_load.out @@ -199,3 +199,6 @@ 10 hefei 23456710 200 changsha 3456789 +-- !select1 -- +John 30 New York {"email":"[email protected]","phone":"+1-123-456-7890"} + diff --git a/regression-test/data/load_p0/stream_load/with_jsonb.json b/regression-test/data/load_p0/stream_load/with_jsonb.json new file mode 100644 index 0000000000..97c537fa0e --- /dev/null +++ b/regression-test/data/load_p0/stream_load/with_jsonb.json @@ -0,0 +1 @@ +{ "name": "John", "age": 30, "city": "New York", "contact": "{ \"email\": \"[email protected]\", \"phone\": \"+1-123-456-7890\" }" } diff --git a/regression-test/suites/load_p0/stream_load/test_json_load.groovy b/regression-test/suites/load_p0/stream_load/test_json_load.groovy index 4e3d2dbf81..7a69114d11 100644 --- a/regression-test/suites/load_p0/stream_load/test_json_load.groovy +++ b/regression-test/suites/load_p0/stream_load/test_json_load.groovy @@ -114,13 +114,13 @@ suite("test_json_load", "p0") { assertTrue(result1[0][0] == 0, "Create table should update 0 rows") } - def load_json_data = {label, strip_flag, read_flag, format_flag, exprs, json_paths, + def load_json_data = {table_name, label, strip_flag, read_flag, format_flag, exprs, json_paths, json_root, where_expr, fuzzy_flag, file_name, ignore_failure=false, expected_succ_rows = -1, load_to_single_tablet = 'true' -> // load the json data streamLoad { - table "test_json_load" + table "${table_name}" // set http request header params set 'label', label + "_" + UUID.randomUUID().toString() @@ -216,7 +216,7 @@ suite("test_json_load", "p0") { create_test_table1.call(testTable) - load_json_data.call('test_json_load_case1_2', 'true', '', 'json', '', '', '', '', '', 'simple_json.json') + load_json_data.call("test_json_load", 'test_json_load_case1_2', 'true', '', 'json', '', '', '', '', '', 'simple_json.json') sql "sync" qt_select1 "select * from ${testTable} order by id" @@ -231,7 +231,7 @@ suite("test_json_load", "p0") { create_test_table1.call(testTable) - load_json_data.call('test_json_load_case2_2', 'true', '', 'json', 'id= id * 10', '', '', '', '', 'simple_json.json') + load_json_data.call("test_json_load", 'test_json_load_case2_2', 'true', '', 'json', 'id= id * 10', '', '', '', '', 'simple_json.json') sql "sync" qt_select2 "select * from ${testTable} order by id" @@ -246,7 +246,7 @@ suite("test_json_load", "p0") { create_test_table2.call(testTable) - load_json_data.call('test_json_load_case3_2', 'true', '', 'json', '', '[\"$.id\", \"$.code\"]', + load_json_data.call("test_json_load", 'test_json_load_case3_2', 'true', '', 'json', '', '[\"$.id\", \"$.code\"]', '', '', '', 'simple_json.json') sql "sync" @@ -262,7 +262,7 @@ suite("test_json_load", "p0") { create_test_table2.call(testTable) - load_json_data.call('test_json_load_case4_2', 'true', '', 'json', 'code = id * 10 + 200', '[\"$.id\"]', + load_json_data.call("test_json_load", 'test_json_load_case4_2', 'true', '', 'json', 'code = id * 10 + 200', '[\"$.id\"]', '', '', '', 'simple_json.json') sql "sync" @@ -278,7 +278,7 @@ suite("test_json_load", "p0") { create_test_table2.call(testTable) - load_json_data.call('test_json_load_case5_2', 'true', 'true', 'json', '', '[\"$.id\", \"$.code\"]', + load_json_data.call("test_json_load", 'test_json_load_case5_2', 'true', 'true', 'json', '', '[\"$.id\", \"$.code\"]', '', '', '', 'multi_line_json.json') sql "sync" @@ -294,7 +294,7 @@ suite("test_json_load", "p0") { create_test_table2.call(testTable) - load_json_data.call('test_json_load_case6_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]', + load_json_data.call("test_json_load", 'test_json_load_case6_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]', '', '', '', 'multi_line_json.json') sql "sync" @@ -310,7 +310,7 @@ suite("test_json_load", "p0") { create_test_table2.call(testTable) - load_json_data.call('test_json_load_case7_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]', + load_json_data.call("test_json_load", 'test_json_load_case7_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]', '', 'id > 50', '', 'multi_line_json.json') sql "sync" @@ -326,7 +326,7 @@ suite("test_json_load", "p0") { create_test_table2.call(testTable) - load_json_data.call('test_json_load_case8_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]', + load_json_data.call("test_json_load", 'test_json_load_case8_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]', '', 'id > 50', 'true', 'multi_line_json.json') sql "sync" @@ -342,7 +342,7 @@ suite("test_json_load", "p0") { create_test_table1.call(testTable) - load_json_data.call('test_json_load_case9_2', '', 'true', 'json', 'id= id * 10', '', + load_json_data.call("test_json_load", 'test_json_load_case9_2', '', 'true', 'json', 'id= id * 10', '', '$.item', '', 'true', 'nest_json.json') sql "sync" @@ -358,7 +358,7 @@ suite("test_json_load", "p0") { create_test_table1.call(testTable) - load_json_data.call('test_json_load_case10_2', '', 'true', 'json', 'id= id * 10', '', + load_json_data.call("test_json_load", 'test_json_load_case10_2', '', 'true', 'json', 'id= id * 10', '', '$.item', '', 'false', 'invalid_json.json', false, 4) sql "sync" @@ -374,7 +374,7 @@ suite("test_json_load", "p0") { create_test_table1.call(testTable) - load_json_data.call('test_json_load_case11_2', 'true', '', 'json', '', '', '', '', '', 'simple_json2.json', false, 10) + load_json_data.call("test_json_load", 'test_json_load_case11_2', 'true', '', 'json', '', '', '', '', '', 'simple_json2.json', false, 10) sql "sync" qt_select11 "select * from ${testTable} order by id" @@ -389,7 +389,7 @@ suite("test_json_load", "p0") { create_test_table1.call(testTable) - load_json_data.call('test_json_load_case12_2', 'true', '', 'json', '', '', '', '', '', 'simple_json2_lack_one_column.json') + load_json_data.call("test_json_load", 'test_json_load_case12_2', 'true', '', 'json', '', '', '', '', '', 'simple_json2_lack_one_column.json') sql "sync" qt_select12 "select * from ${testTable} order by id" @@ -441,15 +441,15 @@ suite("test_json_load", "p0") { create_test_table1.call(testTable) - load_json_data.call('test_json_load_case14_2', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]', + load_json_data.call("test_json_load", 'test_json_load_case14_2', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]', '$.item', '', 'true', 'nest_json.json') // invalid nest_json - load_json_data.call('test_json_load_case14_3', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.city\", \"$.code\"]', + load_json_data.call("test_json_load", 'test_json_load_case14_3', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.city\", \"$.code\"]', '$.item', '', 'true', 'invalid_nest_json1.json', true) - load_json_data.call('test_json_load_case14_4', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.city\", \"$.code\"]', + load_json_data.call("test_json_load", 'test_json_load_case14_4', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.city\", \"$.code\"]', '$.item', '', 'true', 'invalid_nest_json2.json', false, 7) - load_json_data.call('test_json_load_case14_5', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.city\", \"$.code\"]', + load_json_data.call("test_json_load", 'test_json_load_case14_5', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.city\", \"$.code\"]', '$.item', '', 'true', 'invalid_nest_json3.json', true) sql "sync" @@ -465,7 +465,7 @@ suite("test_json_load", "p0") { create_test_table1.call(testTable) - load_json_data.call('test_json_load_case15_2', '', 'true', 'json', 'id, code, city,id= id * 10', + load_json_data.call("test_json_load", 'test_json_load_case15_2', '', 'true', 'json', 'id, code, city,id= id * 10', '[\"$.id\", \"$.code\", \"$.city\"]', '$.item', '', 'true', 'nest_json.json') sql "sync" @@ -481,7 +481,7 @@ suite("test_json_load", "p0") { create_test_table1.call(testTable) - load_json_data.call('test_json_load_case16_2', 'true', '', 'json', 'id, code, city', + load_json_data.call("test_json_load", 'test_json_load_case16_2', 'true', '', 'json', 'id, code, city', '[\"$.id\", \"$.code\", \"$.city[2]\"]', '$.item', '', 'true', 'nest_json_array.json', false, 7) sql "sync" @@ -496,13 +496,13 @@ suite("test_json_load", "p0") { sql "DROP TABLE IF EXISTS ${testTable}" test_invalid_json_array_table.call(testTable) - load_json_data.call('test_json_load_case17', 'true', '', 'json', '', '', + load_json_data.call("test_json_load", 'test_json_load_case17', 'true', '', 'json', '', '', '', '', '', 'invalid_json_array.json', false, 0, 'false') - load_json_data.call('test_json_load_case17_1', 'true', '', 'json', '', '', + load_json_data.call("test_json_load", 'test_json_load_case17_1', 'true', '', 'json', '', '', '$.item', '', '', 'invalid_json_array1.json', false, 0, 'false') - load_json_data.call('test_json_load_case17_2', 'true', '', 'json', '', '', + load_json_data.call("test_json_load", 'test_json_load_case17_2', 'true', '', 'json', '', '', '$.item', '', '', 'invalid_json_array2.json', false, 0, 'false') - load_json_data.call('test_json_load_case17_3', 'true', '', 'json', '', '', + load_json_data.call("test_json_load", 'test_json_load_case17_3', 'true', '', 'json', '', '', '$.item', '', '', 'invalid_json_array3.json', false, 0, 'false') sql "sync" qt_select17 "select * from ${testTable}" @@ -516,13 +516,13 @@ suite("test_json_load", "p0") { sql "DROP TABLE IF EXISTS ${testTable}" create_test_table1.call(testTable) - load_json_data.call('test_json_load_case16_2', 'true', '', 'json', 'id, code, city', + load_json_data.call("test_json_load", 'test_json_load_case16_2', 'true', '', 'json', 'id, code, city', '[\"$.id\", \"$.code\", \"$.city[2]\"]', '$.item', '', 'true', 'invalid_nest_json_array.json', true) - load_json_data.call('test_json_load_case16_2', 'true', '', 'json', 'id, code, city', + load_json_data.call("test_json_load", 'test_json_load_case16_2', 'true', '', 'json', 'id, code, city', '[\"$.id\", \"$.code\", \"$.city[100]\"]', '$.item', '', 'true', 'invalid_nest_json_array1.json', true) - load_json_data.call('test_json_load_case16_2', 'true', '', 'json', 'id, code, city', + load_json_data.call("test_json_load", 'test_json_load_case16_2', 'true', '', 'json', 'id, code, city', '[\"$.id\", \"$.code\", \"$.city\"]', '$.item', '', 'true', 'invalid_nest_json_array2.json', true) - load_json_data.call('test_json_load_case16_2', 'true', '', 'json', 'id, code, city', + load_json_data.call("test_json_load", 'test_json_load_case16_2', 'true', '', 'json', 'id, code, city', '[\"$.id\", \"$.code\", \"$.city[2]\"]', '$.item', '', 'true', 'invalid_nest_json_array3.json', true) sql "sync" @@ -537,7 +537,7 @@ suite("test_json_load", "p0") { sql "DROP TABLE IF EXISTS ${testTable}" create_test_table1.call(testTable) - load_json_data.call('test_json_load_case19', 'false', 'true', 'json', 'Id, cIty, CodE', '', + load_json_data.call("test_json_load", 'test_json_load_case19', 'false', 'true', 'json', 'Id, cIty, CodE', '', '', '', '', 'case_sensitive_json.json', false, 2) sql "sync" qt_select19 "select * from ${testTable} order by id" @@ -552,7 +552,7 @@ suite("test_json_load", "p0") { create_test_table1.call(testTable) - load_json_data.call('test_json_load_case1_2', 'true', '', 'json', '', '', '', '', '', 'simple_json_bom.json') + load_json_data.call("test_json_load", 'test_json_load_case1_2', 'true', '', 'json', '', '', '', '', '', 'simple_json_bom.json') sql "sync" qt_select1 "select * from ${testTable} order by id" @@ -561,6 +561,33 @@ suite("test_json_load", "p0") { try_sql("DROP TABLE IF EXISTS ${testTable}") } + // case21: import json with jsonb field + try { + testTable = "with_jsonb" + sql "DROP TABLE IF EXISTS ${testTable}" + sql """ + CREATE TABLE ${testTable} ( + `name` varchar(29) NOT NULL COMMENT '年月', + `age` int, + `city` varchar(29), + `contact` jsonb + ) ENGINE=OLAP + DUPLICATE KEY(`name`) + COMMENT '流水月结明细表' + DISTRIBUTED BY RANDOM BUCKETS auto + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + load_json_data.call("with_jsonb", "with_jsonb_load" , 'false', '', 'json', '', '', '', '', '', 'with_jsonb.json') + + sql "sync" + qt_select1 "select * from ${testTable}" + + } finally { + try_sql("DROP TABLE IF EXISTS ${testTable}") + } + // if 'enableHdfs' in regression-conf.groovy has been set to true, // the test will run these case as below. if (enableHdfs()) { --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
