This is an automated email from the ASF dual-hosted git repository.
eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 195fb730d40 [improve](serde) support json string format with escaped
charactors (#37176)
195fb730d40 is described below
commit 195fb730d409c522da7bf03705cf9adb633fb679
Author: amory <[email protected]>
AuthorDate: Thu Jul 4 15:06:06 2024 +0800
[improve](serde) support json string format with escaped charactors (#37176)
before this pr
if we use some escape from nested types like array, we can not make this
string cast to json
---
.../vec/data_types/serde/data_type_string_serde.h | 46 ++++++++++++-
be/src/vec/functions/function_cast.h | 1 +
regression-test/data/jsonb_p0/test_jsonb_cast.csv | 4 ++
regression-test/data/jsonb_p0/test_jsonb_cast.out | 27 ++++++++
.../suites/jsonb_p0/test_jsonb_cast.groovy | 79 ++++++++++++++++++++++
5 files changed, 154 insertions(+), 3 deletions(-)
diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h
b/be/src/vec/data_types/serde/data_type_string_serde.h
index b74b5857086..24f99a12e67 100644
--- a/be/src/vec/data_types/serde/data_type_string_serde.h
+++ b/be/src/vec/data_types/serde/data_type_string_serde.h
@@ -73,19 +73,59 @@ public:
auto result = check_column_const_set_readability(column, row_num);
ColumnPtr ptr = result.first;
row_num = result.second;
+ const auto& value = assert_cast<const
ColumnType&>(*ptr).get_data_at(row_num);
if (_nesting_level > 1) {
bw.write('"');
}
-
- const auto& value = assert_cast<const
ColumnType&>(*ptr).get_data_at(row_num);
- bw.write(value.data, value.size);
+ if constexpr (std::is_same_v<ColumnType, ColumnString>) {
+ if (options.escape_char != 0) {
+ // we should make deal with some special characters in json
str if we have escape_char
+ StringRef str_ref = value;
+ write_with_escaped_char_to_json(str_ref, bw);
+ } else {
+ bw.write(value.data, value.size);
+ }
+ } else {
+ bw.write(value.data, value.size);
+ }
if (_nesting_level > 1) {
bw.write('"');
}
+
return Status::OK();
}
+ inline void write_with_escaped_char_to_json(StringRef value,
BufferWritable& bw) const {
+ for (char it : value) {
+ switch (it) {
+ case '\b':
+ bw.write("\\b", 2);
+ break;
+ case '\f':
+ bw.write("\\f", 2);
+ break;
+ case '\n':
+ bw.write("\\n", 2);
+ break;
+ case '\r':
+ bw.write("\\r", 2);
+ break;
+ case '\t':
+ bw.write("\\t", 2);
+ break;
+ case '\\':
+ bw.write("\\\\", 2);
+ break;
+ case '"':
+ bw.write("\\\"", 2);
+ break;
+ default:
+ bw.write(it);
+ }
+ }
+ }
+
Status serialize_column_to_json(const IColumn& column, int start_idx, int
end_idx,
BufferWritable& bw, FormatOptions&
options) const override {
SERIALIZE_COLUMN_TO_JSON();
diff --git a/be/src/vec/functions/function_cast.h
b/be/src/vec/functions/function_cast.h
index 17250e10fd7..d4b21aacc5c 100644
--- a/be/src/vec/functions/function_cast.h
+++ b/be/src/vec/functions/function_cast.h
@@ -766,6 +766,7 @@ struct ConvertImplGenericToJsonb {
auto tmp_col = ColumnString::create();
vectorized::DataTypeSerDe::FormatOptions options;
+ options.escape_char = '\\';
for (size_t i = 0; i < input_rows_count; i++) {
// convert to string
tmp_col->clear();
diff --git a/regression-test/data/jsonb_p0/test_jsonb_cast.csv
b/regression-test/data/jsonb_p0/test_jsonb_cast.csv
new file mode 100644
index 00000000000..08b694ddea8
--- /dev/null
+++ b/regression-test/data/jsonb_p0/test_jsonb_cast.csv
@@ -0,0 +1,4 @@
+1 \N
+2 ['{\'x\' : \'{"y" : 1}\', \'t\' : \'{"y" : 2}\'}', '{"x" : 1}']
+3 ['foo\'bar', 'foo"bar', 'foo\\'bar', 'foo\'\'bar']
+4 ['\/some\/cool\/url', '/some/cool/url',
'a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e']
\ No newline at end of file
diff --git a/regression-test/data/jsonb_p0/test_jsonb_cast.out
b/regression-test/data/jsonb_p0/test_jsonb_cast.out
new file mode 100644
index 00000000000..2ab4174c746
--- /dev/null
+++ b/regression-test/data/jsonb_p0/test_jsonb_cast.out
@@ -0,0 +1,27 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !select_1 --
+1 \N
+2 ["{\\'x\\' : \\'{"y" : 1}\\', \\'t\\' : \\'{"y" : 2}\\'}", "{"x" : 1}"]
+3 ["foo\\'bar', 'foo"bar', 'foo\\\\'bar', 'foo\\'\\'bar"]
+4 ["\\/some\\/cool\\/url", "/some/cool/url",
"a\\\\_\\\\c\\\\l\\\\i\\\\c\\\\k\\\\h\\\\o\\\\u\\\\s\\\\e"]
+
+-- !select_2 --
+1 \N
+2 ["{\\'x\\' : \\'{"y" : 1}\\', \\'t\\' : \\'{"y" : 2}\\'}", "{"x" : 1}"]
+3 ["foo\\'bar', 'foo"bar', 'foo\\\\'bar', 'foo\\'\\'bar"]
+4 ["\\/some\\/cool\\/url", "/some/cool/url",
"a\\\\_\\\\c\\\\l\\\\i\\\\c\\\\k\\\\h\\\\o\\\\u\\\\s\\\\e"]
+27 ["{"k1":"v1", "k2": 200}"]
+28 ["{"a.b.c":{"k1.a1":"v31", "k2": 300},"a":"niu"}"]
+29 ["\n\r", "\n\r"]
+30 ["f\r\n", "f\r\n""]
+
+-- !select_json --
+1 \N
+2 ["{\\\\'x\\\\' : \\\\'{\\"y\\" : 1}\\\\', \\\\'t\\\\' : \\\\'{\\"y\\" :
2}\\\\'}","{\\"x\\" : 1}"]
+3 ["foo\\\\'bar', 'foo\\"bar', 'foo\\\\\\\\'bar', 'foo\\\\'\\\\'bar"]
+4
["\\\\/some\\\\/cool\\\\/url","/some/cool/url","a\\\\\\\\_\\\\\\\\c\\\\\\\\l\\\\\\\\i\\\\\\\\c\\\\\\\\k\\\\\\\\h\\\\\\\\o\\\\\\\\u\\\\\\\\s\\\\\\\\e"]
+27 ["{\\"k1\\":\\"v1\\", \\"k2\\": 200}"]
+28 ["{\\"a.b.c\\":{\\"k1.a1\\":\\"v31\\", \\"k2\\":
300},\\"a\\":\\"niu\\"}"]
+29 ["\\f\\n\\r","\\f\\n\\r"]
+30 ["f\\b\\r\\n","f\\b\\r\\n\\""]
+
diff --git a/regression-test/suites/jsonb_p0/test_jsonb_cast.groovy
b/regression-test/suites/jsonb_p0/test_jsonb_cast.groovy
new file mode 100644
index 00000000000..4d1b2aa7181
--- /dev/null
+++ b/regression-test/suites/jsonb_p0/test_jsonb_cast.groovy
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.codehaus.groovy.runtime.IOGroovyMethods
+
+suite("test_jsonb_cast", "p0") {
+
+ // define a sql table with array<text> which has some Escape Character and
should also to cast to json
+ def testTable = "tbl_test_array_text_cast_jsonb"
+ def dataFile = "test_jsonb_cast.csv"
+
+ sql """ set experimental_enable_nereids_planner = true """
+ sql """ set enable_fallback_to_original_planner = true """
+
+ sql "DROP TABLE IF EXISTS ${testTable}"
+
+ sql """
+ CREATE TABLE IF NOT EXISTS ${testTable} (
+ id INT,
+ a ARRAY<TEXT>,
+ )
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 3
+ PROPERTIES("replication_num" = "1");
+ """
+
+ // load the jsonb data from csv file
+ streamLoad {
+ table testTable
+
+ file dataFile // import csv file
+ time 10000 // limit inflight 10s
+ set 'strict_mode', 'true'
+
+ // if declared a check callback, the default check condition will
ignore.
+ // So you must check all condition
+ check { result, exception, startTime, endTime ->
+ if (exception != null) {
+ throw exception
+ }
+ log.info("Stream load result: ${result}".toString())
+ def json = parseJson(result)
+ assertEquals(4, json.NumberTotalRows)
+ assertEquals(4, json.NumberLoadedRows)
+ assertTrue(json.LoadBytes > 0)
+ }
+ }
+
+ sql """ sync; """
+
+ // check result
+ qt_select_1 "SELECT * FROM ${testTable} ORDER BY id"
+
+
+ // insert into valid json rows
+ sql """INSERT INTO ${testTable} VALUES(27, ['{"k1":"v1", "k2": 200}'])"""
+ sql """INSERT INTO ${testTable} VALUES(28, ['{"a.b.c":{"k1.a1":"v31",
"k2": 300},"a":"niu"}'])"""
+ sql """INSERT INTO ${testTable} VALUES(29, ['\f\n\r', "\f\n\r"])"""
+ sql """INSERT INTO ${testTable} VALUES(30, ["\\f\\b\\r\\n",
'\\f\\b\\r\\n"'])"""
+
+ // check result
+ qt_select_2 "SELECT * FROM ${testTable} ORDER BY id"
+ // check cast as json
+ qt_select_json "SELECT id, cast(a as JSON) FROM ${testTable} ORDER BY id"
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]