This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 677435cef82 [Pick](Branch-2.1) pick json reader fix and support 
specify $. as column (#39271)
677435cef82 is described below

commit 677435cef822ea88ea96954963d571ef0493221d
Author: lihangyu <[email protected]>
AuthorDate: Tue Aug 13 17:44:45 2024 +0800

    [Pick](Branch-2.1) pick json reader fix and support specify $. as column 
(#39271)
    
    #39206
    #38213
---
 be/src/exprs/json_functions.cpp                    | 14 +++++-
 be/src/exprs/json_functions.h                      |  2 +
 be/src/vec/exec/format/json/new_json_reader.cpp    | 14 +++++-
 .../test_json_extract_path_invalid_type.json       | 13 ++++++
 .../data/load_p0/stream_load/test_json_load.out    | 10 ++++
 .../load_p0/stream_load/test_read_root_path.json   |  4 ++
 .../load_p0/stream_load/test_json_load.groovy      | 54 ++++++++++++++++++++++
 7 files changed, 108 insertions(+), 3 deletions(-)

diff --git a/be/src/exprs/json_functions.cpp b/be/src/exprs/json_functions.cpp
index 29c1596ed8f..ac4d64bc62f 100644
--- a/be/src/exprs/json_functions.cpp
+++ b/be/src/exprs/json_functions.cpp
@@ -24,6 +24,7 @@
 #include <rapidjson/stringbuffer.h>
 #include <rapidjson/writer.h>
 #include <re2/re2.h>
+#include <simdjson/error.h>
 #include <simdjson/simdjson.h> // IWYU pragma: keep
 #include <stdlib.h>
 
@@ -254,13 +255,17 @@ Status 
JsonFunctions::extract_from_object(simdjson::ondemand::object& obj,
                                           const std::vector<JsonPath>& 
jsonpath,
                                           simdjson::ondemand::value* value) 
noexcept {
 // Return DataQualityError when it's a malformed json.
-// Otherwise the path was not found, due to array out of bound or not exist
+// Otherwise the path was not found, due to
+// 1. array out of bound
+// 2. not exist such field in object
+// 3. the input type is not object but could be null or other types and lead 
to simdjson::INCORRECT_TYPE
 #define HANDLE_SIMDJSON_ERROR(err, msg)                                        
             \
     do {                                                                       
             \
         const simdjson::error_code& _err = err;                                
             \
         const std::string& _msg = msg;                                         
             \
         if (UNLIKELY(_err)) {                                                  
             \
-            if (_err == simdjson::NO_SUCH_FIELD || _err == 
simdjson::INDEX_OUT_OF_BOUNDS) { \
+            if (_err == simdjson::NO_SUCH_FIELD || _err == 
simdjson::INDEX_OUT_OF_BOUNDS || \
+                _err == simdjson::INCORRECT_TYPE) {                            
             \
                 return Status::NotFound<false>(                                
             \
                         fmt::format("Not found target filed, err: {}, msg: 
{}",             \
                                     simdjson::error_message(_err), _msg));     
             \
@@ -348,4 +353,9 @@ void JsonFunctions::merge_objects(rapidjson::Value& 
dst_object, rapidjson::Value
     }
 }
 
+// root path "$."
+bool JsonFunctions::is_root_path(const std::vector<JsonPath>& json_path) {
+    return json_path.size() == 2 && json_path[0].key == "$" && 
json_path[1].key.empty();
+}
+
 } // namespace doris
diff --git a/be/src/exprs/json_functions.h b/be/src/exprs/json_functions.h
index 72aa522ff37..11970eb8c46 100644
--- a/be/src/exprs/json_functions.h
+++ b/be/src/exprs/json_functions.h
@@ -116,6 +116,8 @@ public:
 
     static std::string print_json_value(const rapidjson::Value& value);
 
+    static bool is_root_path(const std::vector<JsonPath>& json_path);
+
 private:
     static rapidjson::Value* match_value(const std::vector<JsonPath>& 
parsed_paths,
                                          rapidjson::Value* document,
diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp 
b/be/src/vec/exec/format/json/new_json_reader.cpp
index 9872b1c3150..40dc6dda5f9 100644
--- a/be/src/vec/exec/format/json/new_json_reader.cpp
+++ b/be/src/vec/exec/format/json/new_json_reader.cpp
@@ -1659,7 +1659,19 @@ Status 
NewJsonReader::_simdjson_write_columns_by_jsonpath(
                 return st;
             }
         }
-        if (i >= _parsed_jsonpaths.size() || st.is<NOT_FOUND>()) {
+        if (i < _parsed_jsonpaths.size() && 
JsonFunctions::is_root_path(_parsed_jsonpaths[i])) {
+            // Indicate that the jsonpath is "$.", read the full root json 
object, insert the original doc directly
+            ColumnNullable* nullable_column = nullptr;
+            IColumn* target_column_ptr = nullptr;
+            if (slot_desc->is_nullable()) {
+                nullable_column = assert_cast<ColumnNullable*>(column_ptr);
+                target_column_ptr = &nullable_column->get_nested_column();
+            }
+            auto* column_string = 
assert_cast<ColumnString*>(target_column_ptr);
+            
column_string->insert_data(_simdjson_ondemand_padding_buffer.data(),
+                                       _original_doc_size);
+            has_valid_value = true;
+        } else if (i >= _parsed_jsonpaths.size() || st.is<NOT_FOUND>()) {
             // not match in jsondata, filling with default value
             RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, 
valid));
             if (!(*valid)) {
diff --git 
a/regression-test/data/load_p0/stream_load/test_json_extract_path_invalid_type.json
 
b/regression-test/data/load_p0/stream_load/test_json_extract_path_invalid_type.json
new file mode 100644
index 00000000000..945b4143022
--- /dev/null
+++ 
b/regression-test/data/load_p0/stream_load/test_json_extract_path_invalid_type.json
@@ -0,0 +1,13 @@
+[
+    {
+        "id": 789,
+        "city": {
+            "name": "beijing",
+            "region": "haidian"
+        }
+    },
+    {
+        "id": 1111,
+        "city": null
+    }
+]
\ No newline at end of file
diff --git a/regression-test/data/load_p0/stream_load/test_json_load.out 
b/regression-test/data/load_p0/stream_load/test_json_load.out
index 588b6edb004..1d6777bb21e 100644
--- a/regression-test/data/load_p0/stream_load/test_json_load.out
+++ b/regression-test/data/load_p0/stream_load/test_json_load.out
@@ -250,3 +250,13 @@ test       k2_value
 
 -- !select29 --
 10     \N
+
+-- !select30 --
+12345  {"k1":12345,"k2":"11111","k3":111111,"k4":[11111]}      
{"k1":12345,"k2":"11111","k3":111111,"k4":[11111]}      111111
+12346  {"k1":12346,"k2":"22222","k4":[22222]}  
{"k1":12346,"k2":"22222","k4":[22222]}  \N
+12347  {"k1":12347,"k3":"33333","k4":[22222]}  
{"k1":12347,"k3":"33333","k4":[22222]}  33333
+12348  {"k1":12348,"k3":"33333","k5":{"k51":1024,"xxxx":[11111]}}      
{"k1":12348,"k3":"33333","k5":{"k51":1024,"xxxx":[11111]}}      33333
+
+-- !select31 --
+789    beijing haidian
+1111   \N      \N
\ No newline at end of file
diff --git a/regression-test/data/load_p0/stream_load/test_read_root_path.json 
b/regression-test/data/load_p0/stream_load/test_read_root_path.json
new file mode 100644
index 00000000000..777ccbbfb1f
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/test_read_root_path.json
@@ -0,0 +1,4 @@
+{"k1" : 12345, "k2" : "11111", "k3" : 111111, "k4" : [11111]}
+{"k1" : 12346, "k2" : "22222", "k4" : [22222]}
+{"k1" : 12347, "k3" : "33333", "k4" : [22222]}
+{"k1" : 12348, "k3" : "33333", "k5" : {"k51" : 1024, "xxxx" : [11111]}}
\ No newline at end of file
diff --git a/regression-test/suites/load_p0/stream_load/test_json_load.groovy 
b/regression-test/suites/load_p0/stream_load/test_json_load.groovy
index 7f182a44b74..8b8e1417bcf 100644
--- a/regression-test/suites/load_p0/stream_load/test_json_load.groovy
+++ b/regression-test/suites/load_p0/stream_load/test_json_load.groovy
@@ -878,4 +878,58 @@ suite("test_json_load", "p0") {
     } finally {
         try_sql("DROP TABLE IF EXISTS ${testTable}")
     }
+
+    // support read "$."  as root
+    try {
+        sql "DROP TABLE IF EXISTS ${testTable}"
+        sql """CREATE TABLE IF NOT EXISTS ${testTable} 
+            (
+                `k1` varchar(1024) NULL,
+                `k2` variant  NULL,
+                `k3` variant  NULL,
+                `k4` variant  NULL
+            )
+            DUPLICATE KEY(`k1`)
+            COMMENT ''
+            DISTRIBUTED BY RANDOM BUCKETS 1
+            PROPERTIES (
+            "replication_allocation" = "tag.location.default: 1"
+            );"""
+
+        load_json_data.call("${testTable}", "${testTable}_case30", 'false', 
'true', 'json', '', '[\"$.k1\",\"$.\", \"$.\", \"$.k3\"]',
+                             '', '', '', 'test_read_root_path.json')
+        
+        sql "sync"
+        qt_select30 "select * from ${testTable} order by k1"
+
+    } finally {
+        // try_sql("DROP TABLE IF EXISTS ${testTable}")
+    }
+
+    // test extract json path with invalid type(none object types like null)
+    try {
+        sql "DROP TABLE IF EXISTS ${testTable}"
+        sql """
+            CREATE TABLE ${testTable} (
+              `id` int NOT NULL,
+              `name` varchar(24) NULL,
+              `region` varchar(30) NULL
+            ) ENGINE=OLAP
+            DUPLICATE KEY(`id`)
+            COMMENT ''
+            DISTRIBUTED BY RANDOM BUCKETS AUTO
+            PROPERTIES (
+            "replication_allocation" = "tag.location.default: 1"
+            ); 
+            """
+
+        load_json_data.call("${testTable}", "${testTable}_case31", 'true', 
'false', 'json', '', '[\"$.id\", \"$.city.name\", \"$.city.region\"]',
+                             '', '', '', 
'test_json_extract_path_invalid_type.json', false, 2)
+        
+        sql "sync"
+        qt_select31 "select * from ${testTable} order by id"
+
+    } finally {
+        // try_sql("DROP TABLE IF EXISTS ${testTable}")
+    }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to