This is an automated email from the ASF dual-hosted git repository.
zykkk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new ddd7e9871d [improvement](Jsonb) optimization Jsonb path parse (#21495)
ddd7e9871d is described below
commit ddd7e9871ddcf177c47c7e21e2ec5d9232133739
Author: Liqf <[email protected]>
AuthorDate: Sun Jul 23 18:59:12 2023 +0800
[improvement](Jsonb) optimization Jsonb path parse (#21495)
The previous logic was to read jsonbvalue while parsing the json path. For
complex json paths, there will be a lot of repeated parsing work. The
optimization idea is to separate the analysis and value of jsonpath
---
be/src/util/jsonb_document.h | 273 ++++++++++++++++----------------
be/src/vec/functions/function_jsonb.cpp | 101 +++++++++---
2 files changed, 213 insertions(+), 161 deletions(-)
diff --git a/be/src/util/jsonb_document.h b/be/src/util/jsonb_document.h
index d54e7291dd..c21917e066 100644
--- a/be/src/util/jsonb_document.h
+++ b/be/src/util/jsonb_document.h
@@ -309,14 +309,6 @@ public:
bool get_has_escapes() const { return has_escapes; }
- void set_is_invalid_json_path(bool has) { is_invalid_json_path = has; }
-
- bool get_is_invalid_json_path() const { return is_invalid_json_path; }
-
- void set_type(unsigned int code) { type = code; }
-
- bool get_type() const { return type; }
-
private:
/// The current position in the stream.
const char* m_position;
@@ -332,9 +324,17 @@ private:
///Whether to contain escape characters
bool has_escapes = false;
+};
+
+struct leg_info {
+ ///path leg ptr
+ char* leg_ptr;
+
+ ///path leg len
+ unsigned int leg_len;
- ///Is the json path valid
- bool is_invalid_json_path = false;
+ ///array_index
+ int array_index;
///type: 0 is member 1 is array
unsigned int type;
@@ -343,10 +343,24 @@ private:
class JsonbPath {
public:
// parse json path
- static bool parsePath(Stream* stream);
+ static bool parsePath(Stream* stream, JsonbPath* path);
+
+ static bool parse_array(Stream* stream, JsonbPath* path);
+ static bool parse_member(Stream* stream, JsonbPath* path);
+
+ //return true if json path valid else return false
+ bool seek(const char* string, size_t length);
+
+ void add_leg_to_leg_vector(std::unique_ptr<leg_info> leg) {
+ leg_vector.emplace_back(leg.release());
+ }
- static bool parse_array(Stream* stream);
- static bool parse_member(Stream* stream);
+ size_t get_leg_vector_size() { return leg_vector.size(); }
+
+ leg_info* get_leg_from_leg_vector(size_t i) { return leg_vector[i].get(); }
+
+private:
+ std::vector<std::unique_ptr<leg_info>> leg_vector;
};
/*
@@ -529,15 +543,8 @@ public:
// get the raw byte array of the value
const char* getValuePtr() const;
- // find the JSONB value by a key path string (null terminated)
- JsonbValue* findPath(const char* key_path, bool& is_invalid_json_path,
- hDictFind handler = nullptr) {
- return findPath(key_path, (unsigned int)strlen(key_path),
is_invalid_json_path, handler);
- }
-
- // find the JSONB value by a key path string (with length)
- JsonbValue* findPath(const char* key_path, unsigned int len, bool&
is_invalid_json_path,
- hDictFind handler);
+ // find the JSONB value by JsonbPath
+ JsonbValue* findValue(JsonbPath& path, hDictFind handler);
friend class JsonbDocument;
protected:
@@ -1207,154 +1214,100 @@ inline const char* JsonbValue::getValuePtr() const {
}
}
-inline JsonbValue* JsonbValue::findPath(const char* key_path, unsigned int
kp_len,
- bool& is_invalid_json_path, hDictFind
handler = nullptr) {
- if (!key_path) return nullptr;
- if (kp_len == 0) {
- is_invalid_json_path = true;
- return nullptr;
- }
+inline bool JsonbPath::seek(const char* key_path, size_t kp_len) {
+ //path invalid
+ if (!key_path || kp_len == 0) return false;
Stream stream(key_path, kp_len);
stream.skip_whitespace();
if (stream.exhausted() || stream.read() != SCOPE) {
- is_invalid_json_path = true;
- return nullptr;
+ //path invalid
+ return false;
}
- JsonbValue* pval = this;
-
- while (pval && !stream.exhausted()) {
+ while (!stream.exhausted()) {
stream.skip_whitespace();
stream.clear_leg_ptr();
stream.clear_leg_len();
- if (!JsonbPath::parsePath(&stream)) {
- is_invalid_json_path = stream.get_is_invalid_json_path();
- return nullptr;
- }
-
- if (stream.get_leg_len() == 0) {
- return nullptr;
+ if (!JsonbPath::parsePath(&stream, this)) {
+ //path invalid
+ return false;
}
+ }
+ return true;
+}
- if (stream.get_type() == MEMBER_CODE) {
+inline JsonbValue* JsonbValue::findValue(JsonbPath& path, hDictFind handler) {
+ JsonbValue* pval = this;
+ for (size_t i = 0; i < path.get_leg_vector_size(); ++i) {
+ switch (path.get_leg_from_leg_vector(i)->type) {
+ case MEMBER_CODE: {
if (LIKELY(pval->type_ == JsonbType::T_Object)) {
- if (stream.get_leg_len() == 1 && *stream.get_leg_ptr() ==
WILDCARD) {
- return pval;
- } else if (stream.get_has_escapes()) {
- stream.remove_escapes();
+ if (path.get_leg_from_leg_vector(i)->leg_len == 1 &&
+ *path.get_leg_from_leg_vector(i)->leg_ptr == WILDCARD) {
+ continue;
}
pval = ((ObjectVal*)pval)
- ->find(stream.get_leg_ptr(),
stream.get_leg_len(), handler);
+ ->find(path.get_leg_from_leg_vector(i)->leg_ptr,
+
path.get_leg_from_leg_vector(i)->leg_len, handler);
if (!pval) return nullptr;
+ continue;
} else {
return nullptr;
}
- } else if (stream.get_type() == ARRAY_CODE) {
- int index = 0;
- std::string_view idx_string(stream.get_leg_ptr(),
stream.get_leg_len());
-
- if (stream.get_leg_len() == 1 && *stream.get_leg_ptr() ==
WILDCARD) {
+ }
+ case ARRAY_CODE: {
+ if (path.get_leg_from_leg_vector(i)->leg_len == 1 &&
+ *path.get_leg_from_leg_vector(i)->leg_ptr == WILDCARD) {
if (LIKELY(pval->type_ == JsonbType::T_Array)) {
- stream.skip(1);
- stream.skip_whitespace();
continue;
} else {
return nullptr;
}
- } else if (std::equal(LAST, LAST + 4, stream.get_leg_ptr(),
- [](char c1, char c2) {
- return std::tolower(c1) ==
std::tolower(c2);
- }) &&
- stream.get_leg_len() >= 4) {
- auto pos = idx_string.find(MINUS);
-
- if (pos != std::string::npos) {
- idx_string = idx_string.substr(pos + 1);
-
- auto result = std::from_chars(idx_string.data(),
- idx_string.data() +
idx_string.size(), index);
- if (result.ec != std::errc()) {
- is_invalid_json_path = true;
- return nullptr;
- }
-
- if (pval->type_ == JsonbType::T_Object) {
- if (index == 0) {
- continue;
- } else {
- return nullptr;
- }
- } else if (LIKELY(pval->type_ == JsonbType::T_Array)) {
- size_t num = ((ArrayVal*)pval)->numElem();
- if (index > num) return nullptr;
- index = num - 1 - index;
- } else {
- return nullptr;
- }
- } else if (stream.get_leg_len() == 4) {
- if (pval->type_ == JsonbType::T_Object) {
- continue;
- } else if (LIKELY(pval->type_ == JsonbType::T_Array)) {
- index = ((ArrayVal*)pval)->numElem() - 1;
- } else {
- return nullptr;
- }
-
- } else {
- is_invalid_json_path = true;
- return nullptr;
- }
- } else {
- auto result = std::from_chars(idx_string.data(),
- idx_string.data() +
idx_string.size(), index);
- if (result.ec != std::errc()) {
- is_invalid_json_path = true;
- return nullptr;
- }
+ }
- if (pval->type_ == JsonbType::T_Object) {
- if (index == 0) {
- continue;
- } else {
- return nullptr;
- }
- } else if (LIKELY(pval->type_ == JsonbType::T_Array)) {
- if (std::abs(index) >= ((ArrayVal*)pval)->numElem())
return nullptr;
- } else {
- return nullptr;
- }
+ if (pval->type_ == JsonbType::T_Object &&
+ path.get_leg_from_leg_vector(i)->array_index == 0) {
+ continue;
}
- if (index >= 0) {
- pval = ((ArrayVal*)pval)->get(index);
+ if (pval->type_ != JsonbType::T_Array ||
+ path.get_leg_from_leg_vector(i)->leg_ptr != nullptr ||
+ path.get_leg_from_leg_vector(i)->leg_len != 0)
+ return nullptr;
+
+ if (path.get_leg_from_leg_vector(i)->array_index >= 0) {
+ pval =
((ArrayVal*)pval)->get(path.get_leg_from_leg_vector(i)->array_index);
} else {
- pval = ((ArrayVal*)pval)->get(((ArrayVal*)pval)->numElem() +
index);
+ pval = ((ArrayVal*)pval)
+ ->get(((ArrayVal*)pval)->numElem() +
+
path.get_leg_from_leg_vector(i)->array_index);
}
+
+ if (!pval) return nullptr;
+ continue;
+ }
}
}
-
return pval;
}
-inline bool JsonbPath::parsePath(Stream* stream) {
+inline bool JsonbPath::parsePath(Stream* stream, JsonbPath* path) {
if (stream->peek() == BEGIN_ARRAY) {
- return parse_array(stream);
+ return parse_array(stream, path);
} else if (stream->peek() == BEGIN_MEMBER) {
- return parse_member(stream);
+ return parse_member(stream, path);
} else {
- stream->set_is_invalid_json_path(true);
return false; //invalid json path
}
}
-inline bool JsonbPath::parse_array(Stream* stream) {
+inline bool JsonbPath::parse_array(Stream* stream, JsonbPath* path) {
assert(stream->peek() == BEGIN_ARRAY);
stream->skip(1);
if (stream->exhausted()) {
- stream->set_is_invalid_json_path(true);
return false;
}
@@ -1363,10 +1316,12 @@ inline bool JsonbPath::parse_array(Stream* stream) {
stream->add_leg_len();
stream->skip(1);
if (stream->peek() == END_ARRAY) {
- stream->set_type(ARRAY_CODE);
+ std::unique_ptr<leg_info> leg(
+ new leg_info(stream->get_leg_ptr(), stream->get_leg_len(),
0, ARRAY_CODE));
+ path->add_leg_to_leg_vector(std::move(leg));
+ stream->skip(1);
return true;
} else {
- stream->set_is_invalid_json_path(true);
return false;
}
}
@@ -1377,22 +1332,58 @@ inline bool JsonbPath::parse_array(Stream* stream) {
stream->add_leg_len();
}
- if (!stream->exhausted() && stream->peek() == END_ARRAY) {
+ if (stream->exhausted() || stream->peek() != END_ARRAY) {
+ return false;
+ } else {
stream->skip(1);
- stream->set_type(ARRAY_CODE);
+ }
+
+ //parse array index to int
+
+ std::string_view idx_string(stream->get_leg_ptr(), stream->get_leg_len());
+ int index = 0;
+
+ if (stream->get_leg_len() >= 4 &&
+ std::equal(LAST, LAST + 4, stream->get_leg_ptr(),
+ [](char c1, char c2) { return std::tolower(c1) ==
std::tolower(c2); })) {
+ auto pos = idx_string.find(MINUS);
+
+ if (pos != std::string::npos) {
+ idx_string = idx_string.substr(pos + 1);
+
+ auto result = std::from_chars(idx_string.data(), idx_string.data()
+ idx_string.size(),
+ index);
+ if (result.ec != std::errc()) {
+ return false;
+ }
+
+ } else if (stream->get_leg_len() > 4) {
+ return false;
+ }
+
+ std::unique_ptr<leg_info> leg(new leg_info(nullptr, 0, -index - 1,
ARRAY_CODE));
+ path->add_leg_to_leg_vector(std::move(leg));
+
return true;
- } else {
- stream->set_is_invalid_json_path(true);
+ }
+
+ auto result = std::from_chars(idx_string.data(), idx_string.data() +
idx_string.size(), index);
+
+ if (result.ec != std::errc()) {
return false;
}
+
+ std::unique_ptr<leg_info> leg(new leg_info(nullptr, 0, index, ARRAY_CODE));
+ path->add_leg_to_leg_vector(std::move(leg));
+
+ return true;
}
-inline bool JsonbPath::parse_member(Stream* stream) {
+inline bool JsonbPath::parse_member(Stream* stream, JsonbPath* path) {
// advance past the .
assert(stream->peek() == BEGIN_MEMBER);
stream->skip(1);
if (stream->exhausted()) {
- stream->set_is_invalid_json_path(true);
return false;
}
@@ -1400,7 +1391,9 @@ inline bool JsonbPath::parse_member(Stream* stream) {
stream->set_leg_ptr(const_cast<char*>(stream->position()));
stream->add_leg_len();
stream->skip(1);
- stream->set_type(MEMBER_CODE);
+ std::unique_ptr<leg_info> leg(
+ new leg_info(stream->get_leg_ptr(), stream->get_leg_len(), 0,
MEMBER_CODE));
+ path->add_leg_to_leg_vector(std::move(leg));
return true;
}
@@ -1435,12 +1428,18 @@ inline bool JsonbPath::parse_member(Stream* stream) {
stream->add_leg_len();
}
- if (left_quotation_marks != nullptr && right_quotation_marks == nullptr) {
- stream->set_is_invalid_json_path(true);
+ if ((left_quotation_marks != nullptr && right_quotation_marks == nullptr)
||
+ stream->get_leg_ptr() == nullptr || stream->get_leg_len() == 0) {
return false; //invalid json path
}
- stream->set_type(MEMBER_CODE);
+ if (stream->get_has_escapes()) {
+ stream->remove_escapes();
+ }
+
+ std::unique_ptr<leg_info> leg(
+ new leg_info(stream->get_leg_ptr(), stream->get_leg_len(), 0,
MEMBER_CODE));
+ path->add_leg_to_leg_vector(std::move(leg));
return true;
}
diff --git a/be/src/vec/functions/function_jsonb.cpp
b/be/src/vec/functions/function_jsonb.cpp
index d9e498b4cf..9fe20e751f 100644
--- a/be/src/vec/functions/function_jsonb.cpp
+++ b/be/src/vec/functions/function_jsonb.cpp
@@ -436,10 +436,7 @@ private:
ColumnString::Offsets&
res_offsets, NullMap& null_map,
const
std::unique_ptr<JsonbWriter>& writer,
std::unique_ptr<JsonbToJson>&
formater,
- const char* l_raw, int l_size,
const char* r_raw,
- int r_size, bool&
is_invalid_json_path) {
- String path(r_raw, r_size);
-
+ const char* l_raw, int l_size,
JsonbPath& path) {
if (null_map[i]) {
StringOP::push_null_string(i, res_data, res_offsets, null_map);
return;
@@ -453,9 +450,9 @@ private:
}
// value is NOT necessary to be deleted since JsonbValue will not
allocate memory
- JsonbValue* value = doc->getValue()->findPath(r_raw, r_size,
is_invalid_json_path, nullptr);
+ JsonbValue* value = doc->getValue()->findValue(path, nullptr);
- if (UNLIKELY(!value) || is_invalid_json_path) {
+ if (UNLIKELY(!value)) {
StringOP::push_null_string(i, res_data, res_offsets, null_map);
return;
}
@@ -535,8 +532,18 @@ public:
size_t r_off = roffsets[index_check_const(i, path_const[0]) -
1];
size_t r_size = roffsets[index_check_const(i, path_const[0])]
- r_off;
const char* r_raw = reinterpret_cast<const
char*>(&rdata[r_off]);
+
+ JsonbPath path;
+ if (!path.seek(r_raw, r_size)) {
+ return Status::InvalidArgument(
+ "Json path error: {} for value: {}",
+
JsonbErrMsg::getErrMsg(JsonbErrType::E_INVALID_JSON_PATH),
+ std::string_view(reinterpret_cast<const
char*>(rdata.data()),
+ rdata.size()));
+ }
+
inner_loop_impl(i, res_data, res_offsets, null_map, writer,
formater, l_raw, l_size,
- r_raw, r_size, is_invalid_json_path);
+ path);
} else { // will make array string to user
writer->reset();
writer->writeStartArray();
@@ -553,9 +560,16 @@ public:
writer->writeNull();
continue;
}
- // value is NOT necessary to be deleted since JsonbValue
will not allocate memory
- JsonbValue* value =
- doc->getValue()->findPath(r_raw, r_size,
is_invalid_json_path, nullptr);
+
+ JsonbPath path;
+ if (!path.seek(r_raw, r_size)) {
+ return Status::InvalidArgument(
+ "Json path error: {} for value: {}",
+
JsonbErrMsg::getErrMsg(JsonbErrType::E_INVALID_JSON_PATH),
+ std::string_view(reinterpret_cast<const
char*>(rdata.data()),
+ rdata.size()));
+ }
+
// if not valid json path , should return error message to
user
if (is_invalid_json_path) {
return Status::InvalidArgument(
@@ -564,6 +578,10 @@ public:
std::string_view(reinterpret_cast<const
char*>(rdata.data()),
rdata.size()));
}
+
+ // value is NOT necessary to be deleted since JsonbValue
will not allocate memory
+ JsonbValue* value = doc->getValue()->findValue(path,
nullptr);
+
if (UNLIKELY(!value)) {
writer->writeNull();
} else {
@@ -602,8 +620,15 @@ public:
int r_size = roffsets[i] - roffsets[i - 1];
const char* r_raw = reinterpret_cast<const
char*>(&rdata[roffsets[i - 1]]);
+ JsonbPath path;
+ if (!path.seek(r_raw, r_size)) {
+ is_invalid_json_path = true;
+ StringOP::push_null_string(i, res_data, res_offsets, null_map);
+ return;
+ }
+
inner_loop_impl(i, res_data, res_offsets, null_map, writer,
formater, l_raw, l_size,
- r_raw, r_size, is_invalid_json_path);
+ path);
} //for
} //function
static void vector_scalar(FunctionContext* context, const
ColumnString::Chars& ldata,
@@ -620,12 +645,18 @@ public:
std::unique_ptr<JsonbToJson> formater;
+ JsonbPath path;
+ if (!path.seek(rdata.data, rdata.size)) {
+ is_invalid_json_path = true;
+ return;
+ }
+
for (size_t i = 0; i < input_rows_count; ++i) {
int l_size = loffsets[i] - loffsets[i - 1];
const char* l_raw = reinterpret_cast<const
char*>(&ldata[loffsets[i - 1]]);
inner_loop_impl(i, res_data, res_offsets, null_map, writer,
formater, l_raw, l_size,
- rdata.data, rdata.size, is_invalid_json_path);
+ path);
} //for
} //function
static void scalar_vector(FunctionContext* context, const StringRef& ldata,
@@ -647,8 +678,15 @@ public:
int r_size = roffsets[i] - roffsets[i - 1];
const char* r_raw = reinterpret_cast<const
char*>(&rdata[roffsets[i - 1]]);
+ JsonbPath path;
+ if (!path.seek(r_raw, r_size)) {
+ is_invalid_json_path = true;
+ StringOP::push_null_string(i, res_data, res_offsets, null_map);
+ return;
+ }
+
inner_loop_impl(i, res_data, res_offsets, null_map, writer,
formater, ldata.data,
- ldata.size, r_raw, r_size, is_invalid_json_path);
+ ldata.size, path);
} //for
} //function
};
@@ -663,8 +701,7 @@ struct JsonbExtractImpl {
private:
static ALWAYS_INLINE void inner_loop_impl(size_t i, Container& res,
NullMap& null_map,
const char* l_raw_str, int
l_str_size,
- const char* r_raw_str, int
r_str_size,
- bool& is_invalid_json_path) {
+ JsonbPath& path) {
if (null_map[i]) {
res[i] = 0;
return;
@@ -679,10 +716,9 @@ private:
}
// value is NOT necessary to be deleted since JsonbValue will not
allocate memory
- JsonbValue* value =
- doc->getValue()->findPath(r_raw_str, r_str_size,
is_invalid_json_path, nullptr);
+ JsonbValue* value = doc->getValue()->findValue(path, nullptr);
- if (UNLIKELY(!value) || is_invalid_json_path) {
+ if (UNLIKELY(!value)) {
if constexpr (!only_check_exists) {
null_map[i] = 1;
}
@@ -761,8 +797,14 @@ public:
const char* r_raw_str = reinterpret_cast<const
char*>(&rdata[roffsets[i - 1]]);
int r_str_size = roffsets[i] - roffsets[i - 1];
- inner_loop_impl(i, res, null_map, l_raw_str, l_str_size,
r_raw_str, r_str_size,
- is_invalid_json_path);
+ JsonbPath path;
+ if (!path.seek(r_raw_str, r_str_size)) {
+ is_invalid_json_path = true;
+ res[i] = 0;
+ return;
+ }
+
+ inner_loop_impl(i, res, null_map, l_raw_str, l_str_size, path);
} //for
} //function
static void scalar_vector(FunctionContext* context, const StringRef& ldata,
@@ -780,8 +822,14 @@ public:
const char* r_raw_str = reinterpret_cast<const
char*>(&rdata[roffsets[i - 1]]);
int r_str_size = roffsets[i] - roffsets[i - 1];
- inner_loop_impl(i, res, null_map, ldata.data, ldata.size,
r_raw_str, r_str_size,
- is_invalid_json_path);
+ JsonbPath path;
+ if (!path.seek(r_raw_str, r_str_size)) {
+ is_invalid_json_path = true;
+ res[i] = 0;
+ return;
+ }
+
+ inner_loop_impl(i, res, null_map, ldata.data, ldata.size, path);
} //for
} //function
static void vector_scalar(FunctionContext* context, const
ColumnString::Chars& ldata,
@@ -790,6 +838,12 @@ public:
size_t size = loffsets.size();
res.resize(size);
+ JsonbPath path;
+ if (!path.seek(rdata.data, rdata.size)) {
+ is_invalid_json_path = true;
+ return;
+ }
+
for (size_t i = 0; i < loffsets.size(); i++) {
if constexpr (only_check_exists) {
res[i] = 0;
@@ -798,8 +852,7 @@ public:
const char* l_raw_str = reinterpret_cast<const
char*>(&ldata[loffsets[i - 1]]);
int l_str_size = loffsets[i] - loffsets[i - 1];
- inner_loop_impl(i, res, null_map, l_raw_str, l_str_size,
rdata.data, rdata.size,
- is_invalid_json_path);
+ inner_loop_impl(i, res, null_map, l_raw_str, l_str_size, path);
} //for
} //function
};
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]