lgbo-ustc commented on code in PR #7034:
URL: https://github.com/apache/incubator-gluten/pull/7034#discussion_r1734245954
##########
cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h:
##########
@@ -66,6 +68,261 @@ struct GetJsonObject
static constexpr auto name{"get_json_object"};
};
+class JSONTextNormalizer
+{
+public:
+ // simd json will fail to parse the json text on some cases, see #7014,
#3750, #3337, #5303
+ // To keep the result same with vanilla, we normalize the json string when
simd json fails.
+ // It returns null when normalize the json text fail, otherwise returns a
position among `pos`
+ // and `end` which points to the whole json object end.
+ // `dst` refer to a memory buffer that is used to store the normalization
result.
+ static const char * normalize(const char * pos, const char * end, char *&
dst)
+ {
+ pos = normalizeWhitespace(pos, end, dst);
+ if (!pos || pos >= end)
+ return nullptr;
+ if (*pos == '[')
+ return normalizeArray(pos, end, dst);
+ else if (*pos == '{')
+ return normalizeObject(pos, end, dst);
+ return nullptr;
+ }
+
+private:
+ inline static void copyToDst(char *& p, char c)
+ {
+ *p = c;
+ p++;
+ }
+
+ inline static void copyToDst(char *& p, const char * src, size_t len)
+ {
+ memcpy(p, src, len);
+ p += len;
+ }
+
+ inline static bool isExpectedChar(char c, const char * pos, const char *
end) { return pos && pos < end && *pos == c; }
+
+ inline static const char * normalizeWhitespace(const char * pos, const
char * end, char *& dst)
+ {
+ const auto * start_pos = pos;
+ while (pos && pos < end)
+ {
+ if (isWhitespaceASCII(*pos))
+ pos++;
+ else
+ break;
+ }
+ if (pos != start_pos)
+ copyToDst(dst, start_pos, pos - start_pos);
+ return pos;
+ }
+
+ inline static const char * normalizeComma(const char * pos, const char *
end, char *& dst)
+ {
+ pos = normalizeWhitespace(pos, end, dst);
+ if (!isExpectedChar(',', pos, end)) [[unlikely]]
+ {
+ // LOG_DEBUG(getLogger("GetJsonObject"), "xxx normalizeComma. not
,");
+ return nullptr;
+ }
+ pos += 1;
+ copyToDst(dst, ',');
+ return normalizeWhitespace(pos, end, dst);
+ }
+
+ inline static const char * normalizeColon(const char * pos, const char *
end, char *& dst)
+ {
+ pos = normalizeWhitespace(pos, end, dst);
+ if (!isExpectedChar(':', pos, end))
+ {
+ // LOG_DEBUG(getLogger("GetJsonObject"), "xxx normalizeColon. not
:");
+ return nullptr;
+ }
+ pos += 1;
+ copyToDst(dst, ':');
+ return normalizeWhitespace(pos, end, dst);
+ }
+
+ inline static const char * normalizeField(const char * pos, const char *
end, char *& dst)
+ {
+ const auto * start_pos = pos;
+ pos = find_first_symbols<',', '}', ']'>(pos, end);
+ if (pos >= end) [[unlikely]]
+ {
+ // LOG_DEBUG(getLogger("GetJsonObject"), "xxx normalizeField. not
field");
+ return nullptr;
+ }
+ copyToDst(dst, start_pos, pos - start_pos);
+ return pos;
+ ;
+ }
+
+ inline static const char * normalizeString(const char * pos, const char *
end, char *& dst)
+ {
+ const auto * start_pos = pos;
+ if (!isExpectedChar('"', pos, end)) [[unlikely]]
+ {
+ // LOG_DEBUG(getLogger("GetJsonObject"), "xxx normalizeString. not
\"");
+ return nullptr;
+ }
+ pos += 1;
+
+ do
+ {
+ pos = find_first_symbols<'\\', '"'>(pos, end);
+ if (pos != end && *pos == '\\')
+ {
+ // escape charaters. e.g. '\"', '\\'
+ pos += 2;
+ if (pos >= end)
+ return nullptr;
+ }
+ else
+ break;
+ } while (pos != end);
+
+ pos = find_first_symbols<'"'>(pos, end);
+ if (!isExpectedChar('"', pos, end))
+ return nullptr;
+ pos += 1;
+
+ size_t n = 0;
+ for (; start_pos != pos; ++start_pos)
+ {
+ if ((*start_pos >= 0x00 && *start_pos <= 0x1f) || *start_pos ==
0x7f)
+ {
+ if (n)
+ {
+ copyToDst(dst, start_pos - n, n);
+ n = 0;
+ }
+ continue;
+ }
+ else
+ {
+ n += 1;
+ }
+ }
+ if (n)
+ copyToDst(dst, start_pos - n, n);
+
+ return normalizeWhitespace(pos, end, dst);
+ }
+
+ inline static const char * normalizeArray(const char * pos, const char *
end, char *& dst)
+ {
+ if (!isExpectedChar('[', pos, end)) [[unlikely]]
+ {
+ // LOG_DEBUG(getLogger("GetJsonObject"), "xxx normalizeArray. not
[");
+ return nullptr;
+ }
+ pos += 1;
+ copyToDst(dst, '[');
+
+ pos = normalizeWhitespace(pos, end, dst);
+
+ bool has_more = false;
+ while (pos && pos < end && *pos != ']')
+ {
+ has_more = false;
+ switch (*pos)
+ {
+ case '{': {
+ pos = normalizeObject(pos, end, dst);
+ break;
+ }
+ case '"': {
+ pos = normalizeString(pos, end, dst);
+ break;
+ }
+ case '[': {
+ pos = normalizeArray(pos, end, dst);
+ break;
+ }
+ default: {
+ pos = normalizeField(pos, end, dst);
+ break;
+ }
+ }
+ if (!isExpectedChar(',', pos, end))
+ break;
+ pos = normalizeComma(pos, end, dst);
+ has_more = true;
+ }
+
+ if (!isExpectedChar(']', pos, end) || has_more)
+ {
+ // LOG_DEBUG(getLogger("GetJsonObject"), "xxx normalizeArray. not
]");
+ return nullptr;
+ }
+ pos += 1;
+ copyToDst(dst, ']');
+ return normalizeWhitespace(pos, end, dst);
+ }
+
+ inline static const char * normalizeObject(const char * pos, const char *
end, char *& dst)
Review Comment:
只是个提示,倒没什么影响
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]