Re: [PR] [GLUTEN-7014][CH] Fix: different results from `get_json_object` [incubator-gluten]

via GitHub Wed, 28 Aug 2024 01:37:47 -0700


lgbo-ustc commented on code in PR #7034:
URL: https://github.com/apache/incubator-gluten/pull/7034#discussion_r1734245954



##########
cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h:
##########
@@ -66,6 +68,261 @@ struct GetJsonObject
     static constexpr auto name{"get_json_object"};
 };
 
+class JSONTextNormalizer
+{
+public:
+    // simd json will fail to parse the json text on some cases, see #7014, 
#3750, #3337, #5303
+    // To keep the result same with vanilla, we normalize the json string when 
simd json fails.
+    // It returns null when normalize the json text fail, otherwise returns a 
position among `pos`
+    // and `end` which points to the whole json object end.
+    // `dst` refer to a memory buffer that is used to store the normalization 
result.
+    static const char * normalize(const char * pos, const char * end, char *& 
dst)
+    {
+        pos = normalizeWhitespace(pos, end, dst);
+        if (!pos || pos >= end)
+            return nullptr;
+        if (*pos == '[')
+            return normalizeArray(pos, end, dst);
+        else if (*pos == '{')
+            return normalizeObject(pos, end, dst);
+        return nullptr;
+    }
+
+private:
+    inline static void copyToDst(char *& p, char c)
+    {
+        *p = c;
+        p++;
+    }
+
+    inline static void copyToDst(char *& p, const char * src, size_t len)
+    {
+        memcpy(p, src, len);
+        p += len;
+    }
+
+    inline static bool isExpectedChar(char c, const char * pos, const char * 
end) { return pos && pos < end && *pos == c; }
+
+    inline static const char * normalizeWhitespace(const char * pos, const 
char * end, char *& dst)
+    {
+        const auto * start_pos = pos;
+        while (pos && pos < end)
+        {
+            if (isWhitespaceASCII(*pos))
+                pos++;
+            else
+                break;
+        }
+        if (pos != start_pos)
+            copyToDst(dst, start_pos, pos - start_pos);
+        return pos;
+    }
+
+    inline static const char * normalizeComma(const char * pos, const char * 
end, char *& dst)
+    {
+        pos = normalizeWhitespace(pos, end, dst);
+        if (!isExpectedChar(',', pos, end)) [[unlikely]]
+        {
+            // LOG_DEBUG(getLogger("GetJsonObject"), "xxx normalizeComma. not 
,");
+            return nullptr;
+        }
+        pos += 1;
+        copyToDst(dst, ',');
+        return normalizeWhitespace(pos, end, dst);
+    }
+
+    inline static const char * normalizeColon(const char * pos, const char * 
end, char *& dst)
+    {
+        pos = normalizeWhitespace(pos, end, dst);
+        if (!isExpectedChar(':', pos, end))
+        {
+            // LOG_DEBUG(getLogger("GetJsonObject"), "xxx normalizeColon. not 
:");
+            return nullptr;
+        }
+        pos += 1;
+        copyToDst(dst, ':');
+        return normalizeWhitespace(pos, end, dst);
+    }
+
+    inline static const char * normalizeField(const char * pos, const char * 
end, char *& dst)
+    {
+        const auto * start_pos = pos;
+        pos = find_first_symbols<',', '}', ']'>(pos, end);
+        if (pos >= end) [[unlikely]]
+        {
+            // LOG_DEBUG(getLogger("GetJsonObject"), "xxx normalizeField. not 
field");
+            return nullptr;
+        }
+        copyToDst(dst, start_pos, pos - start_pos);
+        return pos;
+        ;
+    }
+
+    inline static const char * normalizeString(const char * pos, const char * 
end, char *& dst)
+    {
+        const auto * start_pos = pos;
+        if (!isExpectedChar('"', pos, end)) [[unlikely]]
+        {
+            // LOG_DEBUG(getLogger("GetJsonObject"), "xxx normalizeString. not 
\"");
+            return nullptr;
+        }
+        pos += 1;
+
+        do
+        {
+            pos = find_first_symbols<'\\', '"'>(pos, end);
+            if (pos != end && *pos == '\\')
+            {
+                // escape charaters. e.g. '\"', '\\'
+                pos += 2;
+                if (pos >= end)
+                    return nullptr;
+            }
+            else
+                break;
+        } while (pos != end);
+
+        pos = find_first_symbols<'"'>(pos, end);
+        if (!isExpectedChar('"', pos, end))
+            return nullptr;
+        pos += 1;
+
+        size_t n = 0;
+        for (; start_pos != pos; ++start_pos)
+        {
+            if ((*start_pos >= 0x00 && *start_pos <= 0x1f) || *start_pos == 
0x7f)
+            {
+                if (n)
+                {
+                    copyToDst(dst, start_pos - n, n);
+                    n = 0;
+                }
+                continue;
+            }
+            else
+            {
+                n += 1;
+            }
+        }
+        if (n)
+            copyToDst(dst, start_pos - n, n);
+
+        return normalizeWhitespace(pos, end, dst);
+    }
+
+    inline static const char * normalizeArray(const char * pos, const char * 
end, char *& dst)
+    {
+        if (!isExpectedChar('[', pos, end)) [[unlikely]]
+        {
+            // LOG_DEBUG(getLogger("GetJsonObject"), "xxx normalizeArray. not 
[");
+            return nullptr;
+        }
+        pos += 1;
+        copyToDst(dst, '[');
+
+        pos = normalizeWhitespace(pos, end, dst);
+
+        bool has_more = false;
+        while (pos && pos < end && *pos != ']')
+        {
+            has_more = false;
+            switch (*pos)
+            {
+                case '{': {
+                    pos = normalizeObject(pos, end, dst);
+                    break;
+                }
+                case '"': {
+                    pos = normalizeString(pos, end, dst);
+                    break;
+                }
+                case '[': {
+                    pos = normalizeArray(pos, end, dst);
+                    break;
+                }
+                default: {
+                    pos = normalizeField(pos, end, dst);
+                    break;
+                }
+            }
+            if (!isExpectedChar(',', pos, end))
+                break;
+            pos = normalizeComma(pos, end, dst);
+            has_more = true;
+        }
+
+        if (!isExpectedChar(']', pos, end) || has_more)
+        {
+            // LOG_DEBUG(getLogger("GetJsonObject"), "xxx normalizeArray. not 
]");
+            return nullptr;
+        }
+        pos += 1;
+        copyToDst(dst, ']');
+        return normalizeWhitespace(pos, end, dst);
+    }
+
+    inline static const char * normalizeObject(const char * pos, const char * 
end, char *& dst)

Review Comment:
   只是个提示，倒没什么影响



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [GLUTEN-7014][CH] Fix: different results from `get_json_object` [incubator-gluten]

Reply via email to