[ovs-dev] [PATCH v2] json: Improve string parsing

Rosemarie O'Riorden Tue, 29 Mar 2022 09:51:51 -0700

To parse a json string prior to this change, json_lex_input is called
with each character of the string. If the character needs to be copied
to the buffer, it is copied individually. This is an expensive
operation, as often there are multiple characters in a row
that need to be copied, and copying memory in blocks is more efficient
than byte by byte. To improve this, the string is now copied
in blocks with an offset counter. A copy is performed when the parser
state equals done.


Functions that are called for each character use a lot of CPU cycles.
Making these functions inline greatly reduces the cycles used and
improves overall performance. Since json_lex_input was only needed in
one place, it doesn't have to be its own function.

There is also a conditional that checks if the current character is a
new line, which is quite unlikely. When this was examined with perf, the
comparison had a very high CPU cycle usage. To improve this, the
OVS_UNLIKELY macro was used, which forces the compiler to switch the
order of the instructions.

Here is the improvement seen in the json-string-benchmark test:

  SIZE      Q  S         BEFORE       AFTER      CHANGE
--------------------------------------------------------
100000      0  0 :      0.842 ms     0.489 ms   -41.9 %
100000      2  1 :      0.917 ms     0.535 ms   -41.7 %
100000      10 1 :      1.063 ms     0.656 ms   -38.3 %
10000000    0  0 :     85.328 ms    49.878 ms   -41.5 %
10000000    2  1 :     92.555 ms    54.778 ms   -40.8 %
10000000    10 1 :    106.728 ms    66.735 ms   -37.5 %
100000000   0  0 :    955.375 ms   621.950 ms   -34.9 %
100000000   2  1 :   1031.700 ms   665.200 ms   -35.5 %
100000000   10 1 :   1189.300 ms   796.050 ms   -33.0 %

Here Q is probability (%) for a character to be a '\"' and
S is probability (%) to be a special character ( < 32).

Signed-off-by: Rosemarie O'Riorden <[email protected]>
---
Adds a local counter so p->start is no longer needed
Moves contents of json_lex_input into json_parser_feed
Adds function json_parser_account_byte
Performs copy when parser state equals done

 lib/json.c | 200 +++++++++++++++++++++++++++++------------------------
 1 file changed, 110 insertions(+), 90 deletions(-)

diff --git a/lib/json.c b/lib/json.c
index 720c73d94..2b18ec550 100644
--- a/lib/json.c
+++ b/lib/json.c
@@ -976,88 +976,6 @@ json_lex_string(struct json_parser *p)
     }
 }
 
-static bool
-json_lex_input(struct json_parser *p, unsigned char c)
-{
-    struct json_token token;
-
-    switch (p->lex_state) {
-    case JSON_LEX_START:
-        switch (c) {
-        case ' ': case '\t': case '\n': case '\r':
-            /* Nothing to do. */
-            return true;
-
-        case 'a': case 'b': case 'c': case 'd': case 'e':
-        case 'f': case 'g': case 'h': case 'i': case 'j':
-        case 'k': case 'l': case 'm': case 'n': case 'o':
-        case 'p': case 'q': case 'r': case 's': case 't':
-        case 'u': case 'v': case 'w': case 'x': case 'y':
-        case 'z':
-            p->lex_state = JSON_LEX_KEYWORD;
-            break;
-
-        case '[': case '{': case ']': case '}': case ':': case ',':
-            token.type = c;
-            json_parser_input(p, &token);
-            return true;
-
-        case '-':
-        case '0': case '1': case '2': case '3': case '4':
-        case '5': case '6': case '7': case '8': case '9':
-            p->lex_state = JSON_LEX_NUMBER;
-            break;
-
-        case '"':
-            p->lex_state = JSON_LEX_STRING;
-            return true;
-
-        default:
-            if (isprint(c)) {
-                json_error(p, "invalid character '%c'", c);
-            } else {
-                json_error(p, "invalid character U+%04x", c);
-            }
-            return true;
-        }
-        break;
-
-    case JSON_LEX_KEYWORD:
-        if (!isalpha((unsigned char) c)) {
-            json_lex_keyword(p);
-            return false;
-        }
-        break;
-
-    case JSON_LEX_NUMBER:
-        if (!strchr(".0123456789eE-+", c)) {
-            json_lex_number(p);
-            return false;
-        }
-        break;
-
-    case JSON_LEX_STRING:
-        if (c == '\\') {
-            p->lex_state = JSON_LEX_ESCAPE;
-        } else if (c == '"') {
-            json_lex_string(p);
-            return true;
-        } else if (c < 0x20) {
-            json_error(p, "U+%04X must be escaped in quoted string", c);
-            return true;
-        }
-        break;
-
-    case JSON_LEX_ESCAPE:
-        p->lex_state = JSON_LEX_STRING;
-        break;
-
-    default:
-        abort();
-    }
-    ds_put_char(&p->buffer, c);
-    return true;
-}
 
 /* Parsing. */
 
@@ -1160,22 +1078,124 @@ json_parser_create(int flags)
     return p;
 }
 
+static inline void ALWAYS_INLINE
+json_parser_account_byte(struct json_parser *p, unsigned char c)
+{
+    p->byte_number++;
+    if (OVS_UNLIKELY(c == '\n')) {
+        p->column_number = 0;
+        p->line_number++;
+    } else {
+        p->column_number++;
+    }
+}
+
 size_t
 json_parser_feed(struct json_parser *p, const char *input, size_t n)
 {
+    size_t token_start = 0;
     size_t i;
+
     for (i = 0; !p->done && i < n; ) {
-        if (json_lex_input(p, input[i])) {
-            p->byte_number++;
-            if (input[i] == '\n') {
-                p->column_number = 0;
-                p->line_number++;
-            } else {
-                p->column_number++;
+        bool consumed = true;
+
+        const char *start_p = &input[token_start];
+        unsigned char c = input[i];
+        struct json_token token;
+
+        switch (p->lex_state) {
+        case JSON_LEX_START:
+            switch (c) {
+            case ' ': case '\t': case '\n': case '\r':
+                /* Nothing to do. */
+
+                token_start = i + 1;
+                break;
+
+            case 'a': case 'b': case 'c': case 'd': case 'e':
+            case 'f': case 'g': case 'h': case 'i': case 'j':
+            case 'k': case 'l': case 'm': case 'n': case 'o':
+            case 'p': case 'q': case 'r': case 's': case 't':
+            case 'u': case 'v': case 'w': case 'x': case 'y':
+            case 'z':
+                p->lex_state = JSON_LEX_KEYWORD;
+                token_start = i;
+                break;
+
+            case '[': case '{': case ']': case '}': case ':': case ',':
+                token.type = c;
+                json_parser_input(p, &token);
+                token_start = i + 1;
+                break;
+
+            case '-':
+            case '0': case '1': case '2': case '3': case '4':
+            case '5': case '6': case '7': case '8': case '9':
+                p->lex_state = JSON_LEX_NUMBER;
+                token_start = i;
+                break;
+
+            case '"':
+                p->lex_state = JSON_LEX_STRING;
+                token_start = i + 1;
+                break;
+
+            default:
+                if (isprint(c)) {
+                    json_error(p, "invalid character '%c'", c);
+                } else {
+                    json_error(p, "invalid character U+%04x", c);
+                }
+                break;
+            }
+            break;
+
+        case JSON_LEX_KEYWORD:
+            if (!isalpha((unsigned char) c)) {
+                ds_put_buffer(&p->buffer, start_p, i - token_start);
+                json_lex_keyword(p);
+                consumed = false;
+                break;
             }
+            break;
+
+        case JSON_LEX_NUMBER:
+            if (!strchr(".0123456789eE-+", c)) {
+                ds_put_buffer(&p->buffer, start_p, i - token_start);
+                json_lex_number(p);
+                consumed = false;
+                break;
+            }
+            break;
+
+        case JSON_LEX_STRING:
+            if (c == '\\') {
+                p->lex_state = JSON_LEX_ESCAPE;
+            } else if (c == '"') {
+                ds_put_buffer(&p->buffer, start_p, i - token_start);
+                json_lex_string(p);
+            } else if (c < 0x20) {
+                json_error(p, "U+%04X must be escaped in quoted string", c);
+            }
+            break;
+
+        case JSON_LEX_ESCAPE:
+            p->lex_state = JSON_LEX_STRING;
+            break;
+
+        default:
+            ovs_abort(0, "unexpected lexer state");
+        }
+
+        if (consumed) {
+            json_parser_account_byte(p, c);
             i++;
         }
     }
+
+    if (!p->done) {
+        ds_put_buffer(&p->buffer, &input[token_start], i - token_start);
+    }
     return i;
 }
 
@@ -1201,7 +1221,7 @@ json_parser_finish(struct json_parser *p)
 
     case JSON_LEX_NUMBER:
     case JSON_LEX_KEYWORD:
-        json_lex_input(p, ' ');
+        json_parser_feed(p, " ", 1);
         break;
     }
 
-- 
2.35.1

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v2] json: Improve string parsing

Reply via email to