Re: [ovs-dev] [PATCH] json: Improve json parsing

Dumitru Ceara Wed, 23 Mar 2022 03:49:30 -0700

On 3/22/22 21:17, Rosemarie O'Riorden wrote:
> Hi Dumitru,
> 
> Thank you for the review! I'm going to send a v2 shortly with some changes.
> 
>> I'm not sure what the benefit is to have 'start' as 'const unsigned char
>> *'; it could easily be 'const char *' and we wouldn't have to cast later
>> on.  Also, a comment might be useful, describing that this points to the
>> start of the input that has been preprocessed but not yet copied to
>> 'buffer'.
> 
> I agree with this and have updated 'start' to be a const char *, which
> took away the need for all those casts. I have also added a comment to
> explain what p->start is.
> 
> 
>> On the other hand, json_lex_keyword(p) calls json_parser_input(p, ..)
>> which clears p->buffer at the end.
>>
>> So I thought that we should set p->start = NULL when we clear p->buffer.
>>
>> But json_parser_input() changes the lexer state back to JSON_LEX_START
>> so p->start will be reset when json_lex_input() is called next.
>>
>> I don't really have a good suggestion about how to improve this, it just
>> took me a bit to figure out why it's working correctly now.
> 
> Maybe I can add a few comments to better explain what is going on.
> SInce p->start is updated at the next json_lex_input call, I guess
> this logic doesn't need to be changed, but if you think it does, let
> me know.
> 
> 
>> I keep wondering if there's a way to avoid the p->start all together.
>> What if we instead pass 'start' as an argument to json_lex_input() along
>> with an offset to indicate the next character to process?
> 
> I thought about this for a while. I originally had this idea too, but
> I found that it made less sense to me. If I removed start and replaced
> it with an offset count, I'd need to track the offset with a pointer
> that needs to be incremented for every character that is processed.
> The way I have it now, p->start is only assigned once at every state
> change, so it makes it quite simple in my opinion. If there is more
> input on how I can improve this, please let me know.
> 
> 
>>> -        json_lex_input(p, ' ');
>>> +        p->start = space;
>>> +        json_lex_input(p, space);
>>
>> This can be:
>>
>> p->start = " ";
>> json_lex_input(p, " ");
>>
> 
> I've updated this the way you suggested.
> 
> 
> Thank you,
> Rosemarie O'Riorden
>


Hi Rosemarie,

What about the alternative approach below (incremental on top of your
patch)?  It might be a bit intrusive because we explicitly inline
json_lex_input() but that function was anyway used in a single place
(well, actually two places but the call in json_parser_finish()
can be replaced with a call to json_parser_feed()).

Like this we avoid the 'start' field in 'struct json_parser' and make it
all contained, in a single place, in json_parser_feed().  No need to
explain how p->start is used and what it does.

Thanks,
Dumitru

diff --git a/lib/json.c b/lib/json.c
index f48cb32415bf..2b18ec550d67 100644
--- a/lib/json.c
+++ b/lib/json.c
@@ -101,7 +101,6 @@ struct json_parser {
     int line_number;
     int column_number;
     int byte_number;
-    const unsigned char *start;
 
     /* Parsing. */
     enum json_parse_state parse_state;
@@ -977,96 +976,6 @@ json_lex_string(struct json_parser *p)
     }
 }
 
-static inline ALWAYS_INLINE bool
-json_lex_input(struct json_parser *p, const unsigned char *ch)
-{
-    struct json_token token;
-    unsigned char c = *ch;
-
-    switch (p->lex_state) {
-    case JSON_LEX_START:
-        switch (c) {
-        case ' ': case '\t': case '\n': case '\r':
-            /* Nothing to do. */
-            p->start = ch + 1;
-            return true;
-
-        case 'a': case 'b': case 'c': case 'd': case 'e':
-        case 'f': case 'g': case 'h': case 'i': case 'j':
-        case 'k': case 'l': case 'm': case 'n': case 'o':
-        case 'p': case 'q': case 'r': case 's': case 't':
-        case 'u': case 'v': case 'w': case 'x': case 'y':
-        case 'z':
-            p->lex_state = JSON_LEX_KEYWORD;
-            p->start = ch;
-            break;
-
-        case '[': case '{': case ']': case '}': case ':': case ',':
-            token.type = c;
-            json_parser_input(p, &token);
-            p->start = ch + 1;
-            return true;
-
-        case '-':
-        case '0': case '1': case '2': case '3': case '4':
-        case '5': case '6': case '7': case '8': case '9':
-            p->lex_state = JSON_LEX_NUMBER;
-            p->start = ch;
-            break;
-
-        case '"':
-            p->lex_state = JSON_LEX_STRING;
-            p->start = ch + 1;
-            return true;
-
-        default:
-            if (isprint(c)) {
-                json_error(p, "invalid character '%c'", c);
-            } else {
-                json_error(p, "invalid character U+%04x", c);
-            }
-            return true;
-        }
-        break;
-
-    case JSON_LEX_KEYWORD:
-        if (!isalpha((unsigned char) c)) {
-            ds_put_buffer(&p->buffer, (const char *) p->start, ch - p->start);
-            json_lex_keyword(p);
-            return false;
-        }
-        break;
-
-    case JSON_LEX_NUMBER:
-        if (!strchr(".0123456789eE-+", c)) {
-            ds_put_buffer(&p->buffer, (const char *) p->start, ch - p->start);
-            json_lex_number(p);
-            return false;
-        }
-        break;
-
-    case JSON_LEX_STRING:
-        if (c == '\\') {
-            p->lex_state = JSON_LEX_ESCAPE;
-        } else if (c == '"') {
-            ds_put_buffer(&p->buffer, (const char *) p->start, ch - p->start);
-            json_lex_string(p);
-            return true;
-        } else if (c < 0x20) {
-            json_error(p, "U+%04X must be escaped in quoted string", c);
-            return true;
-        }
-        break;
-
-    case JSON_LEX_ESCAPE:
-        p->lex_state = JSON_LEX_STRING;
-        break;
-
-    default:
-        abort();
-    }
-    return true;
-}
 

 /* Parsing. */
 
@@ -1169,26 +1078,123 @@ json_parser_create(int flags)
     return p;
 }
 
+static inline void ALWAYS_INLINE
+json_parser_account_byte(struct json_parser *p, unsigned char c)
+{
+    p->byte_number++;
+    if (OVS_UNLIKELY(c == '\n')) {
+        p->column_number = 0;
+        p->line_number++;
+    } else {
+        p->column_number++;
+    }
+}
+
 size_t
 json_parser_feed(struct json_parser *p, const char *input, size_t n)
 {
+    size_t token_start = 0;
     size_t i;
-    p->start = (const unsigned char *) input;
+
     for (i = 0; !p->done && i < n; ) {
-        if (json_lex_input(p, (unsigned const char *) &input[i])) {
-            p->byte_number++;
-            if (OVS_UNLIKELY(input[i] == '\n')) {
-                p->column_number = 0;
-                p->line_number++;
-            } else {
-                p->column_number++;
+        bool consumed = true;
+
+        const char *start_p = &input[token_start];
+        unsigned char c = input[i];
+        struct json_token token;
+
+        switch (p->lex_state) {
+        case JSON_LEX_START:
+            switch (c) {
+            case ' ': case '\t': case '\n': case '\r':
+                /* Nothing to do. */
+
+                token_start = i + 1;
+                break;
+
+            case 'a': case 'b': case 'c': case 'd': case 'e':
+            case 'f': case 'g': case 'h': case 'i': case 'j':
+            case 'k': case 'l': case 'm': case 'n': case 'o':
+            case 'p': case 'q': case 'r': case 's': case 't':
+            case 'u': case 'v': case 'w': case 'x': case 'y':
+            case 'z':
+                p->lex_state = JSON_LEX_KEYWORD;
+                token_start = i;
+                break;
+
+            case '[': case '{': case ']': case '}': case ':': case ',':
+                token.type = c;
+                json_parser_input(p, &token);
+                token_start = i + 1;
+                break;
+
+            case '-':
+            case '0': case '1': case '2': case '3': case '4':
+            case '5': case '6': case '7': case '8': case '9':
+                p->lex_state = JSON_LEX_NUMBER;
+                token_start = i;
+                break;
+
+            case '"':
+                p->lex_state = JSON_LEX_STRING;
+                token_start = i + 1;
+                break;
+
+            default:
+                if (isprint(c)) {
+                    json_error(p, "invalid character '%c'", c);
+                } else {
+                    json_error(p, "invalid character U+%04x", c);
+                }
+                break;
             }
+            break;
+
+        case JSON_LEX_KEYWORD:
+            if (!isalpha((unsigned char) c)) {
+                ds_put_buffer(&p->buffer, start_p, i - token_start);
+                json_lex_keyword(p);
+                consumed = false;
+                break;
+            }
+            break;
+
+        case JSON_LEX_NUMBER:
+            if (!strchr(".0123456789eE-+", c)) {
+                ds_put_buffer(&p->buffer, start_p, i - token_start);
+                json_lex_number(p);
+                consumed = false;
+                break;
+            }
+            break;
+
+        case JSON_LEX_STRING:
+            if (c == '\\') {
+                p->lex_state = JSON_LEX_ESCAPE;
+            } else if (c == '"') {
+                ds_put_buffer(&p->buffer, start_p, i - token_start);
+                json_lex_string(p);
+            } else if (c < 0x20) {
+                json_error(p, "U+%04X must be escaped in quoted string", c);
+            }
+            break;
+
+        case JSON_LEX_ESCAPE:
+            p->lex_state = JSON_LEX_STRING;
+            break;
+
+        default:
+            ovs_abort(0, "unexpected lexer state");
+        }
+
+        if (consumed) {
+            json_parser_account_byte(p, c);
             i++;
         }
     }
+
     if (!p->done) {
-        ds_put_buffer(&p->buffer, (const char *) p->start,
-                (const unsigned char *) &input[i] - p->start);
+        ds_put_buffer(&p->buffer, &input[token_start], i - token_start);
     }
     return i;
 }
@@ -1203,7 +1209,6 @@ struct json *
 json_parser_finish(struct json_parser *p)
 {
     struct json *json;
-    const unsigned char *space = (const unsigned char *) " ";
 
     switch (p->lex_state) {
     case JSON_LEX_START:
@@ -1216,8 +1221,7 @@ json_parser_finish(struct json_parser *p)
 
     case JSON_LEX_NUMBER:
     case JSON_LEX_KEYWORD:
-        p->start = space;
-        json_lex_input(p, space);
+        json_parser_feed(p, " ", 1);
         break;
     }
 
---

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH] json: Improve json parsing

Reply via email to