scovich commented on code in PR #9232:
URL: https://github.com/apache/arrow-rs/pull/9232#discussion_r2710226916


##########
arrow-json/src/reader/tape.rs:
##########
@@ -294,6 +305,45 @@ macro_rules! next {
     };
 }
 
+/// Evaluates to the next non-whitespace byte in the iterator or breaks the 
current loop
+macro_rules! next_non_whitespace {

Review Comment:
   Directly inspired by the `next!` macro



##########
arrow-json/src/reader/tape.rs:
##########
@@ -338,59 +388,94 @@ impl TapeDecoder {
         }
     }
 
+    /// Write the closing elements for an object to the tape
+    fn end_object(&mut self, start_idx: u32) {
+        let end_idx = self.elements.len() as u32;
+        self.elements[start_idx as usize] = TapeElement::StartObject(end_idx);
+        self.elements.push(TapeElement::EndObject(start_idx));
+        self.stack.pop();
+    }
+
+    /// Write the closing elements for a list to the tape
+    fn end_list(&mut self, start_idx: u32) {
+        let end_idx = self.elements.len() as u32;
+        self.elements[start_idx as usize] = TapeElement::StartList(end_idx);
+        self.elements.push(TapeElement::EndList(start_idx));
+        self.stack.pop();
+    }
+
     pub fn decode(&mut self, buf: &[u8]) -> Result<usize, ArrowError> {
         let mut iter = BufIter::new(buf);
 
         while !iter.is_empty() {
             let state = match self.stack.last_mut() {
                 Some(l) => l,
                 None => {
-                    iter.skip_whitespace();
-                    if iter.is_empty() || self.cur_row >= self.batch_size {
+                    if self.cur_row >= self.batch_size {
                         break;
                     }
 
+                    let b = match iter.next_non_whitespace() {

Review Comment:
   This should have used the macro... but we still need to do it here, before 
incrementing the row count.



##########
arrow-json/src/reader/tape.rs:
##########
@@ -338,59 +388,94 @@ impl TapeDecoder {
         }
     }
 
+    /// Write the closing elements for an object to the tape
+    fn end_object(&mut self, start_idx: u32) {
+        let end_idx = self.elements.len() as u32;
+        self.elements[start_idx as usize] = TapeElement::StartObject(end_idx);
+        self.elements.push(TapeElement::EndObject(start_idx));
+        self.stack.pop();
+    }
+
+    /// Write the closing elements for a list to the tape
+    fn end_list(&mut self, start_idx: u32) {
+        let end_idx = self.elements.len() as u32;
+        self.elements[start_idx as usize] = TapeElement::StartList(end_idx);
+        self.elements.push(TapeElement::EndList(start_idx));
+        self.stack.pop();
+    }
+
     pub fn decode(&mut self, buf: &[u8]) -> Result<usize, ArrowError> {
         let mut iter = BufIter::new(buf);
 
         while !iter.is_empty() {
             let state = match self.stack.last_mut() {
                 Some(l) => l,
                 None => {
-                    iter.skip_whitespace();
-                    if iter.is_empty() || self.cur_row >= self.batch_size {
+                    if self.cur_row >= self.batch_size {
                         break;
                     }
 
+                    let b = match iter.next_non_whitespace() {
+                        Some(b) => b,
+                        None => break,
+                    };
+
                     // Start of row
                     self.cur_row += 1;
-                    self.stack.push(DecoderState::Value);
+
+                    // Detect value type and push appropriate state
+                    dispatch_value!(self, b, |s| self.stack.push(s));
+
                     self.stack.last_mut().unwrap()
                 }
             };
 
             match state {
-                // Decoding an object
+                // Expecting object member or close brace
                 DecoderState::Object(start_idx) => {
-                    iter.advance_until(|b| !json_whitespace(b) && b != b',');
-                    match next!(iter) {
+                    let start_idx = *start_idx;
+                    match next_non_whitespace!(iter) {
                         b'"' => {
-                            self.stack.push(DecoderState::Value);
+                            *state = DecoderState::ObjectAfterValue(start_idx);
                             self.stack.push(DecoderState::Colon);
                             self.stack.push(DecoderState::String);
                         }
-                        b'}' => {
-                            let start_idx = *start_idx;
-                            let end_idx = self.elements.len() as u32;
-                            self.elements[start_idx as usize] = 
TapeElement::StartObject(end_idx);
-                            
self.elements.push(TapeElement::EndObject(start_idx));
-                            self.stack.pop();
-                        }
-                        b => return Err(err(b, "parsing object")),
+                        b'}' => self.end_object(start_idx),
+                        b => return Err(err(b, "expected '\"' or '}'")),
+                    }
+                }
+                // After value in object - expecting comma or close brace
+                DecoderState::ObjectAfterValue(start_idx) => {
+                    let start_idx = *start_idx;
+                    match next_non_whitespace!(iter) {
+                        b',' => *state = DecoderState::Object(start_idx),
+                        b'}' => self.end_object(start_idx),
+                        b => return Err(err(b, "expected ',' or '}'")),
                     }
                 }
-                // Decoding a list
+                // Decoding a list - awaiting next element or ']'
                 DecoderState::List(start_idx) => {
-                    iter.advance_until(|b| !json_whitespace(b) && b != b',');
-                    match iter.peek() {
-                        Some(b']') => {
-                            iter.next();
-                            let start_idx = *start_idx;
-                            let end_idx = self.elements.len() as u32;
-                            self.elements[start_idx as usize] = 
TapeElement::StartList(end_idx);
-                            
self.elements.push(TapeElement::EndList(start_idx));
-                            self.stack.pop();
+                    let start_idx = *start_idx;
+                    dispatch_value!(
+                        self,
+                        next_non_whitespace!(iter),
+                        |s| {
+                            *state = DecoderState::ListAfterValue(start_idx);
+                            self.stack.push(s);
+                        },
+                        b']' => {
+                            self.end_list(start_idx);
+                            continue;

Review Comment:
   The `continue` is newly required in order to bypass the `transition` logic 
inside the macro. 
   It's not a behavior change because this `match` is anyway the last statement 
in the loop body.



##########
arrow-json/src/reader/tape.rs:
##########
@@ -447,9 +512,8 @@ impl TapeDecoder {
                     }
                 }
                 DecoderState::Colon => {
-                    iter.skip_whitespace();
-                    match next!(iter) {
-                        b':' => self.stack.pop(),
+                    match next_non_whitespace!(iter) {
+                        b':' => *state = DecoderState::Value,

Review Comment:
   This is the only state transition that requires an actual `Value` on the 
stack, because we don't know the next byte yet. The other two sites that expect 
values already know the next byte and can dispatch directly on it.



##########
arrow-json/src/reader/tape.rs:
##########
@@ -338,59 +388,94 @@ impl TapeDecoder {
         }
     }
 
+    /// Write the closing elements for an object to the tape
+    fn end_object(&mut self, start_idx: u32) {
+        let end_idx = self.elements.len() as u32;
+        self.elements[start_idx as usize] = TapeElement::StartObject(end_idx);
+        self.elements.push(TapeElement::EndObject(start_idx));
+        self.stack.pop();
+    }
+
+    /// Write the closing elements for a list to the tape
+    fn end_list(&mut self, start_idx: u32) {

Review Comment:
   These two helpers were factored out from their respective Object and List 
match arms, because now they have two call sites. It didn't seem helpful to use 
a macro because there are no custom logic tweaks involved.



##########
arrow-json/src/reader/tape.rs:
##########
@@ -294,6 +305,45 @@ macro_rules! next {
     };
 }
 
+/// Evaluates to the next non-whitespace byte in the iterator or breaks the 
current loop
+macro_rules! next_non_whitespace {
+    ($next:ident) => {
+        match $next.next_non_whitespace() {
+            Some(b) => b,
+            None => break,
+        }
+    };
+}
+
+/// Dispatches value type detection with optional special case and custom 
transition function
+macro_rules! dispatch_value {

Review Comment:
   There are three places in the fully expanded/inline JSON grammar that expect 
a value:
   * Starting a new line -- we skip leading whitespace before dispatching on 
the first value byte, in order to know whether we should increment the row 
count or not
   * Expecting an array element -- we skip leading whitespace and check for a 
closing `]` before dispatching as a value
   * Expecting an object field value -- we already saw the `:` and know a value 
should follow, but there could be leading whitespace. To handle this, we push a 
Value on the stack instead of dispatching directly.
   
   Rather than dispatch to a separate Value state that needs to re-examine the 
byte, we use the macro to inline the logic three times, with localized tweaks 
as needed.



##########
arrow-json/src/reader/tape.rs:
##########
@@ -338,59 +388,94 @@ impl TapeDecoder {
         }
     }
 
+    /// Write the closing elements for an object to the tape
+    fn end_object(&mut self, start_idx: u32) {
+        let end_idx = self.elements.len() as u32;
+        self.elements[start_idx as usize] = TapeElement::StartObject(end_idx);
+        self.elements.push(TapeElement::EndObject(start_idx));
+        self.stack.pop();
+    }
+
+    /// Write the closing elements for a list to the tape
+    fn end_list(&mut self, start_idx: u32) {
+        let end_idx = self.elements.len() as u32;
+        self.elements[start_idx as usize] = TapeElement::StartList(end_idx);
+        self.elements.push(TapeElement::EndList(start_idx));
+        self.stack.pop();
+    }
+
     pub fn decode(&mut self, buf: &[u8]) -> Result<usize, ArrowError> {
         let mut iter = BufIter::new(buf);
 
         while !iter.is_empty() {
             let state = match self.stack.last_mut() {
                 Some(l) => l,
                 None => {
-                    iter.skip_whitespace();
-                    if iter.is_empty() || self.cur_row >= self.batch_size {
+                    if self.cur_row >= self.batch_size {

Review Comment:
   After thinking carefully (and asking Claude to double-check my reasoning), I 
don't think it was actually important to skip whitespace before checking row 
count. If anything, that ordering was a slight performance pessimization.



##########
arrow-json/src/reader/tape.rs:
##########
@@ -338,59 +388,94 @@ impl TapeDecoder {
         }
     }
 
+    /// Write the closing elements for an object to the tape
+    fn end_object(&mut self, start_idx: u32) {
+        let end_idx = self.elements.len() as u32;
+        self.elements[start_idx as usize] = TapeElement::StartObject(end_idx);
+        self.elements.push(TapeElement::EndObject(start_idx));
+        self.stack.pop();
+    }
+
+    /// Write the closing elements for a list to the tape
+    fn end_list(&mut self, start_idx: u32) {
+        let end_idx = self.elements.len() as u32;
+        self.elements[start_idx as usize] = TapeElement::StartList(end_idx);
+        self.elements.push(TapeElement::EndList(start_idx));
+        self.stack.pop();
+    }
+
     pub fn decode(&mut self, buf: &[u8]) -> Result<usize, ArrowError> {
         let mut iter = BufIter::new(buf);
 
         while !iter.is_empty() {
             let state = match self.stack.last_mut() {
                 Some(l) => l,
                 None => {
-                    iter.skip_whitespace();
-                    if iter.is_empty() || self.cur_row >= self.batch_size {
+                    if self.cur_row >= self.batch_size {
                         break;
                     }
 
+                    let b = match iter.next_non_whitespace() {
+                        Some(b) => b,
+                        None => break,
+                    };
+
                     // Start of row
                     self.cur_row += 1;
-                    self.stack.push(DecoderState::Value);
+
+                    // Detect value type and push appropriate state
+                    dispatch_value!(self, b, |s| self.stack.push(s));
+
                     self.stack.last_mut().unwrap()
                 }
             };
 
             match state {
-                // Decoding an object
+                // Expecting object member or close brace
                 DecoderState::Object(start_idx) => {
-                    iter.advance_until(|b| !json_whitespace(b) && b != b',');
-                    match next!(iter) {
+                    let start_idx = *start_idx;
+                    match next_non_whitespace!(iter) {
                         b'"' => {
-                            self.stack.push(DecoderState::Value);
+                            *state = DecoderState::ObjectAfterValue(start_idx);

Review Comment:
   The main overhead of the bug fix is two-fold:
   1. The scan for next meaningful byte is split in two (before+after comma)
   2. The second of those scans requires a new state machine state + loop 
iteration
   
   The same applies to `DecoderState::List` below.
   
   I don't know any cheaper way to express the tighter semantics?



##########
arrow-json/src/reader/tape.rs:
##########
@@ -409,29 +494,9 @@ impl TapeDecoder {
                         b => unreachable!("{}", b),
                     }
                 }
-                state @ DecoderState::Value => {

Review Comment:
   The `state @` capture was unnecessary -- `state` was already in scope.



##########
arrow-json/src/reader/tape.rs:
##########
@@ -338,59 +388,94 @@ impl TapeDecoder {
         }
     }
 
+    /// Write the closing elements for an object to the tape
+    fn end_object(&mut self, start_idx: u32) {
+        let end_idx = self.elements.len() as u32;
+        self.elements[start_idx as usize] = TapeElement::StartObject(end_idx);
+        self.elements.push(TapeElement::EndObject(start_idx));
+        self.stack.pop();
+    }
+
+    /// Write the closing elements for a list to the tape
+    fn end_list(&mut self, start_idx: u32) {
+        let end_idx = self.elements.len() as u32;
+        self.elements[start_idx as usize] = TapeElement::StartList(end_idx);
+        self.elements.push(TapeElement::EndList(start_idx));
+        self.stack.pop();
+    }
+
     pub fn decode(&mut self, buf: &[u8]) -> Result<usize, ArrowError> {
         let mut iter = BufIter::new(buf);
 
         while !iter.is_empty() {
             let state = match self.stack.last_mut() {
                 Some(l) => l,
                 None => {
-                    iter.skip_whitespace();
-                    if iter.is_empty() || self.cur_row >= self.batch_size {
+                    if self.cur_row >= self.batch_size {
                         break;
                     }
 
+                    let b = match iter.next_non_whitespace() {
+                        Some(b) => b,
+                        None => break,
+                    };
+
                     // Start of row
                     self.cur_row += 1;
-                    self.stack.push(DecoderState::Value);
+
+                    // Detect value type and push appropriate state
+                    dispatch_value!(self, b, |s| self.stack.push(s));
+
                     self.stack.last_mut().unwrap()
                 }
             };
 
             match state {
-                // Decoding an object
+                // Expecting object member or close brace
                 DecoderState::Object(start_idx) => {
-                    iter.advance_until(|b| !json_whitespace(b) && b != b',');
-                    match next!(iter) {
+                    let start_idx = *start_idx;
+                    match next_non_whitespace!(iter) {
                         b'"' => {
-                            self.stack.push(DecoderState::Value);
+                            *state = DecoderState::ObjectAfterValue(start_idx);
                             self.stack.push(DecoderState::Colon);
                             self.stack.push(DecoderState::String);
                         }
-                        b'}' => {
-                            let start_idx = *start_idx;
-                            let end_idx = self.elements.len() as u32;
-                            self.elements[start_idx as usize] = 
TapeElement::StartObject(end_idx);
-                            
self.elements.push(TapeElement::EndObject(start_idx));
-                            self.stack.pop();
-                        }
-                        b => return Err(err(b, "parsing object")),
+                        b'}' => self.end_object(start_idx),
+                        b => return Err(err(b, "expected '\"' or '}'")),
+                    }
+                }
+                // After value in object - expecting comma or close brace
+                DecoderState::ObjectAfterValue(start_idx) => {
+                    let start_idx = *start_idx;

Review Comment:
   Note: Have to drop the borrow on `*start_idx` before mutating `*state` or 
calling a method on `self`



##########
arrow-json/src/reader/tape.rs:
##########
@@ -676,18 +740,25 @@ impl<'a> BufIter<'a> {
         }
     }
 
-    fn skip_whitespace(&mut self) {
-        self.advance_until(|b| !json_whitespace(b));
+    // Advance to the next non-whitespace char and consume it
+    fn next_non_whitespace(&mut self) -> Option<u8> {
+        for b in self.as_slice() {
+            self.pos += 1;
+            if !json_whitespace(*b) {
+                return Some(*b);
+            }
+        }
+        None
     }
 }
 
 impl Iterator for BufIter<'_> {
     type Item = u8;
 
     fn next(&mut self) -> Option<Self::Item> {
-        let b = self.peek();
+        let b = self.peek()?;

Review Comment:
   I'm on the fence with this change:
   * Technically, it's needed because somebody repeatedly calling `next` on the 
iterator could cause an integer overflow by blindly incrementing the index
   * But it's performance-neutral at best and may even be a slight slowdown.
   
   I can be convinced to revert it, if we think it's unhelpful.



##########
arrow-json/src/reader/tape.rs:
##########
@@ -676,18 +740,25 @@ impl<'a> BufIter<'a> {
         }
     }
 
-    fn skip_whitespace(&mut self) {
-        self.advance_until(|b| !json_whitespace(b));
+    // Advance to the next non-whitespace char and consume it
+    fn next_non_whitespace(&mut self) -> Option<u8> {
+        for b in self.as_slice() {

Review Comment:
   Imperative code because all declarative monadic chains I could cook up 
produced significantly worse machine code.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to