Re: [PR] Adds additional type support to arrow-avro writer [arrow-rs]

via GitHub Wed, 10 Sep 2025 21:43:19 -0700


jecsand838 commented on code in PR #8298:
URL: https://github.com/apache/arrow-rs/pull/8298#discussion_r2338523112



##########
arrow-avro/src/writer/encoder.rs:
##########
@@ -69,6 +73,77 @@ fn write_bool<W: Write + ?Sized>(out: &mut W, v: bool) -> 
Result<(), ArrowError>
         .map_err(|e| ArrowError::IoError(format!("write bool: {e}"), e))
 }
 
+/// Minimal two's-complement big-endian representation helper for Avro decimal 
(bytes).
+///
+/// For positive numbers, trim leading 0x00 while the next byte's MSB is 0.
+/// For negative numbers, trim leading 0xFF while the next byte's MSB is 1.
+/// The resulting slice still encodes the same signed value.
+///
+/// See Avro spec: decimal over `bytes` uses two's-complement big-endian
+/// representation of the unscaled integer value. 1.11.1 specification.
+#[inline]
+fn minimal_twos_complement(be: &[u8]) -> &[u8] {
+    if be.is_empty() {
+        return be;
+    }
+    let mut i = 0usize;
+    let sign = (be[0] & 0x80) != 0;
+    while i + 1 < be.len() {
+        let b = be[i];
+        let next = be[i + 1];
+        let trim_pos = !sign && b == 0x00 && (next & 0x80) == 0;
+        let trim_neg = sign && b == 0xFF && (next & 0x80) != 0;
+        if trim_pos || trim_neg {
+            i += 1;
+        } else {
+            break;
+        }
+    }
+    &be[i..]
+}
+
+/// Sign-extend (or validate/truncate) big-endian integer bytes to exactly `n` 
bytes.
+///
+/// If `src_be` is longer than `n`, ensure that dropped leading bytes are all 
sign bytes,
+/// and that the MSB of the first kept byte matches the sign; otherwise return 
an overflow error.
+/// If shorter than `n`, left-pad with the sign byte.
+///
+/// Used for Avro decimal over `fixed(N)`.
+#[inline]
+fn sign_extend_to_exact(src_be: &[u8], n: usize) -> Result<Vec<u8>, 
ArrowError> {
+    let len = src_be.len();
+    let sign_byte = if len > 0 && (src_be[0] & 0x80) != 0 {
+        0xFF
+    } else {
+        0x00
+    };
+    if len == n {
+        return Ok(src_be.to_vec());
+    }
+    if len > n {
+        let extra = len - n;
+        if src_be[..extra].iter().any(|&b| b != sign_byte) {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Decimal value with {} bytes cannot be represented in {} bytes 
without overflow",
+                len, n
+            )));
+        }
+        if n > 0 {
+            let first_kept = src_be[extra];
+            if ((first_kept ^ sign_byte) & 0x80) != 0 {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "Decimal value with {} bytes cannot be represented in {} 
bytes without overflow",
+                    len, n
+                )));
+            }
+        }
+        return Ok(src_be[extra..].to_vec());
+    }
+    let mut out = vec![sign_byte; n];
+    out[n - len..].copy_from_slice(src_be);
+    Ok(out)
+}

Review Comment:
   @nathaniel-d-ef Actually, here's a slightly different and even more 
performant approach:
   
   ```rust
   #[inline]
   fn minimal_twos_complement(be: &[u8]) -> &[u8] {
       if be.is_empty() {
           return be;
       }
       let sign_byte = if (be[0] & 0x80) != 0 { 0xFF } else { 0x00 };
       let mut k = 0usize;
       while k < be.len() && be[k] == sign_byte {
           k += 1;
       }
       if k == 0 {
           return be;
       }
       if k == be.len() {
           return &be[be.len() - 1..];
       }
       let drop = if ((be[k] ^ sign_byte) & 0x80) == 0 {
           k
       } else {
           k - 1
       };
       &be[drop..]
   }
   
   #[inline]
   fn write_sign_extended<W: Write + ?Sized>(
       out: &mut W,
       src_be: &[u8],
       n: usize,
   ) -> Result<(), ArrowError> {
       let len = src_be.len();
       if len == n {
           return out
               .write_all(src_be)
               .map_err(|e| ArrowError::IoError(format!("write decimal fixed: 
{e}"), e));
       }
       let sign_byte = if len > 0 && (src_be[0] & 0x80) != 0 {
           0xFF
       } else {
           0x00
       };
       if len > n {
           let extra = len - n;
           if n == 0 && src_be.iter().all(|&b| b == sign_byte) {
               return Ok(());
           }
           // All truncated bytes must equal the sign byte, and the MSB of the 
first
           // retained byte must match the sign (otherwise overflow).
           if src_be[..extra].iter().any(|&b| b != sign_byte)
               || ((src_be[extra] ^ sign_byte) & 0x80) != 0
           {
               return Err(ArrowError::InvalidArgumentError(format!(
                   "Decimal value with {len} bytes cannot be represented in {n} 
bytes without overflow",
               )));
           }
           return out
               .write_all(&src_be[extra..])
               .map_err(|e| ArrowError::IoError(format!("write decimal fixed: 
{e}"), e));
       }
       // len < n: prepend sign bytes (sign extension) then the payload
       let pad_len = n - len;
       // Fixed-size stack pads to avoid heap allocation on the hot path
       const ZPAD: [u8; 64] = [0x00; 64];
       const FPAD: [u8; 64] = [0xFF; 64];
       let pad = if sign_byte == 0x00 {
           &ZPAD[..]
       } else {
           &FPAD[..]
       };
       // Emit padding in 64‑byte chunks (minimizes write calls without 
allocating),
       // then write the original bytes.
       let mut rem = pad_len;
       while rem >= pad.len() {
           out.write_all(pad)
               .map_err(|e| ArrowError::IoError(format!("write decimal fixed: 
{e}"), e))?;
           rem -= pad.len();
       }
       if rem > 0 {
           out.write_all(&pad[..rem])
               .map_err(|e| ArrowError::IoError(format!("write decimal fixed: 
{e}"), e))?;
       }
       out.write_all(src_be)
           .map_err(|e| ArrowError::IoError(format!("write decimal fixed: 
{e}"), e))
   }
   ```
   
   Then you'd just call `write_sign_extended` in `DecimalEncoder::encode` like 
this.
   
   ```rust
       fn encode<W: Write + ?Sized>(&mut self, out: &mut W, idx: usize) -> 
Result<(), ArrowError> {
           let be = self.arr.value_be_bytes(idx);
           match self.fixed_size {
               Some(n) => write_sign_extended(out, &be, n),
               None => write_len_prefixed(out, minimal_twos_complement(&be)),
           }
       }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Adds additional type support to arrow-avro writer [arrow-rs]

Reply via email to