alamb commented on code in PR #7875:
URL: https://github.com/apache/arrow-rs/pull/7875#discussion_r2188160176


##########
arrow-array/src/array/byte_view_array.rs:
##########
@@ -605,29 +605,85 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
     /// key("bar\0") = 0x0000000000000000000062617200000004
     /// ⇒ key("bar") < key("bar\0")
     /// ```
+    /// # Inlining and Endianness
+    ///
+    /// - We start by calling `.to_le_bytes()` on the `raw` `u128`, because 
Rust’s native in‑memory
+    ///   representation is little‑endian on x86/ARM.
+    /// - We extract the low 32 bits numerically (`raw as u32`)—this step is 
endianness‑free.
+    /// - We copy the 12 bytes of inline data (original order) into 
`buf[0..12]`.
+    /// - We serialize `length` as big‑endian into `buf[12..16]`.
+    /// - Finally, `u128::from_be_bytes(buf)` treats `buf[0]` as the most 
significant byte
+    ///   and `buf[15]` as the least significant, producing a `u128` whose 
integer value
+    ///   directly encodes “inline data then length” in big‑endian form.
+    ///
+    /// This ensures that a simple `u128` comparison is equivalent to the 
desired
+    /// lexicographical comparison of the inline bytes followed by length.
     #[inline(always)]
     pub fn inline_key_fast(raw: u128) -> u128 {
-        // Convert the raw u128 (little-endian) into bytes for manipulation
+        // 1. Decompose `raw` into little‑endian bytes:
+        //    - raw_bytes[0..4]  = length in LE
+        //    - raw_bytes[4..16] = inline string data
         let raw_bytes = raw.to_le_bytes();
 
-        // Extract the length (first 4 bytes), convert to big-endian u32, and 
promote to u128
-        let len_le = &raw_bytes[0..4];
-        let len_be = u32::from_le_bytes(len_le.try_into().unwrap()).to_be() as 
u128;
-
-        // Extract the inline string bytes (next 12 bytes), place them into 
the lower 12 bytes of a 16-byte array,
-        // padding the upper 4 bytes with zero to form a little-endian u128 
value
-        let mut inline_bytes = [0u8; 16];
-        inline_bytes[4..16].copy_from_slice(&raw_bytes[4..16]);
-
-        // Convert to big-endian to ensure correct lexical ordering
-        let inline_u128 = u128::from_le_bytes(inline_bytes).to_be();
-
-        // Shift right by 32 bits to discard the zero padding (upper 4 bytes),
-        // so that the inline string occupies the high 96 bits
-        let inline_part = inline_u128 >> 32;
-
-        // Combine the inline string part (high 96 bits) and length (low 32 
bits) into the final key
-        (inline_part << 32) | len_be
+        // 2. Numerically truncate to get the low 32‑bit length 
(endianness‑free).
+        let length = raw as u32;
+
+        // 3. Build a 16‑byte buffer in big‑endian order:
+        //    - buf[0..12]  = inline string bytes (in original order)
+        //    - buf[12..16] = length.to_be_bytes() (BE)
+        let mut buf = [0u8; 16];
+        buf[0..12].copy_from_slice(&raw_bytes[4..16]); // inline data
+
+        // Why convert length to big-endian for comparison?
+        //
+        // Rust (on most platforms) stores integers in little-endian format,
+        // meaning the least significant byte is at the lowest memory address.
+        // For example, an u32 value like 0x22345677 is stored in memory as:
+        //
+        //   [0x77, 0x56, 0x34, 0x22]  // little-endian layout
+        //    ^     ^     ^     ^
+        //  LSB   ↑↑↑           MSB

Review Comment:
   this is a very nice comment



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to