alamb commented on code in PR #7875: URL: https://github.com/apache/arrow-rs/pull/7875#discussion_r2188160176
########## arrow-array/src/array/byte_view_array.rs: ########## @@ -605,29 +605,85 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> { /// key("bar\0") = 0x0000000000000000000062617200000004 /// ⇒ key("bar") < key("bar\0") /// ``` + /// # Inlining and Endianness + /// + /// - We start by calling `.to_le_bytes()` on the `raw` `u128`, because Rust’s native in‑memory + /// representation is little‑endian on x86/ARM. + /// - We extract the low 32 bits numerically (`raw as u32`)—this step is endianness‑free. + /// - We copy the 12 bytes of inline data (original order) into `buf[0..12]`. + /// - We serialize `length` as big‑endian into `buf[12..16]`. + /// - Finally, `u128::from_be_bytes(buf)` treats `buf[0]` as the most significant byte + /// and `buf[15]` as the least significant, producing a `u128` whose integer value + /// directly encodes “inline data then length” in big‑endian form. + /// + /// This ensures that a simple `u128` comparison is equivalent to the desired + /// lexicographical comparison of the inline bytes followed by length. #[inline(always)] pub fn inline_key_fast(raw: u128) -> u128 { - // Convert the raw u128 (little-endian) into bytes for manipulation + // 1. Decompose `raw` into little‑endian bytes: + // - raw_bytes[0..4] = length in LE + // - raw_bytes[4..16] = inline string data let raw_bytes = raw.to_le_bytes(); - // Extract the length (first 4 bytes), convert to big-endian u32, and promote to u128 - let len_le = &raw_bytes[0..4]; - let len_be = u32::from_le_bytes(len_le.try_into().unwrap()).to_be() as u128; - - // Extract the inline string bytes (next 12 bytes), place them into the lower 12 bytes of a 16-byte array, - // padding the upper 4 bytes with zero to form a little-endian u128 value - let mut inline_bytes = [0u8; 16]; - inline_bytes[4..16].copy_from_slice(&raw_bytes[4..16]); - - // Convert to big-endian to ensure correct lexical ordering - let inline_u128 = u128::from_le_bytes(inline_bytes).to_be(); - - // Shift right by 32 bits to discard the zero padding (upper 4 bytes), - // so that the inline string occupies the high 96 bits - let inline_part = inline_u128 >> 32; - - // Combine the inline string part (high 96 bits) and length (low 32 bits) into the final key - (inline_part << 32) | len_be + // 2. Numerically truncate to get the low 32‑bit length (endianness‑free). + let length = raw as u32; + + // 3. Build a 16‑byte buffer in big‑endian order: + // - buf[0..12] = inline string bytes (in original order) + // - buf[12..16] = length.to_be_bytes() (BE) + let mut buf = [0u8; 16]; + buf[0..12].copy_from_slice(&raw_bytes[4..16]); // inline data + + // Why convert length to big-endian for comparison? + // + // Rust (on most platforms) stores integers in little-endian format, + // meaning the least significant byte is at the lowest memory address. + // For example, an u32 value like 0x22345677 is stored in memory as: + // + // [0x77, 0x56, 0x34, 0x22] // little-endian layout + // ^ ^ ^ ^ + // LSB ↑↑↑ MSB Review Comment: this is a very nice comment -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org