Re: [PR] Perf: Make sort string view fast(1.5X ~ 3X faster) [arrow-rs]

via GitHub Tue, 01 Jul 2025 12:38:51 -0700


alamb commented on code in PR #7792:
URL: https://github.com/apache/arrow-rs/pull/7792#discussion_r2178390852



##########
arrow-ord/src/sort.rs:
##########
@@ -303,18 +303,118 @@ fn sort_bytes<T: ByteArrayType>(
     sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into()
 }
 
+/// Builds a 128-bit composite key for an inline value:
+/// - High 96 bits: the inline data in big-endian byte order (for correct 
lexicographical sorting)
+/// - Low  32 bits: the length in big-endian byte order (so shorter strings 
are always numerically smaller)
+///
+/// This function extracts the length and the 12-byte inline string data from 
the
+/// raw little-endian u128 representation, converts them to big-endian 
ordering,
+/// and packs them into a single u128 value suitable for fast comparisons.
+///
+/// # Note
+/// The input `raw` is assumed to be in little-endian format with the 
following layout:
+/// - bytes 0..4: length (u32)
+/// - bytes 4..16: inline string data (padded with zeros if less than 12 bytes)
+///
+/// The output u128 key places the inline string data in the upper 96 bits 
(big-endian)
+/// and the length in the lower 32 bits (big-endian).
+#[inline(always)]
+pub fn inline_key_fast(raw: u128) -> u128 {
+    // Convert the raw u128 (little-endian) into bytes for manipulation
+    let raw_bytes = raw.to_le_bytes();
+
+    // Extract the length (first 4 bytes), convert to big-endian u32, and 
promote to u128
+    let len_le = &raw_bytes[0..4];
+    let len_be = u32::from_le_bytes(len_le.try_into().unwrap()).to_be() as 
u128;
+
+    // Extract the inline string bytes (next 12 bytes), place them into the 
lower 12 bytes of a 16-byte array,
+    // padding the upper 4 bytes with zero to form a little-endian u128 value
+    let mut inline_bytes = [0u8; 16];
+    inline_bytes[4..16].copy_from_slice(&raw_bytes[4..16]);
+
+    // Convert to big-endian to ensure correct lexical ordering
+    let inline_u128 = u128::from_le_bytes(inline_bytes).to_be();
+
+    // Shift right by 32 bits to discard the zero padding (upper 4 bytes),
+    // so that the inline string occupies the high 96 bits
+    let inline_part = inline_u128 >> 32;
+
+    // Combine the inline string part (high 96 bits) and length (low 32 bits) 
into the final key
+    (inline_part << 32) | len_be
+}
+
 fn sort_byte_view<T: ByteViewType>(
     values: &GenericByteViewArray<T>,
     value_indices: Vec<u32>,
     nulls: Vec<u32>,
     options: SortOptions,
     limit: Option<usize>,
 ) -> UInt32Array {
-    let mut valids = value_indices
+    // 1. Build a list of (index, raw_view, length)
+    let mut valids: Vec<_> = value_indices
         .into_iter()
-        .map(|index| (index, values.value(index as usize).as_ref()))
-        .collect::<Vec<(u32, &[u8])>>();
-    sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into()
+        .map(|idx| {
+            // SAFETY: we know idx < values.len()
+            let raw = unsafe { *values.views().get_unchecked(idx as usize) };
+            let len = raw as u32; // lower 32 bits encode length
+            (idx, raw, len)
+        })
+        .collect();
+
+    // 2. Compute the number of non-null entries to partially sort
+    let vlimit = match (limit, options.nulls_first) {
+        (Some(l), true) => l.saturating_sub(nulls.len()).min(valids.len()),
+        _ => valids.len(),
+    };
+
+    // 3. Mixed comparator: first prefix, then inline vs full comparison
+    let cmp_mixed = |a: &(u32, u128, u32), b: &(u32, u128, u32)| {
+        let (_, raw_a, len_a) = *a;
+        let (_, raw_b, len_b) = *b;
+
+        // 3.1 Both inline (≤12 bytes): compare full 128-bit key including 
length
+        if len_a <= MAX_INLINE_VIEW_LEN && len_b <= MAX_INLINE_VIEW_LEN {
+            return inline_key_fast(raw_a).cmp(&inline_key_fast(raw_b));
+        }
+
+        // 3.2 Compare 4-byte prefix in big-endian order
+        let pref_a = ByteView::from(raw_a).prefix.swap_bytes();
+        let pref_b = ByteView::from(raw_b).prefix.swap_bytes();
+        if pref_a != pref_b {
+            return pref_a.cmp(&pref_b);
+        }
+
+        // 3.3 Fallback to full byte-slice comparison

Review Comment:
   Update: I double checked and [`value_unchecked`  correctly 
handles](https://github.com/apache/arrow-rs/blob/aa960977275f96a42e74569cd4ef833afa38ecf2/arrow-array/src/array/byte_view_array.rs#L319-L318)
 short/long views
   
   Though it makes me wonder if we could squeeze more out of this function by 
handling the three cases explicitly (short, long), (long, short) and (long, 
long)
   
   However, that might have a minimal payback



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Perf: Make sort string view fast(1.5X ~ 3X faster) [arrow-rs]

Reply via email to