Dandandan commented on code in PR #9683: URL: https://github.com/apache/arrow-rs/pull/9683#discussion_r3060751748
########## arrow-row/src/radix.rs: ########## @@ -0,0 +1,660 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! MSD radix sort on row-encoded keys. +//! +//! The Arrow row format produces big-endian, memcmp-comparable byte sequences, +//! making it ideal for MSD (Most Significant Digit) radix sort without any +//! additional encoding. This gives O(n × key_width) performance instead of +//! O(n log n × comparison_cost). +//! +//! # When to use this +//! +//! Radix sort on row-encoded keys is the fastest sort strategy for most +//! multi-column sorts, including: +//! - **Primitive columns** (integers, floats) +//! - **String columns**, especially multiple string columns +//! - **Mixed column types** (primitives, strings, dicts, lists) +//! +//! The advantage over [`lexsort_to_indices`] grows with N and with the +//! number of columns. +//! +//! # When NOT to use this +//! +//! Prefer [`lexsort_to_indices`] when: +//! - **All sort columns are low-cardinality dictionaries** with no +//! high-cardinality column to break ties. The row encoding for +//! dictionary values produces long shared prefixes, and radix sort +//! gains little from its first few byte passes before falling back +//! to comparison sort. +//! - **A leading primitive column discriminates most rows and a trailing +//! column is expensive to encode** (e.g., lists). [`lexsort_to_indices`] +//! avoids encoding the trailing column for rows already resolved by +//! the leading column. +//! +//! [`lexsort_to_indices`]: https://docs.rs/arrow-ord/latest/arrow_ord/sort/fn.lexsort_to_indices.html + +use crate::Rows; + +/// When a bucket has this few elements, the fixed per-level cost of radix +/// sort (256-bucket histogram + scatter) exceeds the O(n log n) cost of +/// comparison sort with small n and warm cache lines. +const FALLBACK_THRESHOLD: usize = 64; + +/// Beyond this depth, comparison sort on the full row handles the +/// remaining discrimination. 8 bytes covers the discriminating prefix +/// of most key layouts; deeper recursion hits diminishing returns as +/// buckets become sparse and the per-level overhead dominates. +const MAX_DEPTH: usize = 8; + +/// Sort row indices using MSD radix sort on row-encoded keys. +/// +/// Takes [`Rows`] produced by [`RowConverter::convert_columns`] and returns +/// a `Vec<u32>` of row indices in sorted order. The caller is responsible for +/// encoding columns into row format and for using the returned indices to +/// reorder the original arrays (e.g., via [`take`]). +/// +/// See the [module-level documentation](self) for guidance on when radix sort +/// is faster than [`lexsort_to_indices`]. +/// +/// # Example +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_row::{RowConverter, SortField}; +/// # use arrow_row::radix::radix_sort_to_indices; +/// # use arrow_array::{Int32Array, ArrayRef}; +/// # use arrow_schema::DataType; +/// let array: ArrayRef = Arc::new(Int32Array::from(vec![5, 3, 1, 4, 2])); +/// let converter = RowConverter::new(vec![SortField::new(DataType::Int32)]).unwrap(); +/// let rows = converter.convert_columns(&[array]).unwrap(); +/// let indices = radix_sort_to_indices(&rows); +/// assert_eq!(indices, vec![2, 4, 1, 3, 0]); // points to [1, 2, 3, 4, 5] +/// ``` +/// +/// [`RowConverter::convert_columns`]: crate::RowConverter::convert_columns +/// [`take`]: https://docs.rs/arrow/latest/arrow/compute/fn.take.html +/// [`lexsort_to_indices`]: https://docs.rs/arrow-ord/latest/arrow_ord/sort/fn.lexsort_to_indices.html +pub fn radix_sort_to_indices(rows: &Rows) -> Vec<u32> { + let n = rows.num_rows(); + let mut indices: Vec<u32> = (0..n as u32).collect(); + let mut temp = vec![0u32; n]; + msd_radix_sort(&mut indices, &mut temp, rows, 0); + indices +} + +fn msd_radix_sort(indices: &mut [u32], temp: &mut [u32], rows: &Rows, byte_pos: usize) { + let n = indices.len(); + + if n <= FALLBACK_THRESHOLD || byte_pos >= MAX_DEPTH { + indices.sort_unstable_by(|&a, &b| { + // SAFETY: indices contains a permutation of 0..rows.num_rows() + let ra = unsafe { rows.row_unchecked(a as usize) }; + let rb = unsafe { rows.row_unchecked(b as usize) }; + ra.cmp(&rb) + }); + return; + } + + // Both the histogram and scatter loops read each row's byte via + // row_unchecked. Pre-extracting bytes into a contiguous buffer was + // tried but benchmarked slower — the extra write pass costs more + // than the second read through row offsets already hot in cache. + let mut counts = [0u32; 256]; + for &idx in &*indices { + // SAFETY: indices contains a permutation of 0..rows.num_rows() + let row = unsafe { rows.row_unchecked(idx as usize) }; + let byte = row.data().get(byte_pos).copied().unwrap_or(0); Review Comment: Any way we can make this more efficient? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
