(datafusion-comet) branch main updated: docs: add SAFETY comments to all unsafe blocks in shuffle spark_unsafe module (#3603)

agrove Wed, 25 Feb 2026 18:22:30 -0800

This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git



The following commit(s) were added to refs/heads/main by this push:
     new c91684922 docs: add SAFETY comments to all unsafe blocks in shuffle 
spark_unsafe module (#3603)
c91684922 is described below

commit c916849227b8a961b1d024310d5b9db3ba7620cf
Author: Andy Grove <[email protected]>
AuthorDate: Wed Feb 25 19:21:18 2026 -0700

    docs: add SAFETY comments to all unsafe blocks in shuffle spark_unsafe 
module (#3603)
---
 .../src/execution/shuffle/spark_unsafe/list.rs     |  6 +++-
 .../core/src/execution/shuffle/spark_unsafe/map.rs |  3 +-
 .../core/src/execution/shuffle/spark_unsafe/row.rs | 41 ++++++++++++++++++++++
 3 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/native/core/src/execution/shuffle/spark_unsafe/list.rs 
b/native/core/src/execution/shuffle/spark_unsafe/list.rs
index d8e39e8b0..cc7281394 100644
--- a/native/core/src/execution/shuffle/spark_unsafe/list.rs
+++ b/native/core/src/execution/shuffle/spark_unsafe/list.rs
@@ -51,7 +51,8 @@ impl SparkUnsafeObject for SparkUnsafeArray {
 impl SparkUnsafeArray {
     /// Creates a `SparkUnsafeArray` which points to the given address and 
size in bytes.
     pub fn new(addr: i64) -> Self {
-        // Read the number of elements from the first 8 bytes.
+        // SAFETY: addr points to valid Spark UnsafeArray data from the JVM.
+        // The first 8 bytes contain the element count as a little-endian i64.
         let slice: &[u8] = unsafe { std::slice::from_raw_parts(addr as *const 
u8, 8) };
         let num_elements = i64::from_le_bytes(slice.try_into().unwrap());
 
@@ -83,6 +84,9 @@ impl SparkUnsafeArray {
     /// Returns true if the null bit at the given index of the array is set.
     #[inline]
     pub(crate) fn is_null_at(&self, index: usize) -> bool {
+        // SAFETY: row_addr points to valid Spark UnsafeArray data. The null 
bitset starts
+        // at offset 8 and contains ceil(num_elements/64) * 8 bytes. The 
caller ensures
+        // index < num_elements, so word_offset is within the bitset region.
         unsafe {
             let mask: i64 = 1i64 << (index & 0x3f);
             let word_offset = (self.row_addr + 8 + (((index >> 6) as i64) << 
3)) as *const i64;
diff --git a/native/core/src/execution/shuffle/spark_unsafe/map.rs 
b/native/core/src/execution/shuffle/spark_unsafe/map.rs
index de2b96146..dbb5b404a 100644
--- a/native/core/src/execution/shuffle/spark_unsafe/map.rs
+++ b/native/core/src/execution/shuffle/spark_unsafe/map.rs
@@ -30,7 +30,8 @@ pub struct SparkUnsafeMap {
 impl SparkUnsafeMap {
     /// Creates a `SparkUnsafeMap` which points to the given address and size 
in bytes.
     pub(crate) fn new(addr: i64, size: i32) -> Self {
-        // Read the number of bytes of key array from the first 8 bytes.
+        // SAFETY: addr points to valid Spark UnsafeMap data from the JVM.
+        // The first 8 bytes contain the key array size as a little-endian i64.
         let slice: &[u8] = unsafe { std::slice::from_raw_parts(addr as *const 
u8, 8) };
         let key_array_size = i64::from_le_bytes(slice.try_into().unwrap());
 
diff --git a/native/core/src/execution/shuffle/spark_unsafe/row.rs 
b/native/core/src/execution/shuffle/spark_unsafe/row.rs
index 1c121c506..02daf6a34 100644
--- a/native/core/src/execution/shuffle/spark_unsafe/row.rs
+++ b/native/core/src/execution/shuffle/spark_unsafe/row.rs
@@ -58,6 +58,19 @@ const NESTED_TYPE_BUILDER_CAPACITY: usize = 100;
 /// A common trait for Spark Unsafe classes that can be used to access the 
underlying data,
 /// e.g., `UnsafeRow` and `UnsafeArray`. This defines a set of methods that 
can be used to
 /// access the underlying data with index.
+///
+/// # Safety
+///
+/// Implementations must ensure that:
+/// - `get_row_addr()` returns a valid pointer to JVM-allocated memory
+/// - `get_element_offset()` returns a valid pointer within the row/array data 
region
+/// - The memory layout follows Spark's UnsafeRow/UnsafeArray format
+/// - The memory remains valid for the lifetime of the object (guaranteed by 
JVM ownership)
+///
+/// All accessor methods (get_boolean, get_int, etc.) use unsafe pointer 
operations but are
+/// safe to call as long as:
+/// - The index is within bounds (caller's responsibility)
+/// - The object was constructed from valid Spark UnsafeRow/UnsafeArray data
 pub trait SparkUnsafeObject {
     /// Returns the address of the row.
     fn get_row_addr(&self) -> i64;
@@ -77,12 +90,15 @@ pub trait SparkUnsafeObject {
     /// Returns boolean value at the given index of the object.
     fn get_boolean(&self, index: usize) -> bool {
         let addr = self.get_element_offset(index, 1);
+        // SAFETY: addr points to valid element data within the 
UnsafeRow/UnsafeArray region.
+        // The caller ensures index is within bounds.
         unsafe { *addr != 0 }
     }
 
     /// Returns byte value at the given index of the object.
     fn get_byte(&self, index: usize) -> i8 {
         let addr = self.get_element_offset(index, 1);
+        // SAFETY: addr points to valid element data (1 byte) within the 
row/array region.
         let slice: &[u8] = unsafe { std::slice::from_raw_parts(addr, 1) };
         i8::from_le_bytes(slice.try_into().unwrap())
     }
@@ -90,6 +106,7 @@ pub trait SparkUnsafeObject {
     /// Returns short value at the given index of the object.
     fn get_short(&self, index: usize) -> i16 {
         let addr = self.get_element_offset(index, 2);
+        // SAFETY: addr points to valid element data (2 bytes) within the 
row/array region.
         let slice: &[u8] = unsafe { std::slice::from_raw_parts(addr, 2) };
         i16::from_le_bytes(slice.try_into().unwrap())
     }
@@ -97,6 +114,7 @@ pub trait SparkUnsafeObject {
     /// Returns integer value at the given index of the object.
     fn get_int(&self, index: usize) -> i32 {
         let addr = self.get_element_offset(index, 4);
+        // SAFETY: addr points to valid element data (4 bytes) within the 
row/array region.
         let slice: &[u8] = unsafe { std::slice::from_raw_parts(addr, 4) };
         i32::from_le_bytes(slice.try_into().unwrap())
     }
@@ -104,6 +122,7 @@ pub trait SparkUnsafeObject {
     /// Returns long value at the given index of the object.
     fn get_long(&self, index: usize) -> i64 {
         let addr = self.get_element_offset(index, 8);
+        // SAFETY: addr points to valid element data (8 bytes) within the 
row/array region.
         let slice: &[u8] = unsafe { std::slice::from_raw_parts(addr, 8) };
         i64::from_le_bytes(slice.try_into().unwrap())
     }
@@ -111,6 +130,7 @@ pub trait SparkUnsafeObject {
     /// Returns float value at the given index of the object.
     fn get_float(&self, index: usize) -> f32 {
         let addr = self.get_element_offset(index, 4);
+        // SAFETY: addr points to valid element data (4 bytes) within the 
row/array region.
         let slice: &[u8] = unsafe { std::slice::from_raw_parts(addr, 4) };
         f32::from_le_bytes(slice.try_into().unwrap())
     }
@@ -118,6 +138,7 @@ pub trait SparkUnsafeObject {
     /// Returns double value at the given index of the object.
     fn get_double(&self, index: usize) -> f64 {
         let addr = self.get_element_offset(index, 8);
+        // SAFETY: addr points to valid element data (8 bytes) within the 
row/array region.
         let slice: &[u8] = unsafe { std::slice::from_raw_parts(addr, 8) };
         f64::from_le_bytes(slice.try_into().unwrap())
     }
@@ -126,6 +147,8 @@ pub trait SparkUnsafeObject {
     fn get_string(&self, index: usize) -> &str {
         let (offset, len) = self.get_offset_and_len(index);
         let addr = self.get_row_addr() + offset as i64;
+        // SAFETY: addr points to valid UTF-8 string data within the 
variable-length region.
+        // Offset and length are read from the fixed-length portion of the 
row/array.
         let slice: &[u8] = unsafe { std::slice::from_raw_parts(addr as *const 
u8, len as usize) };
 
         from_utf8(slice).unwrap()
@@ -135,12 +158,15 @@ pub trait SparkUnsafeObject {
     fn get_binary(&self, index: usize) -> &[u8] {
         let (offset, len) = self.get_offset_and_len(index);
         let addr = self.get_row_addr() + offset as i64;
+        // SAFETY: addr points to valid binary data within the variable-length 
region.
+        // Offset and length are read from the fixed-length portion of the 
row/array.
         unsafe { std::slice::from_raw_parts(addr as *const u8, len as usize) }
     }
 
     /// Returns date value at the given index of the object.
     fn get_date(&self, index: usize) -> i32 {
         let addr = self.get_element_offset(index, 4);
+        // SAFETY: addr points to valid element data (4 bytes) within the 
row/array region.
         let slice: &[u8] = unsafe { std::slice::from_raw_parts(addr, 4) };
         i32::from_le_bytes(slice.try_into().unwrap())
     }
@@ -148,6 +174,7 @@ pub trait SparkUnsafeObject {
     /// Returns timestamp value at the given index of the object.
     fn get_timestamp(&self, index: usize) -> i64 {
         let addr = self.get_element_offset(index, 8);
+        // SAFETY: addr points to valid element data (8 bytes) within the 
row/array region.
         let slice: &[u8] = unsafe { std::slice::from_raw_parts(addr, 8) };
         i64::from_le_bytes(slice.try_into().unwrap())
     }
@@ -257,6 +284,9 @@ impl SparkUnsafeRow {
     /// Returns true if the null bit at the given index of the row is set.
     #[inline]
     pub(crate) fn is_null_at(&self, index: usize) -> bool {
+        // SAFETY: row_addr points to valid Spark UnsafeRow data with at least
+        // ceil(num_fields/64) * 8 bytes of null bitset. The caller ensures 
index < num_fields.
+        // word_offset is within the bitset region since (index >> 6) << 3 < 
bitset size.
         unsafe {
             let mask: i64 = 1i64 << (index & 0x3f);
             let word_offset = (self.row_addr + (((index >> 6) as i64) << 3)) 
as *const i64;
@@ -267,6 +297,9 @@ impl SparkUnsafeRow {
 
     /// Unsets the null bit at the given index of the row, i.e., set the bit 
to 0 (not null).
     pub fn set_not_null_at(&mut self, index: usize) {
+        // SAFETY: row_addr points to valid Spark UnsafeRow data with at least
+        // ceil(num_fields/64) * 8 bytes of null bitset. The caller ensures 
index < num_fields.
+        // Writing is safe because we have mutable access and the memory is 
owned by the JVM.
         unsafe {
             let mask: i64 = 1i64 << (index & 0x3f);
             let word_offset = (self.row_addr + (((index >> 6) as i64) << 3)) 
as *mut i64;
@@ -463,6 +496,8 @@ fn append_columns(
             let mut row = SparkUnsafeRow::new(schema);
 
             for i in row_start..row_end {
+                // SAFETY: row_addresses_ptr and row_sizes_ptr are JNI arrays 
with at least
+                // row_end elements. i is in [row_start, row_end) so the 
offset is in bounds.
                 let row_addr = unsafe { *row_addresses_ptr.add(i) };
                 let row_size = unsafe { *row_sizes_ptr.add(i) };
                 row.point_to(row_addr, row_size);
@@ -593,6 +628,8 @@ fn append_columns(
             let mut row = SparkUnsafeRow::new(schema);
 
             for i in row_start..row_end {
+                // SAFETY: row_addresses_ptr and row_sizes_ptr are JNI arrays 
with at least
+                // row_end elements. i is in [row_start, row_end) so the 
offset is in bounds.
                 let row_addr = unsafe { *row_addresses_ptr.add(i) };
                 let row_size = unsafe { *row_sizes_ptr.add(i) };
                 row.point_to(row_addr, row_size);
@@ -613,6 +650,8 @@ fn append_columns(
             let mut row = SparkUnsafeRow::new(schema);
 
             for i in row_start..row_end {
+                // SAFETY: row_addresses_ptr and row_sizes_ptr are JNI arrays 
with at least
+                // row_end elements. i is in [row_start, row_end) so the 
offset is in bounds.
                 let row_addr = unsafe { *row_addresses_ptr.add(i) };
                 let row_size = unsafe { *row_sizes_ptr.add(i) };
                 row.point_to(row_addr, row_size);
@@ -640,6 +679,8 @@ fn append_columns(
             let mut row = SparkUnsafeRow::new(schema);
 
             for i in row_start..row_end {
+                // SAFETY: row_addresses_ptr and row_sizes_ptr are JNI arrays 
with at least
+                // row_end elements. i is in [row_start, row_end) so the 
offset is in bounds.
                 let row_addr = unsafe { *row_addresses_ptr.add(i) };
                 let row_size = unsafe { *row_sizes_ptr.add(i) };
                 row.point_to(row_addr, row_size);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion-comet) branch main updated: docs: add SAFETY comments to all unsafe blocks in shuffle spark_unsafe module (#3603)

Reply via email to