This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 5dd5418070b Minor: Add doc comments to `GenericByteViewArray` (#5512)
5dd5418070b is described below

commit 5dd5418070bd6284e1ca8a5aed17f7323965b525
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu Mar 14 22:45:31 2024 -0400

    Minor: Add doc comments to `GenericByteViewArray` (#5512)
    
    * Minor: Add doc comments to `GenericByteViewArray`
    
    * Improve docs
---
 arrow-array/src/array/byte_view_array.rs | 70 +++++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/arrow-array/src/array/byte_view_array.rs 
b/arrow-array/src/array/byte_view_array.rs
index e22e9b1688b..a3b8a5dcb80 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -34,7 +34,66 @@ use std::sync::Arc;
 /// Different than [`crate::GenericByteArray`] as it stores both an offset and 
length
 /// meaning that take / filter operations can be implemented without copying 
the underlying data.
 ///
+/// See [`StringViewArray`] for storing utf8 encoded string data and
+/// [`BinaryViewArray`] for storing bytes.
+///
 /// [Variable-size Binary View Layout]: 
https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
+///
+/// A `GenericByteViewArray` stores variable length byte strings. An array of
+/// `N` elements is stored as `N` fixed length "views" and a variable number
+/// of variable length "buffers".
+///
+/// Each view is a `u128` value  layout is different depending on the
+/// length of the string stored at that location:
+///
+/// ```text
+///                         ┌──────┬────────────────────────┐
+///                         │length│      string value      │
+///    Strings (len <= 12)  │      │    (padded with 0)     │
+///                         └──────┴────────────────────────┘
+///                          0    31                      127
+///
+///                         ┌───────┬───────┬───────┬───────┐
+///                         │length │prefix │  buf  │offset │
+///    Strings (len > 12)   │       │       │ index │       │
+///                         └───────┴───────┴───────┴───────┘
+///                          0    31       63      95    127
+/// ```
+///
+/// * Strings with length <= 12 are stored directly in the view.
+///
+/// * Strings with length > 12: The first four bytes are stored inline in the
+/// view and the entire string is stored in one of the buffers.
+///
+/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
+/// than they must point into a valid buffer. However, they can be out of 
order,
+/// non continuous and overlapping.
+///
+/// For example, in the following diagram, the strings "FishWasInTownToday" and
+/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
+/// separate buffer while the string "LavaMonster" is stored inlined in the
+/// view. In this case, the same bytes for "Fish" are used to store both 
strings.
+///
+/// ```text
+///                                                                            
┌───┐
+///                         ┌──────┬──────┬──────┬──────┐               offset 
│...│
+/// "FishWasInTownTodayYay" │  21  │ Fish │  0   │ 115  │─ ─              103  
│Mr.│
+///                         └──────┴──────┴──────┴──────┘   │      ┌ ─ ─ ─ ─ ▶ 
│Cru│
+///                         ┌──────┬──────┬──────┬──────┐                      
│mpl│
+/// "CrumpleFacedFish"      │  16  │ Crum │  0   │ 103  │─ ─│─ ─ ─ ┘           
│eFa│
+///                         └──────┴──────┴──────┴──────┘                      
│ced│
+///                         ┌──────┬────────────────────┐   └ ─ ─ ─ ─ ─ ─ ─ ─ 
▶│Fis│
+/// "LavaMonster"           │  11  │   LavaMonster\0    │                      
│hWa│
+///                         └──────┴────────────────────┘               offset 
│sIn│
+///                                                                       115  
│Tow│
+///                                                                            
│nTo│
+///                                                                            
│day│
+///                                  u128 "views"                              
│Yay│
+///                                                                   buffer 0 
│...│
+///                                                                            
└───┘
+/// ```
+/// [`GenericByteArray`]: crate::array::GenericByteArray
+
 pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
     data_type: DataType,
     views: ScalarBuffer<u128>,
@@ -332,10 +391,19 @@ where
 }
 
 /// A [`GenericByteViewArray`] of `[u8]`
+///
+/// # Example
+/// ```
+/// use arrow_array::BinaryViewArray;
+/// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], 
b"world", b"lulu", b"large payload over 12 bytes"]);
+/// assert_eq!(array.value(0), b"hello");
+/// assert_eq!(array.value(3), b"large payload over 12 bytes");
+/// ```
 pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;
 
-/// A [`GenericByteViewArray`] of `str`
+/// A [`GenericByteViewArray`] that stores uf8 data
 ///
+/// # Example
 /// ```
 /// use arrow_array::StringViewArray;
 /// let array = StringViewArray::from_iter_values(vec!["hello", "world", 
"lulu", "large payload over 12 bytes"]);

Reply via email to