[arrow-rs] branch master updated: Treat DecimalArray as PrimitiveArray in row format (#2866)

tustvold Tue, 18 Oct 2022 12:11:13 -0700

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/master by this push:
     new 07024f6a1 Treat DecimalArray as PrimitiveArray in row format (#2866)
07024f6a1 is described below

commit 07024f6a16b870fda81cba5779b8817b20386ebf
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Wed Oct 19 08:10:58 2022 +1300

    Treat DecimalArray as PrimitiveArray in row format (#2866)
---
 arrow-buffer/src/bigint.rs  |  20 +++++++++
 arrow/src/row/dictionary.rs |  21 ++++------
 arrow/src/row/fixed.rs      |  47 +--------------------
 arrow/src/row/mod.rs        | 100 ++++++++++++++++++++++++++++++++++----------
 4 files changed, 108 insertions(+), 80 deletions(-)

diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs
index 7873064b4..3518b85e4 100644
--- a/arrow-buffer/src/bigint.rs
+++ b/arrow-buffer/src/bigint.rs
@@ -86,6 +86,15 @@ impl i256 {
         }
     }
 
+    /// Create an integer value from its representation as a byte array in 
little-endian.
+    #[inline]
+    pub fn from_be_bytes(b: [u8; 32]) -> Self {
+        Self {
+            high: i128::from_be_bytes(b[0..16].try_into().unwrap()),
+            low: u128::from_be_bytes(b[16..32].try_into().unwrap()),
+        }
+    }
+
     pub fn from_i128(v: i128) -> Self {
         let mut bytes = if num::Signed::is_negative(&v) {
             [255_u8; 32]
@@ -130,6 +139,17 @@ impl i256 {
         t
     }
 
+    /// Return the memory representation of this integer as a byte array in 
big-endian byte order.
+    #[inline]
+    pub fn to_be_bytes(self) -> [u8; 32] {
+        let mut t = [0; 32];
+        let t_low: &mut [u8; 16] = (&mut t[0..16]).try_into().unwrap();
+        *t_low = self.high.to_be_bytes();
+        let t_high: &mut [u8; 16] = (&mut t[16..32]).try_into().unwrap();
+        *t_high = self.low.to_be_bytes();
+        t
+    }
+
     /// Create an i256 from the provided [`BigInt`] returning a bool indicating
     /// if overflow occurred
     fn from_bigint_with_overflow(v: BigInt) -> (Self, bool) {
diff --git a/arrow/src/row/dictionary.rs b/arrow/src/row/dictionary.rs
index b06688224..1ec7c2a21 100644
--- a/arrow/src/row/dictionary.rs
+++ b/arrow/src/row/dictionary.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use crate::compute::SortOptions;
-use crate::row::fixed::{FixedLengthEncoding, FromSlice, RawDecimal};
+use crate::row::fixed::{FixedLengthEncoding, FromSlice};
 use crate::row::interner::{Interned, OrderPreservingInterner};
 use crate::row::{null_sentinel, Rows};
 use arrow_array::builder::*;
@@ -173,12 +173,8 @@ pub unsafe fn decode_dictionary<K: ArrowDictionaryKeyType>(
         &value_type => (decode_primitive_helper, values),
         DataType::Null => NullArray::new(values.len()).into_data(),
         DataType::Boolean => decode_bool(&values),
-        DataType::Decimal128(p, s) => {
-            decode_decimal::<16, Decimal128Type>(&values, *p, *s)
-        }
-        DataType::Decimal256(p, s) => {
-            decode_decimal::<32, Decimal256Type>(&values, *p, *s)
-        }
+        DataType::Decimal128(p, s) => 
decode_decimal::<Decimal128Type>(&values, *p, *s),
+        DataType::Decimal256(p, s) => 
decode_decimal::<Decimal256Type>(&values, *p, *s),
         DataType::Utf8 => decode_string::<i32>(&values),
         DataType::LargeUtf8 => decode_string::<i64>(&values),
         DataType::Binary => decode_binary::<i32>(&values),
@@ -279,10 +275,9 @@ where
 }
 
 /// Decodes a `DecimalArray` from dictionary values
-fn decode_decimal<const N: usize, T: DecimalType>(
-    values: &[&[u8]],
-    precision: u8,
-    scale: u8,
-) -> ArrayData {
-    decode_fixed::<RawDecimal<N>>(values, T::TYPE_CONSTRUCTOR(precision, 
scale))
+fn decode_decimal<T: DecimalType>(values: &[&[u8]], precision: u8, scale: u8) 
-> ArrayData
+where
+    T::Native: FixedLengthEncoding,
+{
+    decode_fixed::<T::Native>(values, T::TYPE_CONSTRUCTOR(precision, scale))
 }
diff --git a/arrow/src/row/fixed.rs b/arrow/src/row/fixed.rs
index ec7afd8e3..d5935cfb6 100644
--- a/arrow/src/row/fixed.rs
+++ b/arrow/src/row/fixed.rs
@@ -19,9 +19,8 @@ use crate::array::PrimitiveArray;
 use crate::compute::SortOptions;
 use crate::datatypes::ArrowPrimitiveType;
 use crate::row::{null_sentinel, Rows};
-use arrow_array::types::DecimalType;
 use arrow_array::BooleanArray;
-use arrow_buffer::{bit_util, MutableBuffer, ToByteSlice};
+use arrow_buffer::{bit_util, i256, MutableBuffer, ToByteSlice};
 use arrow_data::{ArrayData, ArrayDataBuilder};
 use arrow_schema::DataType;
 use half::f16;
@@ -91,6 +90,7 @@ encode_signed!(2, i16);
 encode_signed!(4, i32);
 encode_signed!(8, i64);
 encode_signed!(16, i128);
+encode_signed!(32, i256);
 
 macro_rules! encode_unsigned {
     ($n:expr, $t:ty) => {
@@ -164,38 +164,6 @@ impl FixedLengthEncoding for f64 {
     }
 }
 
-pub type RawDecimal128 = RawDecimal<16>;
-pub type RawDecimal256 = RawDecimal<32>;
-
-/// The raw bytes of a decimal
-#[derive(Copy, Clone)]
-pub struct RawDecimal<const N: usize>(pub [u8; N]);
-
-impl<const N: usize> ToByteSlice for RawDecimal<N> {
-    fn to_byte_slice(&self) -> &[u8] {
-        &self.0
-    }
-}
-
-impl<const N: usize> FixedLengthEncoding for RawDecimal<N> {
-    type Encoded = [u8; N];
-
-    fn encode(self) -> [u8; N] {
-        let mut val = self.0;
-        // Convert to big endian representation
-        val.reverse();
-        // Toggle top "sign" bit to ensure consistent sort order
-        val[0] ^= 0x80;
-        val
-    }
-
-    fn decode(mut encoded: Self::Encoded) -> Self {
-        encoded[0] ^= 0x80;
-        encoded.reverse();
-        Self(encoded)
-    }
-}
-
 /// Returns the total encoded length (including null byte) for a value of type 
`T::Native`
 pub const fn encoded_len<T>(_col: &PrimitiveArray<T>) -> usize
 where
@@ -354,17 +322,6 @@ fn decode_fixed<T: FixedLengthEncoding + ToByteSlice>(
     unsafe { builder.build_unchecked() }
 }
 
-/// Decodes a `DecimalArray` from rows
-pub fn decode_decimal<const N: usize, T: DecimalType + ArrowPrimitiveType>(
-    rows: &mut [&[u8]],
-    options: SortOptions,
-    precision: u8,
-    scale: u8,
-) -> PrimitiveArray<T> {
-    decode_fixed::<RawDecimal<N>>(rows, T::TYPE_CONSTRUCTOR(precision, scale), 
options)
-        .into()
-}
-
 /// Decodes a `PrimitiveArray` from rows
 pub fn decode_primitive<T: ArrowPrimitiveType>(
     rows: &mut [&[u8]],
diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs
index 77c70a5fd..c3aa9ea4c 100644
--- a/arrow/src/row/mod.rs
+++ b/arrow/src/row/mod.rs
@@ -84,6 +84,7 @@ use std::sync::Arc;
 
 use arrow_array::cast::*;
 use arrow_array::*;
+use arrow_buffer::i256;
 
 use crate::compute::SortOptions;
 use crate::datatypes::*;
@@ -91,10 +92,7 @@ use crate::error::{ArrowError, Result};
 use crate::row::dictionary::{
     compute_dictionary_mapping, decode_dictionary, encode_dictionary,
 };
-use crate::row::fixed::{
-    decode_bool, decode_decimal, decode_primitive, RawDecimal, RawDecimal128,
-    RawDecimal256,
-};
+use crate::row::fixed::{decode_bool, decode_primitive};
 use crate::row::interner::OrderPreservingInterner;
 use crate::row::variable::{decode_binary, decode_string};
 use crate::{downcast_dictionary_array, downcast_primitive_array};
@@ -488,8 +486,8 @@ fn new_empty_rows(
             array => lengths.iter_mut().for_each(|x| *x += 
fixed::encoded_len(array)),
             DataType::Null => {},
             DataType::Boolean => lengths.iter_mut().for_each(|x| *x += 
bool::ENCODED_LEN),
-            DataType::Decimal128(_, _) => lengths.iter_mut().for_each(|x| *x 
+= RawDecimal128::ENCODED_LEN),
-            DataType::Decimal256(_, _) => lengths.iter_mut().for_each(|x| *x 
+= RawDecimal256::ENCODED_LEN),
+            DataType::Decimal128(_, _) => lengths.iter_mut().for_each(|x| *x 
+= i128::ENCODED_LEN),
+            DataType::Decimal256(_, _) => lengths.iter_mut().for_each(|x| *x 
+= i256::ENCODED_LEN),
             DataType::Binary => as_generic_binary_array::<i32>(array)
                 .iter()
                 .zip(lengths.iter_mut())
@@ -571,24 +569,20 @@ fn encode_column(
         DataType::Null => {}
         DataType::Boolean => fixed::encode(out, as_boolean_array(column), 
opts),
         DataType::Decimal128(_, _) => {
-            let iter = column
+            let column = column
                 .as_any()
                 .downcast_ref::<Decimal128Array>()
-                .unwrap()
-                .into_iter()
-                .map(|x| x.map(|x| RawDecimal(x.to_le_bytes())));
+                .unwrap();
 
-            fixed::encode(out, iter, opts)
+            fixed::encode(out, column, opts)
         },
         DataType::Decimal256(_, _) => {
-            let iter = column
+            let column = column
                 .as_any()
                 .downcast_ref::<Decimal256Array>()
-                .unwrap()
-                .into_iter()
-                .map(|x| x.map(|x| RawDecimal(x.to_le_bytes())));
+                .unwrap();
 
-            fixed::encode(out, iter, opts)
+            fixed::encode(out, column, opts)
         },
         DataType::Binary => {
             variable::encode(out, 
as_generic_binary_array::<i32>(column).iter(), opts)
@@ -641,12 +635,16 @@ unsafe fn decode_column(
         DataType::LargeBinary => Arc::new(decode_binary::<i64>(rows, options)),
         DataType::Utf8 => Arc::new(decode_string::<i32>(rows, options)),
         DataType::LargeUtf8 => Arc::new(decode_string::<i64>(rows, options)),
-        DataType::Decimal128(p, s) => {
-            Arc::new(decode_decimal::<16, Decimal128Type>(rows, options, *p, 
*s))
-        }
-        DataType::Decimal256(p, s) => {
-            Arc::new(decode_decimal::<32, Decimal256Type>(rows, options, *p, 
*s))
-        }
+        DataType::Decimal128(p, s) => Arc::new(
+            decode_primitive::<Decimal128Type>(rows, options)
+                .with_precision_and_scale(*p, *s)
+                .unwrap(),
+        ),
+        DataType::Decimal256(p, s) => Arc::new(
+            decode_primitive::<Decimal256Type>(rows, options)
+                .with_precision_and_scale(*p, *s)
+                .unwrap(),
+        ),
         DataType::Dictionary(k, v) => match k.as_ref() {
             DataType::Int8 => Arc::new(decode_dictionary::<Int8Type>(
                 interner.unwrap(),
@@ -795,6 +793,64 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_decimal128() {
+        let mut converter = RowConverter::new(vec![SortField::new(
+            DataType::Decimal128(DECIMAL128_MAX_PRECISION, 7),
+        )]);
+        let col = Arc::new(
+            Decimal128Array::from_iter([
+                None,
+                Some(i128::MIN),
+                Some(-13),
+                Some(46_i128),
+                Some(5456_i128),
+                Some(i128::MAX),
+            ])
+            .with_precision_and_scale(38, 7)
+            .unwrap(),
+        ) as ArrayRef;
+
+        let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap();
+        for i in 0..rows.num_rows() - 1 {
+            assert!(rows.row(i) < rows.row(i + 1));
+        }
+
+        let back = converter.convert_rows(&rows).unwrap();
+        assert_eq!(back.len(), 1);
+        assert_eq!(col.as_ref(), back[0].as_ref())
+    }
+
+    #[test]
+    fn test_decimal256() {
+        let mut converter = RowConverter::new(vec![SortField::new(
+            DataType::Decimal256(DECIMAL256_MAX_PRECISION, 7),
+        )]);
+        let col = Arc::new(
+            Decimal256Array::from_iter([
+                None,
+                Some(i256::MIN),
+                Some(i256::from_parts(0, -1)),
+                Some(i256::from_parts(u128::MAX, -1)),
+                Some(i256::from_parts(u128::MAX, 0)),
+                Some(i256::from_parts(0, 46_i128)),
+                Some(i256::from_parts(5, 46_i128)),
+                Some(i256::MAX),
+            ])
+            .with_precision_and_scale(DECIMAL256_MAX_PRECISION, 7)
+            .unwrap(),
+        ) as ArrayRef;
+
+        let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap();
+        for i in 0..rows.num_rows() - 1 {
+            assert!(rows.row(i) < rows.row(i + 1));
+        }
+
+        let back = converter.convert_rows(&rows).unwrap();
+        assert_eq!(back.len(), 1);
+        assert_eq!(col.as_ref(), back[0].as_ref())
+    }
+
     #[test]
     fn test_bool() {
         let mut converter = 
RowConverter::new(vec![SortField::new(DataType::Boolean)]);

[arrow-rs] branch master updated: Treat DecimalArray as PrimitiveArray in row format (#2866)

Reply via email to