This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 2bcc0cfdb initial commit (#1564)
2bcc0cfdb is described below
commit 2bcc0cfdbf128b94505b7310d680157f6b9f20cb
Author: Chao Sun <[email protected]>
AuthorDate: Fri Apr 15 05:52:03 2022 -0700
initial commit (#1564)
---
.../tests/test_sql.py | 38 +++++-
arrow/src/array/ffi.rs | 116 +++++++++++++++++-
arrow/src/datatypes/ffi.rs | 28 ++++-
arrow/src/ffi.rs | 129 ++++++++++++++++++++-
4 files changed, 302 insertions(+), 9 deletions(-)
diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py
b/arrow-pyarrow-integration-testing/tests/test_sql.py
index 058a32ea8..9d5b93679 100644
--- a/arrow-pyarrow-integration-testing/tests/test_sql.py
+++ b/arrow-pyarrow-integration-testing/tests/test_sql.py
@@ -61,9 +61,11 @@ _supported_pyarrow_types = [
pa.decimal128(19, 4),
pa.string(),
pa.binary(),
+ pa.binary(10),
pa.large_string(),
pa.large_binary(),
pa.list_(pa.int32()),
+ pa.list_(pa.int32(), 2),
pa.large_list(pa.uint16()),
pa.struct(
[
@@ -85,8 +87,6 @@ _supported_pyarrow_types = [
_unsupported_pyarrow_types = [
pa.decimal256(76, 38),
pa.duration("s"),
- pa.binary(10),
- pa.list_(pa.int32(), 2),
pa.map_(pa.string(), pa.int32()),
pa.union(
[pa.field("a", pa.binary(10)), pa.field("b", pa.string())],
@@ -190,6 +190,29 @@ def test_time32_python():
del b
del expected
+def test_binary_array():
+ """
+ Python -> Rust -> Python
+ """
+ a = pa.array(["a", None, "bb", "ccc"], pa.binary())
+ b = rust.round_trip_array(a)
+ b.validate(full=True)
+ assert a.to_pylist() == b.to_pylist()
+ assert a.type == b.type
+ del a
+ del b
+
+def test_fixed_len_binary_array():
+ """
+ Python -> Rust -> Python
+ """
+ a = pa.array(["aaa", None, "bbb", "ccc"], pa.binary(3))
+ b = rust.round_trip_array(a)
+ b.validate(full=True)
+ assert a.to_pylist() == b.to_pylist()
+ assert a.type == b.type
+ del a
+ del b
def test_list_array():
"""
@@ -203,6 +226,17 @@ def test_list_array():
del a
del b
+def test_fixed_len_list_array():
+ """
+ Python -> Rust -> Python
+ """
+ a = pa.array([[1, 2], None, [3, 4], [5, 6]], pa.list_(pa.int64(), 2))
+ b = rust.round_trip_array(a)
+ b.validate(full=True)
+ assert a.to_pylist() == b.to_pylist()
+ assert a.type == b.type
+ del a
+ del b
def test_timestamp_python():
"""
diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs
index 976c6b8ce..51da67c59 100644
--- a/arrow/src/array/ffi.rs
+++ b/arrow/src/array/ffi.rs
@@ -45,12 +45,14 @@ impl TryFrom<ArrayData> for ffi::ArrowArray {
#[cfg(test)]
mod tests {
- use crate::array::{DictionaryArray, Int32Array, StringArray};
+ use crate::array::{DictionaryArray, FixedSizeListArray, Int32Array,
StringArray};
+ use crate::buffer::Buffer;
use crate::error::Result;
+ use crate::util::bit_util;
use crate::{
array::{
- Array, ArrayData, BooleanArray, Int64Array, StructArray,
UInt32Array,
- UInt64Array,
+ Array, ArrayData, BooleanArray, FixedSizeBinaryArray, Int64Array,
+ StructArray, UInt32Array, UInt64Array,
},
datatypes::{DataType, Field},
ffi::ArrowArray,
@@ -149,4 +151,112 @@ mod tests {
let data = array.data();
test_round_trip(data)
}
+
+ #[test]
+ fn test_fixed_size_binary() -> Result<()> {
+ let values = vec![vec![10, 10, 10], vec![20, 20, 20], vec![30, 30,
30]];
+ let array = FixedSizeBinaryArray::try_from_iter(values.into_iter())?;
+
+ let data = array.data();
+ test_round_trip(data)
+ }
+
+ #[test]
+ fn test_fixed_size_binary_with_nulls() -> Result<()> {
+ let values = vec![
+ None,
+ Some(vec![10, 10, 10]),
+ None,
+ Some(vec![20, 20, 20]),
+ Some(vec![30, 30, 30]),
+ None,
+ ];
+ let array =
FixedSizeBinaryArray::try_from_sparse_iter(values.into_iter())?;
+
+ let data = array.data();
+ test_round_trip(data)
+ }
+
+ #[test]
+ fn test_fixed_size_list() -> Result<()> {
+ let v: Vec<i64> = (0..9).into_iter().collect();
+ let value_data = ArrayData::builder(DataType::Int64)
+ .len(9)
+ .add_buffer(Buffer::from_slice_ref(&v))
+ .build()?;
+ let list_data_type =
+ DataType::FixedSizeList(Box::new(Field::new("f", DataType::Int64,
false)), 3);
+ let list_data = ArrayData::builder(list_data_type)
+ .len(3)
+ .add_child_data(value_data)
+ .build()?;
+ let array = FixedSizeListArray::from(list_data);
+
+ let data = array.data();
+ test_round_trip(data)
+ }
+
+ #[test]
+ fn test_fixed_size_list_with_nulls() -> Result<()> {
+ // 0100 0110
+ let mut validity_bits: [u8; 1] = [0; 1];
+ bit_util::set_bit(&mut validity_bits, 1);
+ bit_util::set_bit(&mut validity_bits, 2);
+ bit_util::set_bit(&mut validity_bits, 6);
+
+ let v: Vec<i16> = (0..16).into_iter().collect();
+ let value_data = ArrayData::builder(DataType::Int16)
+ .len(16)
+ .add_buffer(Buffer::from_slice_ref(&v))
+ .build()?;
+ let list_data_type =
+ DataType::FixedSizeList(Box::new(Field::new("f", DataType::Int16,
false)), 2);
+ let list_data = ArrayData::builder(list_data_type)
+ .len(8)
+ .null_bit_buffer(Buffer::from(validity_bits))
+ .add_child_data(value_data)
+ .build()?;
+ let array = FixedSizeListArray::from(list_data);
+
+ let data = array.data();
+ test_round_trip(data)
+ }
+
+ #[test]
+ fn test_fixed_size_list_nested() -> Result<()> {
+ let v: Vec<i32> = (0..16).into_iter().collect();
+ let value_data = ArrayData::builder(DataType::Int32)
+ .len(16)
+ .add_buffer(Buffer::from_slice_ref(&v))
+ .build()?;
+
+ let offsets: Vec<i32> = vec![0, 2, 4, 6, 8, 10, 12, 14, 16];
+ let value_offsets = Buffer::from_slice_ref(&offsets);
+ let inner_list_data_type =
+ DataType::List(Box::new(Field::new("item", DataType::Int32,
false)));
+ let inner_list_data = ArrayData::builder(inner_list_data_type.clone())
+ .len(8)
+ .add_buffer(value_offsets)
+ .add_child_data(value_data)
+ .build()?;
+
+ // 0000 0100
+ let mut validity_bits: [u8; 1] = [0; 1];
+ bit_util::set_bit(&mut validity_bits, 2);
+
+ let list_data_type = DataType::FixedSizeList(
+ Box::new(Field::new("f", inner_list_data_type, false)),
+ 2,
+ );
+ let list_data = ArrayData::builder(list_data_type)
+ .len(4)
+ .null_bit_buffer(Buffer::from(validity_bits))
+ .add_child_data(inner_list_data)
+ .build()?;
+
+ let array = FixedSizeListArray::from(list_data);
+
+ let data = array.data();
+ test_round_trip(data)
+ }
}
diff --git a/arrow/src/datatypes/ffi.rs b/arrow/src/datatypes/ffi.rs
index 10645fb68..bc274e2dc 100644
--- a/arrow/src/datatypes/ffi.rs
+++ b/arrow/src/datatypes/ffi.rs
@@ -67,6 +67,23 @@ impl TryFrom<&FFI_ArrowSchema> for DataType {
// Parametrized types, requiring string parse
other => {
match other.splitn(2, ':').collect::<Vec<&str>>().as_slice() {
+ // FixedSizeBinary type in format "w:num_bytes"
+ ["w", num_bytes] => {
+ let parsed_num_bytes =
num_bytes.parse::<i32>().map_err(|_| {
+ ArrowError::CDataInterface(
+ "FixedSizeBinary requires an integer parameter
representing number of bytes per element".to_string())
+ })?;
+ DataType::FixedSizeBinary(parsed_num_bytes)
+ },
+ // FixedSizeList type in format "+w:num_elems"
+ ["+w", num_elems] => {
+ let c_child = c_schema.child(0);
+ let parsed_num_elems =
num_elems.parse::<i32>().map_err(|_| {
+ ArrowError::CDataInterface(
+ "The FixedSizeList type requires an integer
parameter representing number of elements per list".to_string())
+ })?;
+
DataType::FixedSizeList(Box::new(Field::try_from(c_child)?), parsed_num_elems)
+ },
// Decimal types in format "d:precision,scale" or
"d:precision,scale,bitWidth"
["d", extra] => {
match extra.splitn(3,
',').collect::<Vec<&str>>().as_slice() {
@@ -178,7 +195,9 @@ impl TryFrom<&DataType> for FFI_ArrowSchema {
let format = get_format_string(dtype)?;
// allocate and hold the children
let children = match dtype {
- DataType::List(child) | DataType::LargeList(child) => {
+ DataType::List(child)
+ | DataType::LargeList(child)
+ | DataType::FixedSizeList(child, _) => {
vec![FFI_ArrowSchema::try_from(child.as_ref())?]
}
DataType::Struct(fields) => fields
@@ -215,6 +234,8 @@ fn get_format_string(dtype: &DataType) -> Result<String> {
DataType::LargeBinary => Ok("Z".to_string()),
DataType::Utf8 => Ok("u".to_string()),
DataType::LargeUtf8 => Ok("U".to_string()),
+ DataType::FixedSizeBinary(num_bytes) => Ok(format!("w:{}", num_bytes)),
+ DataType::FixedSizeList(_, num_elems) => Ok(format!("+w:{}",
num_elems)),
DataType::Decimal(precision, scale) => Ok(format!("d:{},{}",
precision, scale)),
DataType::Date32 => Ok("tdD".to_string()),
DataType::Date64 => Ok("tdm".to_string()),
@@ -325,6 +346,11 @@ mod tests {
round_trip_type(DataType::Float64)?;
round_trip_type(DataType::Date64)?;
round_trip_type(DataType::Time64(TimeUnit::Nanosecond))?;
+ round_trip_type(DataType::FixedSizeBinary(12))?;
+ round_trip_type(DataType::FixedSizeList(
+ Box::new(Field::new("a", DataType::Int64, false)),
+ 5,
+ ))?;
round_trip_type(DataType::Utf8)?;
round_trip_type(DataType::List(Box::new(Field::new(
"a",
diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs
index e50756801..ccd774ac4 100644
--- a/arrow/src/ffi.rs
+++ b/arrow/src/ffi.rs
@@ -333,6 +333,17 @@ fn bit_width(data_type: &DataType, i: usize) ->
Result<usize> {
data_type, i
)))
}
+ (DataType::FixedSizeBinary(num_bytes), 1) => size_of::<u8>() *
(*num_bytes as usize) * 8,
+ (DataType::FixedSizeList(f, num_elems), 1) => {
+ let child_bit_width = bit_width(f.data_type(), 1)?;
+ child_bit_width * (*num_elems as usize)
+ },
+ (DataType::FixedSizeBinary(_), _) | (DataType::FixedSizeList(_, _), _)
=> {
+ return Err(ArrowError::CDataInterface(format!(
+ "The datatype \"{:?}\" expects 2 buffers, but requested {}.
Please verify that the C data interface is correctly implemented.",
+ data_type, i
+ )))
+ },
// Variable-sized binaries: have two buffers.
// "small": first buffer is i32, second is in bytes
(DataType::Utf8, 1) | (DataType::Binary, 1) | (DataType::List(_), 1)
=> size_of::<i32>() * 8,
@@ -862,9 +873,10 @@ mod tests {
use super::*;
use crate::array::{
export_array_into_raw, make_array, Array, ArrayData,
BinaryOffsetSizeTrait,
- BooleanArray, DecimalArray, DictionaryArray, GenericBinaryArray,
- GenericListArray, GenericStringArray, Int32Array, OffsetSizeTrait,
- StringOffsetSizeTrait, Time32MillisecondArray,
TimestampMillisecondArray,
+ BooleanArray, DecimalArray, DictionaryArray, FixedSizeBinaryArray,
+ FixedSizeListArray, GenericBinaryArray, GenericListArray,
GenericStringArray,
+ Int32Array, OffsetSizeTrait, StringOffsetSizeTrait,
Time32MillisecondArray,
+ TimestampMillisecondArray,
};
use crate::compute::kernels;
use crate::datatypes::{Field, Int8Type};
@@ -1175,6 +1187,117 @@ mod tests {
Ok(())
}
+ #[test]
+ fn test_fixed_size_binary_array() -> Result<()> {
+ let values = vec![
+ None,
+ Some(vec![10, 10, 10]),
+ None,
+ Some(vec![20, 20, 20]),
+ Some(vec![30, 30, 30]),
+ None,
+ ];
+ let array =
FixedSizeBinaryArray::try_from_sparse_iter(values.into_iter())?;
+
+ // export it
+ let array = ArrowArray::try_from(array.data().clone())?;
+
+ // (simulate consumer) import it
+ let data = ArrayData::try_from(array)?;
+ let array = make_array(data);
+
+ // perform some operation
+ let array = kernels::concat::concat(&[array.as_ref(),
array.as_ref()]).unwrap();
+ let array = array
+ .as_any()
+ .downcast_ref::<FixedSizeBinaryArray>()
+ .unwrap();
+
+ // verify
+ assert_eq!(
+ array,
+ &FixedSizeBinaryArray::try_from_sparse_iter(
+ vec![
+ None,
+ Some(vec![10, 10, 10]),
+ None,
+ Some(vec![20, 20, 20]),
+ Some(vec![30, 30, 30]),
+ None,
+ None,
+ Some(vec![10, 10, 10]),
+ None,
+ Some(vec![20, 20, 20]),
+ Some(vec![30, 30, 30]),
+ None,
+ ]
+ .into_iter()
+ )?
+ );
+
+ // (drop/release)
+ Ok(())
+ }
+
+ #[test]
+ fn test_fixed_size_list_array() -> Result<()> {
+ // 0000 0100
+ let mut validity_bits: [u8; 1] = [0; 1];
+ bit_util::set_bit(&mut validity_bits, 2);
+
+ let v: Vec<i32> = (0..9).into_iter().collect();
+ let value_data = ArrayData::builder(DataType::Int32)
+ .len(9)
+ .add_buffer(Buffer::from_slice_ref(&v))
+ .build()?;
+
+ let list_data_type =
+ DataType::FixedSizeList(Box::new(Field::new("f", DataType::Int32,
false)), 3);
+ let list_data = ArrayData::builder(list_data_type.clone())
+ .len(3)
+ .null_bit_buffer(Buffer::from(validity_bits))
+ .add_child_data(value_data)
+ .build()?;
+
+ // export it
+ let array = ArrowArray::try_from(list_data)?;
+
+ // (simulate consumer) import it
+ let data = ArrayData::try_from(array)?;
+ let array = make_array(data);
+
+ // perform some operation
+ let array = kernels::concat::concat(&[array.as_ref(),
array.as_ref()]).unwrap();
+ let array =
array.as_any().downcast_ref::<FixedSizeListArray>().unwrap();
+
+ // 0010 0100
+ let mut expected_validity_bits: [u8; 1] = [0; 1];
+ bit_util::set_bit(&mut expected_validity_bits, 2);
+ bit_util::set_bit(&mut expected_validity_bits, 5);
+
+ let mut w = vec![];
+ w.extend_from_slice(&v);
+ w.extend_from_slice(&v);
+
+ let expected_value_data = ArrayData::builder(DataType::Int32)
+ .len(18)
+ .add_buffer(Buffer::from_slice_ref(&w))
+ .build()?;
+
+ let expected_list_data = ArrayData::builder(list_data_type)
+ .len(6)
+ .null_bit_buffer(Buffer::from(expected_validity_bits))
+ .add_child_data(expected_value_data)
+ .build()?;
+ let expected_array = FixedSizeListArray::from(expected_list_data);
+
+ // verify
+ assert_eq!(array, &expected_array);
+
+ // (drop/release)
+ Ok(())
+ }
+
#[test]
fn test_dictionary() -> Result<()> {
// create an array natively