This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 4adbeee14c Add append_many to dictionary arrays to allow adding
repeated values (#6534)
4adbeee14c is described below
commit 4adbeee14c40c080f8fca3bd42c0c3856bbb151c
Author: Adrian Garcia Badaracco <[email protected]>
AuthorDate: Thu Oct 10 15:55:00 2024 -0500
Add append_many to dictionary arrays to allow adding repeated values (#6534)
* Add append_many to dictionary arrays to allow adding repeated values
* fix merge
* rename func
---
arrow-array/src/array/dictionary_array.rs | 68 ++++++++++++++++++++++
.../builder/generic_bytes_dictionary_builder.rs | 58 +++++++++++++++---
arrow-array/src/builder/primitive_builder.rs | 7 +++
.../src/builder/primitive_dictionary_builder.rs | 62 ++++++++++++++++----
4 files changed, 175 insertions(+), 20 deletions(-)
diff --git a/arrow-array/src/array/dictionary_array.rs
b/arrow-array/src/array/dictionary_array.rs
index d6c5dd4c3e..bdb6f0d4b5 100644
--- a/arrow-array/src/array/dictionary_array.rs
+++ b/arrow-array/src/array/dictionary_array.rs
@@ -1072,6 +1072,74 @@ mod tests {
assert_eq!(dict_array.keys(), &Int16Array::from(vec![3_i16, 4]));
}
+ #[test]
+ fn test_dictionary_builder_append_many() {
+ let mut builder = PrimitiveDictionaryBuilder::<UInt8Type,
UInt32Type>::new();
+
+ builder.append(1).unwrap();
+ builder.append_n(2, 2).unwrap();
+ builder.append_options(None, 2);
+ builder.append_options(Some(3), 3);
+
+ let array = builder.finish();
+
+ let values = array
+ .values()
+ .as_primitive::<UInt32Type>()
+ .iter()
+ .map(Option::unwrap)
+ .collect::<Vec<_>>();
+ assert_eq!(values, &[1, 2, 3]);
+ let keys = array.keys().iter().collect::<Vec<_>>();
+ assert_eq!(
+ keys,
+ &[
+ Some(0),
+ Some(1),
+ Some(1),
+ None,
+ None,
+ Some(2),
+ Some(2),
+ Some(2)
+ ]
+ );
+ }
+
+ #[test]
+ fn test_string_dictionary_builder_append_many() {
+ let mut builder = StringDictionaryBuilder::<Int8Type>::new();
+
+ builder.append("a").unwrap();
+ builder.append_n("b", 2).unwrap();
+ builder.append_options(None::<&str>, 2);
+ builder.append_options(Some("c"), 3);
+
+ let array = builder.finish();
+
+ let values = array
+ .values()
+ .as_string::<i32>()
+ .iter()
+ .map(Option::unwrap)
+ .collect::<Vec<_>>();
+ assert_eq!(values, &["a", "b", "c"]);
+ let keys = array.keys().iter().collect::<Vec<_>>();
+ assert_eq!(
+ keys,
+ &[
+ Some(0),
+ Some(1),
+ Some(1),
+ None,
+ None,
+ Some(2),
+ Some(2),
+ Some(2)
+ ]
+ );
+ }
+
#[test]
fn test_dictionary_array_fmt_debug() {
let mut builder = PrimitiveDictionaryBuilder::<UInt8Type,
UInt32Type>::with_capacity(3, 2);
diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs
b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs
index 128a4f8206..a327c622a7 100644
--- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs
@@ -195,12 +195,7 @@ where
K: ArrowDictionaryKeyType,
T: ByteArrayType,
{
- /// Append a value to the array. Return an existing index
- /// if already present in the values array or a new index if the
- /// value is appended to the values array.
- ///
- /// Returns an error if the new index would overflow the key type.
- pub fn append(&mut self, value: impl AsRef<T::Native>) ->
Result<K::Native, ArrowError> {
+ fn get_or_insert_key(&mut self, value: impl AsRef<T::Native>) ->
Result<K::Native, ArrowError> {
let value_native: &T::Native = value.as_ref();
let value_bytes: &[u8] = value_native.as_ref();
@@ -223,8 +218,32 @@ where
.get();
let key =
K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;
+
+ Ok(key)
+ }
+
+ /// Append a value to the array. Return an existing index
+ /// if already present in the values array or a new index if the
+ /// value is appended to the values array.
+ ///
+ /// Returns an error if the new index would overflow the key type.
+ pub fn append(&mut self, value: impl AsRef<T::Native>) ->
Result<K::Native, ArrowError> {
+ let key = self.get_or_insert_key(value)?;
self.keys_builder.append_value(key);
+ Ok(key)
+ }
+ /// Append a value multiple times to the array.
+ /// This is the same as `append` but allows to append the same value
multiple times without doing multiple lookups.
+ ///
+ /// Returns an error if the new index would overflow the key type.
+ pub fn append_n(
+ &mut self,
+ value: impl AsRef<T::Native>,
+ count: usize,
+ ) -> Result<K::Native, ArrowError> {
+ let key = self.get_or_insert_key(value)?;
+ self.keys_builder.append_value_n(key, count);
Ok(key)
}
@@ -237,6 +256,17 @@ where
self.append(value).expect("dictionary key overflow");
}
+ /// Infallibly append a value to this builder repeatedly `count` times.
+ /// This is the same as `append_value` but allows to append the same value
multiple times without doing multiple lookups.
+ ///
+ /// # Panics
+ ///
+ /// Panics if the resulting length of the dictionary values array would
exceed `T::Native::MAX`
+ pub fn append_values(&mut self, value: impl AsRef<T::Native>, count:
usize) {
+ self.append_n(value, count)
+ .expect("dictionary key overflow");
+ }
+
/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
@@ -256,6 +286,19 @@ where
};
}
+ /// Append an `Option` value into the builder repeatedly `count` times.
+ /// This is the same as `append_option` but allows to append the same
value multiple times without doing multiple lookups.
+ ///
+ /// # Panics
+ ///
+ /// Panics if the resulting length of the dictionary values array would
exceed `T::Native::MAX`
+ pub fn append_options(&mut self, value: Option<impl AsRef<T::Native>>,
count: usize) {
+ match value {
+ None => self.keys_builder.append_nulls(count),
+ Some(v) => self.append_values(v, count),
+ };
+ }
+
/// Builds the `DictionaryArray` and reset this builder.
pub fn finish(&mut self) -> DictionaryArray<K> {
self.dedup.clear();
@@ -331,8 +374,7 @@ fn get_bytes<T: ByteArrayType>(values:
&GenericByteBuilder<T>, idx: usize) -> &[
/// // The builder builds the dictionary value by value
/// builder.append("abc").unwrap();
/// builder.append_null();
-/// builder.append("def").unwrap();
-/// builder.append("def").unwrap();
+/// builder.append_n("def", 2).unwrap(); // appends "def" twice with a single
lookup
/// builder.append("abc").unwrap();
/// let array = builder.finish();
///
diff --git a/arrow-array/src/builder/primitive_builder.rs
b/arrow-array/src/builder/primitive_builder.rs
index 39b27bfca8..3191fea6e4 100644
--- a/arrow-array/src/builder/primitive_builder.rs
+++ b/arrow-array/src/builder/primitive_builder.rs
@@ -202,6 +202,13 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
self.values_builder.append(v);
}
+ /// Appends a value of type `T` into the builder `n` times
+ #[inline]
+ pub fn append_value_n(&mut self, v: T::Native, n: usize) {
+ self.null_buffer_builder.append_n_non_nulls(n);
+ self.values_builder.append_n(n, v);
+ }
+
/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs
b/arrow-array/src/builder/primitive_dictionary_builder.rs
index a764fa4c29..35abe5ba5f 100644
--- a/arrow-array/src/builder/primitive_dictionary_builder.rs
+++ b/arrow-array/src/builder/primitive_dictionary_builder.rs
@@ -21,7 +21,6 @@ use crate::{Array, ArrayRef, ArrowPrimitiveType,
DictionaryArray};
use arrow_buffer::{ArrowNativeType, ToByteSlice};
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
-use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::Arc;
@@ -210,26 +209,41 @@ where
K: ArrowDictionaryKeyType,
V: ArrowPrimitiveType,
{
- /// Append a primitive value to the array. Return an existing index
- /// if already present in the values array or a new index if the
- /// value is appended to the values array.
#[inline]
- pub fn append(&mut self, value: V::Native) -> Result<K::Native,
ArrowError> {
- let key = match self.map.entry(Value(value)) {
- Entry::Vacant(vacant) => {
- // Append new value.
+ fn get_or_insert_key(&mut self, value: V::Native) -> Result<K::Native,
ArrowError> {
+ match self.map.get(&Value(value)) {
+ Some(&key) => {
+
Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?)
+ }
+ None => {
let key = self.values_builder.len();
self.values_builder.append_value(value);
- vacant.insert(key);
-
K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?
+ self.map.insert(Value(value), key);
+
Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?)
}
- Entry::Occupied(o) => K::Native::usize_as(*o.get()),
- };
+ }
+ }
+ /// Append a primitive value to the array. Return an existing index
+ /// if already present in the values array or a new index if the
+ /// value is appended to the values array.
+ #[inline]
+ pub fn append(&mut self, value: V::Native) -> Result<K::Native,
ArrowError> {
+ let key = self.get_or_insert_key(value)?;
self.keys_builder.append_value(key);
Ok(key)
}
+ /// Append a value multiple times to the array.
+ /// This is the same as `append` but allows to append the same value
multiple times without doing multiple lookups.
+ ///
+ /// Returns an error if the new index would overflow the key type.
+ pub fn append_n(&mut self, value: V::Native, count: usize) ->
Result<K::Native, ArrowError> {
+ let key = self.get_or_insert_key(value)?;
+ self.keys_builder.append_value_n(key, count);
+ Ok(key)
+ }
+
/// Infallibly append a value to this builder
///
/// # Panics
@@ -240,6 +254,17 @@ where
self.append(value).expect("dictionary key overflow");
}
+ /// Infallibly append a value to this builder repeatedly `count` times.
+ /// This is the same as `append_value` but allows to append the same value
multiple times without doing multiple lookups.
+ ///
+ /// # Panics
+ ///
+ /// Panics if the resulting length of the dictionary values array would
exceed `T::Native::MAX`
+ pub fn append_values(&mut self, value: V::Native, count: usize) {
+ self.append_n(value, count)
+ .expect("dictionary key overflow");
+ }
+
/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
@@ -259,6 +284,19 @@ where
};
}
+ /// Append an `Option` value into the builder repeatedly `count` times.
+ /// This is the same as `append_option` but allows to append the same
value multiple times without doing multiple lookups.
+ ///
+ /// # Panics
+ ///
+ /// Panics if the resulting length of the dictionary values array would
exceed `T::Native::MAX`
+ pub fn append_options(&mut self, value: Option<V::Native>, count: usize) {
+ match value {
+ None => self.keys_builder.append_nulls(count),
+ Some(v) => self.append_values(v, count),
+ };
+ }
+
/// Builds the `DictionaryArray` and reset this builder.
pub fn finish(&mut self) -> DictionaryArray<K> {
self.map.clear();