This is an automated email from the ASF dual-hosted git repository.
paddyhoran pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new fe39a67 ARROW-8591: [Rust] Reverse lookup for a key in DictionaryArray
fe39a67 is described below
commit fe39a672660e172dba89de64f9b3eff9e545d6fc
Author: Mahmut Bulut <[email protected]>
AuthorDate: Tue Apr 28 15:52:43 2020 -0400
ARROW-8591: [Rust] Reverse lookup for a key in DictionaryArray
This PR enables reverse lookup for already built dict.
Closes #7036 from vertexclique/dict-vectorized-lookup
Authored-by: Mahmut Bulut <[email protected]>
Signed-off-by: Paddy Horan <[email protected]>
---
rust/arrow/src/array/array.rs | 91 +++++++++++++++++++++++++++----------------
1 file changed, 58 insertions(+), 33 deletions(-)
diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs
index 7b0b398..71f4783 100644
--- a/rust/arrow/src/array/array.rs
+++ b/rust/arrow/src/array/array.rs
@@ -1786,34 +1786,33 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer, usize)> for
StructArray {
/// This is mostly used to represent strings or a limited set of primitive
types as integers,
/// for example when doing NLP analysis or representing chromosomes by name.
///
-/// Example with nullable data:
+/// Example **with nullable** data:
///
/// ```
-/// use arrow::array::DictionaryArray;
-/// use arrow::datatypes::Int8Type;
-/// let test = vec!["a", "a", "b", "c"];
-/// let array : DictionaryArray<Int8Type> = test.iter().map(|&x| if x ==
"b" {None} else {Some(x)}).collect();
-/// assert_eq!(array.keys().collect::<Vec<Option<i8>>>(), vec![Some(0),
Some(0), None, Some(1)]);
+/// use arrow::array::DictionaryArray;
+/// use arrow::datatypes::Int8Type;
+/// let test = vec!["a", "a", "b", "c"];
+/// let array : DictionaryArray<Int8Type> = test.iter().map(|&x| if x == "b"
{None} else {Some(x)}).collect();
+/// assert_eq!(array.keys().collect::<Vec<Option<i8>>>(), vec![Some(0),
Some(0), None, Some(1)]);
/// ```
///
-/// Example without nullable data:
+/// Example **without nullable** data:
///
/// ```
-///
-/// use arrow::array::DictionaryArray;
-/// use arrow::datatypes::Int8Type;
-/// let test = vec!["a", "a", "b", "c"];
-/// let array : DictionaryArray<Int8Type> = test.into_iter().collect();
-/// assert_eq!(array.keys().collect::<Vec<Option<i8>>>(), vec![Some(0),
Some(0), Some(1), Some(2)]);
+/// use arrow::array::DictionaryArray;
+/// use arrow::datatypes::Int8Type;
+/// let test = vec!["a", "a", "b", "c"];
+/// let array : DictionaryArray<Int8Type> = test.into_iter().collect();
+/// assert_eq!(array.keys().collect::<Vec<Option<i8>>>(), vec![Some(0),
Some(0), Some(1), Some(2)]);
/// ```
pub struct DictionaryArray<K: ArrowPrimitiveType> {
- // Array of keys, much like a PrimitiveArray
+ /// Array of keys, much like a PrimitiveArray
data: ArrayDataRef,
- // Pointer to the key values.
+ /// Pointer to the key values.
raw_values: RawPtrBox<K::Native>,
- // Array of any values.
+ /// Array of any values.
values: ArrayRef,
/// Values are ordered.
@@ -1867,8 +1866,8 @@ where
impl<'a, K: ArrowPrimitiveType> DictionaryArray<K> {
/// Return an iterator to the keys of this dictionary.
- pub fn keys(&'a self) -> NullableIter<'a, K::Native> {
- NullableIter::<'a, K::Native> {
+ pub fn keys(&self) -> NullableIter<'_, K::Native> {
+ NullableIter::<'_, K::Native> {
data: &self.data,
ptr: unsafe { self.raw_values.get().offset(self.data.offset() as
isize) },
i: 0,
@@ -1876,6 +1875,17 @@ impl<'a, K: ArrowPrimitiveType> DictionaryArray<K> {
}
}
+ /// Returns the lookup key by doing reverse dictionary lookup
+ pub fn lookup_key(&self, value: &'static str) -> Option<K::Native> {
+ let rd_buf: &StringArray =
+ self.values.as_any().downcast_ref::<StringArray>().unwrap();
+
+ (0..rd_buf.len())
+ .position(|i| rd_buf.value(i) == value)
+ .map(K::Native::from_usize)
+ .flatten()
+ }
+
/// Returns an `ArrayRef` to the dictionary values.
pub fn values(&self) -> ArrayRef {
self.values.clone()
@@ -1891,6 +1901,7 @@ impl<'a, K: ArrowPrimitiveType> DictionaryArray<K> {
self.data.len()
}
+ // Currently exists for compatibility purposes with Arrow IPC.
pub fn is_ordered(&self) -> bool {
self.is_ordered
}
@@ -1913,15 +1924,14 @@ impl<T: ArrowPrimitiveType> From<ArrayDataRef> for
DictionaryArray<T> {
let raw_values = data.buffers()[0].raw_data();
let dtype: &DataType = data.data_type();
let values = make_array(data.child_data()[0].clone());
- if let DataType::Dictionary(_, _) = dtype {
- Self {
- data: data,
+ match dtype {
+ DataType::Dictionary(_, _) => Self {
+ data,
raw_values: RawPtrBox::new(raw_values as *const T::Native),
- values: values,
+ values,
is_ordered: false,
- }
- } else {
- panic!("DictionaryArray must have Dictionary data type.")
+ },
+ _ => panic!("DictionaryArray must have Dictionary data type."),
}
}
}
@@ -1931,12 +1941,12 @@ impl<T: ArrowPrimitiveType + ArrowDictionaryKeyType>
FromIterator<Option<&'stati
for DictionaryArray<T>
{
fn from_iter<I: IntoIterator<Item = Option<&'static str>>>(iter: I) ->
Self {
- let iter = iter.into_iter();
- let (lower, _) = iter.size_hint();
+ let it = iter.into_iter();
+ let (lower, _) = it.size_hint();
let key_builder = PrimitiveBuilder::<T>::new(lower);
let value_builder = StringBuilder::new(256);
let mut builder = StringDictionaryBuilder::new(key_builder,
value_builder);
- for i in iter {
+ it.for_each(|i| {
if let Some(i) = i {
// Note: impl ... for Result<DictionaryArray<T>> fails with
// error[E0117]: only traits defined in the current crate can
be implemented for arbitrary types
@@ -1948,7 +1958,7 @@ impl<T: ArrowPrimitiveType + ArrowDictionaryKeyType>
FromIterator<Option<&'stati
.append_null()
.expect("Unable to append a null value to a dictionary
array.");
}
- }
+ });
builder.finish()
}
@@ -1959,16 +1969,16 @@ impl<T: ArrowPrimitiveType + ArrowDictionaryKeyType>
FromIterator<&'static str>
for DictionaryArray<T>
{
fn from_iter<I: IntoIterator<Item = &'static str>>(iter: I) -> Self {
- let iter = iter.into_iter();
- let (lower, _) = iter.size_hint();
+ let it = iter.into_iter();
+ let (lower, _) = it.size_hint();
let key_builder = PrimitiveBuilder::<T>::new(lower);
let value_builder = StringBuilder::new(256);
let mut builder = StringDictionaryBuilder::new(key_builder,
value_builder);
- for i in iter {
+ it.for_each(|i| {
builder
.append(i)
.expect("Unable to append a value to a dictionary array.");
- }
+ });
builder.finish()
}
@@ -3415,4 +3425,19 @@ mod tests {
format!("{:?}", array)
);
}
+
+ #[test]
+ fn test_dictionary_array_reverse_lookup_key() {
+ let test = vec!["a", "a", "b", "c"];
+ let array: DictionaryArray<Int8Type> = test.into_iter().collect();
+
+ assert_eq!(array.lookup_key("c"), Some(2));
+
+ // Direction of building a dictionary is the iterator direction
+ let test = vec!["t3", "t3", "t2", "t2", "t1", "t3", "t4", "t1", "t0"];
+ let array: DictionaryArray<Int8Type> = test.into_iter().collect();
+
+ assert_eq!(array.lookup_key("t1"), Some(2));
+ assert_eq!(array.lookup_key("non-existent"), None);
+ }
}