This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new a1a53cafc arrow-array::builder: support more dictionary keys (#6845)
a1a53cafc is described below

commit a1a53cafc2170389b5de1af94d5ddc2cf53f2d79
Author: ajwerner <[email protected]>
AuthorDate: Sat Dec 7 15:05:04 2024 -0500

    arrow-array::builder: support more dictionary keys (#6845)
    
    The spec says that the keys in dictionaries are [0]:
    
    > (4) The index type of a Dictionary type can only be an integer type, 
preferably signed, with width 8 to 64 bits.
    
    In my use case I have a very small number of values so wasting bits on a
    wider key is wasteful.
    
    [0]: 
https://github.com/apache/arrow/blob/fe32a7dfe5e22e7737198476fe1ac0e8a5dccef2/docs/source/format/Columnar.rst?plain=1#L182-L183
---
 arrow-array/src/builder/struct_builder.rs | 114 +++++++++++++++++++++---------
 1 file changed, 79 insertions(+), 35 deletions(-)

diff --git a/arrow-array/src/builder/struct_builder.rs 
b/arrow-array/src/builder/struct_builder.rs
index f1ce5fa85..2b288445c 100644
--- a/arrow-array/src/builder/struct_builder.rs
+++ b/arrow-array/src/builder/struct_builder.rs
@@ -15,9 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::builder::*;
-use crate::types::Int32Type;
 use crate::StructArray;
+use crate::{
+    builder::*,
+    types::{Int16Type, Int32Type, Int64Type, Int8Type},
+};
 use arrow_buffer::NullBufferBuilder;
 use arrow_schema::{DataType, Fields, IntervalUnit, SchemaBuilder, TimeUnit};
 use std::sync::Arc;
@@ -290,29 +292,42 @@ pub fn make_builder(datatype: &DataType, capacity: usize) 
-> Box<dyn ArrayBuilde
             t => panic!("The field of Map data type {t:?} should has a child 
Struct field"),
         },
         DataType::Struct(fields) => 
Box::new(StructBuilder::from_fields(fields.clone(), capacity)),
-        DataType::Dictionary(key_type, value_type) if **key_type == 
DataType::Int32 => {
-            match &**value_type {
-                DataType::Utf8 => {
-                    let dict_builder: StringDictionaryBuilder<Int32Type> =
-                        StringDictionaryBuilder::with_capacity(capacity, 256, 
1024);
-                    Box::new(dict_builder)
-                }
-                DataType::LargeUtf8 => {
-                    let dict_builder: LargeStringDictionaryBuilder<Int32Type> =
-                        LargeStringDictionaryBuilder::with_capacity(capacity, 
256, 1024);
-                    Box::new(dict_builder)
-                }
-                DataType::Binary => {
-                    let dict_builder: BinaryDictionaryBuilder<Int32Type> =
-                        BinaryDictionaryBuilder::with_capacity(capacity, 256, 
1024);
-                    Box::new(dict_builder)
-                }
-                DataType::LargeBinary => {
-                    let dict_builder: LargeBinaryDictionaryBuilder<Int32Type> =
-                        LargeBinaryDictionaryBuilder::with_capacity(capacity, 
256, 1024);
-                    Box::new(dict_builder)
+        t @ DataType::Dictionary(key_type, value_type) => {
+            macro_rules! dict_builder {
+                ($key_type:ty) => {
+                    match &**value_type {
+                        DataType::Utf8 => {
+                            let dict_builder: 
StringDictionaryBuilder<$key_type> =
+                                
StringDictionaryBuilder::with_capacity(capacity, 256, 1024);
+                            Box::new(dict_builder)
+                        }
+                        DataType::LargeUtf8 => {
+                            let dict_builder: 
LargeStringDictionaryBuilder<$key_type> =
+                                
LargeStringDictionaryBuilder::with_capacity(capacity, 256, 1024);
+                            Box::new(dict_builder)
+                        }
+                        DataType::Binary => {
+                            let dict_builder: 
BinaryDictionaryBuilder<$key_type> =
+                                
BinaryDictionaryBuilder::with_capacity(capacity, 256, 1024);
+                            Box::new(dict_builder)
+                        }
+                        DataType::LargeBinary => {
+                            let dict_builder: 
LargeBinaryDictionaryBuilder<$key_type> =
+                                
LargeBinaryDictionaryBuilder::with_capacity(capacity, 256, 1024);
+                            Box::new(dict_builder)
+                        }
+                        t => panic!("Dictionary value type {t:?} is not 
currently supported"),
+                    }
+                };
+            }
+            match &**key_type {
+                DataType::Int8 => dict_builder!(Int8Type),
+                DataType::Int16 => dict_builder!(Int16Type),
+                DataType::Int32 => dict_builder!(Int32Type),
+                DataType::Int64 => dict_builder!(Int64Type),
+                _ => {
+                    panic!("Data type {t:?} with key type {key_type:?} is not 
currently supported")
                 }
-                t => panic!("Unsupported dictionary value type {t:?} is not 
currently supported"),
             }
         }
         t => panic!("Data type {t:?} is not currently supported"),
@@ -430,12 +445,14 @@ impl StructBuilder {
 
 #[cfg(test)]
 mod tests {
+    use std::any::type_name;
+
     use super::*;
     use arrow_buffer::Buffer;
     use arrow_data::ArrayData;
     use arrow_schema::Field;
 
-    use crate::array::Array;
+    use crate::{array::Array, types::ArrowDictionaryKeyType};
 
     #[test]
     fn test_struct_array_builder() {
@@ -690,10 +707,31 @@ mod tests {
     }
 
     #[test]
-    fn test_struct_array_builder_from_dictionary_type() {
+    fn test_struct_array_builder_from_dictionary_type_int8_key() {
+        
test_struct_array_builder_from_dictionary_type_inner::<Int8Type>(DataType::Int8);
+    }
+
+    #[test]
+    fn test_struct_array_builder_from_dictionary_type_int16_key() {
+        
test_struct_array_builder_from_dictionary_type_inner::<Int16Type>(DataType::Int16);
+    }
+
+    #[test]
+    fn test_struct_array_builder_from_dictionary_type_int32_key() {
+        
test_struct_array_builder_from_dictionary_type_inner::<Int32Type>(DataType::Int32);
+    }
+
+    #[test]
+    fn test_struct_array_builder_from_dictionary_type_int64_key() {
+        
test_struct_array_builder_from_dictionary_type_inner::<Int64Type>(DataType::Int64);
+    }
+
+    fn test_struct_array_builder_from_dictionary_type_inner<K: 
ArrowDictionaryKeyType>(
+        key_type: DataType,
+    ) {
         let dict_field = Field::new(
             "f1",
-            DataType::Dictionary(Box::new(DataType::Int32), 
Box::new(DataType::Utf8)),
+            DataType::Dictionary(Box::new(key_type), Box::new(DataType::Utf8)),
             false,
         );
         let fields = vec![dict_field.clone()];
@@ -701,10 +739,14 @@ mod tests {
         let cloned_dict_field = dict_field.clone();
         let expected_child_dtype = dict_field.data_type();
         let mut struct_builder = 
StructBuilder::from_fields(vec![cloned_dict_field], 5);
-        struct_builder
-            .field_builder::<StringDictionaryBuilder<Int32Type>>(0)
-            .expect("Builder should be StringDictionaryBuilder")
-            .append_value("dict string");
+        let Some(dict_builder) = 
struct_builder.field_builder::<StringDictionaryBuilder<K>>(0)
+        else {
+            panic!(
+                "Builder should be StringDictionaryBuilder<{}>",
+                type_name::<K>()
+            )
+        };
+        dict_builder.append_value("dict string");
         struct_builder.append(true);
         let array = struct_builder.finish();
 
@@ -714,13 +756,15 @@ mod tests {
     }
 
     #[test]
-    #[should_panic(expected = "Data type Dictionary(Int16, Utf8) is not 
currently supported")]
+    #[should_panic(
+        expected = "Data type Dictionary(UInt64, Utf8) with key type UInt64 is 
not currently supported"
+    )]
     fn test_struct_array_builder_from_schema_unsupported_type() {
         let fields = vec![
-            Field::new("f1", DataType::Int16, false),
+            Field::new("f1", DataType::UInt64, false),
             Field::new(
                 "f2",
-                DataType::Dictionary(Box::new(DataType::Int16), 
Box::new(DataType::Utf8)),
+                DataType::Dictionary(Box::new(DataType::UInt64), 
Box::new(DataType::Utf8)),
                 false,
             ),
         ];
@@ -729,7 +773,7 @@ mod tests {
     }
 
     #[test]
-    #[should_panic(expected = "Unsupported dictionary value type Int32 is not 
currently supported")]
+    #[should_panic(expected = "Dictionary value type Int32 is not currently 
supported")]
     fn test_struct_array_builder_from_dict_with_unsupported_value_type() {
         let fields = vec![Field::new(
             "f1",

Reply via email to