alamb commented on code in PR #1861:
URL: https://github.com/apache/arrow-rs/pull/1861#discussion_r910495603


##########
arrow/src/array/builder/string_dictionary_builder.rs:
##########
@@ -122,27 +126,44 @@ where
         keys_builder: PrimitiveBuilder<K>,
         dictionary_values: &StringArray,
     ) -> Result<Self> {
+        let state = ahash::RandomState::default();
         let dict_len = dictionary_values.len();
-        let mut values_builder =
-            StringBuilder::with_capacity(dict_len, 
dictionary_values.value_data().len());
-        let mut map: HashMap<Box<[u8]>, K::Native> = 
HashMap::with_capacity(dict_len);
-        for i in 0..dict_len {
-            if dictionary_values.is_valid(i) {
-                let value = dictionary_values.value(i);
-                map.insert(
-                    value.as_bytes().into(),
-                    K::Native::from_usize(i)
-                        .ok_or(ArrowError::DictionaryKeyOverflowError)?,
-                );
-                values_builder.append_value(value)?;
-            } else {
-                values_builder.append_null()?;
+
+        let mut dedup = HashMap::with_capacity_and_hasher(dict_len, ());
+
+        let values_len = dictionary_values.value_data().len();
+        let mut values_builder = StringBuilder::with_capacity(dict_len, 
values_len);
+
+        for (idx, maybe_value) in dictionary_values.iter().enumerate() {
+            match maybe_value {
+                Some(value) => {
+                    let hash = compute_hash(&state, value.as_bytes());

Review Comment:
   For my own curiosity, I stared at this for a while
   
   I would describie this as  using `HashMap` as a `HashSet`  of  pointers 
(indexes) to canonical string values in `StringBuilder`. It then uses the low 
level hash brown APIs to checks if the same string has previously been inserted.
   
   Very nice
   
   I wondered a little why  we needed to store any value for `key` 🤔 at all, 
but then I realized it is needed so we the code can find the corresponding 
string value



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to