alamb commented on issue #1708:
URL: 
https://github.com/apache/arrow-datafusion/issues/1708#issuecomment-1026263442


   Here is the old implementation in case it is of value:
   
   ```rust
   /// Appends a sequence of [u8] bytes for the value in `col[row]` to
   /// `vec` to be used as a key into the hash map for a dictionary type
   ///
   /// Note that ideally, for dictionary encoded columns, we would be
   /// able to simply use the dictionary idicies themselves (no need to
   /// look up values) or possibly simply build the hash table entirely
   /// on the dictionary indexes.
   ///
   /// This aproach would likely work (very) well for the common case,
   /// but it also has to to handle the case where the dictionary itself
   /// is not the same across all record batches (and thus indexes in one
   /// record batch may not correspond to the same index in another)
   fn dictionary_create_key_for_col<K: ArrowDictionaryKeyType>(
       col: &ArrayRef,
       row: usize,
       vec: &mut Vec<u8>,
   ) -> Result<()> {
       let dict_col = 
col.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
   
       // look up the index in the values dictionary
       let keys_col = dict_col.keys();
       let values_index = keys_col.value(row).to_usize().ok_or_else(|| {
           DataFusionError::Internal(format!(
               "Can not convert index to usize in dictionary of type creating 
group by value {:?}",
               keys_col.data_type()
           ))
       })?;
   
       create_key_for_col(dict_col.values(), values_index, vec)
   }
   
   /// Appends a sequence of [u8] bytes for the value in `col[row]` to
   /// `vec` to be used as a key into the hash map.
   ///
   /// NOTE: This function does not check col.is_valid(). Caller must do so
   fn create_key_for_col(col: &ArrayRef, row: usize, vec: &mut Vec<u8>) -> 
Result<()> {
       match col.data_type() {
           DataType::Boolean => {
               let array = col.as_any().downcast_ref::<BooleanArray>().unwrap();
               vec.extend_from_slice(&[array.value(row) as u8]);
           }
           DataType::Float32 => {
               let array = col.as_any().downcast_ref::<Float32Array>().unwrap();
               vec.extend_from_slice(&array.value(row).to_le_bytes());
           }
           DataType::Float64 => {
               let array = col.as_any().downcast_ref::<Float64Array>().unwrap();
               vec.extend_from_slice(&array.value(row).to_le_bytes());
           }
           DataType::UInt8 => {
               let array = col.as_any().downcast_ref::<UInt8Array>().unwrap();
               vec.extend_from_slice(&array.value(row).to_le_bytes());
           }
           DataType::UInt16 => {
               let array = col.as_any().downcast_ref::<UInt16Array>().unwrap();
               vec.extend_from_slice(&array.value(row).to_le_bytes());
           }
           DataType::UInt32 => {
               let array = col.as_any().downcast_ref::<UInt32Array>().unwrap();
               vec.extend_from_slice(&array.value(row).to_le_bytes());
           }
           DataType::UInt64 => {
               let array = col.as_any().downcast_ref::<UInt64Array>().unwrap();
               vec.extend_from_slice(&array.value(row).to_le_bytes());
           }
           DataType::Int8 => {
               let array = col.as_any().downcast_ref::<Int8Array>().unwrap();
               vec.extend_from_slice(&array.value(row).to_le_bytes());
           }
           DataType::Int16 => {
               let array = col.as_any().downcast_ref::<Int16Array>().unwrap();
               vec.extend(array.value(row).to_le_bytes().iter());
           }
           DataType::Int32 => {
               let array = col.as_any().downcast_ref::<Int32Array>().unwrap();
               vec.extend_from_slice(&array.value(row).to_le_bytes());
           }
           DataType::Int64 => {
               let array = col.as_any().downcast_ref::<Int64Array>().unwrap();
               vec.extend_from_slice(&array.value(row).to_le_bytes());
           }
           DataType::Timestamp(TimeUnit::Millisecond, None) => {
               let array = col
                   .as_any()
                   .downcast_ref::<TimestampMillisecondArray>()
                   .unwrap();
               vec.extend_from_slice(&array.value(row).to_le_bytes());
           }
           DataType::Timestamp(TimeUnit::Microsecond, None) => {
               let array = col
                   .as_any()
                   .downcast_ref::<TimestampMicrosecondArray>()
                   .unwrap();
               vec.extend_from_slice(&array.value(row).to_le_bytes());
           }
           DataType::Timestamp(TimeUnit::Nanosecond, None) => {
               let array = col
                   .as_any()
                   .downcast_ref::<TimestampNanosecondArray>()
                   .unwrap();
               vec.extend_from_slice(&array.value(row).to_le_bytes());
           }
           DataType::Utf8 => {
               let array = col.as_any().downcast_ref::<StringArray>().unwrap();
               let value = array.value(row);
               // store the size
               vec.extend_from_slice(&value.len().to_le_bytes());
               // store the string value
               vec.extend_from_slice(value.as_bytes());
           }
           DataType::LargeUtf8 => {
               let array = 
col.as_any().downcast_ref::<LargeStringArray>().unwrap();
               let value = array.value(row);
               // store the size
               vec.extend_from_slice(&value.len().to_le_bytes());
               // store the string value
               vec.extend_from_slice(value.as_bytes());
           }
           DataType::Date32 => {
               let array = col.as_any().downcast_ref::<Date32Array>().unwrap();
               vec.extend_from_slice(&array.value(row).to_le_bytes());
           }
           DataType::Dictionary(index_type, _) => match **index_type {
               DataType::Int8 => {
                   dictionary_create_key_for_col::<Int8Type>(col, row, vec)?;
               }
               DataType::Int16 => {
                   dictionary_create_key_for_col::<Int16Type>(col, row, vec)?;
               }
               DataType::Int32 => {
                   dictionary_create_key_for_col::<Int32Type>(col, row, vec)?;
               }
               DataType::Int64 => {
                   dictionary_create_key_for_col::<Int64Type>(col, row, vec)?;
               }
               DataType::UInt8 => {
                   dictionary_create_key_for_col::<UInt8Type>(col, row, vec)?;
               }
               DataType::UInt16 => {
                   dictionary_create_key_for_col::<UInt16Type>(col, row, vec)?;
               }
               DataType::UInt32 => {
                   dictionary_create_key_for_col::<UInt32Type>(col, row, vec)?;
               }
               DataType::UInt64 => {
                   dictionary_create_key_for_col::<UInt64Type>(col, row, vec)?;
               }
               _ => {
                   return Err(DataFusionError::Internal(format!(
                   "Unsupported GROUP BY type (dictionary index type not 
supported creating key) {}",
                   col.data_type(),
               )))
               }
           },
           _ => {
               // This is internal because we should have caught this before.
               return Err(DataFusionError::Internal(format!(
                   "Unsupported GROUP BY type creating key {}",
                   col.data_type(),
               )));
           }
       }
       Ok(())
   }
   
   /// Create a key `Vec<u8>` that is used as key for the hashmap
   ///
   /// This looks like
   /// [null_byte][col_value_bytes][null_byte][col_value_bytes]
   ///
   /// Note that relatively uncommon patterns (e.g. not 0x00) are chosen
   /// for the null_byte to make debugging easier. The actual values are
   /// arbitrary.
   ///
   /// For a NULL value in a column, the key looks like
   /// [0xFE]
   ///
   /// For a Non-NULL value in a column, this looks like:
   /// [0xFF][byte representation of column value]
   ///
   /// Example of a key with no NULL values:
   /// ```text
   ///                        0xFF byte at the start of each column
   ///                           signifies the value is non-null
   ///                                          │
   ///
   ///                      ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┴ ─ ─ ─ ─ ─ ─ ─ ┐
   ///
   ///                      │        string len                 │  0x1234
   /// {                    ▼       (as usize le)      "foo"    ▼(as u16 le)
   ///   k1: "foo"        ╔ ═┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──╦ ═┌──┬──┐
   ///   k2: 0x1234u16     FF║03│00│00│00│00│00│00│00│"f│"o│"o│FF║34│12│
   /// }                  ╚ ═└──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──╩ ═└──┴──┘
   ///                     0  1  2  3  4  5  6  7  8  9  10 11 12 13 14
   /// ```
   ///
   ///  Example of a key with NULL values:
   ///
   ///```text
   ///                         0xFE byte at the start of k1 column
   ///                     ┌ ─     signifies the value is NULL
   ///
   ///                     └ ┐
   ///                              0x1234
   /// {                     ▼    (as u16 le)
   ///   k1: NULL          ╔ ═╔ ═┌──┬──┐
   ///   k2: 0x1234u16      FE║FF║12│34│
   /// }                   ╚ ═╚ ═└──┴──┘
   ///                       0  1  2  3
   ///```
   pub(crate) fn create_key(
       group_by_keys: &[ArrayRef],
       row: usize,
       vec: &mut Vec<u8>,
   ) -> Result<()> {
       vec.clear();
       for col in group_by_keys {
           if !col.is_valid(row) {
               vec.push(0xFE);
           } else {
               vec.push(0xFF);
               create_key_for_col(col, row, vec)?
           }
       }
       Ok(())
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to