alamb commented on issue #1708: URL: https://github.com/apache/arrow-datafusion/issues/1708#issuecomment-1026263442
Here is the old implementation in case it is of value: ```rust /// Appends a sequence of [u8] bytes for the value in `col[row]` to /// `vec` to be used as a key into the hash map for a dictionary type /// /// Note that ideally, for dictionary encoded columns, we would be /// able to simply use the dictionary idicies themselves (no need to /// look up values) or possibly simply build the hash table entirely /// on the dictionary indexes. /// /// This aproach would likely work (very) well for the common case, /// but it also has to to handle the case where the dictionary itself /// is not the same across all record batches (and thus indexes in one /// record batch may not correspond to the same index in another) fn dictionary_create_key_for_col<K: ArrowDictionaryKeyType>( col: &ArrayRef, row: usize, vec: &mut Vec<u8>, ) -> Result<()> { let dict_col = col.as_any().downcast_ref::<DictionaryArray<K>>().unwrap(); // look up the index in the values dictionary let keys_col = dict_col.keys(); let values_index = keys_col.value(row).to_usize().ok_or_else(|| { DataFusionError::Internal(format!( "Can not convert index to usize in dictionary of type creating group by value {:?}", keys_col.data_type() )) })?; create_key_for_col(dict_col.values(), values_index, vec) } /// Appends a sequence of [u8] bytes for the value in `col[row]` to /// `vec` to be used as a key into the hash map. /// /// NOTE: This function does not check col.is_valid(). Caller must do so fn create_key_for_col(col: &ArrayRef, row: usize, vec: &mut Vec<u8>) -> Result<()> { match col.data_type() { DataType::Boolean => { let array = col.as_any().downcast_ref::<BooleanArray>().unwrap(); vec.extend_from_slice(&[array.value(row) as u8]); } DataType::Float32 => { let array = col.as_any().downcast_ref::<Float32Array>().unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::Float64 => { let array = col.as_any().downcast_ref::<Float64Array>().unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::UInt8 => { let array = col.as_any().downcast_ref::<UInt8Array>().unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::UInt16 => { let array = col.as_any().downcast_ref::<UInt16Array>().unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::UInt32 => { let array = col.as_any().downcast_ref::<UInt32Array>().unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::UInt64 => { let array = col.as_any().downcast_ref::<UInt64Array>().unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::Int8 => { let array = col.as_any().downcast_ref::<Int8Array>().unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::Int16 => { let array = col.as_any().downcast_ref::<Int16Array>().unwrap(); vec.extend(array.value(row).to_le_bytes().iter()); } DataType::Int32 => { let array = col.as_any().downcast_ref::<Int32Array>().unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::Int64 => { let array = col.as_any().downcast_ref::<Int64Array>().unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::Timestamp(TimeUnit::Millisecond, None) => { let array = col .as_any() .downcast_ref::<TimestampMillisecondArray>() .unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::Timestamp(TimeUnit::Microsecond, None) => { let array = col .as_any() .downcast_ref::<TimestampMicrosecondArray>() .unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::Timestamp(TimeUnit::Nanosecond, None) => { let array = col .as_any() .downcast_ref::<TimestampNanosecondArray>() .unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::Utf8 => { let array = col.as_any().downcast_ref::<StringArray>().unwrap(); let value = array.value(row); // store the size vec.extend_from_slice(&value.len().to_le_bytes()); // store the string value vec.extend_from_slice(value.as_bytes()); } DataType::LargeUtf8 => { let array = col.as_any().downcast_ref::<LargeStringArray>().unwrap(); let value = array.value(row); // store the size vec.extend_from_slice(&value.len().to_le_bytes()); // store the string value vec.extend_from_slice(value.as_bytes()); } DataType::Date32 => { let array = col.as_any().downcast_ref::<Date32Array>().unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::Dictionary(index_type, _) => match **index_type { DataType::Int8 => { dictionary_create_key_for_col::<Int8Type>(col, row, vec)?; } DataType::Int16 => { dictionary_create_key_for_col::<Int16Type>(col, row, vec)?; } DataType::Int32 => { dictionary_create_key_for_col::<Int32Type>(col, row, vec)?; } DataType::Int64 => { dictionary_create_key_for_col::<Int64Type>(col, row, vec)?; } DataType::UInt8 => { dictionary_create_key_for_col::<UInt8Type>(col, row, vec)?; } DataType::UInt16 => { dictionary_create_key_for_col::<UInt16Type>(col, row, vec)?; } DataType::UInt32 => { dictionary_create_key_for_col::<UInt32Type>(col, row, vec)?; } DataType::UInt64 => { dictionary_create_key_for_col::<UInt64Type>(col, row, vec)?; } _ => { return Err(DataFusionError::Internal(format!( "Unsupported GROUP BY type (dictionary index type not supported creating key) {}", col.data_type(), ))) } }, _ => { // This is internal because we should have caught this before. return Err(DataFusionError::Internal(format!( "Unsupported GROUP BY type creating key {}", col.data_type(), ))); } } Ok(()) } /// Create a key `Vec<u8>` that is used as key for the hashmap /// /// This looks like /// [null_byte][col_value_bytes][null_byte][col_value_bytes] /// /// Note that relatively uncommon patterns (e.g. not 0x00) are chosen /// for the null_byte to make debugging easier. The actual values are /// arbitrary. /// /// For a NULL value in a column, the key looks like /// [0xFE] /// /// For a Non-NULL value in a column, this looks like: /// [0xFF][byte representation of column value] /// /// Example of a key with no NULL values: /// ```text /// 0xFF byte at the start of each column /// signifies the value is non-null /// │ /// /// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┴ ─ ─ ─ ─ ─ ─ ─ ┐ /// /// │ string len │ 0x1234 /// { ▼ (as usize le) "foo" ▼(as u16 le) /// k1: "foo" ╔ ═┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──╦ ═┌──┬──┐ /// k2: 0x1234u16 FF║03│00│00│00│00│00│00│00│"f│"o│"o│FF║34│12│ /// } ╚ ═└──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──╩ ═└──┴──┘ /// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 /// ``` /// /// Example of a key with NULL values: /// ///```text /// 0xFE byte at the start of k1 column /// ┌ ─ signifies the value is NULL /// /// └ ┐ /// 0x1234 /// { ▼ (as u16 le) /// k1: NULL ╔ ═╔ ═┌──┬──┐ /// k2: 0x1234u16 FE║FF║12│34│ /// } ╚ ═╚ ═└──┴──┘ /// 0 1 2 3 ///``` pub(crate) fn create_key( group_by_keys: &[ArrayRef], row: usize, vec: &mut Vec<u8>, ) -> Result<()> { vec.clear(); for col in group_by_keys { if !col.is_valid(row) { vec.push(0xFE); } else { vec.push(0xFF); create_key_for_col(col, row, vec)? } } Ok(()) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org