Dandandan commented on a change in pull request #793: URL: https://github.com/apache/arrow-datafusion/pull/793#discussion_r679379622
########## File path: datafusion/src/physical_plan/hash_aggregate.rs ########## @@ -644,14 +645,63 @@ fn create_key_for_col(col: &ArrayRef, row: usize, vec: &mut Vec<u8>) -> Result<( } /// Create a key `Vec<u8>` that is used as key for the hashmap +/// +/// This looks like +/// [null_byte][col_value_bytes][null_byte][col_value_bytes] +/// +/// Note that relatively uncommon patterns (e.g. not 0x00) are chosen +/// for the null_byte to make debugging easier. The actual values are +/// arbitrary. +/// +/// For a NULL value in a column, the key looks like +/// [0xFE] +/// +/// For a Non-NULL value in a column, this looks like: +/// [0xFF][byte representation of column value] +/// +/// Example of a key with no NULL values: +/// ```text +/// 0xFF byte at the start of each column +/// signifies the value is non-null +/// │ +/// +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┴ ─ ─ ─ ─ ─ ─ ─ ┐ +/// +/// │ string len │ 0x1234 +/// { ▼ (as usize le) "foo" ▼(as u16 le) +/// k1: "foo" ╔ ═┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──╦ ═┌──┬──┐ +/// k2: 0x1234u16 FF║03│00│00│00│00│00│00│00│"f│"o│"o│FF║34│12│ +/// } ╚ ═└──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──╩ ═└──┴──┘ +/// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 +/// ``` +/// +/// Example of a key with NULL values: +/// +///```text +/// 0xFE byte at the start of k1 column +/// ┌ ─ signifies the value is NULL +/// +/// └ ┐ +/// 0x1234 +/// { ▼ (as u16 le) +/// k1: NULL ╔ ═╔ ═┌──┬──┐ +/// k2: 0x1234u16 FE║FF║12│34│ +/// } ╚ ═╚ ═└──┴──┘ +/// 0 1 2 3 +///``` pub(crate) fn create_key( group_by_keys: &[ArrayRef], row: usize, vec: &mut Vec<u8>, ) -> Result<()> { vec.clear(); for col in group_by_keys { - create_key_for_col(col, row, vec)? + if !col.is_valid(row) { Review comment: Not sure if it makes sense to improve performance here, but an optimization might be to check on `null-count==0` outside of this function to avoid the `is_valid` call and just always add an `0xFF` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org