jayzhan211 commented on issue #7064: URL: https://github.com/apache/arrow-datafusion/issues/7064#issuecomment-1884047429
I draft the idea, does this make sense? ```rust // Short String Optimizated HashSet for String // Equivalent to HashSet<String> but with better memory usage (Speed unsure) struct SSOStringHashSet { // header: u128 // short string: length(4bytes) + data(12bytes) // long string: length(4bytes) + prefix(4bytes) + offset(8bytes) header_set: HashSet<u128, RandomState>, // map<hash of long string w/o 4 bytes prefix, offset in buffer> long_string_map: HashMap<u64, u64, RandomState>, buffer: BufferBuilder<u8>, } impl SSOStringHashSet { fn insert(&mut self, value: &str) { let value_len = value.len(); if value_len <= 12 { let mut short_string_header = 0u128; short_string_header |= (value_len << 96) as u128; short_string_header |= value .as_bytes() .iter() .fold(0u128, |acc, &x| acc << 8 | x as u128); self.header_set.insert(short_string_header); } else { // 1) hash the string w/o 4 bytes prefix // 2) check if the hash exists in the map // 3) if exists, insert the offset into the header // 4) if not exists, insert the hash and offset into the map let mut long_string_header = 0u128; long_string_header |= (value_len << 96) as u128; long_string_header |= (value .as_bytes() .iter() .take(4) .fold(0u128, |acc, &x| acc << 8 | x as u128) << 64) as u128; let suffix = value .as_bytes() .iter() .skip(4) .collect::<Vec<_>>(); // NYI hash_bytes: hash &[u8] to u64 let hashed_suffix = hash_bytes(suffix); if let Some(offset) = self.long_string_map.get(&hashed_suffix) { long_string_header |= *offset as u128; } else { let offset = self.buffer.len(); self.long_string_map.insert(hashed_suffix, offset as u64); long_string_header |= offset as u128; // convert suffix: Vec<&u8> to &[u8] self.buffer.append_slice(suffix); } } } } ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org