notfilippo commented on code in PR #11978: URL: https://github.com/apache/datafusion/pull/11978#discussion_r1716741382
########## datafusion/core/src/datasource/physical_plan/file_scan_config.rs: ########## @@ -480,155 +422,17 @@ impl PartitionColumnProjector { } } -#[derive(Debug, Default)] -struct ZeroBufferGenerators { - gen_i8: ZeroBufferGenerator<i8>, - gen_i16: ZeroBufferGenerator<i16>, - gen_i32: ZeroBufferGenerator<i32>, - gen_i64: ZeroBufferGenerator<i64>, - gen_u8: ZeroBufferGenerator<u8>, - gen_u16: ZeroBufferGenerator<u16>, - gen_u32: ZeroBufferGenerator<u32>, - gen_u64: ZeroBufferGenerator<u64>, -} - -/// Generate a arrow [`Buffer`] that contains zero values. -#[derive(Debug, Default)] -struct ZeroBufferGenerator<T> -where - T: ArrowNativeType, -{ - cache: Option<Buffer>, - _t: PhantomData<T>, -} - -impl<T> ZeroBufferGenerator<T> -where - T: ArrowNativeType, -{ - const SIZE: usize = std::mem::size_of::<T>(); - - fn get_buffer(&mut self, n_vals: usize) -> Buffer { - match &mut self.cache { - Some(buf) if buf.len() >= n_vals * Self::SIZE => { - buf.slice_with_length(0, n_vals * Self::SIZE) - } - _ => { - let mut key_buffer_builder = BufferBuilder::<T>::new(n_vals); - key_buffer_builder.advance(n_vals); // keys are all 0 - self.cache.insert(key_buffer_builder.finish()).clone() - } - } - } -} - -fn create_dict_array<T>( - buffer_gen: &mut ZeroBufferGenerator<T>, - dict_val: &ScalarValue, - len: usize, - data_type: DataType, -) -> Result<ArrayRef> -where - T: ArrowNativeType, -{ - let dict_vals = dict_val.to_array()?; - - let sliced_key_buffer = buffer_gen.get_buffer(len); - - // assemble pieces together - let mut builder = ArrayData::builder(data_type) - .len(len) - .add_buffer(sliced_key_buffer); - builder = builder.add_child_data(dict_vals.to_data()); - Ok(Arc::new(DictionaryArray::<UInt16Type>::from( - builder.build().unwrap(), - ))) -} - -fn create_output_array( - key_buffer_cache: &mut ZeroBufferGenerators, - val: &ScalarValue, - len: usize, -) -> Result<ArrayRef> { - if let ScalarValue::Dictionary(key_type, dict_val) = &val { - match key_type.as_ref() { - DataType::Int8 => { - return create_dict_array( - &mut key_buffer_cache.gen_i8, - dict_val, - len, - val.data_type(), - ); - } - DataType::Int16 => { - return create_dict_array( - &mut key_buffer_cache.gen_i16, - dict_val, - len, - val.data_type(), - ); - } - DataType::Int32 => { - return create_dict_array( - &mut key_buffer_cache.gen_i32, - dict_val, - len, - val.data_type(), - ); - } - DataType::Int64 => { - return create_dict_array( - &mut key_buffer_cache.gen_i64, - dict_val, - len, - val.data_type(), - ); - } - DataType::UInt8 => { - return create_dict_array( - &mut key_buffer_cache.gen_u8, - dict_val, - len, - val.data_type(), - ); - } - DataType::UInt16 => { - return create_dict_array( - &mut key_buffer_cache.gen_u16, - dict_val, - len, - val.data_type(), - ); - } - DataType::UInt32 => { - return create_dict_array( - &mut key_buffer_cache.gen_u32, - dict_val, - len, - val.data_type(), - ); - } - DataType::UInt64 => { - return create_dict_array( - &mut key_buffer_cache.gen_u64, - dict_val, - len, - val.data_type(), - ); - } - _ => {} - } - } - +fn create_output_array(val: &ScalarValue, len: usize) -> Result<ArrayRef> { + // TODO(@notfilippo): should we reintroduce a way to encode as dictionaries? Review Comment: I'm missing some context for partitions so I'm not really sure if it's the right call to remove the Dictionary output array. IIUC it seems like it's very useful to save space and optimise columns added this way. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org