alamb commented on code in PR #14754: URL: https://github.com/apache/datafusion/pull/14754#discussion_r1965394266
########## datafusion/core/src/datasource/physical_plan/avro.rs: ########## @@ -255,10 +255,15 @@ impl FileSource for AvroSource { fn file_type(&self) -> &str { "avro" } - fn supports_repartition(&self, config: &FileScanConfig) -> bool { - !(config.file_compression_type.is_compressed() - || config.new_lines_in_values - || self.as_any().downcast_ref::<AvroSource>().is_some()) + + fn repartitioned( Review Comment: this is so much nicer ❤️ ########## datafusion/core/src/datasource/data_source.rs: ########## @@ -62,9 +64,33 @@ pub trait FileSource: Send + Sync { fn fmt_extra(&self, _t: DisplayFormatType, _f: &mut Formatter) -> fmt::Result { Ok(()) } - /// Return true if the file format supports repartition + + /// If supported by the [`FileSource`], redistribute files across partitions according to their size. + /// Allows custom file formats to implement their own repartitioning logic. /// - /// If this returns true, the DataSourceExec may repartition the data - /// by breaking up the input files into multiple smaller groups. - fn supports_repartition(&self, config: &FileScanConfig) -> bool; + /// Provides a default repartitioning behavior, see comments on [`FileGroupPartitioner`] for more detail. + fn repartitioned( + &self, + target_partitions: usize, + repartition_file_min_size: usize, + output_ordering: Option<LexOrdering>, + config: &FileScanConfig, + ) -> datafusion_common::Result<Option<FileScanConfig>> { + if config.file_compression_type.is_compressed() || config.new_lines_in_values { Review Comment: I agree in general the new_lines_in_values is pretty CSV specific, but I also think this code is backwards compatible and can be overridden by sub classes now. We can revise the default value in the future if needed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org