This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new cc3978e4a0 Minor: add ticket references to parallel parquet writing
code (#7592)
cc3978e4a0 is described below
commit cc3978e4a0da8a5f010cf6e6271e4bc1cf691911
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu Sep 28 12:52:45 2023 -0400
Minor: add ticket references to parallel parquet writing code (#7592)
---
datafusion/core/src/datasource/file_format/parquet.rs | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/datafusion/core/src/datasource/file_format/parquet.rs
b/datafusion/core/src/datasource/file_format/parquet.rs
index d7234a5375..ebdf3ea444 100644
--- a/datafusion/core/src/datasource/file_format/parquet.rs
+++ b/datafusion/core/src/datasource/file_format/parquet.rs
@@ -844,6 +844,8 @@ async fn output_single_parquet_file_parallelized(
parquet_props: &WriterProperties,
) -> Result<usize> {
let mut row_count = 0;
+ // TODO decrease parallelism / buffering:
+ // https://github.com/apache/arrow-datafusion/issues/7591
let parallelism = data.len();
let mut join_handles: Vec<JoinHandle<ParquetFileSerializedResult>> =
Vec::with_capacity(parallelism);
@@ -877,6 +879,8 @@ async fn output_single_parquet_file_parallelized(
>,
> = tokio::task::spawn(async move {
while let Some(data) = rx.recv().await {
+ // TODO write incrementally
+ // https://github.com/apache/arrow-datafusion/issues/7591
object_store_writer.write_all(data.as_slice()).await?;
}
Ok(object_store_writer)
@@ -913,6 +917,8 @@ async fn output_single_parquet_file_parallelized(
bytes_written: column.compressed_size() as
_,
rows_written: rg.num_rows() as _,
metadata: column.clone(),
+ // TODO need to populate the indexes when
writing final file
+ // see
https://github.com/apache/arrow-datafusion/issues/7589
bloom_filter: None,
column_index: None,
offset_index: None,