alamb commented on code in PR #6987:
URL: https://github.com/apache/arrow-datafusion/pull/6987#discussion_r1270610963
##########
datafusion/core/src/datasource/physical_plan/csv.rs:
##########
@@ -566,30 +564,32 @@ pub async fn plan_to_csv(
path: impl AsRef<str>,
) -> Result<()> {
let path = path.as_ref();
- // create directory to contain the CSV files (one per partition)
- let fs_path = Path::new(path);
- if let Err(e) = fs::create_dir(fs_path) {
- return Err(DataFusionError::Execution(format!(
- "Could not create directory {path}: {e:?}"
- )));
- }
-
+ let parsed = ListingTableUrl::parse(path)?;
+ let object_store_url = parsed.object_store();
+ let store = task_ctx.runtime_env().object_store(&object_store_url)?;
+ let mut buffer;
let mut join_set = JoinSet::new();
for i in 0..plan.output_partitioning().partition_count() {
- let plan = plan.clone();
- let filename = format!("part-{i}.csv");
- let path = fs_path.join(filename);
- let file = fs::File::create(path)?;
- let mut writer = csv::Writer::new(file);
- let stream = plan.execute(i, task_ctx.clone())?;
+ let storeref = store.clone();
+ let plan: Arc<dyn ExecutionPlan> = plan.clone();
+ let filename = format!("{}/part-{i}.csv", parsed.prefix());
+ let file = object_store::path::Path::parse(filename)?;
+ buffer = Vec::new();
Review Comment:
> If we must choose only one or the other, I would also favor multipart
upload, since large files could fail in the current implementation, whereas
small files would at worst be slower in a multipart implementation. I will work
on a multipart implementation of this!
Thank you -- I think if the overhead of doing a multi-part upload is too
big, it would make sense to add (optional ) buffering in the object_store crate
(if it doesn't have one already) such that multiple small multi-part writes are
coalesced into a smaller number of larger writes
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]