tustvold commented on code in PR #4859:
URL: https://github.com/apache/arrow-rs/pull/4859#discussion_r1338290631
##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -347,13 +349,22 @@ impl PageWriter for ArrowPageWriter {
}
}
-/// Encodes a leaf column to [`ArrowPageWriter`]
-enum ArrowColumnWriter {
+/// Serializes [ArrayRef]s to [ArrowColumnChunk]s which can be concatenated
+/// to form a parquet row group
+pub enum ArrowColumnWriter {
Review Comment:
Can we make the enum variants private as well, i.e. something like
```
pub struct ArrowColumnWriter(ArrowColumnWriterImpl);
enum ArrowColumnWriterImpl {
...
}
```
##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -376,31 +388,56 @@ impl ArrowRowGroupWriter {
props: &WriterPropertiesPtr,
arrow: &SchemaRef,
) -> Result<Self> {
- let mut writers = Vec::with_capacity(arrow.fields.len());
+ let mut writers_and_buffers = Vec::with_capacity(arrow.fields.len());
let mut leaves = parquet.columns().iter();
for field in &arrow.fields {
- get_arrow_column_writer(field.data_type(), props, &mut leaves,
&mut writers)?;
+ get_arrow_column_writer(
+ field.data_type(),
+ props,
+ &mut leaves,
+ &mut writers_and_buffers,
+ )?;
}
+ let (shared_buffers, writers): (Vec<_>, Vec<_>) =
+ writers_and_buffers.into_iter().unzip();
Ok(Self {
writers,
+ shared_buffers,
schema: arrow.clone(),
buffered_rows: 0,
})
}
pub fn write(&mut self, batch: &RecordBatch) -> Result<()> {
self.buffered_rows += batch.num_rows();
- let mut writers = self.writers.iter_mut().map(|(_, x)| x);
+ let mut writers = self.writers.iter_mut();
for (array, field) in batch.columns().iter().zip(&self.schema.fields) {
let mut levels = calculate_array_levels(array, field)?.into_iter();
write_leaves(&mut writers, &mut levels, array.as_ref())?;
}
Ok(())
}
+ pub fn schema(&self) -> &Arc<Schema> {
+ &self.schema
+ }
+
+ /// Takes ownership of all [ArrowColumnWriter]s from this
[ArrowRowGroupWriter]
+ /// Caller must restore ownership with give_col_writers before calling
close method.
+ pub fn take_col_writers(&mut self) -> Vec<ArrowColumnWriter> {
+ self.writers.drain(..).collect()
+ }
+
+ /// Restores ownership of all [ArrowColumnWriter]s. Caller is responsible
for
+ /// returning the [Vec] in the same order returned by take_col_writers
method.
+ pub fn give_col_writers(&mut self, writers: Vec<ArrowColumnWriter>) {
+ self.writers = writers;
+ }
Review Comment:
I'm not a massive fan of this API tbh, I'll have a play and see what I can
come up with
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]