tustvold commented on code in PR #4269:
URL: https://github.com/apache/arrow-rs/pull/4269#discussion_r1204114796


##########
parquet/src/file/writer.rs:
##########
@@ -475,28 +464,107 @@ impl<'a, W: Write> SerializedRowGroupWriter<'a, W> {
 
             Ok(())
         };
+        (self.buf, Box::new(on_close))
+    }
 
-        let column = self.descr.column(self.column_index);
-        self.column_index += 1;
-
-        Ok(Some(factory(
-            column,
-            &self.props,
-            page_writer,
-            Box::new(on_close),
-        )?))
+    /// Returns the next column writer, if available, using the factory 
function;
+    /// otherwise returns `None`.
+    pub(crate) fn next_column_with_factory<'b, F, C>(
+        &'b mut self,
+        factory: F,
+    ) -> Result<Option<C>>
+    where
+        F: FnOnce(
+            ColumnDescPtr,
+            WriterPropertiesPtr,
+            Box<dyn PageWriter + 'b>,
+            OnCloseColumnChunk<'b>,
+        ) -> Result<C>,
+    {
+        self.assert_previous_writer_closed()?;
+        Ok(match self.next_column_desc() {
+            Some(column) => {
+                let props = self.props.clone();
+                let (buf, on_close) = self.get_on_close();
+                let page_writer = Box::new(SerializedPageWriter::new(buf));
+                Some(factory(column, props, page_writer, Box::new(on_close))?)
+            }
+            None => None,
+        })
     }
 
     /// Returns the next column writer, if available; otherwise returns `None`.
     /// In case of any IO error or Thrift error, or if row group writer has 
already been
     /// closed returns `Err`.
     pub fn next_column(&mut self) -> 
Result<Option<SerializedColumnWriter<'_>>> {
         self.next_column_with_factory(|descr, props, page_writer, on_close| {
-            let column_writer = get_column_writer(descr, props.clone(), 
page_writer);
+            let column_writer = get_column_writer(descr, props, page_writer);
             Ok(SerializedColumnWriter::new(column_writer, Some(on_close)))
         })
     }
 
+    /// Append a column chunk from another source without decoding it
+    ///
+    /// This can be used for efficiently concatenating or projecting parquet 
data,
+    /// or encoding parquet data to temporary in-memory buffers
+    pub fn splice_column<R: ChunkReader>(

Review Comment:
   > For example, perhaps we can at least make sure metadata.column_descr_ptr() 
matches the target column
   
   This check already exists - 
https://github.com/apache/arrow-rs/pull/4269/files#diff-3b307348aabe465890fa39973e9fda0243bd2344cb7cb9cdf02ac2d39521d7caR522
   
   > Explain that the close is the result from closing the previous column in 
this writer
   
   It need not be, `ColumnCloseResult` is just a struct of column data. There 
are various ways a user could conceivably construct it.
   
   > Perhaps you plan to do that as a follow on PR
   
   I have a PR almost ready that adds a parquet-concat binary that will show 
how to use this
   
   > Are we happy enough with this API to mark it pub
   
   In this case I would rather expose it so that people can explore the various 
use-cases it unlocks, I also have a PR lined up that uses it to efficiently 
concatenate parquet files, and it will need to be public for that
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to