alamb commented on code in PR #5268:
URL: https://github.com/apache/arrow-datafusion/pull/5268#discussion_r1105847268
##########
datafusion/core/src/datasource/file_format/parquet.rs:
##########
@@ -550,50 +550,63 @@ pub(crate) mod test_util {
use parquet::file::properties::WriterProperties;
use tempfile::NamedTempFile;
+ /// How many rows per page should be written
+ const ROWS_PER_PAGE: usize = 2;
+
/// Writes `batches` to a temporary parquet file
///
- /// If multi_page is set to `true`, all batches are written into
- /// one temporary parquet file and the parquet file is written
+ /// If multi_page is set to `true`, the parquet file(s) are written
/// with 2 rows per data page (used to test page filtering and
/// boundaries).
pub async fn store_parquet(
batches: Vec<RecordBatch>,
multi_page: bool,
) -> Result<(Vec<ObjectMeta>, Vec<NamedTempFile>)> {
- if multi_page {
- // All batches write in to one file, each batch must have same
schema.
- let mut output = NamedTempFile::new().expect("creating temp file");
- let mut builder = WriterProperties::builder();
- builder = builder.set_data_page_row_count_limit(2);
- let proper = builder.build();
- let mut writer =
- ArrowWriter::try_new(&mut output, batches[0].schema(),
Some(proper))
- .expect("creating writer");
- for b in batches {
- writer.write(&b).expect("Writing batch");
- }
- writer.close().unwrap();
- Ok((vec![local_unpartitioned_file(&output)], vec![output]))
- } else {
- // Each batch writes to their own file
- let files: Vec<_> = batches
- .into_iter()
- .map(|batch| {
- let mut output = NamedTempFile::new().expect("creating
temp file");
+ // Each batch writes to their own file
+ let files: Vec<_> = batches
+ .into_iter()
+ .map(|batch| {
+ let mut output = NamedTempFile::new().expect("creating temp
file");
+
+ let builder = WriterProperties::builder();
+ let props = if multi_page {
+ builder.set_data_page_row_count_limit(2)
Review Comment:
👍 you are totally right -- good catch
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]