Re: [I] Files containing binary data with >=8_388_855 bytes per row written with `arrow-rs` can't be read with `pyarrow` [arrow-rs]

via GitHub Tue, 13 May 2025 09:14:06 -0700


jonded94 commented on issue #7489:
URL: https://github.com/apache/arrow-rs/issues/7489#issuecomment-2877156966


   @alamb I tried setting `statistics_truncate_length` as well as 
`column_index_truncate_length`, but for some reason, this didn't enable 
`pyarrow` to read the new file.
   
   I used this Rust code to initialize a Parquet Writer:
   
   ```Rust
   #[pyclass]
   pub struct ParquetFileWriter {
       writer: Mutex<Option<ArrowWriter<FileWriter>>>,
   }
   
   impl ParquetFileWriter {
       #[allow(clippy::too_many_arguments)]
       fn try_new(
           file: FileWriter,
           schema: Schema,
           target_rows_per_row_group: NonZeroUsize,
           column_compression: Option<HashMap<ColumnPath, Compression>>,
           compression: Option<Compression>,
           statistics_enabled: Option<EnabledStatistics>,
           statistics_truncate_length: Option<NonZeroUsize>,
           column_index_truncate_length: Option<NonZeroUsize>,
       ) -> Result<Self, DiscoParquetError> {
           let props = {
               let mut builder = WriterProperties::builder()
                   .set_max_row_group_size(target_rows_per_row_group.get())
                   
.set_key_value_metadata(Some(convert_arrow_metadata_to_parquet_metadata(
                       schema.metadata.clone(),
                   )))
                   
.set_statistics_truncate_length(statistics_truncate_length.map(|value| 
value.get()))
                   .set_column_index_truncate_length(
                       column_index_truncate_length.map(|value| value.get()),
                   );
               dbg!(statistics_truncate_length);
               dbg!(column_index_truncate_length);
               if let Some(compression) = compression {
                   builder = builder.set_compression(compression);
               }
               if let Some(column_compression) = column_compression {
                   for (column_path, compression) in 
column_compression.into_iter() {
                       builder = builder.set_column_compression(column_path, 
compression);
                   }
               }
               if let Some(statistics_enabled) = statistics_enabled {
                   builder = builder.set_statistics_enabled(statistics_enabled);
               }
   
               builder.build()
           };
   
           Ok(Self {
               writer: Mutex::new(Some(ArrowWriter::try_new_with_options(
                   file,
                   SchemaRef::new(schema),
                   ArrowWriterOptions::new().with_properties(props),
               )?)),
           })
       }
   
       fn write_batch(&self, batch: RecordBatch) -> Result<(), 
DiscoParquetError> {
           if let Some(file) = self.writer.lock()?.as_mut() {
               file.write(&batch)?;
               Ok(())
           } else {
               Err(PyValueError::new_err("File is already closed.").into())
           }
       }
   ```
   
   And I used this pytest code to verify which test cases are readable or are 
not readable with `pyarrow`:
   
   ```Python
   @pytest.mark.parametrize(
       "statistics_level,statistics_truncate_length,expected_fail",
       [
           (EnabledStatistics.NONE, 1, False),
           (EnabledStatistics.CHUNK, 1024, False),
           (EnabledStatistics.CHUNK, 16 * 1024 * 1024, False),
           (EnabledStatistics.CHUNK, None, False),
           (EnabledStatistics.PAGE, 1024, False),
           (EnabledStatistics.PAGE, 16 * 1024 * 1024, True),
           (EnabledStatistics.PAGE, None, True),
       ],
   )
   def test_page_statistics_pyarrow_compatibility(
       statistics_level: EnabledStatistics, statistics_truncate_length: int | 
None, expected_fail: bool, tmp_path: Path
   ) -> None:
       # 16MiB, to get statistic headers definitely above 16MiB which pyarrow 
doesn't support right now (version 20.0.0)
       length = 16 * 1024 * 1024
   
       schema = pyarrow.schema([pyarrow.field("data", pyarrow.binary())])
       data = [{"data": b"0" * length} for _ in range(2)]
   
       b = pyarrow.RecordBatch.from_pylist(data, schema=schema)
   
       path = tmp_path / "test.parquet"
       with ParquetFileWriter(
           path,
           schema,
           statistics_enabled=statistics_level,
           statistics_truncate_length=statistics_truncate_length,
           column_index_truncate_length=statistics_truncate_length,
       ) as writer:
           writer.write_batch(b)
   
       reader = pyarrow.parquet.ParquetFile(path)
       with pytest.raises(OSError) if expected_fail else 
contextlib.nullcontext():
           reader.read_row_group(0)
   ```
   
   ### Expectation
   
   - With `EnabledStatistics` == `None`, it should never fail since no 
statistics at all are written
   - With `EnabledStatistics` == `Chunk`, it should not matter to which length 
the statistics are truncated since `pyarrow` appears to be able to read 
ColumnChunk/RowGroup level statistics that are arbitrarily large?
   - With `EnabledStatistics` == `Page`, it should fail whenever the statistics 
are not truncated at all or only to large value (16MiB for example), but it 
should **not** fail when they are truncated to a much smaller value (1024)
   
   ### Reality
   
   Every expectation holds true, besides one: It **still** fails with 
`EnabledStatistics` == `Page` when truncation is set to a low value. Note that 
I added a `dbg!` statement in the Rust code to force printing what 
`statistics_truncate_length` and `column_index_truncate_length` are set to (and 
yes, it doesn't matter if I set this to a very low value such as 1 or so).
   
   ```
   _lib/parquet/parquet_writer.rs:168:13] statistics_truncate_length = Some(
       1024,
   )
   [_lib/parquet/parquet_writer.rs:169:13] column_index_truncate_length = Some(
       1024,
   )
   
   tests/test_writers.py:307 
(test_page_statistics_pyarrow_compatibility[statistics_level4-1024-False])
   statistics_level = EnabledStatistics.PAGE, statistics_truncate_length = 1024
   expected_fail = False
   tmp_path = 
PosixPath('/tmp/pytest-of-[...]/pytest-5/test_page_statistics_pyarrow_c4')
   
   [...]
           reader = pyarrow.parquet.ParquetFile(path)
           with pytest.raises(OSError) if expected_fail else 
contextlib.nullcontext():
   >           reader.read_row_group(0)
   
   test_writers.py:343: 
   _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
_ _ 
   ../.venv/lib/python3.12/site-packages/pyarrow/parquet/core.py:467: in 
read_row_group
       return self.reader.read_row_group(i, column_indices=column_indices,
   pyarrow/_parquet.pyx:1655: in pyarrow._parquet.ParquetReader.read_row_group
       ???
   pyarrow/_parquet.pyx:1691: in pyarrow._parquet.ParquetReader.read_row_groups
       ???
   _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
_ _ 
   
   >   ???
   E   OSError: Couldn't deserialize thrift: No more data to read.
   E   Deserializing page header failed.
   
   pyarrow/error.pxi:92: OSError
   ```
   
   Parquet file which is still unreadable with `pyarrow`: 
https://limewire.com/d/Cb2YQ#T76kY0beY7


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [I] Files containing binary data with >=8_388_855 bytes per row written with `arrow-rs` can't be read with `pyarrow` [arrow-rs]

Reply via email to