jonded94 commented on issue #7489:
URL: https://github.com/apache/arrow-rs/issues/7489#issuecomment-2877156966
@alamb I tried setting `statistics_truncate_length` as well as
`column_index_truncate_length`, but for some reason, this didn't enable
`pyarrow` to read the new file.
I used this Rust code to initialize a Parquet Writer:
```Rust
#[pyclass]
pub struct ParquetFileWriter {
writer: Mutex<Option<ArrowWriter<FileWriter>>>,
}
impl ParquetFileWriter {
#[allow(clippy::too_many_arguments)]
fn try_new(
file: FileWriter,
schema: Schema,
target_rows_per_row_group: NonZeroUsize,
column_compression: Option<HashMap<ColumnPath, Compression>>,
compression: Option<Compression>,
statistics_enabled: Option<EnabledStatistics>,
statistics_truncate_length: Option<NonZeroUsize>,
column_index_truncate_length: Option<NonZeroUsize>,
) -> Result<Self, DiscoParquetError> {
let props = {
let mut builder = WriterProperties::builder()
.set_max_row_group_size(target_rows_per_row_group.get())
.set_key_value_metadata(Some(convert_arrow_metadata_to_parquet_metadata(
schema.metadata.clone(),
)))
.set_statistics_truncate_length(statistics_truncate_length.map(|value|
value.get()))
.set_column_index_truncate_length(
column_index_truncate_length.map(|value| value.get()),
);
dbg!(statistics_truncate_length);
dbg!(column_index_truncate_length);
if let Some(compression) = compression {
builder = builder.set_compression(compression);
}
if let Some(column_compression) = column_compression {
for (column_path, compression) in
column_compression.into_iter() {
builder = builder.set_column_compression(column_path,
compression);
}
}
if let Some(statistics_enabled) = statistics_enabled {
builder = builder.set_statistics_enabled(statistics_enabled);
}
builder.build()
};
Ok(Self {
writer: Mutex::new(Some(ArrowWriter::try_new_with_options(
file,
SchemaRef::new(schema),
ArrowWriterOptions::new().with_properties(props),
)?)),
})
}
fn write_batch(&self, batch: RecordBatch) -> Result<(),
DiscoParquetError> {
if let Some(file) = self.writer.lock()?.as_mut() {
file.write(&batch)?;
Ok(())
} else {
Err(PyValueError::new_err("File is already closed.").into())
}
}
```
And I used this pytest code to verify which test cases are readable or are
not readable with `pyarrow`:
```Python
@pytest.mark.parametrize(
"statistics_level,statistics_truncate_length,expected_fail",
[
(EnabledStatistics.NONE, 1, False),
(EnabledStatistics.CHUNK, 1024, False),
(EnabledStatistics.CHUNK, 16 * 1024 * 1024, False),
(EnabledStatistics.CHUNK, None, False),
(EnabledStatistics.PAGE, 1024, False),
(EnabledStatistics.PAGE, 16 * 1024 * 1024, True),
(EnabledStatistics.PAGE, None, True),
],
)
def test_page_statistics_pyarrow_compatibility(
statistics_level: EnabledStatistics, statistics_truncate_length: int |
None, expected_fail: bool, tmp_path: Path
) -> None:
# 16MiB, to get statistic headers definitely above 16MiB which pyarrow
doesn't support right now (version 20.0.0)
length = 16 * 1024 * 1024
schema = pyarrow.schema([pyarrow.field("data", pyarrow.binary())])
data = [{"data": b"0" * length} for _ in range(2)]
b = pyarrow.RecordBatch.from_pylist(data, schema=schema)
path = tmp_path / "test.parquet"
with ParquetFileWriter(
path,
schema,
statistics_enabled=statistics_level,
statistics_truncate_length=statistics_truncate_length,
column_index_truncate_length=statistics_truncate_length,
) as writer:
writer.write_batch(b)
reader = pyarrow.parquet.ParquetFile(path)
with pytest.raises(OSError) if expected_fail else
contextlib.nullcontext():
reader.read_row_group(0)
```
### Expectation
- With `EnabledStatistics` == `None`, it should never fail since no
statistics at all are written
- With `EnabledStatistics` == `Chunk`, it should not matter to which length
the statistics are truncated since `pyarrow` appears to be able to read
ColumnChunk/RowGroup level statistics that are arbitrarily large?
- With `EnabledStatistics` == `Page`, it should fail whenever the statistics
are not truncated at all or only to large value (16MiB for example), but it
should **not** fail when they are truncated to a much smaller value (1024)
### Reality
Every expectation holds true, besides one: It **still** fails with
`EnabledStatistics` == `Page` when truncation is set to a low value. Note that
I added a `dbg!` statement in the Rust code to force printing what
`statistics_truncate_length` and `column_index_truncate_length` are set to (and
yes, it doesn't matter if I set this to a very low value such as 1 or so).
```
_lib/parquet/parquet_writer.rs:168:13] statistics_truncate_length = Some(
1024,
)
[_lib/parquet/parquet_writer.rs:169:13] column_index_truncate_length = Some(
1024,
)
tests/test_writers.py:307
(test_page_statistics_pyarrow_compatibility[statistics_level4-1024-False])
statistics_level = EnabledStatistics.PAGE, statistics_truncate_length = 1024
expected_fail = False
tmp_path =
PosixPath('/tmp/pytest-of-[...]/pytest-5/test_page_statistics_pyarrow_c4')
[...]
reader = pyarrow.parquet.ParquetFile(path)
with pytest.raises(OSError) if expected_fail else
contextlib.nullcontext():
> reader.read_row_group(0)
test_writers.py:343:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _
../.venv/lib/python3.12/site-packages/pyarrow/parquet/core.py:467: in
read_row_group
return self.reader.read_row_group(i, column_indices=column_indices,
pyarrow/_parquet.pyx:1655: in pyarrow._parquet.ParquetReader.read_row_group
???
pyarrow/_parquet.pyx:1691: in pyarrow._parquet.ParquetReader.read_row_groups
???
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _
> ???
E OSError: Couldn't deserialize thrift: No more data to read.
E Deserializing page header failed.
pyarrow/error.pxi:92: OSError
```
Parquet file which is still unreadable with `pyarrow`:
https://limewire.com/d/Cb2YQ#T76kY0beY7
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]