This is an automated email from the ASF dual-hosted git repository.
maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 9015a81d0e GH-44769: [C++][Parquet] Fix read/write of metadata length
footer on big-endian systems (#44787)
9015a81d0e is described below
commit 9015a81d0e9f9a861509e5e1b6f96c0d8c01a999
Author: Elliott Sales de Andrade <[email protected]>
AuthorDate: Wed Nov 20 10:20:52 2024 -0500
GH-44769: [C++][Parquet] Fix read/write of metadata length footer on
big-endian systems (#44787)
### Rationale for this change
See issue.
### What changes are included in this PR?
- [Fix writing Parquet metadata length
footer](https://github.com/apache/arrow/commit/fd3cf89d3c978078d96300f770eccc8e7de4bc40)
By converting the `uint32_t` to little endian before casting to a
`uint8_t*`, this is always correct in the output file.
- [Fix reading Parquet metadata length
footer](https://github.com/apache/arrow/commit/4b1dd1b6e5533096764d2541842b4f5fdf858a4e)
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Reading a Parquet file won't complain about metadata size in the footer,
though that doesn't guarantee anything else will work yet.
* GitHub Issue: #44769
Authored-by: Elliott Sales de Andrade <[email protected]>
Signed-off-by: mwish <[email protected]>
---
cpp/src/parquet/file_reader.cc | 7 ++++---
cpp/src/parquet/file_writer.cc | 14 +++++++++++---
2 files changed, 15 insertions(+), 6 deletions(-)
diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc
index 3e9eeea6c6..3cc42ae370 100644
--- a/cpp/src/parquet/file_reader.cc
+++ b/cpp/src/parquet/file_reader.cc
@@ -497,9 +497,10 @@ class SerializedFile : public ParquetFileReader::Contents {
"is not a parquet file.");
}
// Both encrypted/unencrypted footers have the same footer length check.
- uint32_t metadata_len = ::arrow::util::SafeLoadAs<uint32_t>(
- reinterpret_cast<const uint8_t*>(footer_buffer->data()) +
footer_read_size -
- kFooterSize);
+ uint32_t metadata_len =
+
::arrow::bit_util::FromLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(
+ reinterpret_cast<const uint8_t*>(footer_buffer->data()) +
footer_read_size -
+ kFooterSize));
if (metadata_len > source_size_ - kFooterSize) {
throw ParquetInvalidOrCorruptedFileException(
"Parquet file size is ", source_size_,
diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc
index baa9e00da2..854e2161f8 100644
--- a/cpp/src/parquet/file_writer.cc
+++ b/cpp/src/parquet/file_writer.cc
@@ -426,8 +426,10 @@ class FileSerializer : public ParquetFileWriter::Contents {
WriteEncryptedFileMetadata(*file_metadata_, sink_.get(),
footer_encryptor, true);
PARQUET_ASSIGN_OR_THROW(position, sink_->Tell());
uint32_t footer_and_crypto_len = static_cast<uint32_t>(position -
metadata_start);
+ uint32_t footer_and_crypto_len_le =
+ ::arrow::bit_util::ToLittleEndian(footer_and_crypto_len);
PARQUET_THROW_NOT_OK(
- sink_->Write(reinterpret_cast<uint8_t*>(&footer_and_crypto_len), 4));
+ sink_->Write(reinterpret_cast<uint8_t*>(&footer_and_crypto_len_le),
4));
PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4));
} else { // Encrypted file with plaintext footer
file_metadata_ = metadata_->Finish(key_value_metadata_);
@@ -539,7 +541,10 @@ void WriteFileMetaData(const FileMetaData& file_metadata,
ArrowOutputStream* sin
metadata_len = static_cast<uint32_t>(position) - metadata_len;
// Write Footer
- PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<uint8_t*>(&metadata_len),
4));
+ {
+ uint32_t metadata_len_le = ::arrow::bit_util::ToLittleEndian(metadata_len);
+
PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<uint8_t*>(&metadata_len_le),
4));
+ }
PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
}
@@ -562,7 +567,10 @@ void WriteEncryptedFileMetadata(const FileMetaData&
file_metadata,
PARQUET_ASSIGN_OR_THROW(position, sink->Tell());
metadata_len = static_cast<uint32_t>(position) - metadata_len;
-
PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4));
+ {
+ uint32_t metadata_len_le =
::arrow::bit_util::ToLittleEndian(metadata_len);
+
PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<uint8_t*>(&metadata_len_le),
4));
+ }
PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
}
}