This is an automated email from the ASF dual-hosted git repository.
maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 9b584547c7 GH-43427: [C++][Parquet] Deprecate ColumnChunk::file_offset
field and no longer write Metadata at end of Chunk (#43428)
9b584547c7 is described below
commit 9b584547c768fb09b2e33b4ad8797cf45c3b3b97
Author: mwish <[email protected]>
AuthorDate: Wed Aug 7 20:01:04 2024 +0800
GH-43427: [C++][Parquet] Deprecate ColumnChunk::file_offset field and no
longer write Metadata at end of Chunk (#43428)
### Rationale for this change
See github issue
### What changes are included in this PR?
Force `ColumnChunk::file_offset` tobe 0
### Are these changes tested?
No
### Are there any user-facing changes?
Might affect user using legacy `ColumnChunk::file_offset`
* GitHub Issue: #43427
Authored-by: mwish <[email protected]>
Signed-off-by: mwish <[email protected]>
---
c_glib/test/parquet/test-column-chunk-metadata.rb | 2 +-
cpp/src/parquet/column_writer.cc | 5 -----
cpp/src/parquet/metadata.cc | 6 +++---
cpp/src/parquet/metadata.h | 7 ++++++-
python/pyarrow/tests/parquet/test_metadata.py | 2 +-
5 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/c_glib/test/parquet/test-column-chunk-metadata.rb
b/c_glib/test/parquet/test-column-chunk-metadata.rb
index f0012f0124..4612e5bf0c 100644
--- a/c_glib/test/parquet/test-column-chunk-metadata.rb
+++ b/c_glib/test/parquet/test-column-chunk-metadata.rb
@@ -77,7 +77,7 @@ class TestParquetColumnChunkMetadata < Test::Unit::TestCase
test("#file_offset") do
assert do
- @metadata.file_offset > 0
+ @metadata.file_offset == 0
end
end
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 90e0102b42..f859ec9653 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -353,8 +353,6 @@ class SerializedPageWriter : public PageWriter {
total_compressed_size_, total_uncompressed_size_,
has_dictionary,
fallback, dict_encoding_stats_, data_encoding_stats_,
meta_encryptor_);
- // Write metadata at end of column chunk
- metadata_->WriteTo(sink_.get());
}
/**
@@ -667,9 +665,6 @@ class BufferedPageWriter : public PageWriter {
has_dictionary, fallback, pager_->dict_encoding_stats_,
pager_->data_encoding_stats_, pager_->meta_encryptor_);
- // Write metadata at end of column chunk
- metadata_->WriteTo(in_memory_sink_.get());
-
// Buffered page writer needs to adjust page offsets.
pager_->FinishPageIndexes(final_position);
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index fe16f5b76b..10c8afaf37 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -1536,10 +1536,10 @@ class
ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
const std::shared_ptr<Encryptor>& encryptor) {
if (dictionary_page_offset > 0) {
column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset);
- column_chunk_->__set_file_offset(dictionary_page_offset +
compressed_size);
- } else {
- column_chunk_->__set_file_offset(data_page_offset + compressed_size);
}
+ // The `file_offset` field is deprecated and should be set to 0.
+ // See https://github.com/apache/parquet-format/pull/440 for detail.
+ column_chunk_->__set_file_offset(0);
column_chunk_->__isset.meta_data = true;
column_chunk_->meta_data.__set_num_values(num_values);
if (index_page_offset >= 0) {
diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index e02d2e7c85..e46297540b 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -146,7 +146,12 @@ class PARQUET_EXPORT ColumnChunkMetaData {
bool Equals(const ColumnChunkMetaData& other) const;
- // column chunk
+ // Byte offset of `ColumnMetaData` in `file_path()`.
+ //
+ // Note that the meaning of this field has been inconsistent among
implementations
+ // so its use has since been deprecated in the Parquet specification. Modern
+ // implementations will set this to `0` to indicate that the
`ColumnMetaData` is solely
+ // contained in the `ColumnChunk` struct.
int64_t file_offset() const;
// parameter is only used when a dataset is spread across multiple files
diff --git a/python/pyarrow/tests/parquet/test_metadata.py
b/python/pyarrow/tests/parquet/test_metadata.py
index 52ab59a961..528cf0110d 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -122,7 +122,7 @@ def test_parquet_metadata_api():
col_meta = rg_meta.column(ncols + 2)
col_meta = rg_meta.column(0)
- assert col_meta.file_offset > 0
+ assert col_meta.file_offset == 0
assert col_meta.file_path == '' # created from BytesIO
assert col_meta.physical_type == 'BOOLEAN'
assert col_meta.num_values == 10000