This is an automated email from the ASF dual-hosted git repository.
maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 2767dc55cb GH-41579: [C++][Python][Parquet] Support reading/writing
key-value metadata from/to ColumnChunkMetaData (#41580)
2767dc55cb is described below
commit 2767dc55cb41377af6895141f717475d73b2892d
Author: Chungmin Lee <[email protected]>
AuthorDate: Thu Aug 15 09:32:22 2024 -0700
GH-41579: [C++][Python][Parquet] Support reading/writing key-value metadata
from/to ColumnChunkMetaData (#41580)
### Rationale for this change
Parquet standard allows reading/writing key-value metadata from/to
ColumnChunkMetaData, but there is no way to do that with Parquet C++.
### What changes are included in this PR?
Support reading/writing key-value metadata from/to ColumnChunkMetaData with
Parquet C++ reader/writer. Support reading key-value metadata from
ColumnChunkMetaData with pyarrow.parquet.
### Are these changes tested?
Yes, unit tests are added
### Are there any user-facing changes?
Yes.
- Users can read or write key-value metadata for column chunks with Parquet
C++.
- Users can read key-value metadata for column chunks with PyArrow.
- parquet-reader tool prints key-value metadata in column chunks when
`--print-key-value-metadata` option is used.
* GitHub Issue: #41579
Lead-authored-by: Chungmin Lee <[email protected]>
Co-authored-by: mwish <[email protected]>
Signed-off-by: mwish <[email protected]>
---
cpp/src/parquet/column_writer.cc | 24 ++++++++
cpp/src/parquet/column_writer.h | 12 ++++
cpp/src/parquet/column_writer_test.cc | 69 ++++++++++++++++++++++
cpp/src/parquet/metadata.cc | 84 +++++++++++++++++++++------
cpp/src/parquet/metadata.h | 5 ++
cpp/src/parquet/printer.cc | 32 +++++++---
python/pyarrow/_parquet.pxd | 1 +
python/pyarrow/_parquet.pyx | 13 +++++
python/pyarrow/tests/parquet/conftest.py | 12 ++++
python/pyarrow/tests/parquet/test_metadata.py | 9 +++
10 files changed, 235 insertions(+), 26 deletions(-)
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index f859ec9653..40d19d38e1 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -40,6 +40,7 @@
#include "arrow/util/crc32.h"
#include "arrow/util/endian.h"
#include "arrow/util/float16.h"
+#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/util/rle_encoding_internal.h"
#include "arrow/util/type_traits.h"
@@ -832,6 +833,9 @@ class ColumnWriterImpl {
void FlushBufferedDataPages();
ColumnChunkMetaDataBuilder* metadata_;
+ // key_value_metadata_ for the column chunk
+ // It would be nullptr if there is no KeyValueMetadata set.
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
const ColumnDescriptor* descr_;
// scratch buffer if validity bits need to be recalculated.
std::shared_ptr<ResizableBuffer> bits_buffer_;
@@ -1100,6 +1104,7 @@ int64_t ColumnWriterImpl::Close() {
if (rows_written_ > 0 && chunk_statistics.is_set()) {
metadata_->SetStatistics(chunk_statistics);
}
+ metadata_->SetKeyValueMetadata(key_value_metadata_);
pager_->Close(has_dictionary_, fallback_);
}
@@ -1397,6 +1402,25 @@ class TypedColumnWriterImpl : public ColumnWriterImpl,
public TypedColumnWriter<
return pages_change_on_record_boundaries_;
}
+ void AddKeyValueMetadata(
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata)
override {
+ if (closed_) {
+ throw ParquetException("Cannot add key-value metadata to closed column");
+ }
+ if (key_value_metadata_ == nullptr) {
+ key_value_metadata_ = key_value_metadata;
+ } else if (key_value_metadata != nullptr) {
+ key_value_metadata_ = key_value_metadata_->Merge(*key_value_metadata);
+ }
+ }
+
+ void ResetKeyValueMetadata() override {
+ if (closed_) {
+ throw ParquetException("Cannot add key-value metadata to closed column");
+ }
+ key_value_metadata_ = nullptr;
+ }
+
private:
using ValueEncoderType = typename EncodingTraits<DType>::Encoder;
using TypedStats = TypedStatistics<DType>;
diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h
index a278670fa8..845bf9aa89 100644
--- a/cpp/src/parquet/column_writer.h
+++ b/cpp/src/parquet/column_writer.h
@@ -21,6 +21,7 @@
#include <cstring>
#include <memory>
+#include "arrow/type_fwd.h"
#include "arrow/util/compression.h"
#include "parquet/exception.h"
#include "parquet/platform.h"
@@ -181,6 +182,17 @@ class PARQUET_EXPORT ColumnWriter {
/// \brief The file-level writer properties
virtual const WriterProperties* properties() = 0;
+ /// \brief Add key-value metadata to the ColumnChunk.
+ /// \param[in] key_value_metadata the metadata to add.
+ /// \note This will overwrite any existing metadata with the same key.
+ /// \throw ParquetException if Close() has been called.
+ virtual void AddKeyValueMetadata(
+ const std::shared_ptr<const ::arrow::KeyValueMetadata>&
key_value_metadata) = 0;
+
+ /// \brief Reset the ColumnChunk key-value metadata.
+ /// \throw ParquetException if Close() has been called.
+ virtual void ResetKeyValueMetadata() = 0;
+
/// \brief Write Apache Arrow columnar data directly to ColumnWriter. Returns
/// error status if the array data type is not compatible with the concrete
/// writer type.
diff --git a/cpp/src/parquet/column_writer_test.cc
b/cpp/src/parquet/column_writer_test.cc
index c99efd1796..d2b3aa0dff 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -23,10 +23,12 @@
#include <gtest/gtest.h>
#include "arrow/io/buffered.h"
+#include "arrow/io/file.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_builders.h"
#include "arrow/util/config.h"
+#include "arrow/util/key_value_metadata.h"
#include "parquet/column_page.h"
#include "parquet/column_reader.h"
@@ -51,6 +53,9 @@ using schema::PrimitiveNode;
namespace test {
+using ::testing::IsNull;
+using ::testing::NotNull;
+
// The default size used in most tests.
const int SMALL_SIZE = 100;
#ifdef PARQUET_VALGRIND
@@ -385,6 +390,15 @@ class TestPrimitiveWriter : public
PrimitiveTypedTest<TestType> {
return metadata_accessor->encoding_stats();
}
+ std::shared_ptr<const KeyValueMetadata> metadata_key_value_metadata() {
+ // Metadata accessor must be created lazily.
+ // This is because the ColumnChunkMetaData semantics dictate the metadata
object is
+ // complete (no changes to the metadata buffer can be made after
instantiation)
+ auto metadata_accessor =
+ ColumnChunkMetaData::Make(metadata_->contents(), this->descr_);
+ return metadata_accessor->key_value_metadata();
+ }
+
protected:
int64_t values_read_;
// Keep the reader alive as for ByteArray the lifetime of the ByteArray
@@ -1705,5 +1719,60 @@ TEST(TestColumnWriter, WriteDataPageV2HeaderNullCount) {
}
}
+using TestInt32Writer = TestPrimitiveWriter<Int32Type>;
+
+TEST_F(TestInt32Writer, NoWriteKeyValueMetadata) {
+ auto writer = this->BuildWriter();
+ writer->Close();
+ auto key_value_metadata = metadata_key_value_metadata();
+ ASSERT_THAT(key_value_metadata, IsNull());
+}
+
+TEST_F(TestInt32Writer, WriteKeyValueMetadata) {
+ auto writer = this->BuildWriter();
+ writer->AddKeyValueMetadata(
+ KeyValueMetadata::Make({"hello", "bye"}, {"world", "earth"}));
+ // overwrite the previous value
+ writer->AddKeyValueMetadata(KeyValueMetadata::Make({"bye"}, {"moon"}));
+ writer->Close();
+ auto key_value_metadata = metadata_key_value_metadata();
+ ASSERT_THAT(key_value_metadata, NotNull());
+ ASSERT_EQ(2, key_value_metadata->size());
+ ASSERT_OK_AND_ASSIGN(auto value, key_value_metadata->Get("hello"));
+ ASSERT_EQ("world", value);
+ ASSERT_OK_AND_ASSIGN(value, key_value_metadata->Get("bye"));
+ ASSERT_EQ("moon", value);
+}
+
+TEST_F(TestInt32Writer, ResetKeyValueMetadata) {
+ auto writer = this->BuildWriter();
+ writer->AddKeyValueMetadata(KeyValueMetadata::Make({"hello"}, {"world"}));
+ writer->ResetKeyValueMetadata();
+ writer->Close();
+ auto key_value_metadata = metadata_key_value_metadata();
+ ASSERT_THAT(key_value_metadata, IsNull());
+}
+
+TEST_F(TestInt32Writer, WriteKeyValueMetadataEndToEnd) {
+ auto sink = CreateOutputStream();
+ {
+ auto file_writer = ParquetFileWriter::Open(
+ sink,
std::dynamic_pointer_cast<schema::GroupNode>(schema_.schema_root()));
+ auto rg_writer = file_writer->AppendRowGroup();
+ auto col_writer = rg_writer->NextColumn();
+ col_writer->AddKeyValueMetadata(KeyValueMetadata::Make({"foo"}, {"bar"}));
+ file_writer->Close();
+ }
+ ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());
+ auto file_reader =
+
ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
+ auto key_value_metadata =
+
file_reader->metadata()->RowGroup(0)->ColumnChunk(0)->key_value_metadata();
+ ASSERT_THAT(key_value_metadata, NotNull());
+ ASSERT_EQ(1U, key_value_metadata->size());
+ ASSERT_OK_AND_ASSIGN(auto value, key_value_metadata->Get("foo"));
+ ASSERT_EQ("bar", value);
+}
+
} // namespace test
} // namespace parquet
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 10c8afaf37..4f2aa6e373 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -135,6 +135,39 @@ std::shared_ptr<Statistics> MakeColumnStats(const
format::ColumnMetaData& meta_d
throw ParquetException("Can't decode page statistics for selected column
type");
}
+// Get KeyValueMetadata from parquet Thrift RowGroup or ColumnChunk metadata.
+//
+// Returns nullptr if the metadata is not set.
+template <typename Metadata>
+std::shared_ptr<KeyValueMetadata> FromThriftKeyValueMetadata(const Metadata&
source) {
+ std::shared_ptr<KeyValueMetadata> metadata = nullptr;
+ if (source.__isset.key_value_metadata) {
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+ keys.reserve(source.key_value_metadata.size());
+ values.reserve(source.key_value_metadata.size());
+ for (const auto& it : source.key_value_metadata) {
+ keys.push_back(it.key);
+ values.push_back(it.value);
+ }
+ metadata = std::make_shared<KeyValueMetadata>(std::move(keys),
std::move(values));
+ }
+ return metadata;
+}
+
+template <typename Metadata>
+void ToThriftKeyValueMetadata(const KeyValueMetadata& source, Metadata*
metadata) {
+ std::vector<format::KeyValue> key_value_metadata;
+ key_value_metadata.reserve(static_cast<size_t>(source.size()));
+ for (int64_t i = 0; i < source.size(); ++i) {
+ format::KeyValue kv_pair;
+ kv_pair.__set_key(source.key(i));
+ kv_pair.__set_value(source.value(i));
+ key_value_metadata.emplace_back(std::move(kv_pair));
+ }
+ metadata->__set_key_value_metadata(std::move(key_value_metadata));
+}
+
// MetaData Accessor
// ColumnCryptoMetaData
@@ -233,6 +266,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
encoding_stats.count});
}
possible_stats_ = nullptr;
+ InitKeyValueMetadata();
}
bool Equals(const ColumnChunkMetaDataImpl& other) const {
@@ -343,7 +377,15 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
return std::nullopt;
}
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
+ return key_value_metadata_;
+ }
+
private:
+ void InitKeyValueMetadata() {
+ key_value_metadata_ = FromThriftKeyValueMetadata(*column_metadata_);
+ }
+
mutable std::shared_ptr<Statistics> possible_stats_;
std::vector<Encoding::type> encodings_;
std::vector<PageEncodingStats> encoding_stats_;
@@ -353,6 +395,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
const ColumnDescriptor* descr_;
const ReaderProperties properties_;
const ApplicationVersion* writer_version_;
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
};
std::unique_ptr<ColumnChunkMetaData> ColumnChunkMetaData::Make(
@@ -471,6 +514,11 @@ bool ColumnChunkMetaData::Equals(const
ColumnChunkMetaData& other) const {
return impl_->Equals(*other.impl_);
}
+const std::shared_ptr<const KeyValueMetadata>&
ColumnChunkMetaData::key_value_metadata()
+ const {
+ return impl_->key_value_metadata();
+}
+
// row-group metadata
class RowGroupMetaData::RowGroupMetaDataImpl {
public:
@@ -913,7 +961,7 @@ class FileMetaData::FileMetaDataImpl {
std::vector<parquet::ColumnOrder> column_orders;
if (metadata_->__isset.column_orders) {
column_orders.reserve(metadata_->column_orders.size());
- for (auto column_order : metadata_->column_orders) {
+ for (auto& column_order : metadata_->column_orders) {
if (column_order.__isset.TYPE_ORDER) {
column_orders.push_back(ColumnOrder::type_defined_);
} else {
@@ -928,14 +976,7 @@ class FileMetaData::FileMetaDataImpl {
}
void InitKeyValueMetadata() {
- std::shared_ptr<KeyValueMetadata> metadata = nullptr;
- if (metadata_->__isset.key_value_metadata) {
- metadata = std::make_shared<KeyValueMetadata>();
- for (const auto& it : metadata_->key_value_metadata) {
- metadata->Append(it.key, it.value);
- }
- }
- key_value_metadata_ = std::move(metadata);
+ key_value_metadata_ = FromThriftKeyValueMetadata(*metadata_);
}
};
@@ -1590,6 +1631,10 @@ class
ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
column_chunk_->meta_data.__set_encodings(std::move(thrift_encodings));
column_chunk_->meta_data.__set_encoding_stats(std::move(thrift_encoding_stats));
+ if (key_value_metadata_) {
+ ToThriftKeyValueMetadata(*key_value_metadata_,
&column_chunk_->meta_data);
+ }
+
const auto& encrypt_md =
properties_->column_encryption_properties(column_->path()->ToDotString());
// column is encrypted
@@ -1656,6 +1701,10 @@ class
ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
return column_chunk_->meta_data.total_compressed_size;
}
+ void SetKeyValueMetadata(std::shared_ptr<const KeyValueMetadata>
key_value_metadata) {
+ key_value_metadata_ = std::move(key_value_metadata);
+ }
+
private:
void Init(format::ColumnChunk* column_chunk) {
column_chunk_ = column_chunk;
@@ -1670,6 +1719,7 @@ class
ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
std::unique_ptr<format::ColumnChunk> owned_column_chunk_;
const std::shared_ptr<WriterProperties> properties_;
const ColumnDescriptor* column_;
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
};
std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make(
@@ -1727,6 +1777,11 @@ void ColumnChunkMetaDataBuilder::SetStatistics(const
EncodedStatistics& result)
impl_->SetStatistics(result);
}
+void ColumnChunkMetaDataBuilder::SetKeyValueMetadata(
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
+ impl_->SetKeyValueMetadata(std::move(key_value_metadata));
+}
+
int64_t ColumnChunkMetaDataBuilder::total_compressed_size() const {
return impl_->total_compressed_size();
}
@@ -1925,16 +1980,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl {
} else if (key_value_metadata) {
key_value_metadata_ = key_value_metadata_->Merge(*key_value_metadata);
}
- metadata_->key_value_metadata.clear();
- metadata_->key_value_metadata.reserve(
- static_cast<size_t>(key_value_metadata_->size()));
- for (int64_t i = 0; i < key_value_metadata_->size(); ++i) {
- format::KeyValue kv_pair;
- kv_pair.__set_key(key_value_metadata_->key(i));
- kv_pair.__set_value(key_value_metadata_->value(i));
- metadata_->key_value_metadata.push_back(std::move(kv_pair));
- }
- metadata_->__isset.key_value_metadata = true;
+ ToThriftKeyValueMetadata(*key_value_metadata_, metadata_.get());
}
int32_t file_version = 0;
diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index e46297540b..d1e2d1904a 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -184,6 +184,7 @@ class PARQUET_EXPORT ColumnChunkMetaData {
std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
std::optional<IndexLocation> GetColumnIndexLocation() const;
std::optional<IndexLocation> GetOffsetIndexLocation() const;
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
private:
explicit ColumnChunkMetaData(
@@ -466,8 +467,12 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
// column chunk
// Used when a dataset is spread across multiple files
void set_file_path(const std::string& path);
+
// column metadata
void SetStatistics(const EncodedStatistics& stats);
+
+ void SetKeyValueMetadata(std::shared_ptr<const KeyValueMetadata>
key_value_metadata);
+
// get the column descriptor
const ColumnDescriptor* descr() const;
diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index 33df5925a1..60adfc697f 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -64,6 +64,25 @@ void PrintPageEncodingStats(std::ostream& stream,
// the fixed initial size is just for an example
#define COL_WIDTH 30
+void PutChars(std::ostream& stream, char c, int n) {
+ for (int i = 0; i < n; ++i) {
+ stream.put(c);
+ }
+}
+
+void PrintKeyValueMetadata(std::ostream& stream,
+ const KeyValueMetadata& key_value_metadata,
+ int indent_level = 0, int indent_width = 1) {
+ const int64_t size_of_key_value_metadata = key_value_metadata.size();
+ PutChars(stream, ' ', indent_level * indent_width);
+ stream << "Key Value Metadata: " << size_of_key_value_metadata << "
entries\n";
+ for (int64_t i = 0; i < size_of_key_value_metadata; i++) {
+ PutChars(stream, ' ', (indent_level + 1) * indent_width);
+ stream << "Key nr " << i << " " << key_value_metadata.key(i) << ": "
+ << key_value_metadata.value(i) << "\n";
+ }
+}
+
void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int>
selected_columns,
bool print_values, bool format_dump,
bool print_key_value_metadata, const char*
filename) {
@@ -76,12 +95,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream,
std::list<int> selecte
if (print_key_value_metadata && file_metadata->key_value_metadata()) {
auto key_value_metadata = file_metadata->key_value_metadata();
- int64_t size_of_key_value_metadata = key_value_metadata->size();
- stream << "Key Value File Metadata: " << size_of_key_value_metadata << "
entries\n";
- for (int64_t i = 0; i < size_of_key_value_metadata; i++) {
- stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": "
- << key_value_metadata->value(i) << "\n";
- }
+ PrintKeyValueMetadata(stream, *key_value_metadata);
}
stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n";
@@ -136,7 +150,11 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream,
std::list<int> selecte
std::shared_ptr<Statistics> stats = column_chunk->statistics();
const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
- stream << "Column " << i << std::endl << " Values: " <<
column_chunk->num_values();
+ stream << "Column " << i << std::endl;
+ if (print_key_value_metadata && column_chunk->key_value_metadata()) {
+ PrintKeyValueMetadata(stream, *column_chunk->key_value_metadata(), 1,
2);
+ }
+ stream << " Values: " << column_chunk->num_values();
if (column_chunk->is_stats_set()) {
std::string min = stats->EncodeMin(), max = stats->EncodeMax();
stream << ", Null Values: " << stats->null_count()
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 35d15227ee..d6aebd8284 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -328,6 +328,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet"
nogil:
unique_ptr[CColumnCryptoMetaData] crypto_metadata() const
optional[ParquetIndexLocation] GetColumnIndexLocation() const
optional[ParquetIndexLocation] GetOffsetIndexLocation() const
+ shared_ptr[const CKeyValueMetadata] key_value_metadata() const
struct CSortingColumn" parquet::SortingColumn":
int column_idx
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 41b15b633d..254bfe3b09 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -508,6 +508,19 @@ cdef class ColumnChunkMetaData(_Weakrefable):
"""Whether the column chunk has a column index"""
return self.metadata.GetColumnIndexLocation().has_value()
+ @property
+ def metadata(self):
+ """Additional metadata as key value pairs (dict[bytes, bytes])."""
+ cdef:
+ unordered_map[c_string, c_string] metadata
+ const CKeyValueMetadata* underlying_metadata
+ underlying_metadata = self.metadata.key_value_metadata().get()
+ if underlying_metadata != NULL:
+ underlying_metadata.ToUnorderedMap(&metadata)
+ return metadata
+ else:
+ return None
+
cdef class SortingColumn:
"""
diff --git a/python/pyarrow/tests/parquet/conftest.py
b/python/pyarrow/tests/parquet/conftest.py
index 767e7f6b69..80605e973c 100644
--- a/python/pyarrow/tests/parquet/conftest.py
+++ b/python/pyarrow/tests/parquet/conftest.py
@@ -15,6 +15,9 @@
# specific language governing permissions and limitations
# under the License.
+import os
+import pathlib
+
import pytest
from pyarrow.util import guid
@@ -25,6 +28,15 @@ def datadir(base_datadir):
return base_datadir / 'parquet'
[email protected](scope='module')
+def parquet_test_datadir():
+ result = os.environ.get('PARQUET_TEST_DATA')
+ if not result:
+ raise RuntimeError('Please point the PARQUET_TEST_DATA environment '
+ 'variable to the test data directory')
+ return pathlib.Path(result)
+
+
@pytest.fixture
def s3_bucket(s3_server):
boto3 = pytest.importorskip('boto3')
diff --git a/python/pyarrow/tests/parquet/test_metadata.py
b/python/pyarrow/tests/parquet/test_metadata.py
index 528cf0110d..c29213ebc3 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -782,3 +782,12 @@ def test_write_metadata_fs_file_combinations(tempdir,
s3_example_s3fs):
assert meta1.read_bytes() == meta2.read_bytes() \
== meta3.read_bytes() == meta4.read_bytes() \
== s3_fs.open(meta5).read()
+
+
+def test_column_chunk_key_value_metadata(parquet_test_datadir):
+ metadata = pq.read_metadata(parquet_test_datadir /
+ 'column_chunk_key_value_metadata.parquet')
+ key_value_metadata1 = metadata.row_group(0).column(0).metadata
+ assert key_value_metadata1 == {b'foo': b'bar', b'thisiskeywithoutvalue':
b''}
+ key_value_metadata2 = metadata.row_group(0).column(1).metadata
+ assert key_value_metadata2 is None