pitrou commented on a change in pull request #10289:
URL: https://github.com/apache/arrow/pull/10289#discussion_r639561783
##########
File path: python/pyarrow/tests/parquet/test_metadata.py
##########
@@ -303,28 +320,30 @@ def test_field_id_metadata():
pf = pq.ParquetFile(pa.BufferReader(contents))
schema = pf.schema_arrow
- # Expected Parquet schema for reference
- #
- # required group field_id=0 schema {
- # optional int32 field_id=1 f0;
- # optional group field_id=2 f1 (List) {
- # repeated group field_id=3 list {
- # optional int32 field_id=4 item;
- # }
- # }
- # optional binary field_id=5 f2;
- # }
-
field_name = b'PARQUET:field_id'
Review comment:
You already have it named `field_id` above.
##########
File path: cpp/src/parquet/schema_test.cc
##########
@@ -171,17 +167,16 @@ TEST_F(TestPrimitiveNode, Attrs) {
}
TEST_F(TestPrimitiveNode, FromParquet) {
- SchemaElement elt =
- NewPrimitive(name_, FieldRepetitionType::OPTIONAL, Type::INT32,
field_id_);
+ SchemaElement elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL,
Type::INT32);
Review comment:
I'm not sure I understand this change in the tests. The user should
still be able to pass an explicit `field_id` when creating a schema node, no?
##########
File path: cpp/src/parquet/schema.h
##########
@@ -268,7 +268,7 @@ class PARQUET_EXPORT GroupNode : public Node {
public:
// The field_id here is the default to use if it is not set in the
SchemaElement
static std::unique_ptr<Node> FromParquet(const void* opaque_element,
- NodeVector fields = {}, int
field_id = -1);
+ NodeVector fields = {});
Review comment:
While we won't use the optional `field_id` argument anymore in Arrow,
consumers of this API (that read Parquet files without going through Arrow) may
still want to use it for themselves, no?
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1157,6 +1160,94 @@ TEST_F(TestConvertArrowSchema, ParquetFlatDecimals) {
ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields));
}
+class TestConvertRoundTrip : public ::testing::Test {
+ public:
+ ::arrow::Status ConvertSchema(
+ const std::vector<std::shared_ptr<Field>>& fields,
+ std::shared_ptr<::parquet::ArrowWriterProperties> arrow_properties =
+ ::parquet::default_arrow_writer_properties()) {
+ arrow_schema_ = ::arrow::schema(fields);
+ std::shared_ptr<::parquet::WriterProperties> properties =
+ ::parquet::default_writer_properties();
+ RETURN_NOT_OK(ToParquetSchema(arrow_schema_.get(), *properties.get(),
+ *arrow_properties, &parquet_schema_));
+ ::parquet::schema::ToParquet(parquet_schema_->group_node(),
&parquet_format_schema_);
+ auto parquet_schema =
::parquet::schema::FromParquet(parquet_format_schema_);
+ return FromParquetSchema(parquet_schema.get(), &result_schema_);
+ }
+
+ protected:
+ std::shared_ptr<::arrow::Schema> arrow_schema_;
+ std::shared_ptr<SchemaDescriptor> parquet_schema_;
+ std::vector<SchemaElement> parquet_format_schema_;
+ std::shared_ptr<::arrow::Schema> result_schema_;
+};
+
+int GetFieldId(const ::arrow::Field& field) {
+ if (field.metadata() == nullptr) {
+ return -1;
+ }
+ auto maybe_field = field.metadata()->Get("PARQUET:field_id");
+ if (!maybe_field.ok()) {
+ return -1;
+ }
+ return std::stoi(maybe_field.ValueOrDie());
+}
+
+void GetFieldIdsDfs(const ::arrow::FieldVector& fields, std::vector<int>*
field_ids) {
+ for (const auto& field : fields) {
+ field_ids->push_back(GetFieldId(*field));
+ GetFieldIdsDfs(field->type()->fields(), field_ids);
+ }
+}
+
+std::vector<int> GetFieldIdsDfs(const ::arrow::FieldVector& fields) {
+ std::vector<int> field_ids;
+ GetFieldIdsDfs(fields, &field_ids);
+ return field_ids;
+}
+
+TEST_F(TestConvertRoundTrip, GroupIdMissingIfNotSpecified) {
+ std::vector<std::shared_ptr<Field>> arrow_fields;
+ arrow_fields.push_back(::arrow::field("simple", ::arrow::int32(), false));
+ /// { "nested": { "outer": { "inner" }, "sibling" }
Review comment:
Missing closing brace here?
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1157,6 +1160,94 @@ TEST_F(TestConvertArrowSchema, ParquetFlatDecimals) {
ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields));
}
+class TestConvertRoundTrip : public ::testing::Test {
+ public:
+ ::arrow::Status ConvertSchema(
+ const std::vector<std::shared_ptr<Field>>& fields,
+ std::shared_ptr<::parquet::ArrowWriterProperties> arrow_properties =
+ ::parquet::default_arrow_writer_properties()) {
+ arrow_schema_ = ::arrow::schema(fields);
+ std::shared_ptr<::parquet::WriterProperties> properties =
+ ::parquet::default_writer_properties();
+ RETURN_NOT_OK(ToParquetSchema(arrow_schema_.get(), *properties.get(),
+ *arrow_properties, &parquet_schema_));
+ ::parquet::schema::ToParquet(parquet_schema_->group_node(),
&parquet_format_schema_);
+ auto parquet_schema =
::parquet::schema::FromParquet(parquet_format_schema_);
+ return FromParquetSchema(parquet_schema.get(), &result_schema_);
+ }
+
+ protected:
+ std::shared_ptr<::arrow::Schema> arrow_schema_;
+ std::shared_ptr<SchemaDescriptor> parquet_schema_;
+ std::vector<SchemaElement> parquet_format_schema_;
+ std::shared_ptr<::arrow::Schema> result_schema_;
+};
+
+int GetFieldId(const ::arrow::Field& field) {
+ if (field.metadata() == nullptr) {
+ return -1;
+ }
+ auto maybe_field = field.metadata()->Get("PARQUET:field_id");
+ if (!maybe_field.ok()) {
+ return -1;
+ }
+ return std::stoi(maybe_field.ValueOrDie());
+}
+
+void GetFieldIdsDfs(const ::arrow::FieldVector& fields, std::vector<int>*
field_ids) {
+ for (const auto& field : fields) {
+ field_ids->push_back(GetFieldId(*field));
+ GetFieldIdsDfs(field->type()->fields(), field_ids);
+ }
+}
+
+std::vector<int> GetFieldIdsDfs(const ::arrow::FieldVector& fields) {
+ std::vector<int> field_ids;
+ GetFieldIdsDfs(fields, &field_ids);
+ return field_ids;
+}
+
+TEST_F(TestConvertRoundTrip, GroupIdMissingIfNotSpecified) {
+ std::vector<std::shared_ptr<Field>> arrow_fields;
+ arrow_fields.push_back(::arrow::field("simple", ::arrow::int32(), false));
+ /// { "nested": { "outer": { "inner" }, "sibling" }
+ arrow_fields.push_back(::arrow::field(
+ "nested",
+ ::arrow::struct_({::arrow::field("outer",
::arrow::struct_({::arrow::field(
+ "inner",
::arrow::utf8())})),
+ ::arrow::field("sibling", ::arrow::date32())}),
+ false));
+
+ ASSERT_OK(ConvertSchema(arrow_fields));
+ auto field_ids = GetFieldIdsDfs(result_schema_->fields());
+ for (int actual_id : field_ids) {
+ ASSERT_EQ(actual_id, -1);
+ }
+}
+
+std::shared_ptr<::arrow::KeyValueMetadata> FieldIdMetadata(int field_id) {
+ return ::arrow::key_value_metadata({"PARQUET:field_id"},
{std::to_string(field_id)});
+}
+
+TEST_F(TestConvertRoundTrip, GroupIdPreserveExisting) {
Review comment:
Well... this test only checks that Arrow metadata is preserved, right?
It doesn't test that the metadata is actually converted into a field_id on the
Parquet schema node.
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1157,6 +1160,94 @@ TEST_F(TestConvertArrowSchema, ParquetFlatDecimals) {
ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields));
}
+class TestConvertRoundTrip : public ::testing::Test {
+ public:
+ ::arrow::Status ConvertSchema(
+ const std::vector<std::shared_ptr<Field>>& fields,
+ std::shared_ptr<::parquet::ArrowWriterProperties> arrow_properties =
+ ::parquet::default_arrow_writer_properties()) {
+ arrow_schema_ = ::arrow::schema(fields);
+ std::shared_ptr<::parquet::WriterProperties> properties =
+ ::parquet::default_writer_properties();
+ RETURN_NOT_OK(ToParquetSchema(arrow_schema_.get(), *properties.get(),
+ *arrow_properties, &parquet_schema_));
+ ::parquet::schema::ToParquet(parquet_schema_->group_node(),
&parquet_format_schema_);
+ auto parquet_schema =
::parquet::schema::FromParquet(parquet_format_schema_);
+ return FromParquetSchema(parquet_schema.get(), &result_schema_);
+ }
+
+ protected:
+ std::shared_ptr<::arrow::Schema> arrow_schema_;
+ std::shared_ptr<SchemaDescriptor> parquet_schema_;
+ std::vector<SchemaElement> parquet_format_schema_;
+ std::shared_ptr<::arrow::Schema> result_schema_;
+};
+
+int GetFieldId(const ::arrow::Field& field) {
+ if (field.metadata() == nullptr) {
+ return -1;
+ }
+ auto maybe_field = field.metadata()->Get("PARQUET:field_id");
+ if (!maybe_field.ok()) {
+ return -1;
+ }
+ return std::stoi(maybe_field.ValueOrDie());
+}
+
+void GetFieldIdsDfs(const ::arrow::FieldVector& fields, std::vector<int>*
field_ids) {
+ for (const auto& field : fields) {
+ field_ids->push_back(GetFieldId(*field));
+ GetFieldIdsDfs(field->type()->fields(), field_ids);
+ }
+}
+
+std::vector<int> GetFieldIdsDfs(const ::arrow::FieldVector& fields) {
+ std::vector<int> field_ids;
+ GetFieldIdsDfs(fields, &field_ids);
+ return field_ids;
+}
+
+TEST_F(TestConvertRoundTrip, GroupIdMissingIfNotSpecified) {
Review comment:
Why "GroupId"? Should this be "FieldId"?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]