pitrou commented on code in PR #13901:
URL: https://github.com/apache/arrow/pull/13901#discussion_r1735943503


##########
cpp/src/arrow/array/validate.cc:
##########
@@ -985,10 +985,23 @@ Status ValidateArrayFull(const Array& array) { return 
ValidateArrayFull(*array.d
 
 ARROW_EXPORT
 Status ValidateUTF8(const ArrayData& data) {
-  DCHECK(data.type->id() == Type::STRING || data.type->id() == 
Type::STRING_VIEW ||
-         data.type->id() == Type::LARGE_STRING);
-  UTF8DataValidator validator{data};
-  return VisitTypeInline(*data.type, &validator);
+  if (data.type->id() == Type::EXTENSION) {
+    const auto& storage_type =
+        checked_pointer_cast<ExtensionType>(data.type)->storage_type();

Review Comment:
   ```suggestion
       const auto& storage_type =
           checked_cast<const ExtensionType&>(*data.type).storage_type();
   ```



##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -724,6 +727,85 @@ TEST_F(TestConvertParquetSchema, 
ParquetRepeatedNestedSchema) {
   ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
 }
 
+TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) {
+  std::vector<NodePtr> parquet_fields;
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_1", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, 
ConvertedType::JSON));
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_2", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, 
ConvertedType::JSON));
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // By default, both fields should be treated as utf8() fields in Arrow.
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", UTF8, true), ::arrow::field("json_2", UTF8, 
true)});
+    std::shared_ptr<KeyValueMetadata> metadata = 
::arrow::key_value_metadata({}, {});

Review Comment:
   Is this useful?



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -984,21 +992,51 @@ Result<bool> ApplyOriginalMetadata(const Field& 
origin_field, SchemaField* infer
   bool modified = false;
 
   auto& origin_type = origin_field.type();
+  auto inferred_type = inferred->field->type();
 
   if (origin_type->id() == ::arrow::Type::EXTENSION) {
     const auto& ex_type = checked_cast<const 
::arrow::ExtensionType&>(*origin_type);
-    auto origin_storage_field = origin_field.WithType(ex_type.storage_type());
+    if (inferred_type->id() != ::arrow::Type::EXTENSION &&
+        ex_type.extension_name() == 
::arrow::extension::JsonExtensionType::type_name()) {
+      // Schema mismatch.
+      //
+      // Arrow extensions are DISABLED in Parquet.
+      // origin_type is ::arrow::extension::json()
+      // inferred_type is ::arrow::binary()
+      //
+      // Origin type is restored as Arrow should be considered the source of 
truth.
+      DCHECK_EQ(inferred_type->id(), ::arrow::Type::BINARY);
+      inferred->field = inferred->field->WithType(origin_type);
+      RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred));
+    } else {
+      auto origin_storage_field = 
origin_field.WithType(ex_type.storage_type());
 
-    // Apply metadata recursively to storage type
-    RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, 
inferred));
+      // Apply metadata recursively to storage type
+      RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, 
inferred));
 
-    // Restore extension type, if the storage type is the same as inferred
-    // from the Parquet type
-    if (ex_type.storage_type()->Equals(*inferred->field->type())) {
-      inferred->field = inferred->field->WithType(origin_type);
+      // Restore extension type, if the storage type is the same as inferred
+      // from the Parquet type
+      if (ex_type.storage_type()->Equals(*inferred->field->type())) {
+        inferred->field = inferred->field->WithType(origin_type);
+      }
     }
     modified = true;
   } else {
+    if (inferred_type->id() == ::arrow::Type::EXTENSION) {
+      const auto& ex_type = checked_cast<const 
::arrow::ExtensionType&>(*inferred_type);
+      if (ex_type.extension_name() ==
+          ::arrow::extension::JsonExtensionType::type_name()) {
+        // Schema mismatch.
+        //
+        // Arrow extensions are ENABLED in Parquet.
+        // origin_type is ::arrow::binary()
+        // inferred_type is ::arrow::extension::json()
+        //
+        // Origin type is restored as Arrow should be considered the source of 
truth.

Review Comment:
   We can just ignore this case here IMHO.



##########
cpp/src/arrow/array/validate.cc:
##########
@@ -985,10 +985,23 @@ Status ValidateArrayFull(const Array& array) { return 
ValidateArrayFull(*array.d
 
 ARROW_EXPORT
 Status ValidateUTF8(const ArrayData& data) {
-  DCHECK(data.type->id() == Type::STRING || data.type->id() == 
Type::STRING_VIEW ||
-         data.type->id() == Type::LARGE_STRING);
-  UTF8DataValidator validator{data};
-  return VisitTypeInline(*data.type, &validator);
+  if (data.type->id() == Type::EXTENSION) {

Review Comment:
   Did you add tests for this change?



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -1428,6 +1435,42 @@ TEST_F(TestLargeStringParquetIO, Basics) {
   this->RoundTripSingleColumn(large_array, large_array, arrow_properties);
 }
 
+using TestJsonParquetIO = TestParquetIO<::arrow::extension::JsonExtensionType>;
+
+TEST_F(TestJsonParquetIO, JsonExtension) {
+  const char* json = R"([
+    "null",
+    "1234",
+    "3.14159",
+    "true",
+    "false",
+    "\"a json string\"",
+    "[\"a\", \"json\", \"array\"]",
+    "{\"obj\": \"a simple json object\"}"
+  ])";
+
+  const auto json_type = ::arrow::extension::json();
+  const auto json_string_array = ::arrow::ArrayFromJSON(::arrow::utf8(), json);
+  const auto json_array = ::arrow::ExtensionType::WrapArray(json_type, 
json_string_array);
+
+  // When the original Arrow schema isn't stored and Arrow extensions are 
disabled,
+  // LogicalType::JSON is read as utf8.
+  const auto utf8_array = ::arrow::ArrayFromJSON(::arrow::utf8(), json);

Review Comment:
   This is the same as `json_string_array`, right?



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -1410,8 +1417,8 @@ using TestLargeStringParquetIO = 
TestParquetIO<::arrow::LargeStringType>;
 TEST_F(TestLargeStringParquetIO, Basics) {
   const char* json = R"(["foo", "", null, "bar"])";
 
-  const auto large_type = ::arrow::large_utf8();
-  const auto narrow_type = ::arrow::utf8();
+  const auto large_type = ::arrow::large_binary();
+  const auto narrow_type = ::arrow::binary();

Review Comment:
   Why the change?



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -984,21 +990,51 @@ Result<bool> ApplyOriginalMetadata(const Field& 
origin_field, SchemaField* infer
   bool modified = false;
 
   auto& origin_type = origin_field.type();
+  const auto& inferred_type = inferred->field->type();
 
   if (origin_type->id() == ::arrow::Type::EXTENSION) {
     const auto& ex_type = checked_cast<const 
::arrow::ExtensionType&>(*origin_type);
-    auto origin_storage_field = origin_field.WithType(ex_type.storage_type());
+    if (inferred_type->id() != ::arrow::Type::EXTENSION &&
+        ex_type.extension_name() == std::string("arrow.json")) {
+      // Schema mismatch.
+      //
+      // Arrow extensions are DISABLED in Parquet.
+      // origin_type is ::arrow::extension::json()
+      // inferred_type is ::arrow::binary()
+      //
+      // Origin type is restored as Arrow should be considered the source of 
truth.
+      DCHECK_EQ(inferred_type->id(), ::arrow::Type::STRING);

Review Comment:
   This is coming from an arbitrary Parquet file, so we should not use DCHECK. 
Instead let's make it part of the condition above:
   ```c++
       if (inferred_type->id() == ::arrow::Type::STRING &&
           ex_type.extension_name() == std::string("arrow.json")) {
         // Schema mismatch. [...]
   ```



##########
cpp/src/arrow/extension/json_test.cc:
##########
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension/json.h"
+
+#include "arrow/ipc/test_common.h"
+#include "arrow/record_batch.h"
+#include "arrow/testing/gtest_util.h"
+
+namespace arrow {
+
+using arrow::ipc::test::RoundtripBatch;
+using extension::json;
+
+class TestJsonExtensionType : public ::testing::Test {};
+
+std::shared_ptr<Array> ExampleJson(const std::shared_ptr<DataType>& 
storage_type) {
+  std::shared_ptr<Array> arr = ArrayFromJSON(storage_type, R"([
+    "null",
+    "1234",
+    "3.14159",
+    "true",
+    "false",
+    "\"a json string\"",
+    "[\"a\", \"json\", \"array\"]",
+    "{\"obj\": \"a simple json object\"}"
+   ])");
+  return ExtensionType::WrapArray(arrow::extension::json(storage_type), arr);
+}
+
+TEST_F(TestJsonExtensionType, JsonRoundtrip) {
+  for (const auto& storage_type : {utf8(), large_utf8(), utf8_view()}) {
+    auto ext_arr = ExampleJson(storage_type);
+
+    auto batch =
+        RecordBatch::Make(schema({field("f0", json(storage_type))}), 8, 
{ext_arr});
+    std::shared_ptr<RecordBatch> read_batch;
+    RoundtripBatch(batch, &read_batch);
+    ASSERT_OK(read_batch->ValidateFull());
+    CompareBatch(*batch, *read_batch, /*compare_metadata*/ true);
+  }
+}

Review Comment:
   Can you add a test where you call `ValidateFull` and `ValidateUTF8` on a 
JSON array?
   Also check with non-valid UTF8.



##########
cpp/src/arrow/extension/json.h:
##########
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+#include "arrow/extension_type.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow::extension {
+
+/// \brief Concrete type class for variable-size JSON data, utf8-encoded.
+class ARROW_EXPORT JsonExtensionType : public ExtensionType {
+ public:
+  explicit JsonExtensionType(const std::shared_ptr<DataType>& storage_type)
+      : ExtensionType(storage_type), storage_type_(storage_type) {
+    if (storage_type->id() != Type::STRING && storage_type->id() != 
Type::STRING_VIEW &&
+        storage_type->id() != Type::LARGE_STRING) {
+      throw std::invalid_argument("Invalid storage type for JsonExtensionType: 
" +

Review Comment:
   We don't throw exceptions in Arrow C++. Can you just do the check in 
`json.cc`?



##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -724,6 +727,85 @@ TEST_F(TestConvertParquetSchema, 
ParquetRepeatedNestedSchema) {
   ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
 }
 
+TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) {
+  std::vector<NodePtr> parquet_fields;
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_1", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, 
ConvertedType::JSON));
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_2", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, 
ConvertedType::JSON));
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // By default, both fields should be treated as utf8() fields in Arrow.
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", UTF8, true), ::arrow::field("json_2", UTF8, 
true)});
+    std::shared_ptr<KeyValueMetadata> metadata = 
::arrow::key_value_metadata({}, {});
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // If Arrow extensions are enabled, both fields should be treated as 
json() extension
+    // fields.
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled();
+    auto arrow_schema =
+        ::arrow::schema({::arrow::field("json_1", ::arrow::extension::json(), 
true),
+                         ::arrow::field("json_2", ::arrow::extension::json(), 
true)});
+    std::shared_ptr<KeyValueMetadata> metadata = 
::arrow::key_value_metadata({}, {});
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file contains Arrow schema.
+    // Arrow schema has precedence. json_1 should be returned as a json() 
field even
+    // though extensions are not enabled.
+    std::shared_ptr<KeyValueMetadata> field_metadata =
+        ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"});
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", ::arrow::extension::json(), true, 
field_metadata),
+         ::arrow::field("json_2", UTF8, true)});
+
+    ASSERT_OK_AND_ASSIGN(
+        std::shared_ptr<Buffer> serialized,
+        ::arrow::ipc::SerializeSchema(*arrow_schema, 
::arrow::default_memory_pool()));
+    std::string schema_as_string = serialized->ToString();
+    std::string schema_base64 = ::arrow::util::base64_encode(schema_as_string);
+    std::shared_ptr<KeyValueMetadata> metadata =
+        ::arrow::key_value_metadata({"ARROW:schema"}, {schema_base64});

Review Comment:
   Please make a helper function for the Arrow schema -> Parquet metadata 
conversion.



##########
cpp/src/arrow/array/validate.cc:
##########
@@ -985,10 +985,23 @@ Status ValidateArrayFull(const Array& array) { return 
ValidateArrayFull(*array.d
 
 ARROW_EXPORT
 Status ValidateUTF8(const ArrayData& data) {
-  DCHECK(data.type->id() == Type::STRING || data.type->id() == 
Type::STRING_VIEW ||
-         data.type->id() == Type::LARGE_STRING);
-  UTF8DataValidator validator{data};
-  return VisitTypeInline(*data.type, &validator);
+  if (data.type->id() == Type::EXTENSION) {
+    const auto& storage_type =
+        checked_pointer_cast<ExtensionType>(data.type)->storage_type();
+    DCHECK(storage_type->id() == Type::STRING ||
+           storage_type->id() == Type::STRING_VIEW ||
+           storage_type->id() == Type::LARGE_STRING);
+    auto ext_array_data =
+        std::make_shared<ArrayData>(storage_type, data.length, data.buffers,
+                                    data.child_data, data.null_count, 
data.offset);
+    UTF8DataValidator validator{*ext_array_data};

Review Comment:
   What happens if you simply construct the validator with `data`?



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -1428,6 +1435,42 @@ TEST_F(TestLargeStringParquetIO, Basics) {
   this->RoundTripSingleColumn(large_array, large_array, arrow_properties);
 }
 
+using TestJsonParquetIO = TestParquetIO<::arrow::extension::JsonExtensionType>;
+
+TEST_F(TestJsonParquetIO, JsonExtension) {
+  const char* json = R"([
+    "null",
+    "1234",
+    "3.14159",
+    "true",
+    "false",
+    "\"a json string\"",
+    "[\"a\", \"json\", \"array\"]",
+    "{\"obj\": \"a simple json object\"}"
+  ])";
+
+  const auto json_type = ::arrow::extension::json();
+  const auto json_string_array = ::arrow::ArrayFromJSON(::arrow::utf8(), json);
+  const auto json_array = ::arrow::ExtensionType::WrapArray(json_type, 
json_string_array);
+
+  // When the original Arrow schema isn't stored and Arrow extensions are 
disabled,
+  // LogicalType::JSON is read as utf8.
+  const auto utf8_array = ::arrow::ArrayFromJSON(::arrow::utf8(), json);
+  this->RoundTripSingleColumn(json_array, utf8_array, 
default_arrow_writer_properties());
+
+  // When the original Arrow schema isn't stored and Arrow extensions are 
enabled,
+  // LogicalType::JSON is read as JsonExtensionType.
+  ::parquet::ArrowReaderProperties reader_properties;
+  reader_properties.set_arrow_extensions_enabled();
+  this->RoundTripSingleColumn(json_array, json_array, 
default_arrow_writer_properties(),
+                              reader_properties);
+
+  // When the original Arrow schema is stored, the stored Arrow type is always 
respected.
+  const auto arrow_properties =

Review Comment:
   Please call this `writer_properties`



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -1387,8 +1394,8 @@ using TestLargeBinaryParquetIO = 
TestParquetIO<::arrow::LargeBinaryType>;
 TEST_F(TestLargeBinaryParquetIO, Basics) {
   const char* json = "[\"foo\", \"\", null, \"\xff\"]";
 
-  const auto large_type = ::arrow::large_binary();
-  const auto narrow_type = ::arrow::binary();
+  const auto large_type = ::arrow::large_utf8();
+  const auto narrow_type = ::arrow::utf8();
   const auto large_array = ::arrow::ArrayFromJSON(large_type, json);

Review Comment:
   Why the change? 



##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -724,6 +727,85 @@ TEST_F(TestConvertParquetSchema, 
ParquetRepeatedNestedSchema) {
   ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
 }
 
+TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) {
+  std::vector<NodePtr> parquet_fields;
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_1", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, 
ConvertedType::JSON));
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_2", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, 
ConvertedType::JSON));
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // By default, both fields should be treated as utf8() fields in Arrow.
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", UTF8, true), ::arrow::field("json_2", UTF8, 
true)});
+    std::shared_ptr<KeyValueMetadata> metadata = 
::arrow::key_value_metadata({}, {});
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // If Arrow extensions are enabled, both fields should be treated as 
json() extension
+    // fields.
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled();
+    auto arrow_schema =
+        ::arrow::schema({::arrow::field("json_1", ::arrow::extension::json(), 
true),
+                         ::arrow::field("json_2", ::arrow::extension::json(), 
true)});
+    std::shared_ptr<KeyValueMetadata> metadata = 
::arrow::key_value_metadata({}, {});

Review Comment:
   Same question.



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -984,21 +990,51 @@ Result<bool> ApplyOriginalMetadata(const Field& 
origin_field, SchemaField* infer
   bool modified = false;
 
   auto& origin_type = origin_field.type();
+  const auto& inferred_type = inferred->field->type();
 
   if (origin_type->id() == ::arrow::Type::EXTENSION) {
     const auto& ex_type = checked_cast<const 
::arrow::ExtensionType&>(*origin_type);
-    auto origin_storage_field = origin_field.WithType(ex_type.storage_type());
+    if (inferred_type->id() != ::arrow::Type::EXTENSION &&
+        ex_type.extension_name() == std::string("arrow.json")) {
+      // Schema mismatch.
+      //
+      // Arrow extensions are DISABLED in Parquet.
+      // origin_type is ::arrow::extension::json()
+      // inferred_type is ::arrow::binary()

Review Comment:
   ```suggestion
         // inferred_type is ::arrow::utf8()
   ```



##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -724,6 +727,85 @@ TEST_F(TestConvertParquetSchema, 
ParquetRepeatedNestedSchema) {
   ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
 }
 
+TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) {
+  std::vector<NodePtr> parquet_fields;
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_1", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, 
ConvertedType::JSON));
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_2", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, 
ConvertedType::JSON));
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // By default, both fields should be treated as utf8() fields in Arrow.
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", UTF8, true), ::arrow::field("json_2", UTF8, 
true)});
+    std::shared_ptr<KeyValueMetadata> metadata = 
::arrow::key_value_metadata({}, {});
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // If Arrow extensions are enabled, both fields should be treated as 
json() extension
+    // fields.
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled();
+    auto arrow_schema =
+        ::arrow::schema({::arrow::field("json_1", ::arrow::extension::json(), 
true),
+                         ::arrow::field("json_2", ::arrow::extension::json(), 
true)});
+    std::shared_ptr<KeyValueMetadata> metadata = 
::arrow::key_value_metadata({}, {});
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file contains Arrow schema.
+    // Arrow schema has precedence. json_1 should be returned as a json() 
field even
+    // though extensions are not enabled.
+    std::shared_ptr<KeyValueMetadata> field_metadata =
+        ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"});
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", ::arrow::extension::json(), true, 
field_metadata),
+         ::arrow::field("json_2", UTF8, true)});
+
+    ASSERT_OK_AND_ASSIGN(
+        std::shared_ptr<Buffer> serialized,
+        ::arrow::ipc::SerializeSchema(*arrow_schema, 
::arrow::default_memory_pool()));
+    std::string schema_as_string = serialized->ToString();
+    std::string schema_base64 = ::arrow::util::base64_encode(schema_as_string);
+    std::shared_ptr<KeyValueMetadata> metadata =
+        ::arrow::key_value_metadata({"ARROW:schema"}, {schema_base64});
+
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata));
+    CheckFlatSchema(arrow_schema, true /* check_metadata */);
+  }
+
+  {
+    // Parquet file contains Arrow schema.
+    // A contrived example. Parquet believes both columns are JSON. Arrow 
believes json_1
+    // is a JSON column and json_2 is a utf8 column. json_2 should be treated 
as a
+    // utf8 column even if the get_arrow_extensions_enabled is true.

Review Comment:
   It seems like this could only happen if a Arrow writer decided to serialize 
a regular utf8 array as a Parquet JSON column. Why would it do that?
   
   What do you think @jorisvandenbossche ?



##########
cpp/src/parquet/properties.h:
##########
@@ -941,6 +942,15 @@ class PARQUET_EXPORT ArrowReaderProperties {
     return coerce_int96_timestamp_unit_;
   }
 
+  /// Enable Parquet supported Arrow Extension Types.
+  ///
+  /// When enabled, Parquet will use supported Arrow ExtensionTypes in mapping 
to Arrow
+  /// schema. Currently only arrow::extension::json() extension type is 
supported. This
+  /// will be used for utf8 columns whose LogicalType is JSON.
+  void set_arrow_extensions_enabled() { arrow_extensions_enabled_ = true; }
+  void set_arrow_extensions_disabled() { arrow_extensions_enabled_ = false; }
+  bool get_arrow_extensions_enabled() const { return 
arrow_extensions_enabled_; }
+

Review Comment:
   Contrast with:
   ```c++
     void set_pre_buffer(bool pre_buffer) { pre_buffer_ = pre_buffer; }
     /// Return whether read coalescing is enabled.
     bool pre_buffer() const { return pre_buffer_; }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to