(arrow) branch main updated: GH-46676: [C++][Python][Parquet] Allow reading Parquet LIST data as LargeList directly (#46678)

apitrou Wed, 04 Jun 2025 09:30:16 -0700

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 03a1867571 GH-46676: [C++][Python][Parquet] Allow reading Parquet LIST 
data as LargeList directly (#46678)
03a1867571 is described below

commit 03a1867571ef56bdb8c3842a5c26ed7e1ed94c52
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Jun 4 18:30:04 2025 +0200

    GH-46676: [C++][Python][Parquet] Allow reading Parquet LIST data as 
LargeList directly (#46678)
    
    ### Rationale for this change
    
    When reading a Parquet LIST logical type (or a repeated field without a 
logical type), Parquet C++ automatically reads it as a Arrow List array.
    
    However, this can in some cases run into the 32-bit offsets limit. We'd 
like to be able to choose to read as LargeList instead, even if there is no 
serialized Arrow schema in the Parquet file.
    
    ### What changes are included in this PR?
    
    * Add a Parquet read option `list_type` to select which Arrow type to read 
LIST / repeated Parquet columns into
    * Fix an index truncation bug when writing a huge single chunk of data to 
Parquet
    
    ### Are these changes tested?
    
    Yes, the functionality is tested. However, I wasn't able to write a unit 
test that wouldn't consume a horrendous amount of time or  memory 
writing/reading a list with offsets larger than 2**32.
    
    ### Are there any user-facing changes?
    
    No, only an API improvement.
    
    * GitHub Issue: #46676
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/dataset/file_parquet.cc              |   4 +-
 cpp/src/arrow/dataset/file_parquet.h               |   1 +
 cpp/src/parquet/CMakeLists.txt                     |   7 +-
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc  | 110 ++-
 cpp/src/parquet/arrow/arrow_schema_test.cc         | 734 +++++++++++----------
 cpp/src/parquet/arrow/schema.cc                    |  28 +-
 cpp/src/parquet/column_writer.cc                   |   8 +-
 cpp/src/parquet/properties.h                       |  15 +
 python/pyarrow/_dataset_parquet.pyx                |  39 +-
 python/pyarrow/_parquet.pxd                        |   5 +
 python/pyarrow/_parquet.pyx                        |  17 +-
 .../pyarrow/includes/libarrow_dataset_parquet.pxd  |   1 +
 python/pyarrow/includes/libparquet.pxd             |   2 +
 python/pyarrow/parquet/core.py                     |  32 +-
 python/pyarrow/tests/parquet/common.py             |   2 +
 python/pyarrow/tests/parquet/test_data_types.py    |  24 +
 python/pyarrow/tests/test_dataset.py               |  15 +
 17 files changed, 632 insertions(+), 412 deletions(-)

diff --git a/cpp/src/arrow/dataset/file_parquet.cc 
b/cpp/src/arrow/dataset/file_parquet.cc
index d1a7350007..62b8f57ba1 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -117,6 +117,7 @@ parquet::ArrowReaderProperties MakeArrowReaderProperties(
   properties.set_coerce_int96_timestamp_unit(
       format.reader_options.coerce_int96_timestamp_unit);
   properties.set_binary_type(format.reader_options.binary_type);
+  properties.set_list_type(format.reader_options.list_type);
   return properties;
 }
 
@@ -445,7 +446,8 @@ bool ParquetFileFormat::Equals(const FileFormat& other) 
const {
   return (reader_options.dict_columns == other_reader_options.dict_columns &&
           reader_options.coerce_int96_timestamp_unit ==
               other_reader_options.coerce_int96_timestamp_unit &&
-          reader_options.binary_type == other_reader_options.binary_type);
+          reader_options.binary_type == other_reader_options.binary_type &&
+          reader_options.list_type == other_reader_options.list_type);
 }
 
 ParquetFileFormat::ParquetFileFormat(const parquet::ReaderProperties& 
reader_properties)
diff --git a/cpp/src/arrow/dataset/file_parquet.h 
b/cpp/src/arrow/dataset/file_parquet.h
index 234f4ca6f4..1811a96bf9 100644
--- a/cpp/src/arrow/dataset/file_parquet.h
+++ b/cpp/src/arrow/dataset/file_parquet.h
@@ -91,6 +91,7 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat {
     std::unordered_set<std::string> dict_columns;
     arrow::TimeUnit::type coerce_int96_timestamp_unit = arrow::TimeUnit::NANO;
     Type::type binary_type = Type::BINARY;
+    Type::type list_type = Type::LIST;
     /// @}
   } reader_options;
 
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index dadc5ec51d..dc7d40d2a3 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -402,17 +402,18 @@ add_parquet_test(writer-test
 
 add_parquet_test(chunker-test SOURCES chunker_internal_test.cc)
 
-add_parquet_test(arrow-test
+add_parquet_test(arrow-reader-writer-test
                  SOURCES
-                 arrow/arrow_metadata_test.cc
                  arrow/arrow_reader_writer_test.cc
-                 arrow/arrow_schema_test.cc
                  arrow/arrow_statistics_test.cc
                  arrow/variant_test.cc)
 
 add_parquet_test(arrow-internals-test SOURCES arrow/path_internal_test.cc
                  arrow/reconstruct_internal_test.cc)
 
+add_parquet_test(arrow-metadata-test SOURCES arrow/arrow_metadata_test.cc
+                 arrow/arrow_schema_test.cc)
+
 if(PARQUET_REQUIRE_ENCRYPTION)
   add_parquet_test(encryption-test
                    SOURCES
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc 
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 32d901ce78..9c74abab53 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -124,6 +124,19 @@ static constexpr int LARGE_SIZE = 10000;
 
 static constexpr uint32_t kDefaultSeed = 0;
 
+struct ListCase {
+  ::arrow::Type::type type_id;
+  
std::function<std::shared_ptr<::arrow::DataType>(std::shared_ptr<::arrow::Field>)>
+      type_factory;
+};
+
+static const std::vector<ListCase> kListCases = {
+    {::arrow::Type::LIST,
+     [](std::shared_ptr<::arrow::Field> field) { return ::arrow::list(field); 
}},
+    {::arrow::Type::LARGE_LIST,
+     [](std::shared_ptr<::arrow::Field> field) { return 
::arrow::large_list(field); }},
+};
+
 std::shared_ptr<const LogicalType> get_logical_type(const DataType& type) {
   switch (type.id()) {
     case ArrowId::UINT8:
@@ -426,10 +439,13 @@ void CheckConfiguredRoundtrip(
     const std::shared_ptr<::parquet::WriterProperties>& writer_properties =
         ::parquet::default_writer_properties(),
     const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties =
-        default_arrow_writer_properties()) {
+        default_arrow_writer_properties(),
+    const ArrowReaderProperties& arrow_reader_properties =
+        default_arrow_reader_properties()) {
   std::shared_ptr<Table> actual_table;
   ASSERT_NO_FATAL_FAILURE(DoRoundtrip(input_table, input_table->num_rows(), 
&actual_table,
-                                      writer_properties, 
arrow_writer_properties));
+                                      writer_properties, 
arrow_writer_properties,
+                                      arrow_reader_properties));
   if (expected_table) {
     ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*actual_table->schema(),
                                                        
*expected_table->schema(),
@@ -446,14 +462,18 @@ void CheckConfiguredRoundtrip(
 void DoSimpleRoundtrip(const std::shared_ptr<Table>& table, bool use_threads,
                        int64_t row_group_size, const std::vector<int>& 
column_subset,
                        std::shared_ptr<Table>* out,
-                       const std::shared_ptr<ArrowWriterProperties>& 
arrow_properties =
-                           default_arrow_writer_properties()) {
+                       const std::shared_ptr<ArrowWriterProperties>&
+                           arrow_writer_properties = 
default_arrow_writer_properties(),
+                       const ArrowReaderProperties& arrow_reader_properties =
+                           default_arrow_reader_properties()) {
   std::shared_ptr<Buffer> buffer;
   ASSERT_NO_FATAL_FAILURE(
-      WriteTableToBuffer(table, row_group_size, arrow_properties, &buffer));
+      WriteTableToBuffer(table, row_group_size, arrow_writer_properties, 
&buffer));
 
-  ASSERT_OK_AND_ASSIGN(auto reader, 
OpenFile(std::make_shared<BufferReader>(buffer),
-                                             ::arrow::default_memory_pool()));
+  std::unique_ptr<FileReader> reader;
+  FileReaderBuilder builder;
+  ASSERT_OK_NO_THROW(builder.Open(std::make_shared<BufferReader>(buffer)));
+  ASSERT_OK(builder.properties(arrow_reader_properties)->Build(&reader));
 
   reader->set_use_threads(use_threads);
   if (column_subset.size() > 0) {
@@ -468,7 +488,8 @@ void DoRoundTripWithBatches(
     const std::shared_ptr<Table>& table, bool use_threads, int64_t 
row_group_size,
     const std::vector<int>& column_subset, std::shared_ptr<Table>* out,
     const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties =
-        default_arrow_writer_properties()) {
+        default_arrow_writer_properties(),
+    ArrowReaderProperties arrow_reader_properties = 
default_arrow_reader_properties()) {
   std::shared_ptr<Buffer> buffer;
   ASSERT_NO_FATAL_FAILURE(
       WriteTableToBuffer(table, row_group_size, arrow_writer_properties, 
&buffer));
@@ -476,7 +497,6 @@ void DoRoundTripWithBatches(
   std::unique_ptr<FileReader> reader;
   FileReaderBuilder builder;
   ASSERT_OK_NO_THROW(builder.Open(std::make_shared<BufferReader>(buffer)));
-  ArrowReaderProperties arrow_reader_properties;
   arrow_reader_properties.set_batch_size(row_group_size - 1);
   ASSERT_OK_NO_THROW(builder.memory_pool(::arrow::default_memory_pool())
                          ->properties(arrow_reader_properties)
@@ -497,23 +517,24 @@ void DoRoundTripWithBatches(
   ASSERT_OK_AND_ASSIGN(*out, Table::FromRecordBatchReader(batch_reader.get()));
 }
 
-void CheckSimpleRoundtrip(
-    const std::shared_ptr<Table>& table, int64_t row_group_size,
-    const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties =
-        default_arrow_writer_properties()) {
+void CheckSimpleRoundtrip(const std::shared_ptr<Table>& table, int64_t 
row_group_size,
+                          const std::shared_ptr<ArrowWriterProperties>&
+                              arrow_writer_properties = 
default_arrow_writer_properties(),
+                          const ArrowReaderProperties& arrow_reader_properties 
=
+                              default_arrow_reader_properties()) {
   std::shared_ptr<Table> result;
-  ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip(table, false /* use_threads */,
-                                            row_group_size, {}, &result,
-                                            arrow_writer_properties));
+  ASSERT_NO_FATAL_FAILURE(
+      DoSimpleRoundtrip(table, false /* use_threads */, row_group_size, {}, 
&result,
+                        arrow_writer_properties, arrow_reader_properties));
   ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(),
                              /*check_metadata=*/false);
   ASSERT_OK(result->ValidateFull());
 
   ::arrow::AssertTablesEqual(*table, *result, false);
 
-  ASSERT_NO_FATAL_FAILURE(DoRoundTripWithBatches(table, false /* use_threads 
*/,
-                                                 row_group_size, {}, &result,
-                                                 arrow_writer_properties));
+  ASSERT_NO_FATAL_FAILURE(
+      DoRoundTripWithBatches(table, false /* use_threads */, row_group_size, 
{}, &result,
+                             arrow_writer_properties, 
arrow_reader_properties));
   ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(),
                              /*check_metadata=*/false);
   ASSERT_OK(result->ValidateFull());
@@ -3198,8 +3219,22 @@ TEST(ArrowReadWrite, LargeList) {
       [7, 8, 9]])";
   auto array = ::arrow::ArrayFromJSON(type, json);
   auto table = ::arrow::Table::Make(::arrow::schema({field("root", type)}), 
{array});
-  auto props_store_schema = 
ArrowWriterProperties::Builder().store_schema()->build();
-  CheckSimpleRoundtrip(table, 2, props_store_schema);
+  {
+    // If the schema is stored, the large_list is restored regardless of
+    // the list_type setting
+    for (auto list_type : {::arrow::Type::LIST, ::arrow::Type::LARGE_LIST}) {
+      ArrowReaderProperties reader_props;
+      reader_props.set_list_type(list_type);
+      auto writer_props = 
ArrowWriterProperties::Builder().store_schema()->build();
+      CheckSimpleRoundtrip(table, 2, writer_props, reader_props);
+    }
+  }
+  {
+    // If the schema is not stored, large_list is read depending on the 
list_type setting
+    ArrowReaderProperties reader_props;
+    reader_props.set_list_type(::arrow::Type::LARGE_LIST);
+    CheckSimpleRoundtrip(table, 2, default_arrow_writer_properties(), 
reader_props);
+  }
 }
 
 TEST(ArrowReadWrite, FixedSizeList) {
@@ -3224,20 +3259,25 @@ TEST(ArrowReadWrite, ListOfStructOfList2) {
   using ::arrow::list;
   using ::arrow::struct_;
 
-  auto type =
-      list(field("item",
-                 struct_({field("a", ::arrow::int16(), /*nullable=*/false),
-                          field("b", list(::arrow::int64()), 
/*nullable=*/false)}),
-                 /*nullable=*/false));
-
-  const char* json = R"([
-      [{"a": 123, "b": [1, 2, 3]}],
-      null,
-      [],
-      [{"a": 456, "b": []}, {"a": 789, "b": [null]}, {"a": 876, "b": [4, 5, 
6]}]])";
-  auto array = ::arrow::ArrayFromJSON(type, json);
-  auto table = ::arrow::Table::Make(::arrow::schema({field("root", type)}), 
{array});
-  CheckSimpleRoundtrip(table, 2);
+  for (const auto& list_case : kListCases) {
+    auto type = list_case.type_factory(
+        field("item",
+              struct_({field("a", ::arrow::int16(), /*nullable=*/false),
+                       field("b", list_case.type_factory(field("item", 
::arrow::int64())),
+                             /*nullable=*/false)}),
+              /*nullable=*/false));
+
+    const char* json = R"([
+        [{"a": 123, "b": [1, 2, 3]}],
+        null,
+        [],
+        [{"a": 456, "b": []}, {"a": 789, "b": [null]}, {"a": 876, "b": [4, 5, 
6]}]])";
+    auto array = ::arrow::ArrayFromJSON(type, json);
+    auto table = ::arrow::Table::Make(::arrow::schema({field("root", type)}), 
{array});
+    ArrowReaderProperties reader_props;
+    reader_props.set_list_type(list_case.type_id);
+    CheckSimpleRoundtrip(table, 2, default_arrow_writer_properties(), 
reader_props);
+  }
 }
 
 TEST(ArrowReadWrite, StructOfLists) {
diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc 
b/cpp/src/parquet/arrow/arrow_schema_test.cc
index f55631e8c6..390d8bc77e 100644
--- a/cpp/src/parquet/arrow/arrow_schema_test.cc
+++ b/cpp/src/parquet/arrow/arrow_schema_test.cc
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <functional>
 #include <memory>
 #include <vector>
 
@@ -70,6 +71,19 @@ const auto TIMESTAMP_NS = ::arrow::timestamp(TimeUnit::NANO);
 const auto BINARY = ::arrow::binary();
 const auto DECIMAL_8_4 = std::make_shared<::arrow::Decimal128Type>(8, 4);
 
+struct ListCase {
+  ::arrow::Type::type type_id;
+  
std::function<std::shared_ptr<::arrow::DataType>(std::shared_ptr<::arrow::Field>)>
+      type_factory;
+};
+
+static const std::vector<ListCase> kListCases = {
+    {::arrow::Type::LIST,
+     [](std::shared_ptr<::arrow::Field> field) { return ::arrow::list(field); 
}},
+    {::arrow::Type::LARGE_LIST,
+     [](std::shared_ptr<::arrow::Field> field) { return 
::arrow::large_list(field); }},
+};
+
 class TestConvertParquetSchema : public ::testing::Test {
  public:
   virtual void SetUp() {}
@@ -360,309 +374,320 @@ TEST_F(TestConvertParquetSchema, ParquetFlatDecimals) {
 }
 
 TEST_F(TestConvertParquetSchema, ParquetMaps) {
-  std::vector<NodePtr> parquet_fields;
-  std::vector<std::shared_ptr<Field>> arrow_fields;
-
   // MAP encoding example taken from parquet-format/LogicalTypes.md
 
-  // Two column map.
-  {
-    auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, 
ParquetType::BYTE_ARRAY,
-                                   ConvertedType::UTF8);
-    auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
-                                     ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
+  for (const auto& list_case : kListCases) {
+    std::vector<NodePtr> parquet_fields;
+    std::vector<std::shared_ptr<Field>> arrow_fields;
 
-    auto list = GroupNode::Make("key_value", Repetition::REPEATED, {key, 
value});
-    parquet_fields.push_back(
-        GroupNode::Make("my_map", Repetition::REQUIRED, {list}, 
LogicalType::Map()));
-    auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false);
-    auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true);
-    auto arrow_map = std::make_shared<::arrow::MapType>(
-        ::arrow::field("my_map", ::arrow::struct_({arrow_key, arrow_value}),
-                       /*nullable=*/false),
-        /*nullable=*/false);
+    // Two column map.
+    {
+      auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, 
ParquetType::BYTE_ARRAY,
+                                     ConvertedType::UTF8);
+      auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
+                                       ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
 
-    arrow_fields.push_back(::arrow::field("my_map", arrow_map, 
/*nullable=*/false));
-  }
-  // Single column map (i.e. set) gets converted to list of struct.
-  {
-    auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, 
ParquetType::BYTE_ARRAY,
-                                   ConvertedType::UTF8);
+      auto list = GroupNode::Make("key_value", Repetition::REPEATED, {key, 
value});
+      parquet_fields.push_back(
+          GroupNode::Make("my_map", Repetition::REQUIRED, {list}, 
LogicalType::Map()));
+      auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false);
+      auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true);
+      auto arrow_map = std::make_shared<::arrow::MapType>(
+          ::arrow::field("my_map", ::arrow::struct_({arrow_key, arrow_value}),
+                         /*nullable=*/false),
+          /*nullable=*/false);
+
+      arrow_fields.push_back(::arrow::field("my_map", arrow_map, 
/*nullable=*/false));
+    }
+    // Single column map (i.e. set) gets converted to list of struct.
+    {
+      auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, 
ParquetType::BYTE_ARRAY,
+                                     ConvertedType::UTF8);
+
+      auto list = GroupNode::Make("key_value", Repetition::REPEATED, {key});
+      parquet_fields.push_back(
+          GroupNode::Make("my_set", Repetition::REQUIRED, {list}, 
LogicalType::Map()));
+      auto arrow_list =
+          list_case.type_factory(::arrow::field("key", UTF8, 
/*nullable=*/false));
+      arrow_fields.push_back(::arrow::field("my_set", arrow_list, false));
+    }
+    // Two column map with non-standard field names.
+    {
+      auto key = PrimitiveNode::Make("int_key", Repetition::REQUIRED, 
ParquetType::INT32,
+                                     ConvertedType::INT_32);
+      auto value = PrimitiveNode::Make("str_value", Repetition::OPTIONAL,
+                                       ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
 
-    auto list = GroupNode::Make("key_value", Repetition::REPEATED, {key});
-    parquet_fields.push_back(
-        GroupNode::Make("my_set", Repetition::REQUIRED, {list}, 
LogicalType::Map()));
-    auto arrow_list = ::arrow::list({::arrow::field("key", UTF8, 
/*nullable=*/false)});
-    arrow_fields.push_back(::arrow::field("my_set", arrow_list, false));
-  }
-  // Two column map with non-standard field names.
-  {
-    auto key = PrimitiveNode::Make("int_key", Repetition::REQUIRED, 
ParquetType::INT32,
-                                   ConvertedType::INT_32);
-    auto value = PrimitiveNode::Make("str_value", Repetition::OPTIONAL,
-                                     ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
+      auto list = GroupNode::Make("items", Repetition::REPEATED, {key, value});
+      parquet_fields.push_back(
+          GroupNode::Make("items", Repetition::REQUIRED, {list}, 
LogicalType::Map()));
+      auto arrow_value = ::arrow::field("str_value", UTF8, /*nullable=*/true);
+      auto arrow_key = ::arrow::field("int_key", INT32, /*nullable=*/false);
+      auto arrow_map = std::make_shared<::arrow::MapType>(
+          ::arrow::field("items", ::arrow::struct_({arrow_key, arrow_value}), 
false),
+          false);
 
-    auto list = GroupNode::Make("items", Repetition::REPEATED, {key, value});
-    parquet_fields.push_back(
-        GroupNode::Make("items", Repetition::REQUIRED, {list}, 
LogicalType::Map()));
-    auto arrow_value = ::arrow::field("str_value", UTF8, /*nullable=*/true);
-    auto arrow_key = ::arrow::field("int_key", INT32, /*nullable=*/false);
-    auto arrow_map = std::make_shared<::arrow::MapType>(
-        ::arrow::field("items", ::arrow::struct_({arrow_key, arrow_value}), 
false),
-        false);
-
-    arrow_fields.push_back(::arrow::field("items", arrow_map, false));
-  }
+      arrow_fields.push_back(::arrow::field("items", arrow_map, false));
+    }
 
-  auto arrow_schema = ::arrow::schema(arrow_fields);
-  ASSERT_OK(ConvertSchema(parquet_fields));
+    auto arrow_schema = ::arrow::schema(arrow_fields);
 
-  ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
-  for (int i = 0; i < arrow_schema->num_fields(); ++i) {
-    auto result_field = result_schema_->field(i);
-    auto expected_field = arrow_schema->field(i);
-    if (expected_field->type()->id() == ::arrow::Type::MAP) {
-      EXPECT_TRUE(
-          
expected_field->type()->field(0)->Equals(result_field->type()->field(0)))
-          << "Map's struct in field " << i
-          << "\n result: " << result_field->type()->field(0)->ToString() << " "
-          << "\n expected: " << expected_field->type()->field(0)->ToString() 
<< "\n";
+    ArrowReaderProperties props;
+    props.set_list_type(list_case.type_id);
+    ASSERT_OK(ConvertSchema(parquet_fields, /*key_value_metadata=*/{}, props));
+
+    ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
+    for (int i = 0; i < arrow_schema->num_fields(); ++i) {
+      auto result_field = result_schema_->field(i);
+      auto expected_field = arrow_schema->field(i);
+      if (expected_field->type()->id() == ::arrow::Type::MAP) {
+        EXPECT_TRUE(
+            
expected_field->type()->field(0)->Equals(result_field->type()->field(0)))
+            << "Map's struct in field " << i
+            << "\n result: " << result_field->type()->field(0)->ToString() << 
" "
+            << "\n expected: " << expected_field->type()->field(0)->ToString() 
<< "\n";
+      }
     }
   }
 }
 
 TEST_F(TestConvertParquetSchema, ParquetLists) {
-  std::vector<NodePtr> parquet_fields;
-  std::vector<std::shared_ptr<Field>> arrow_fields;
-
   // LIST encoding example taken from parquet-format/LogicalTypes.md
 
-  // // List<String> (list non-null, elements nullable)
-  // required group my_list (LIST) {
-  //   repeated group list {
-  //     optional binary element (UTF8);
-  //   }
-  // }
-  {
-    auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL,
-                                       ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
-    auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
-    parquet_fields.push_back(
-        GroupNode::Make("my_list", Repetition::REQUIRED, {list}, 
ConvertedType::LIST));
-    auto arrow_element = ::arrow::field("string", UTF8, true);
-    auto arrow_list = ::arrow::list(arrow_element);
-    arrow_fields.push_back(::arrow::field("my_list", arrow_list, false));
-  }
+  for (const auto& list_case : kListCases) {
+    std::vector<NodePtr> parquet_fields;
+    std::vector<std::shared_ptr<Field>> arrow_fields;
 
-  // // List<String> (list nullable, elements non-null)
-  // optional group my_list (LIST) {
-  //   repeated group list {
-  //     required binary element (UTF8);
-  //   }
-  // }
-  {
-    auto element = PrimitiveNode::Make("string", Repetition::REQUIRED,
-                                       ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
-    auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
-    parquet_fields.push_back(
-        GroupNode::Make("my_list", Repetition::OPTIONAL, {list}, 
ConvertedType::LIST));
-    auto arrow_element = ::arrow::field("string", UTF8, false);
-    auto arrow_list = ::arrow::list(arrow_element);
-    arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
-  }
+    // // List<String> (list non-null, elements nullable)
+    // required group my_list (LIST) {
+    //   repeated group list {
+    //     optional binary element (UTF8);
+    //   }
+    // }
+    {
+      auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL,
+                                         ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
+      auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
+      parquet_fields.push_back(
+          GroupNode::Make("my_list", Repetition::REQUIRED, {list}, 
ConvertedType::LIST));
+      auto arrow_element = ::arrow::field("string", UTF8, true);
+      auto arrow_list = list_case.type_factory(arrow_element);
+      arrow_fields.push_back(::arrow::field("my_list", arrow_list, false));
+    }
 
-  // Element types can be nested structures. For example, a list of lists:
-  //
-  // // List<List<Integer>>
-  // optional group array_of_arrays (LIST) {
-  //   repeated group list {
-  //     required group element (LIST) {
-  //       repeated group list {
-  //         required int32 element;
-  //       }
-  //     }
-  //   }
-  // }
-  {
-    auto inner_element =
-        PrimitiveNode::Make("int32", Repetition::REQUIRED, ParquetType::INT32);
-    auto inner_list = GroupNode::Make("list", Repetition::REPEATED, 
{inner_element});
-    auto element = GroupNode::Make("element", Repetition::REQUIRED, 
{inner_list},
-                                   ConvertedType::LIST);
-    auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
-    parquet_fields.push_back(GroupNode::Make("array_of_arrays", 
Repetition::OPTIONAL,
-                                             {list}, ConvertedType::LIST));
-    auto arrow_inner_element = ::arrow::field("int32", INT32, false);
-    auto arrow_inner_list = ::arrow::list(arrow_inner_element);
-    auto arrow_element = ::arrow::field("element", arrow_inner_list, false);
-    auto arrow_list = ::arrow::list(arrow_element);
-    arrow_fields.push_back(::arrow::field("array_of_arrays", arrow_list, 
true));
-  }
+    // // List<String> (list nullable, elements non-null)
+    // optional group my_list (LIST) {
+    //   repeated group list {
+    //     required binary element (UTF8);
+    //   }
+    // }
+    {
+      auto element = PrimitiveNode::Make("string", Repetition::REQUIRED,
+                                         ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
+      auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
+      parquet_fields.push_back(
+          GroupNode::Make("my_list", Repetition::OPTIONAL, {list}, 
ConvertedType::LIST));
+      auto arrow_element = ::arrow::field("string", UTF8, false);
+      auto arrow_list = list_case.type_factory(arrow_element);
+      arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+    }
 
-  // // List<String> (list nullable, elements non-null)
-  // optional group my_list (LIST) {
-  //   repeated group element {
-  //     required binary str (UTF8);
-  //   };
-  // }
-  {
-    auto element = PrimitiveNode::Make("str", Repetition::REQUIRED,
-                                       ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
-    auto list = GroupNode::Make("element", Repetition::REPEATED, {element});
-    parquet_fields.push_back(
-        GroupNode::Make("my_list", Repetition::OPTIONAL, {list}, 
ConvertedType::LIST));
-    auto arrow_element = ::arrow::field("str", UTF8, false);
-    auto arrow_list = ::arrow::list(arrow_element);
-    arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
-  }
+    // Element types can be nested structures. For example, a list of lists:
+    //
+    // // List<List<Integer>>
+    // optional group array_of_arrays (LIST) {
+    //   repeated group list {
+    //     required group element (LIST) {
+    //       repeated group list {
+    //         required int32 element;
+    //       }
+    //     }
+    //   }
+    // }
+    {
+      auto inner_element =
+          PrimitiveNode::Make("int32", Repetition::REQUIRED, 
ParquetType::INT32);
+      auto inner_list = GroupNode::Make("list", Repetition::REPEATED, 
{inner_element});
+      auto element = GroupNode::Make("element", Repetition::REQUIRED, 
{inner_list},
+                                     ConvertedType::LIST);
+      auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
+      parquet_fields.push_back(GroupNode::Make("array_of_arrays", 
Repetition::OPTIONAL,
+                                               {list}, ConvertedType::LIST));
+      auto arrow_inner_element = ::arrow::field("int32", INT32, false);
+      auto arrow_inner_list = list_case.type_factory(arrow_inner_element);
+      auto arrow_element = ::arrow::field("element", arrow_inner_list, false);
+      auto arrow_list = list_case.type_factory(arrow_element);
+      arrow_fields.push_back(::arrow::field("array_of_arrays", arrow_list, 
true));
+    }
 
-  // // List<Integer> (nullable list, non-null elements)
-  // optional group my_list (LIST) {
-  //   repeated int32 element;
-  // }
-  {
-    auto element =
-        PrimitiveNode::Make("element", Repetition::REPEATED, 
ParquetType::INT32);
-    parquet_fields.push_back(
-        GroupNode::Make("my_list", Repetition::OPTIONAL, {element}, 
ConvertedType::LIST));
-    auto arrow_element = ::arrow::field("element", INT32, false);
-    auto arrow_list = ::arrow::list(arrow_element);
-    arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
-  }
+    // // List<String> (list nullable, elements non-null)
+    // optional group my_list (LIST) {
+    //   repeated group element {
+    //     required binary str (UTF8);
+    //   };
+    // }
+    {
+      auto element = PrimitiveNode::Make("str", Repetition::REQUIRED,
+                                         ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
+      auto list = GroupNode::Make("element", Repetition::REPEATED, {element});
+      parquet_fields.push_back(
+          GroupNode::Make("my_list", Repetition::OPTIONAL, {list}, 
ConvertedType::LIST));
+      auto arrow_element = ::arrow::field("str", UTF8, false);
+      auto arrow_list = list_case.type_factory(arrow_element);
+      arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+    }
 
-  // // List<Tuple<String, Integer>> (nullable list, non-null elements)
-  // optional group my_list (LIST) {
-  //   repeated group element {
-  //     required binary str (UTF8);
-  //     required int32 num;
-  //   };
-  // }
-  {
-    auto str_element = PrimitiveNode::Make("str", Repetition::REQUIRED,
-                                           ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
-    auto num_element =
-        PrimitiveNode::Make("num", Repetition::REQUIRED, ParquetType::INT32);
-    auto element =
-        GroupNode::Make("element", Repetition::REPEATED, {str_element, 
num_element});
-    parquet_fields.push_back(
-        GroupNode::Make("my_list", Repetition::OPTIONAL, {element}, 
ConvertedType::LIST));
-    auto arrow_str = ::arrow::field("str", UTF8, false);
-    auto arrow_num = ::arrow::field("num", INT32, false);
-    std::vector<std::shared_ptr<Field>> fields({arrow_str, arrow_num});
-    auto arrow_struct = ::arrow::struct_(fields);
-    auto arrow_element = ::arrow::field("element", arrow_struct, false);
-    auto arrow_list = ::arrow::list(arrow_element);
-    arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
-  }
+    // // List<Integer> (nullable list, non-null elements)
+    // optional group my_list (LIST) {
+    //   repeated int32 element;
+    // }
+    {
+      auto element =
+          PrimitiveNode::Make("element", Repetition::REPEATED, 
ParquetType::INT32);
+      parquet_fields.push_back(GroupNode::Make("my_list", 
Repetition::OPTIONAL, {element},
+                                               ConvertedType::LIST));
+      auto arrow_element = ::arrow::field("element", INT32, false);
+      auto arrow_list = list_case.type_factory(arrow_element);
+      arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+    }
 
-  // // List<OneTuple<String>> (nullable list, non-null elements)
-  // optional group my_list (LIST) {
-  //   repeated group array {
-  //     required binary str (UTF8);
-  //   };
-  // }
-  // Special case: group is named array
-  {
-    auto element = PrimitiveNode::Make("str", Repetition::REQUIRED,
-                                       ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
-    auto array = GroupNode::Make("array", Repetition::REPEATED, {element});
-    parquet_fields.push_back(
-        GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, 
ConvertedType::LIST));
-    auto arrow_str = ::arrow::field("str", UTF8, false);
-    std::vector<std::shared_ptr<Field>> fields({arrow_str});
-    auto arrow_struct = ::arrow::struct_(fields);
-    auto arrow_element = ::arrow::field("array", arrow_struct, false);
-    auto arrow_list = ::arrow::list(arrow_element);
-    arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
-  }
+    // // List<Tuple<String, Integer>> (nullable list, non-null elements)
+    // optional group my_list (LIST) {
+    //   repeated group element {
+    //     required binary str (UTF8);
+    //     required int32 num;
+    //   };
+    // }
+    {
+      auto str_element = PrimitiveNode::Make(
+          "str", Repetition::REQUIRED, ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
+      auto num_element =
+          PrimitiveNode::Make("num", Repetition::REQUIRED, ParquetType::INT32);
+      auto element =
+          GroupNode::Make("element", Repetition::REPEATED, {str_element, 
num_element});
+      parquet_fields.push_back(GroupNode::Make("my_list", 
Repetition::OPTIONAL, {element},
+                                               ConvertedType::LIST));
+      auto arrow_str = ::arrow::field("str", UTF8, false);
+      auto arrow_num = ::arrow::field("num", INT32, false);
+      std::vector<std::shared_ptr<Field>> fields({arrow_str, arrow_num});
+      auto arrow_struct = ::arrow::struct_(fields);
+      auto arrow_element = ::arrow::field("element", arrow_struct, false);
+      auto arrow_list = list_case.type_factory(arrow_element);
+      arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+    }
 
-  // // List<OneTuple<String>> (nullable list, non-null elements)
-  // optional group my_list (LIST) {
-  //   repeated group my_list_tuple {
-  //     required binary str (UTF8);
-  //   };
-  // }
-  // Special case: group named ends in _tuple
-  {
-    auto element = PrimitiveNode::Make("str", Repetition::REQUIRED,
-                                       ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
-    auto array = GroupNode::Make("my_list_tuple", Repetition::REPEATED, 
{element});
-    parquet_fields.push_back(
-        GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, 
ConvertedType::LIST));
-    auto arrow_str = ::arrow::field("str", UTF8, false);
-    std::vector<std::shared_ptr<Field>> fields({arrow_str});
-    auto arrow_struct = ::arrow::struct_(fields);
-    auto arrow_element = ::arrow::field("my_list_tuple", arrow_struct, false);
-    auto arrow_list = ::arrow::list(arrow_element);
-    arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
-  }
+    // // List<OneTuple<String>> (nullable list, non-null elements)
+    // optional group my_list (LIST) {
+    //   repeated group array {
+    //     required binary str (UTF8);
+    //   };
+    // }
+    // Special case: group is named array
+    {
+      auto element = PrimitiveNode::Make("str", Repetition::REQUIRED,
+                                         ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
+      auto array = GroupNode::Make("array", Repetition::REPEATED, {element});
+      parquet_fields.push_back(
+          GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, 
ConvertedType::LIST));
+      auto arrow_str = ::arrow::field("str", UTF8, false);
+      std::vector<std::shared_ptr<Field>> fields({arrow_str});
+      auto arrow_struct = ::arrow::struct_(fields);
+      auto arrow_element = ::arrow::field("array", arrow_struct, false);
+      auto arrow_list = list_case.type_factory(arrow_element);
+      arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+    }
 
-  // One-level encoding: Only allows required lists with required cells
-  //   repeated value_type name
-  {
-    parquet_fields.push_back(
-        PrimitiveNode::Make("name", Repetition::REPEATED, ParquetType::INT32));
-    auto arrow_element = ::arrow::field("name", INT32, false);
-    auto arrow_list = ::arrow::list(arrow_element);
-    arrow_fields.push_back(::arrow::field("name", arrow_list, false));
-  }
+    // // List<OneTuple<String>> (nullable list, non-null elements)
+    // optional group my_list (LIST) {
+    //   repeated group my_list_tuple {
+    //     required binary str (UTF8);
+    //   };
+    // }
+    // Special case: group named ends in _tuple
+    {
+      auto element = PrimitiveNode::Make("str", Repetition::REQUIRED,
+                                         ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
+      auto array = GroupNode::Make("my_list_tuple", Repetition::REPEATED, 
{element});
+      parquet_fields.push_back(
+          GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, 
ConvertedType::LIST));
+      auto arrow_str = ::arrow::field("str", UTF8, false);
+      std::vector<std::shared_ptr<Field>> fields({arrow_str});
+      auto arrow_struct = ::arrow::struct_(fields);
+      auto arrow_element = ::arrow::field("my_list_tuple", arrow_struct, 
false);
+      auto arrow_list = list_case.type_factory(arrow_element);
+      arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+    }
 
-  // Two-level encoding List<List<Integer>>:
-  // optional group my_list (LIST) {
-  //   repeated group array (LIST) {
-  //     repeated int32 array;
-  //   }
-  // }
-  {
-    auto inner_array =
-        PrimitiveNode::Make("array", Repetition::REPEATED, ParquetType::INT32);
-    auto outer_array = GroupNode::Make("array", Repetition::REPEATED, 
{inner_array},
-                                       ConvertedType::LIST);
-    parquet_fields.push_back(GroupNode::Make("my_list", Repetition::OPTIONAL,
-                                             {outer_array}, 
ConvertedType::LIST));
-    auto arrow_inner_array = ::arrow::field("array", INT32, 
/*nullable=*/false);
-    auto arrow_outer_array =
-        ::arrow::field("array", ::arrow::list(arrow_inner_array), 
/*nullable=*/false);
-    auto arrow_list = ::arrow::list(arrow_outer_array);
-    arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
-  }
+    // One-level encoding: Only allows required lists with required cells
+    //   repeated value_type name
+    {
+      parquet_fields.push_back(
+          PrimitiveNode::Make("name", Repetition::REPEATED, 
ParquetType::INT32));
+      auto arrow_element = ::arrow::field("name", INT32, false);
+      auto arrow_list = list_case.type_factory(arrow_element);
+      arrow_fields.push_back(::arrow::field("name", arrow_list, false));
+    }
 
-  // List<Map<String, String>> in three-level list encoding:
-  // optional group my_list (LIST) {
-  //   repeated group list {
-  //     required group element (MAP) {
-  //       repeated group key_value {
-  //         required binary key (STRING);
-  //         optional binary value (STRING);
-  //       }
-  //     }
-  //   }
-  // }
-  {
-    auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, 
ParquetType::BYTE_ARRAY,
-                                   ConvertedType::UTF8);
-    auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
-                                     ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
-    auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key, 
value});
-    auto element =
-        GroupNode::Make("element", Repetition::REQUIRED, {key_value}, 
ConvertedType::MAP);
-    auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
-    parquet_fields.push_back(
-        GroupNode::Make("my_list", Repetition::OPTIONAL, {list}, 
ConvertedType::LIST));
-
-    auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false);
-    auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true);
-    auto arrow_element = ::arrow::field(
-        "element", std::make_shared<::arrow::MapType>(arrow_key, arrow_value),
-        /*nullable=*/false);
-    auto arrow_list = ::arrow::list(arrow_element);
-    arrow_fields.push_back(::arrow::field("my_list", arrow_list, 
/*nullable=*/true));
-  }
+    // Two-level encoding List<List<Integer>>:
+    // optional group my_list (LIST) {
+    //   repeated group array (LIST) {
+    //     repeated int32 array;
+    //   }
+    // }
+    {
+      auto inner_array =
+          PrimitiveNode::Make("array", Repetition::REPEATED, 
ParquetType::INT32);
+      auto outer_array = GroupNode::Make("array", Repetition::REPEATED, 
{inner_array},
+                                         ConvertedType::LIST);
+      parquet_fields.push_back(GroupNode::Make("my_list", Repetition::OPTIONAL,
+                                               {outer_array}, 
ConvertedType::LIST));
+      auto arrow_inner_array = ::arrow::field("array", INT32, 
/*nullable=*/false);
+      auto arrow_outer_array = ::arrow::field(
+          "array", list_case.type_factory(arrow_inner_array), 
/*nullable=*/false);
+      auto arrow_list = list_case.type_factory(arrow_outer_array);
+      arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+    }
 
-  auto arrow_schema = ::arrow::schema(arrow_fields);
-  ASSERT_OK(ConvertSchema(parquet_fields));
+    // List<Map<String, String>> in three-level list encoding:
+    // optional group my_list (LIST) {
+    //   repeated group list {
+    //     required group element (MAP) {
+    //       repeated group key_value {
+    //         required binary key (STRING);
+    //         optional binary value (STRING);
+    //       }
+    //     }
+    //   }
+    // }
+    {
+      auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, 
ParquetType::BYTE_ARRAY,
+                                     ConvertedType::UTF8);
+      auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
+                                       ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
+      auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, 
{key, value});
+      auto element = GroupNode::Make("element", Repetition::REQUIRED, 
{key_value},
+                                     ConvertedType::MAP);
+      auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
+      parquet_fields.push_back(
+          GroupNode::Make("my_list", Repetition::OPTIONAL, {list}, 
ConvertedType::LIST));
+
+      auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false);
+      auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true);
+      auto arrow_element = ::arrow::field(
+          "element", std::make_shared<::arrow::MapType>(arrow_key, 
arrow_value),
+          /*nullable=*/false);
+      auto arrow_list = list_case.type_factory(arrow_element);
+      arrow_fields.push_back(::arrow::field("my_list", arrow_list, 
/*nullable=*/true));
+    }
 
-  ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
+    auto arrow_schema = ::arrow::schema(arrow_fields);
+
+    ArrowReaderProperties props;
+    props.set_list_type(list_case.type_id);
+    ASSERT_OK(ConvertSchema(parquet_fields, /*key_value_metadata=*/{}, props));
+
+    ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
+  }
 }
 
 TEST_F(TestConvertParquetSchema, UnsupportedThings) {
@@ -769,43 +794,52 @@ TEST_F(TestConvertParquetSchema, ParquetUndefinedType) {
 }
 
 TEST_F(TestConvertParquetSchema, ParquetRepeatedNestedSchema) {
-  std::vector<NodePtr> parquet_fields;
-  std::vector<std::shared_ptr<Field>> arrow_fields;
-  {
-    //   optional int32 leaf1;
-    //   repeated group outerGroup {
-    //     optional int32 leaf2;
-    //     repeated group innerGroup {
-    //       optional int32 leaf3;
-    //     }
-    //   }
-    parquet_fields.push_back(
-        PrimitiveNode::Make("leaf1", Repetition::OPTIONAL, 
ParquetType::INT32));
-    parquet_fields.push_back(GroupNode::Make(
-        "outerGroup", Repetition::REPEATED,
-        {PrimitiveNode::Make("leaf2", Repetition::OPTIONAL, 
ParquetType::INT32),
-         GroupNode::Make(
-             "innerGroup", Repetition::REPEATED,
-             {PrimitiveNode::Make("leaf3", Repetition::OPTIONAL, 
ParquetType::INT32)})}));
-
-    auto inner_group_fields = {::arrow::field("leaf3", INT32, true)};
-    auto inner_group_type = ::arrow::struct_(inner_group_fields);
-    auto outer_group_fields = {
-        ::arrow::field("leaf2", INT32, true),
-        ::arrow::field(
-            "innerGroup",
-            ::arrow::list(::arrow::field("innerGroup", inner_group_type, 
false)), false)};
-    auto outer_group_type = ::arrow::struct_(outer_group_fields);
-
-    arrow_fields.push_back(::arrow::field("leaf1", INT32, true));
-    arrow_fields.push_back(::arrow::field(
-        "outerGroup",
-        ::arrow::list(::arrow::field("outerGroup", outer_group_type, false)), 
false));
-  }
-  auto arrow_schema = ::arrow::schema(arrow_fields);
-  ASSERT_OK(ConvertSchema(parquet_fields));
+  for (const auto& list_case : kListCases) {
+    std::vector<NodePtr> parquet_fields;
+    std::vector<std::shared_ptr<Field>> arrow_fields;
 
-  ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
+    {
+      //   optional int32 leaf1;
+      //   repeated group outerGroup {
+      //     optional int32 leaf2;
+      //     repeated group innerGroup {
+      //       optional int32 leaf3;
+      //     }
+      //   }
+      parquet_fields.push_back(
+          PrimitiveNode::Make("leaf1", Repetition::OPTIONAL, 
ParquetType::INT32));
+      parquet_fields.push_back(GroupNode::Make(
+          "outerGroup", Repetition::REPEATED,
+          {PrimitiveNode::Make("leaf2", Repetition::OPTIONAL, 
ParquetType::INT32),
+           GroupNode::Make("innerGroup", Repetition::REPEATED,
+                           {PrimitiveNode::Make("leaf3", Repetition::OPTIONAL,
+                                                ParquetType::INT32)})}));
+
+      auto inner_group_fields = {::arrow::field("leaf3", INT32, true)};
+      auto inner_group_type = ::arrow::struct_(inner_group_fields);
+      auto outer_group_fields = {
+          ::arrow::field("leaf2", INT32, true),
+          ::arrow::field("innerGroup",
+                         list_case.type_factory(::arrow::field(
+                             "innerGroup", inner_group_type, 
/*nullable=*/false)),
+                         /*nullable=*/false)};
+      auto outer_group_type = ::arrow::struct_(outer_group_fields);
+
+      arrow_fields.push_back(::arrow::field("leaf1", INT32, true));
+      arrow_fields.push_back(
+          ::arrow::field("outerGroup",
+                         list_case.type_factory(::arrow::field(
+                             "outerGroup", outer_group_type, 
/*nullable=*/false)),
+                         /*nullable=*/false));
+    }
+    auto arrow_schema = ::arrow::schema(arrow_fields);
+
+    ArrowReaderProperties props;
+    props.set_list_type(list_case.type_id);
+    ASSERT_OK(ConvertSchema(parquet_fields, /*key_value_metadata=*/{}, props));
+
+    ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
+  }
 }
 
 TEST_F(TestConvertParquetSchema, IllegalParquetNestedSchema) {
@@ -1514,48 +1548,50 @@ TEST_F(TestConvertArrowSchema, 
ParquetGeoArrowCrsProjjson) {
 }
 
 TEST_F(TestConvertArrowSchema, ParquetLists) {
-  std::vector<NodePtr> parquet_fields;
-  std::vector<std::shared_ptr<Field>> arrow_fields;
+  for (const auto& list_case : kListCases) {
+    std::vector<NodePtr> parquet_fields;
+    std::vector<std::shared_ptr<Field>> arrow_fields;
 
-  // parquet_arrow will always generate 3-level LIST encodings
+    // parquet_arrow will always generate 3-level LIST encodings
 
-  // // List<String> (list non-null, elements nullable)
-  // required group my_list (LIST) {
-  //   repeated group list {
-  //     optional binary element (UTF8);
-  //   }
-  // }
-  {
-    auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
-                                       ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
-    auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
-    parquet_fields.push_back(
-        GroupNode::Make("my_list", Repetition::REQUIRED, {list}, 
ConvertedType::LIST));
-    auto arrow_element = ::arrow::field("string", UTF8, true);
-    auto arrow_list = ::arrow::list(arrow_element);
-    arrow_fields.push_back(::arrow::field("my_list", arrow_list, false));
-  }
+    // // List<String> (list non-null, elements nullable)
+    // required group my_list (LIST) {
+    //   repeated group list {
+    //     optional binary element (UTF8);
+    //   }
+    // }
+    {
+      auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
+                                         ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
+      auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
+      parquet_fields.push_back(
+          GroupNode::Make("my_list", Repetition::REQUIRED, {list}, 
ConvertedType::LIST));
+      auto arrow_element = ::arrow::field("string", UTF8, true);
+      auto arrow_list = list_case.type_factory(arrow_element);
+      arrow_fields.push_back(::arrow::field("my_list", arrow_list, false));
+    }
 
-  // // List<String> (list nullable, elements non-null)
-  // optional group my_list (LIST) {
-  //   repeated group list {
-  //     required binary element (UTF8);
-  //   }
-  // }
-  {
-    auto element = PrimitiveNode::Make("element", Repetition::REQUIRED,
-                                       ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
-    auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
-    parquet_fields.push_back(
-        GroupNode::Make("my_list", Repetition::OPTIONAL, {list}, 
ConvertedType::LIST));
-    auto arrow_element = ::arrow::field("string", UTF8, false);
-    auto arrow_list = ::arrow::list(arrow_element);
-    arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
-  }
+    // // List<String> (list nullable, elements non-null)
+    // optional group my_list (LIST) {
+    //   repeated group list {
+    //     required binary element (UTF8);
+    //   }
+    // }
+    {
+      auto element = PrimitiveNode::Make("element", Repetition::REQUIRED,
+                                         ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
+      auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
+      parquet_fields.push_back(
+          GroupNode::Make("my_list", Repetition::OPTIONAL, {list}, 
ConvertedType::LIST));
+      auto arrow_element = ::arrow::field("string", UTF8, false);
+      auto arrow_list = list_case.type_factory(arrow_element);
+      arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+    }
 
-  ASSERT_OK(ConvertSchema(arrow_fields));
+    ASSERT_OK(ConvertSchema(arrow_fields));
 
-  ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields));
+    ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields));
+  }
 }
 
 TEST_F(TestConvertArrowSchema, ParquetMaps) {
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index 0488e08d05..cd026b3bea 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -78,6 +78,19 @@ Repetition::type RepetitionFromNullable(bool is_nullable) {
   return is_nullable ? Repetition::OPTIONAL : Repetition::REQUIRED;
 }
 
+Result<std::shared_ptr<::arrow::DataType>> MakeArrowList(
+    std::shared_ptr<Field> field, const ArrowReaderProperties& props) {
+  switch (props.list_type()) {
+    case ::arrow::Type::LIST:
+      return ::arrow::list(std::move(field));
+    case ::arrow::Type::LARGE_LIST:
+      return ::arrow::large_list(std::move(field));
+    default:
+      return Status::TypeError("Invalid list_type: " +
+                               ::arrow::internal::ToString(props.list_type()));
+  }
+}
+
 Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& 
field,
                    const WriterProperties& properties,
                    const ArrowWriterProperties& arrow_properties, NodePtr* 
out);
@@ -776,8 +789,10 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo 
current_levels,
     RETURN_NOT_OK(
         PopulateLeaf(column_index, item_field, current_levels, ctx, out, 
child_field));
   }
-  out->field = ::arrow::field(group.name(), ::arrow::list(child_field->field),
-                              group.is_optional(), 
FieldIdMetadata(group.field_id()));
+  ARROW_ASSIGN_OR_RAISE(auto list_type,
+                        MakeArrowList(child_field->field, ctx->properties));
+  out->field = ::arrow::field(group.name(), std::move(list_type), 
group.is_optional(),
+                              FieldIdMetadata(group.field_id()));
   out->level_info = current_levels;
   // At this point current levels contains the def level for this list,
   // we need to reset to the prior parent.
@@ -805,7 +820,9 @@ Status GroupToSchemaField(const GroupNode& node, LevelInfo 
current_levels,
 
     int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
     RETURN_NOT_OK(GroupToStruct(node, current_levels, ctx, out, 
&out->children[0]));
-    out->field = ::arrow::field(node.name(), 
::arrow::list(out->children[0].field),
+    ARROW_ASSIGN_OR_RAISE(auto list_type,
+                          MakeArrowList(out->children[0].field, 
ctx->properties));
+    out->field = ::arrow::field(node.name(), std::move(list_type),
                                 /*nullable=*/false, 
FieldIdMetadata(node.field_id()));
 
     ctx->LinkParent(&out->children[0], out);
@@ -856,7 +873,9 @@ Status NodeToSchemaField(const Node& node, LevelInfo 
current_levels,
       RETURN_NOT_OK(PopulateLeaf(column_index, child_field, current_levels, 
ctx, out,
                                  &out->children[0]));
 
-      out->field = ::arrow::field(node.name(), ::arrow::list(child_field),
+      ARROW_ASSIGN_OR_RAISE(auto list_type,
+                            MakeArrowList(out->children[0].field, 
ctx->properties));
+      out->field = ::arrow::field(node.name(), std::move(list_type),
                                   /*nullable=*/false, 
FieldIdMetadata(node.field_id()));
       out->level_info = current_levels;
       // At this point current_levels has consider this list the ancestor so 
restore
@@ -934,6 +953,7 @@ 
std::function<std::shared_ptr<::arrow::DataType>(FieldVector)> GetNestedFactory(
       }
       break;
     case ::arrow::Type::LIST:
+    case ::arrow::Type::LARGE_LIST:
       if (origin_type.id() == ::arrow::Type::LIST) {
         return [](FieldVector fields) {
           DCHECK_EQ(fields.size(), 1);
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 2c4a072284..acce754c87 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -2008,7 +2008,7 @@ struct SerializeFunctor {
   Status Serialize(const ArrayType& array, ArrowWriteContext*, ParquetCType* 
out) {
     const ArrowCType* input = array.raw_values();
     if (array.null_count() > 0) {
-      for (int i = 0; i < array.length(); i++) {
+      for (int64_t i = 0; i < array.length(); i++) {
         out[i] = static_cast<ParquetCType>(input[i]);
       }
     } else {
@@ -2067,7 +2067,7 @@ Status 
TypedColumnWriterImpl<ParquetType>::WriteArrowSerialize(
 template <>
 struct SerializeFunctor<BooleanType, ::arrow::BooleanType> {
   Status Serialize(const ::arrow::BooleanArray& data, ArrowWriteContext*, 
bool* out) {
-    for (int i = 0; i < data.length(); i++) {
+    for (int64_t i = 0; i < data.length(); i++) {
       *out++ = data.Value(i);
     }
     return Status::OK();
@@ -2092,7 +2092,7 @@ template <>
 struct SerializeFunctor<Int32Type, ::arrow::Date64Type> {
   Status Serialize(const ::arrow::Date64Array& array, ArrowWriteContext*, 
int32_t* out) {
     const int64_t* input = array.raw_values();
-    for (int i = 0; i < array.length(); i++) {
+    for (int64_t i = 0; i < array.length(); i++) {
       *out++ = static_cast<int32_t>(*input++ / 86400000);
     }
     return Status::OK();
@@ -2146,7 +2146,7 @@ struct SerializeFunctor<Int32Type, ::arrow::Time32Type> {
     const int32_t* input = array.raw_values();
     const auto& type = static_cast<const ::arrow::Time32Type&>(*array.type());
     if (type.unit() == ::arrow::TimeUnit::SECOND) {
-      for (int i = 0; i < array.length(); i++) {
+      for (int64_t i = 0; i < array.length(); i++) {
         out[i] = input[i] * 1000;
       }
     } else {
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index aec9edd093..bbaf6b5e71 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -991,6 +991,7 @@ static constexpr bool kArrowDefaultUseThreads = false;
 static constexpr int64_t kArrowDefaultBatchSize = 64 * 1024;
 
 constexpr inline ::arrow::Type::type kArrowDefaultBinaryType = 
::arrow::Type::BINARY;
+constexpr inline ::arrow::Type::type kArrowDefaultListType = 
::arrow::Type::LIST;
 
 /// EXPERIMENTAL: Properties for configuring FileReader behavior.
 class PARQUET_EXPORT ArrowReaderProperties {
@@ -1003,6 +1004,7 @@ class PARQUET_EXPORT ArrowReaderProperties {
         cache_options_(::arrow::io::CacheOptions::LazyDefaults()),
         coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO),
         binary_type_(kArrowDefaultBinaryType),
+        list_type_(kArrowDefaultListType),
         arrow_extensions_enabled_(false),
         should_load_statistics_(false) {}
 
@@ -1051,6 +1053,18 @@ class PARQUET_EXPORT ArrowReaderProperties {
   /// Return the Arrow binary type to read BYTE_ARRAY columns as.
   ::arrow::Type::type binary_type() const { return binary_type_; }
 
+  /// \brief Set the Arrow list type to read Parquet list columns as.
+  ///
+  /// Allowed values are Type::LIST and Type::LARGE_LIST.
+  /// Default is Type::LIST.
+  ///
+  /// However, if a serialized Arrow schema is found in the Parquet metadata,
+  /// this setting is ignored and the Arrow schema takes precedence
+  /// (see ArrowWriterProperties::store_schema).
+  void set_list_type(::arrow::Type::type value) { list_type_ = value; }
+  /// Return the Arrow list type to read Parquet list columns as.
+  ::arrow::Type::type list_type() const { return list_type_; }
+
   /// \brief Set the maximum number of rows to read into a record batch.
   ///
   /// Will only be fewer rows when there are no more rows in the file.
@@ -1121,6 +1135,7 @@ class PARQUET_EXPORT ArrowReaderProperties {
   ::arrow::io::CacheOptions cache_options_;
   ::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
   ::arrow::Type::type binary_type_;
+  ::arrow::Type::type list_type_;
   bool arrow_extensions_enabled_;
   bool should_load_statistics_;
 };
diff --git a/python/pyarrow/_dataset_parquet.pyx 
b/python/pyarrow/_dataset_parquet.pyx
index f5e501bdc8..9405b5d8c5 100644
--- a/python/pyarrow/_dataset_parquet.pyx
+++ b/python/pyarrow/_dataset_parquet.pyx
@@ -49,7 +49,7 @@ from pyarrow._dataset cimport (
 
 from pyarrow._parquet cimport (
     _create_writer_properties, _create_arrow_writer_properties,
-    FileMetaData,
+    _unwrap_list_type, FileMetaData,
 )
 
 
@@ -145,6 +145,7 @@ cdef class ParquetFileFormat(FileFormat):
         options.coerce_int96_timestamp_unit = \
             read_options._coerce_int96_timestamp_unit
         options.binary_type = read_options._binary_type
+        options.list_type = read_options._list_type
 
         self.init(<shared_ptr[CFileFormat]> wrapped)
         self.default_fragment_scan_options = default_fragment_scan_options
@@ -186,6 +187,7 @@ cdef class ParquetFileFormat(FileFormat):
         parquet_read_options._coerce_int96_timestamp_unit = \
             options.coerce_int96_timestamp_unit
         parquet_read_options._binary_type = options.binary_type
+        parquet_read_options._list_type = options.list_type
         return parquet_read_options
 
     def make_write_options(self, **kwargs):
@@ -516,20 +518,27 @@ cdef class ParquetReadOptions(_Weakrefable):
         If given, Parquet binary columns will be read as this datatype.
         This setting is ignored if a serialized Arrow schema is found in
         the Parquet metadata.
+    list_type : subclass of pyarrow.DataType, default None
+        If given, non-MAP repeated columns will be read as an instance of
+        this datatype (either pyarrow.ListType or pyarrow.LargeListType).
+        This setting is ignored if a serialized Arrow schema is found in
+        the Parquet metadata.
     """
 
     cdef public:
         set dictionary_columns
         TimeUnit _coerce_int96_timestamp_unit
         Type _binary_type
+        Type _list_type
 
     # Also see _PARQUET_READ_OPTIONS
     def __init__(self, dictionary_columns=None,
                  coerce_int96_timestamp_unit=None,
-                 binary_type=None):
+                 binary_type=None, list_type=None):
         self.dictionary_columns = set(dictionary_columns or set())
         self.coerce_int96_timestamp_unit = coerce_int96_timestamp_unit
         self.binary_type = binary_type
+        self.list_type = list_type
 
     @property
     def binary_type(self):
@@ -542,6 +551,18 @@ cdef class ParquetReadOptions(_Weakrefable):
         else:
             self._binary_type = _Type_BINARY
 
+    @property
+    def list_type(self):
+        return (pa.LargeListType if self._list_type == _Type_LARGE_LIST
+                else pa.ListType)
+
+    @list_type.setter
+    def list_type(self, ty):
+        if ty is not None:
+            self._list_type = _unwrap_list_type(ty)
+        else:
+            self._list_type = _Type_LIST
+
     @property
     def coerce_int96_timestamp_unit(self):
         return timeunit_to_string(self._coerce_int96_timestamp_unit)
@@ -564,9 +585,10 @@ cdef class ParquetReadOptions(_Weakrefable):
         bool
         """
         return (self.dictionary_columns == other.dictionary_columns and
-                self.coerce_int96_timestamp_unit ==
-                other.coerce_int96_timestamp_unit and
-                self.binary_type == other.binary_type)
+                self._coerce_int96_timestamp_unit ==
+                other._coerce_int96_timestamp_unit and
+                self._binary_type == other._binary_type and
+                self._list_type == other._list_type)
 
     def __eq__(self, other):
         try:
@@ -578,7 +600,10 @@ cdef class ParquetReadOptions(_Weakrefable):
         return (
             f"<ParquetReadOptions"
             f" dictionary_columns={self.dictionary_columns}"
-            f" coerce_int96_timestamp_unit={self.coerce_int96_timestamp_unit}>"
+            f" coerce_int96_timestamp_unit={self.coerce_int96_timestamp_unit}"
+            f" binary_type={self.binary_type}"
+            f" list_type={self.list_type}"
+            f">"
         )
 
 
@@ -696,7 +721,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
 
 
 cdef set _PARQUET_READ_OPTIONS = {
-    'dictionary_columns', 'coerce_int96_timestamp_unit', 'binary_type'
+    'dictionary_columns', 'coerce_int96_timestamp_unit', 'binary_type', 
'list_type',
 }
 
 
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 1ef575d18f..94365f0f7c 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -68,6 +68,11 @@ cdef shared_ptr[ArrowWriterProperties] 
_create_arrow_writer_properties(
     store_schema=*,
 ) except *
 
+
+# Unwrap the "list_type" argument for ArrowReaderProperties
+cdef Type _unwrap_list_type(obj) except *
+
+
 cdef class ParquetSchema(_Weakrefable):
     cdef:
         FileMetaData parent  # the FileMetaData owning the SchemaDescriptor
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index b7b90c09db..d59c70a274 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -42,6 +42,7 @@ from pyarrow.lib cimport (_Weakrefable, Buffer, Schema,
                           string_to_timeunit)
 
 from pyarrow.lib import (ArrowException, NativeFile, BufferOutputStream,
+                         ListType, LargeListType,
                          _stringify_path,
                          tobytes, frombytes, is_threading_enabled)
 
@@ -50,6 +51,16 @@ cimport cpython as cp
 _DEFAULT_ROW_GROUP_SIZE = 1024*1024
 _MAX_ROW_GROUP_SIZE = 64*1024*1024
 
+
+cdef Type _unwrap_list_type(obj) except *:
+    if obj is ListType:
+        return _Type_LIST
+    elif obj is LargeListType:
+        return _Type_LARGE_LIST
+    else:
+        raise TypeError(f"Unexpected list_type: {obj!r}")
+
+
 cdef class Statistics(_Weakrefable):
     """Statistics for a single column in a single row group."""
 
@@ -1552,7 +1563,7 @@ cdef class ParquetReader(_Weakrefable):
         self._metadata = None
 
     def open(self, object source not None, *, bint use_memory_map=False,
-             read_dictionary=None, binary_type=None,
+             read_dictionary=None, binary_type=None, list_type=None,
              FileMetaData metadata=None,
              int buffer_size=0, bint pre_buffer=False,
              coerce_int96_timestamp_unit=None,
@@ -1570,6 +1581,7 @@ cdef class ParquetReader(_Weakrefable):
         use_memory_map : bool, default False
         read_dictionary : iterable[int or str], optional
         binary_type : pyarrow.DataType, optional
+        list_type : subclass of pyarrow.DataType, optional
         metadata : FileMetaData, optional
         buffer_size : int, default 0
         pre_buffer : bool, default False
@@ -1625,6 +1637,9 @@ cdef class ParquetReader(_Weakrefable):
             c_binary_type = pyarrow_unwrap_data_type(binary_type)
             arrow_props.set_binary_type(c_binary_type.get().id())
 
+        if list_type is not None:
+            arrow_props.set_list_type(_unwrap_list_type(list_type))
+
         if coerce_int96_timestamp_unit is None:
             # use the default defined in default_arrow_reader_properties()
             pass
diff --git a/python/pyarrow/includes/libarrow_dataset_parquet.pxd 
b/python/pyarrow/includes/libarrow_dataset_parquet.pxd
index 9e44d8aa06..8f4917f6c0 100644
--- a/python/pyarrow/includes/libarrow_dataset_parquet.pxd
+++ b/python/pyarrow/includes/libarrow_dataset_parquet.pxd
@@ -64,6 +64,7 @@ cdef extern from "arrow/dataset/api.h" namespace 
"arrow::dataset" nogil:
         unordered_set[c_string] dict_columns
         TimeUnit coerce_int96_timestamp_unit
         Type binary_type
+        Type list_type
 
     cdef cppclass CParquetFileFormat "arrow::dataset::ParquetFileFormat"(
             CFileFormat):
diff --git a/python/pyarrow/includes/libparquet.pxd 
b/python/pyarrow/includes/libparquet.pxd
index 6de37d4c24..d9dd9d1aec 100644
--- a/python/pyarrow/includes/libparquet.pxd
+++ b/python/pyarrow/includes/libparquet.pxd
@@ -442,6 +442,8 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" 
nogil:
         ArrowReaderProperties()
         void set_binary_type(Type binary_type)
         Type binary_type()
+        void set_list_type(Type list_type)
+        Type list_type()
         void set_read_dictionary(int column_index, c_bool read_dict)
         c_bool read_dictionary(int column_index)
         void set_batch_size(int64_t batch_size)
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 15dae27896..a84fd5e8b7 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -225,6 +225,11 @@ class ParquetFile:
         If given, Parquet binary columns will be read as this datatype.
         This setting is ignored if a serialized Arrow schema is found in
         the Parquet metadata.
+    list_type : subclass of pyarrow.DataType, default None
+        If given, non-MAP repeated columns will be read as an instance of
+        this datatype (either pyarrow.ListType or pyarrow.LargeListType).
+        This setting is ignored if a serialized Arrow schema is found in
+        the Parquet metadata.
     memory_map : bool, default False
         If the source is a file path, use a memory map to read file, which can
         improve performance in some environments.
@@ -304,8 +309,9 @@ class ParquetFile:
     """
 
     def __init__(self, source, *, metadata=None, common_metadata=None,
-                 read_dictionary=None, binary_type=None, memory_map=False,
-                 buffer_size=0, pre_buffer=False, 
coerce_int96_timestamp_unit=None,
+                 read_dictionary=None, binary_type=None, list_type=None,
+                 memory_map=False, buffer_size=0, pre_buffer=False,
+                 coerce_int96_timestamp_unit=None,
                  decryption_properties=None, thrift_string_size_limit=None,
                  thrift_container_size_limit=None, filesystem=None,
                  page_checksum_verification=False, 
arrow_extensions_enabled=False):
@@ -323,7 +329,7 @@ class ParquetFile:
             source, use_memory_map=memory_map,
             buffer_size=buffer_size, pre_buffer=pre_buffer,
             read_dictionary=read_dictionary, metadata=metadata,
-            binary_type=binary_type,
+            binary_type=binary_type, list_type=list_type,
             coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
             decryption_properties=decryption_properties,
             thrift_string_size_limit=thrift_string_size_limit,
@@ -1202,6 +1208,11 @@ binary_type : pyarrow.DataType, default None
     If given, Parquet binary columns will be read as this datatype.
     This setting is ignored if a serialized Arrow schema is found in
     the Parquet metadata.
+list_type : subclass of pyarrow.DataType, default None
+    If given, non-MAP repeated columns will be read as an instance of
+    this datatype (either pyarrow.ListType or pyarrow.LargeListType).
+    This setting is ignored if a serialized Arrow schema is found in
+    the Parquet metadata.
 memory_map : bool, default False
     If the source is a file path, use a memory map to read file, which can
     improve performance in some environments.
@@ -1321,8 +1332,9 @@ Examples
 """
 
     def __init__(self, path_or_paths, filesystem=None, schema=None, *, 
filters=None,
-                 read_dictionary=None, binary_type=None, memory_map=False,
-                 buffer_size=None, partitioning="hive", ignore_prefixes=None,
+                 read_dictionary=None, binary_type=None, list_type=None,
+                 memory_map=False, buffer_size=None, partitioning="hive",
+                 ignore_prefixes=None,
                  pre_buffer=True, coerce_int96_timestamp_unit=None,
                  decryption_properties=None, thrift_string_size_limit=None,
                  thrift_container_size_limit=None,
@@ -1339,6 +1351,7 @@ Examples
             "page_checksum_verification": page_checksum_verification,
             "arrow_extensions_enabled": arrow_extensions_enabled,
             "binary_type": binary_type,
+            "list_type": list_type,
         }
         if buffer_size:
             read_options.update(use_buffered_stream=True,
@@ -1819,9 +1832,10 @@ Read data from a single Parquet file:
 
 def read_table(source, *, columns=None, use_threads=True,
                schema=None, use_pandas_metadata=False, read_dictionary=None,
-               binary_type=None, memory_map=False, buffer_size=0, 
partitioning="hive",
-               filesystem=None, filters=None, ignore_prefixes=None,
-               pre_buffer=True, coerce_int96_timestamp_unit=None,
+               binary_type=None, list_type=None, memory_map=False, 
buffer_size=0,
+               partitioning="hive", filesystem=None, filters=None,
+               ignore_prefixes=None, pre_buffer=True,
+               coerce_int96_timestamp_unit=None,
                decryption_properties=None, thrift_string_size_limit=None,
                thrift_container_size_limit=None,
                page_checksum_verification=False,
@@ -1836,6 +1850,7 @@ def read_table(source, *, columns=None, use_threads=True,
             memory_map=memory_map,
             read_dictionary=read_dictionary,
             binary_type=binary_type,
+            list_type=list_type,
             buffer_size=buffer_size,
             filters=filters,
             ignore_prefixes=ignore_prefixes,
@@ -1872,6 +1887,7 @@ def read_table(source, *, columns=None, use_threads=True,
         dataset = ParquetFile(
             source, read_dictionary=read_dictionary,
             binary_type=binary_type,
+            list_type=list_type,
             memory_map=memory_map, buffer_size=buffer_size,
             pre_buffer=pre_buffer,
             coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
diff --git a/python/pyarrow/tests/parquet/common.py 
b/python/pyarrow/tests/parquet/common.py
index fd6ad94fbd..4f5946649b 100644
--- a/python/pyarrow/tests/parquet/common.py
+++ b/python/pyarrow/tests/parquet/common.py
@@ -67,9 +67,11 @@ def _check_roundtrip(table, expected=None, 
read_table_kwargs=None,
     # intentionally check twice
     result = _roundtrip_table(table, read_table_kwargs=read_table_kwargs,
                               write_table_kwargs=write_table_kwargs)
+    assert result.schema == expected.schema
     assert result.equals(expected)
     result = _roundtrip_table(result, read_table_kwargs=read_table_kwargs,
                               write_table_kwargs=write_table_kwargs)
+    assert result.schema == expected.schema
     assert result.equals(expected)
 
 
diff --git a/python/pyarrow/tests/parquet/test_data_types.py 
b/python/pyarrow/tests/parquet/test_data_types.py
index 6553a5217e..351221f64d 100644
--- a/python/pyarrow/tests/parquet/test_data_types.py
+++ b/python/pyarrow/tests/parquet/test_data_types.py
@@ -351,6 +351,30 @@ def test_large_list_records():
     _check_roundtrip(table)
 
 
+list_types = [
+    (pa.ListType, pa.list_),
+    (pa.LargeListType, pa.large_list),
+]
+
+
+def test_list_types():
+    data = [[1, 2, None]] * 50
+    for _, in_factory in list_types:
+        array = pa.array(data, type=in_factory(pa.int32()))
+        table = pa.Table.from_arrays([array], ['lists'])
+        for out_type, out_factory in list_types:
+            for store_schema in (True, False):
+                if store_schema:
+                    expected_table = table
+                else:
+                    expected_table = pa.Table.from_arrays(
+                        [pa.array(data, type=out_factory(pa.int32()))], 
['lists'])
+                result = _roundtrip_table(
+                    table, write_table_kwargs=dict(store_schema=store_schema),
+                    read_table_kwargs=dict(list_type=out_type))
+                assert result == expected_table
+
+
 @pytest.mark.pandas
 def test_parquet_nested_convenience(tempdir):
     # ARROW-1684
diff --git a/python/pyarrow/tests/test_dataset.py 
b/python/pyarrow/tests/test_dataset.py
index e01c919b42..4af0f914eb 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -846,6 +846,7 @@ def test_parquet_read_options():
     opts2 = ds.ParquetReadOptions(dictionary_columns=['a', 'b'])
     opts3 = ds.ParquetReadOptions(coerce_int96_timestamp_unit="ms")
     opts4 = ds.ParquetReadOptions(binary_type=pa.binary_view())
+    opts5 = ds.ParquetReadOptions(list_type=pa.LargeListType)
 
     assert opts1.dictionary_columns == set()
 
@@ -857,10 +858,14 @@ def test_parquet_read_options():
     assert opts1.binary_type == pa.binary()
     assert opts4.binary_type == pa.binary_view()
 
+    assert opts1.list_type is pa.ListType
+    assert opts5.list_type is pa.LargeListType
+
     assert opts1 == opts1
     assert opts1 != opts2
     assert opts1 != opts3
     assert opts1 != opts4
+    assert opts1 != opts5
 
     opts4.binary_type = None
     assert opts4.binary_type == pa.binary()
@@ -869,6 +874,13 @@ def test_parquet_read_options():
     assert opts4.binary_type == pa.large_binary()
     assert opts1 != opts4
 
+    opts5.list_type = pa.ListType
+    assert opts5.list_type is pa.ListType
+    assert opts5 == opts1
+    opts5.list_type = pa.LargeListType
+    assert opts5.list_type is pa.LargeListType
+    assert opts5 != opts1
+
 
 @pytest.mark.parquet
 def test_parquet_file_format_read_options():
@@ -876,6 +888,7 @@ def test_parquet_file_format_read_options():
     pff2 = ds.ParquetFileFormat(dictionary_columns={'a'})
     pff3 = ds.ParquetFileFormat(coerce_int96_timestamp_unit="s")
     pff4 = ds.ParquetFileFormat(binary_type=pa.binary_view())
+    pff5 = ds.ParquetFileFormat(list_type=pa.LargeListType)
 
     assert pff1.read_options == ds.ParquetReadOptions()
     assert pff2.read_options == ds.ParquetReadOptions(dictionary_columns=['a'])
@@ -883,6 +896,8 @@ def test_parquet_file_format_read_options():
         coerce_int96_timestamp_unit="s")
     assert pff4.read_options == ds.ParquetReadOptions(
         binary_type=pa.binary_view())
+    assert pff5.read_options == ds.ParquetReadOptions(
+        list_type=pa.LargeListType)
 
 
 @pytest.mark.parquet

(arrow) branch main updated: GH-46676: [C++][Python][Parquet] Allow reading Parquet LIST data as LargeList directly (#46678)

Reply via email to