This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 03a1867571 GH-46676: [C++][Python][Parquet] Allow reading Parquet LIST
data as LargeList directly (#46678)
03a1867571 is described below
commit 03a1867571ef56bdb8c3842a5c26ed7e1ed94c52
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Jun 4 18:30:04 2025 +0200
GH-46676: [C++][Python][Parquet] Allow reading Parquet LIST data as
LargeList directly (#46678)
### Rationale for this change
When reading a Parquet LIST logical type (or a repeated field without a
logical type), Parquet C++ automatically reads it as a Arrow List array.
However, this can in some cases run into the 32-bit offsets limit. We'd
like to be able to choose to read as LargeList instead, even if there is no
serialized Arrow schema in the Parquet file.
### What changes are included in this PR?
* Add a Parquet read option `list_type` to select which Arrow type to read
LIST / repeated Parquet columns into
* Fix an index truncation bug when writing a huge single chunk of data to
Parquet
### Are these changes tested?
Yes, the functionality is tested. However, I wasn't able to write a unit
test that wouldn't consume a horrendous amount of time or memory
writing/reading a list with offsets larger than 2**32.
### Are there any user-facing changes?
No, only an API improvement.
* GitHub Issue: #46676
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/dataset/file_parquet.cc | 4 +-
cpp/src/arrow/dataset/file_parquet.h | 1 +
cpp/src/parquet/CMakeLists.txt | 7 +-
cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 110 ++-
cpp/src/parquet/arrow/arrow_schema_test.cc | 734 +++++++++++----------
cpp/src/parquet/arrow/schema.cc | 28 +-
cpp/src/parquet/column_writer.cc | 8 +-
cpp/src/parquet/properties.h | 15 +
python/pyarrow/_dataset_parquet.pyx | 39 +-
python/pyarrow/_parquet.pxd | 5 +
python/pyarrow/_parquet.pyx | 17 +-
.../pyarrow/includes/libarrow_dataset_parquet.pxd | 1 +
python/pyarrow/includes/libparquet.pxd | 2 +
python/pyarrow/parquet/core.py | 32 +-
python/pyarrow/tests/parquet/common.py | 2 +
python/pyarrow/tests/parquet/test_data_types.py | 24 +
python/pyarrow/tests/test_dataset.py | 15 +
17 files changed, 632 insertions(+), 412 deletions(-)
diff --git a/cpp/src/arrow/dataset/file_parquet.cc
b/cpp/src/arrow/dataset/file_parquet.cc
index d1a7350007..62b8f57ba1 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -117,6 +117,7 @@ parquet::ArrowReaderProperties MakeArrowReaderProperties(
properties.set_coerce_int96_timestamp_unit(
format.reader_options.coerce_int96_timestamp_unit);
properties.set_binary_type(format.reader_options.binary_type);
+ properties.set_list_type(format.reader_options.list_type);
return properties;
}
@@ -445,7 +446,8 @@ bool ParquetFileFormat::Equals(const FileFormat& other)
const {
return (reader_options.dict_columns == other_reader_options.dict_columns &&
reader_options.coerce_int96_timestamp_unit ==
other_reader_options.coerce_int96_timestamp_unit &&
- reader_options.binary_type == other_reader_options.binary_type);
+ reader_options.binary_type == other_reader_options.binary_type &&
+ reader_options.list_type == other_reader_options.list_type);
}
ParquetFileFormat::ParquetFileFormat(const parquet::ReaderProperties&
reader_properties)
diff --git a/cpp/src/arrow/dataset/file_parquet.h
b/cpp/src/arrow/dataset/file_parquet.h
index 234f4ca6f4..1811a96bf9 100644
--- a/cpp/src/arrow/dataset/file_parquet.h
+++ b/cpp/src/arrow/dataset/file_parquet.h
@@ -91,6 +91,7 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat {
std::unordered_set<std::string> dict_columns;
arrow::TimeUnit::type coerce_int96_timestamp_unit = arrow::TimeUnit::NANO;
Type::type binary_type = Type::BINARY;
+ Type::type list_type = Type::LIST;
/// @}
} reader_options;
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index dadc5ec51d..dc7d40d2a3 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -402,17 +402,18 @@ add_parquet_test(writer-test
add_parquet_test(chunker-test SOURCES chunker_internal_test.cc)
-add_parquet_test(arrow-test
+add_parquet_test(arrow-reader-writer-test
SOURCES
- arrow/arrow_metadata_test.cc
arrow/arrow_reader_writer_test.cc
- arrow/arrow_schema_test.cc
arrow/arrow_statistics_test.cc
arrow/variant_test.cc)
add_parquet_test(arrow-internals-test SOURCES arrow/path_internal_test.cc
arrow/reconstruct_internal_test.cc)
+add_parquet_test(arrow-metadata-test SOURCES arrow/arrow_metadata_test.cc
+ arrow/arrow_schema_test.cc)
+
if(PARQUET_REQUIRE_ENCRYPTION)
add_parquet_test(encryption-test
SOURCES
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 32d901ce78..9c74abab53 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -124,6 +124,19 @@ static constexpr int LARGE_SIZE = 10000;
static constexpr uint32_t kDefaultSeed = 0;
+struct ListCase {
+ ::arrow::Type::type type_id;
+
std::function<std::shared_ptr<::arrow::DataType>(std::shared_ptr<::arrow::Field>)>
+ type_factory;
+};
+
+static const std::vector<ListCase> kListCases = {
+ {::arrow::Type::LIST,
+ [](std::shared_ptr<::arrow::Field> field) { return ::arrow::list(field);
}},
+ {::arrow::Type::LARGE_LIST,
+ [](std::shared_ptr<::arrow::Field> field) { return
::arrow::large_list(field); }},
+};
+
std::shared_ptr<const LogicalType> get_logical_type(const DataType& type) {
switch (type.id()) {
case ArrowId::UINT8:
@@ -426,10 +439,13 @@ void CheckConfiguredRoundtrip(
const std::shared_ptr<::parquet::WriterProperties>& writer_properties =
::parquet::default_writer_properties(),
const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties =
- default_arrow_writer_properties()) {
+ default_arrow_writer_properties(),
+ const ArrowReaderProperties& arrow_reader_properties =
+ default_arrow_reader_properties()) {
std::shared_ptr<Table> actual_table;
ASSERT_NO_FATAL_FAILURE(DoRoundtrip(input_table, input_table->num_rows(),
&actual_table,
- writer_properties,
arrow_writer_properties));
+ writer_properties,
arrow_writer_properties,
+ arrow_reader_properties));
if (expected_table) {
ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*actual_table->schema(),
*expected_table->schema(),
@@ -446,14 +462,18 @@ void CheckConfiguredRoundtrip(
void DoSimpleRoundtrip(const std::shared_ptr<Table>& table, bool use_threads,
int64_t row_group_size, const std::vector<int>&
column_subset,
std::shared_ptr<Table>* out,
- const std::shared_ptr<ArrowWriterProperties>&
arrow_properties =
- default_arrow_writer_properties()) {
+ const std::shared_ptr<ArrowWriterProperties>&
+ arrow_writer_properties =
default_arrow_writer_properties(),
+ const ArrowReaderProperties& arrow_reader_properties =
+ default_arrow_reader_properties()) {
std::shared_ptr<Buffer> buffer;
ASSERT_NO_FATAL_FAILURE(
- WriteTableToBuffer(table, row_group_size, arrow_properties, &buffer));
+ WriteTableToBuffer(table, row_group_size, arrow_writer_properties,
&buffer));
- ASSERT_OK_AND_ASSIGN(auto reader,
OpenFile(std::make_shared<BufferReader>(buffer),
- ::arrow::default_memory_pool()));
+ std::unique_ptr<FileReader> reader;
+ FileReaderBuilder builder;
+ ASSERT_OK_NO_THROW(builder.Open(std::make_shared<BufferReader>(buffer)));
+ ASSERT_OK(builder.properties(arrow_reader_properties)->Build(&reader));
reader->set_use_threads(use_threads);
if (column_subset.size() > 0) {
@@ -468,7 +488,8 @@ void DoRoundTripWithBatches(
const std::shared_ptr<Table>& table, bool use_threads, int64_t
row_group_size,
const std::vector<int>& column_subset, std::shared_ptr<Table>* out,
const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties =
- default_arrow_writer_properties()) {
+ default_arrow_writer_properties(),
+ ArrowReaderProperties arrow_reader_properties =
default_arrow_reader_properties()) {
std::shared_ptr<Buffer> buffer;
ASSERT_NO_FATAL_FAILURE(
WriteTableToBuffer(table, row_group_size, arrow_writer_properties,
&buffer));
@@ -476,7 +497,6 @@ void DoRoundTripWithBatches(
std::unique_ptr<FileReader> reader;
FileReaderBuilder builder;
ASSERT_OK_NO_THROW(builder.Open(std::make_shared<BufferReader>(buffer)));
- ArrowReaderProperties arrow_reader_properties;
arrow_reader_properties.set_batch_size(row_group_size - 1);
ASSERT_OK_NO_THROW(builder.memory_pool(::arrow::default_memory_pool())
->properties(arrow_reader_properties)
@@ -497,23 +517,24 @@ void DoRoundTripWithBatches(
ASSERT_OK_AND_ASSIGN(*out, Table::FromRecordBatchReader(batch_reader.get()));
}
-void CheckSimpleRoundtrip(
- const std::shared_ptr<Table>& table, int64_t row_group_size,
- const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties =
- default_arrow_writer_properties()) {
+void CheckSimpleRoundtrip(const std::shared_ptr<Table>& table, int64_t
row_group_size,
+ const std::shared_ptr<ArrowWriterProperties>&
+ arrow_writer_properties =
default_arrow_writer_properties(),
+ const ArrowReaderProperties& arrow_reader_properties
=
+ default_arrow_reader_properties()) {
std::shared_ptr<Table> result;
- ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip(table, false /* use_threads */,
- row_group_size, {}, &result,
- arrow_writer_properties));
+ ASSERT_NO_FATAL_FAILURE(
+ DoSimpleRoundtrip(table, false /* use_threads */, row_group_size, {},
&result,
+ arrow_writer_properties, arrow_reader_properties));
::arrow::AssertSchemaEqual(*table->schema(), *result->schema(),
/*check_metadata=*/false);
ASSERT_OK(result->ValidateFull());
::arrow::AssertTablesEqual(*table, *result, false);
- ASSERT_NO_FATAL_FAILURE(DoRoundTripWithBatches(table, false /* use_threads
*/,
- row_group_size, {}, &result,
- arrow_writer_properties));
+ ASSERT_NO_FATAL_FAILURE(
+ DoRoundTripWithBatches(table, false /* use_threads */, row_group_size,
{}, &result,
+ arrow_writer_properties,
arrow_reader_properties));
::arrow::AssertSchemaEqual(*table->schema(), *result->schema(),
/*check_metadata=*/false);
ASSERT_OK(result->ValidateFull());
@@ -3198,8 +3219,22 @@ TEST(ArrowReadWrite, LargeList) {
[7, 8, 9]])";
auto array = ::arrow::ArrayFromJSON(type, json);
auto table = ::arrow::Table::Make(::arrow::schema({field("root", type)}),
{array});
- auto props_store_schema =
ArrowWriterProperties::Builder().store_schema()->build();
- CheckSimpleRoundtrip(table, 2, props_store_schema);
+ {
+ // If the schema is stored, the large_list is restored regardless of
+ // the list_type setting
+ for (auto list_type : {::arrow::Type::LIST, ::arrow::Type::LARGE_LIST}) {
+ ArrowReaderProperties reader_props;
+ reader_props.set_list_type(list_type);
+ auto writer_props =
ArrowWriterProperties::Builder().store_schema()->build();
+ CheckSimpleRoundtrip(table, 2, writer_props, reader_props);
+ }
+ }
+ {
+ // If the schema is not stored, large_list is read depending on the
list_type setting
+ ArrowReaderProperties reader_props;
+ reader_props.set_list_type(::arrow::Type::LARGE_LIST);
+ CheckSimpleRoundtrip(table, 2, default_arrow_writer_properties(),
reader_props);
+ }
}
TEST(ArrowReadWrite, FixedSizeList) {
@@ -3224,20 +3259,25 @@ TEST(ArrowReadWrite, ListOfStructOfList2) {
using ::arrow::list;
using ::arrow::struct_;
- auto type =
- list(field("item",
- struct_({field("a", ::arrow::int16(), /*nullable=*/false),
- field("b", list(::arrow::int64()),
/*nullable=*/false)}),
- /*nullable=*/false));
-
- const char* json = R"([
- [{"a": 123, "b": [1, 2, 3]}],
- null,
- [],
- [{"a": 456, "b": []}, {"a": 789, "b": [null]}, {"a": 876, "b": [4, 5,
6]}]])";
- auto array = ::arrow::ArrayFromJSON(type, json);
- auto table = ::arrow::Table::Make(::arrow::schema({field("root", type)}),
{array});
- CheckSimpleRoundtrip(table, 2);
+ for (const auto& list_case : kListCases) {
+ auto type = list_case.type_factory(
+ field("item",
+ struct_({field("a", ::arrow::int16(), /*nullable=*/false),
+ field("b", list_case.type_factory(field("item",
::arrow::int64())),
+ /*nullable=*/false)}),
+ /*nullable=*/false));
+
+ const char* json = R"([
+ [{"a": 123, "b": [1, 2, 3]}],
+ null,
+ [],
+ [{"a": 456, "b": []}, {"a": 789, "b": [null]}, {"a": 876, "b": [4, 5,
6]}]])";
+ auto array = ::arrow::ArrayFromJSON(type, json);
+ auto table = ::arrow::Table::Make(::arrow::schema({field("root", type)}),
{array});
+ ArrowReaderProperties reader_props;
+ reader_props.set_list_type(list_case.type_id);
+ CheckSimpleRoundtrip(table, 2, default_arrow_writer_properties(),
reader_props);
+ }
}
TEST(ArrowReadWrite, StructOfLists) {
diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc
b/cpp/src/parquet/arrow/arrow_schema_test.cc
index f55631e8c6..390d8bc77e 100644
--- a/cpp/src/parquet/arrow/arrow_schema_test.cc
+++ b/cpp/src/parquet/arrow/arrow_schema_test.cc
@@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.
+#include <functional>
#include <memory>
#include <vector>
@@ -70,6 +71,19 @@ const auto TIMESTAMP_NS = ::arrow::timestamp(TimeUnit::NANO);
const auto BINARY = ::arrow::binary();
const auto DECIMAL_8_4 = std::make_shared<::arrow::Decimal128Type>(8, 4);
+struct ListCase {
+ ::arrow::Type::type type_id;
+
std::function<std::shared_ptr<::arrow::DataType>(std::shared_ptr<::arrow::Field>)>
+ type_factory;
+};
+
+static const std::vector<ListCase> kListCases = {
+ {::arrow::Type::LIST,
+ [](std::shared_ptr<::arrow::Field> field) { return ::arrow::list(field);
}},
+ {::arrow::Type::LARGE_LIST,
+ [](std::shared_ptr<::arrow::Field> field) { return
::arrow::large_list(field); }},
+};
+
class TestConvertParquetSchema : public ::testing::Test {
public:
virtual void SetUp() {}
@@ -360,309 +374,320 @@ TEST_F(TestConvertParquetSchema, ParquetFlatDecimals) {
}
TEST_F(TestConvertParquetSchema, ParquetMaps) {
- std::vector<NodePtr> parquet_fields;
- std::vector<std::shared_ptr<Field>> arrow_fields;
-
// MAP encoding example taken from parquet-format/LogicalTypes.md
- // Two column map.
- {
- auto key = PrimitiveNode::Make("key", Repetition::REQUIRED,
ParquetType::BYTE_ARRAY,
- ConvertedType::UTF8);
- auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
- ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
+ for (const auto& list_case : kListCases) {
+ std::vector<NodePtr> parquet_fields;
+ std::vector<std::shared_ptr<Field>> arrow_fields;
- auto list = GroupNode::Make("key_value", Repetition::REPEATED, {key,
value});
- parquet_fields.push_back(
- GroupNode::Make("my_map", Repetition::REQUIRED, {list},
LogicalType::Map()));
- auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false);
- auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true);
- auto arrow_map = std::make_shared<::arrow::MapType>(
- ::arrow::field("my_map", ::arrow::struct_({arrow_key, arrow_value}),
- /*nullable=*/false),
- /*nullable=*/false);
+ // Two column map.
+ {
+ auto key = PrimitiveNode::Make("key", Repetition::REQUIRED,
ParquetType::BYTE_ARRAY,
+ ConvertedType::UTF8);
+ auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
+ ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
- arrow_fields.push_back(::arrow::field("my_map", arrow_map,
/*nullable=*/false));
- }
- // Single column map (i.e. set) gets converted to list of struct.
- {
- auto key = PrimitiveNode::Make("key", Repetition::REQUIRED,
ParquetType::BYTE_ARRAY,
- ConvertedType::UTF8);
+ auto list = GroupNode::Make("key_value", Repetition::REPEATED, {key,
value});
+ parquet_fields.push_back(
+ GroupNode::Make("my_map", Repetition::REQUIRED, {list},
LogicalType::Map()));
+ auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false);
+ auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true);
+ auto arrow_map = std::make_shared<::arrow::MapType>(
+ ::arrow::field("my_map", ::arrow::struct_({arrow_key, arrow_value}),
+ /*nullable=*/false),
+ /*nullable=*/false);
+
+ arrow_fields.push_back(::arrow::field("my_map", arrow_map,
/*nullable=*/false));
+ }
+ // Single column map (i.e. set) gets converted to list of struct.
+ {
+ auto key = PrimitiveNode::Make("key", Repetition::REQUIRED,
ParquetType::BYTE_ARRAY,
+ ConvertedType::UTF8);
+
+ auto list = GroupNode::Make("key_value", Repetition::REPEATED, {key});
+ parquet_fields.push_back(
+ GroupNode::Make("my_set", Repetition::REQUIRED, {list},
LogicalType::Map()));
+ auto arrow_list =
+ list_case.type_factory(::arrow::field("key", UTF8,
/*nullable=*/false));
+ arrow_fields.push_back(::arrow::field("my_set", arrow_list, false));
+ }
+ // Two column map with non-standard field names.
+ {
+ auto key = PrimitiveNode::Make("int_key", Repetition::REQUIRED,
ParquetType::INT32,
+ ConvertedType::INT_32);
+ auto value = PrimitiveNode::Make("str_value", Repetition::OPTIONAL,
+ ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
- auto list = GroupNode::Make("key_value", Repetition::REPEATED, {key});
- parquet_fields.push_back(
- GroupNode::Make("my_set", Repetition::REQUIRED, {list},
LogicalType::Map()));
- auto arrow_list = ::arrow::list({::arrow::field("key", UTF8,
/*nullable=*/false)});
- arrow_fields.push_back(::arrow::field("my_set", arrow_list, false));
- }
- // Two column map with non-standard field names.
- {
- auto key = PrimitiveNode::Make("int_key", Repetition::REQUIRED,
ParquetType::INT32,
- ConvertedType::INT_32);
- auto value = PrimitiveNode::Make("str_value", Repetition::OPTIONAL,
- ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
+ auto list = GroupNode::Make("items", Repetition::REPEATED, {key, value});
+ parquet_fields.push_back(
+ GroupNode::Make("items", Repetition::REQUIRED, {list},
LogicalType::Map()));
+ auto arrow_value = ::arrow::field("str_value", UTF8, /*nullable=*/true);
+ auto arrow_key = ::arrow::field("int_key", INT32, /*nullable=*/false);
+ auto arrow_map = std::make_shared<::arrow::MapType>(
+ ::arrow::field("items", ::arrow::struct_({arrow_key, arrow_value}),
false),
+ false);
- auto list = GroupNode::Make("items", Repetition::REPEATED, {key, value});
- parquet_fields.push_back(
- GroupNode::Make("items", Repetition::REQUIRED, {list},
LogicalType::Map()));
- auto arrow_value = ::arrow::field("str_value", UTF8, /*nullable=*/true);
- auto arrow_key = ::arrow::field("int_key", INT32, /*nullable=*/false);
- auto arrow_map = std::make_shared<::arrow::MapType>(
- ::arrow::field("items", ::arrow::struct_({arrow_key, arrow_value}),
false),
- false);
-
- arrow_fields.push_back(::arrow::field("items", arrow_map, false));
- }
+ arrow_fields.push_back(::arrow::field("items", arrow_map, false));
+ }
- auto arrow_schema = ::arrow::schema(arrow_fields);
- ASSERT_OK(ConvertSchema(parquet_fields));
+ auto arrow_schema = ::arrow::schema(arrow_fields);
- ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
- for (int i = 0; i < arrow_schema->num_fields(); ++i) {
- auto result_field = result_schema_->field(i);
- auto expected_field = arrow_schema->field(i);
- if (expected_field->type()->id() == ::arrow::Type::MAP) {
- EXPECT_TRUE(
-
expected_field->type()->field(0)->Equals(result_field->type()->field(0)))
- << "Map's struct in field " << i
- << "\n result: " << result_field->type()->field(0)->ToString() << " "
- << "\n expected: " << expected_field->type()->field(0)->ToString()
<< "\n";
+ ArrowReaderProperties props;
+ props.set_list_type(list_case.type_id);
+ ASSERT_OK(ConvertSchema(parquet_fields, /*key_value_metadata=*/{}, props));
+
+ ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
+ for (int i = 0; i < arrow_schema->num_fields(); ++i) {
+ auto result_field = result_schema_->field(i);
+ auto expected_field = arrow_schema->field(i);
+ if (expected_field->type()->id() == ::arrow::Type::MAP) {
+ EXPECT_TRUE(
+
expected_field->type()->field(0)->Equals(result_field->type()->field(0)))
+ << "Map's struct in field " << i
+ << "\n result: " << result_field->type()->field(0)->ToString() <<
" "
+ << "\n expected: " << expected_field->type()->field(0)->ToString()
<< "\n";
+ }
}
}
}
TEST_F(TestConvertParquetSchema, ParquetLists) {
- std::vector<NodePtr> parquet_fields;
- std::vector<std::shared_ptr<Field>> arrow_fields;
-
// LIST encoding example taken from parquet-format/LogicalTypes.md
- // // List<String> (list non-null, elements nullable)
- // required group my_list (LIST) {
- // repeated group list {
- // optional binary element (UTF8);
- // }
- // }
- {
- auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL,
- ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
- auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
- parquet_fields.push_back(
- GroupNode::Make("my_list", Repetition::REQUIRED, {list},
ConvertedType::LIST));
- auto arrow_element = ::arrow::field("string", UTF8, true);
- auto arrow_list = ::arrow::list(arrow_element);
- arrow_fields.push_back(::arrow::field("my_list", arrow_list, false));
- }
+ for (const auto& list_case : kListCases) {
+ std::vector<NodePtr> parquet_fields;
+ std::vector<std::shared_ptr<Field>> arrow_fields;
- // // List<String> (list nullable, elements non-null)
- // optional group my_list (LIST) {
- // repeated group list {
- // required binary element (UTF8);
- // }
- // }
- {
- auto element = PrimitiveNode::Make("string", Repetition::REQUIRED,
- ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
- auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
- parquet_fields.push_back(
- GroupNode::Make("my_list", Repetition::OPTIONAL, {list},
ConvertedType::LIST));
- auto arrow_element = ::arrow::field("string", UTF8, false);
- auto arrow_list = ::arrow::list(arrow_element);
- arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
- }
+ // // List<String> (list non-null, elements nullable)
+ // required group my_list (LIST) {
+ // repeated group list {
+ // optional binary element (UTF8);
+ // }
+ // }
+ {
+ auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL,
+ ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
+ auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
+ parquet_fields.push_back(
+ GroupNode::Make("my_list", Repetition::REQUIRED, {list},
ConvertedType::LIST));
+ auto arrow_element = ::arrow::field("string", UTF8, true);
+ auto arrow_list = list_case.type_factory(arrow_element);
+ arrow_fields.push_back(::arrow::field("my_list", arrow_list, false));
+ }
- // Element types can be nested structures. For example, a list of lists:
- //
- // // List<List<Integer>>
- // optional group array_of_arrays (LIST) {
- // repeated group list {
- // required group element (LIST) {
- // repeated group list {
- // required int32 element;
- // }
- // }
- // }
- // }
- {
- auto inner_element =
- PrimitiveNode::Make("int32", Repetition::REQUIRED, ParquetType::INT32);
- auto inner_list = GroupNode::Make("list", Repetition::REPEATED,
{inner_element});
- auto element = GroupNode::Make("element", Repetition::REQUIRED,
{inner_list},
- ConvertedType::LIST);
- auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
- parquet_fields.push_back(GroupNode::Make("array_of_arrays",
Repetition::OPTIONAL,
- {list}, ConvertedType::LIST));
- auto arrow_inner_element = ::arrow::field("int32", INT32, false);
- auto arrow_inner_list = ::arrow::list(arrow_inner_element);
- auto arrow_element = ::arrow::field("element", arrow_inner_list, false);
- auto arrow_list = ::arrow::list(arrow_element);
- arrow_fields.push_back(::arrow::field("array_of_arrays", arrow_list,
true));
- }
+ // // List<String> (list nullable, elements non-null)
+ // optional group my_list (LIST) {
+ // repeated group list {
+ // required binary element (UTF8);
+ // }
+ // }
+ {
+ auto element = PrimitiveNode::Make("string", Repetition::REQUIRED,
+ ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
+ auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
+ parquet_fields.push_back(
+ GroupNode::Make("my_list", Repetition::OPTIONAL, {list},
ConvertedType::LIST));
+ auto arrow_element = ::arrow::field("string", UTF8, false);
+ auto arrow_list = list_case.type_factory(arrow_element);
+ arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+ }
- // // List<String> (list nullable, elements non-null)
- // optional group my_list (LIST) {
- // repeated group element {
- // required binary str (UTF8);
- // };
- // }
- {
- auto element = PrimitiveNode::Make("str", Repetition::REQUIRED,
- ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
- auto list = GroupNode::Make("element", Repetition::REPEATED, {element});
- parquet_fields.push_back(
- GroupNode::Make("my_list", Repetition::OPTIONAL, {list},
ConvertedType::LIST));
- auto arrow_element = ::arrow::field("str", UTF8, false);
- auto arrow_list = ::arrow::list(arrow_element);
- arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
- }
+ // Element types can be nested structures. For example, a list of lists:
+ //
+ // // List<List<Integer>>
+ // optional group array_of_arrays (LIST) {
+ // repeated group list {
+ // required group element (LIST) {
+ // repeated group list {
+ // required int32 element;
+ // }
+ // }
+ // }
+ // }
+ {
+ auto inner_element =
+ PrimitiveNode::Make("int32", Repetition::REQUIRED,
ParquetType::INT32);
+ auto inner_list = GroupNode::Make("list", Repetition::REPEATED,
{inner_element});
+ auto element = GroupNode::Make("element", Repetition::REQUIRED,
{inner_list},
+ ConvertedType::LIST);
+ auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
+ parquet_fields.push_back(GroupNode::Make("array_of_arrays",
Repetition::OPTIONAL,
+ {list}, ConvertedType::LIST));
+ auto arrow_inner_element = ::arrow::field("int32", INT32, false);
+ auto arrow_inner_list = list_case.type_factory(arrow_inner_element);
+ auto arrow_element = ::arrow::field("element", arrow_inner_list, false);
+ auto arrow_list = list_case.type_factory(arrow_element);
+ arrow_fields.push_back(::arrow::field("array_of_arrays", arrow_list,
true));
+ }
- // // List<Integer> (nullable list, non-null elements)
- // optional group my_list (LIST) {
- // repeated int32 element;
- // }
- {
- auto element =
- PrimitiveNode::Make("element", Repetition::REPEATED,
ParquetType::INT32);
- parquet_fields.push_back(
- GroupNode::Make("my_list", Repetition::OPTIONAL, {element},
ConvertedType::LIST));
- auto arrow_element = ::arrow::field("element", INT32, false);
- auto arrow_list = ::arrow::list(arrow_element);
- arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
- }
+ // // List<String> (list nullable, elements non-null)
+ // optional group my_list (LIST) {
+ // repeated group element {
+ // required binary str (UTF8);
+ // };
+ // }
+ {
+ auto element = PrimitiveNode::Make("str", Repetition::REQUIRED,
+ ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
+ auto list = GroupNode::Make("element", Repetition::REPEATED, {element});
+ parquet_fields.push_back(
+ GroupNode::Make("my_list", Repetition::OPTIONAL, {list},
ConvertedType::LIST));
+ auto arrow_element = ::arrow::field("str", UTF8, false);
+ auto arrow_list = list_case.type_factory(arrow_element);
+ arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+ }
- // // List<Tuple<String, Integer>> (nullable list, non-null elements)
- // optional group my_list (LIST) {
- // repeated group element {
- // required binary str (UTF8);
- // required int32 num;
- // };
- // }
- {
- auto str_element = PrimitiveNode::Make("str", Repetition::REQUIRED,
- ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
- auto num_element =
- PrimitiveNode::Make("num", Repetition::REQUIRED, ParquetType::INT32);
- auto element =
- GroupNode::Make("element", Repetition::REPEATED, {str_element,
num_element});
- parquet_fields.push_back(
- GroupNode::Make("my_list", Repetition::OPTIONAL, {element},
ConvertedType::LIST));
- auto arrow_str = ::arrow::field("str", UTF8, false);
- auto arrow_num = ::arrow::field("num", INT32, false);
- std::vector<std::shared_ptr<Field>> fields({arrow_str, arrow_num});
- auto arrow_struct = ::arrow::struct_(fields);
- auto arrow_element = ::arrow::field("element", arrow_struct, false);
- auto arrow_list = ::arrow::list(arrow_element);
- arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
- }
+ // // List<Integer> (nullable list, non-null elements)
+ // optional group my_list (LIST) {
+ // repeated int32 element;
+ // }
+ {
+ auto element =
+ PrimitiveNode::Make("element", Repetition::REPEATED,
ParquetType::INT32);
+ parquet_fields.push_back(GroupNode::Make("my_list",
Repetition::OPTIONAL, {element},
+ ConvertedType::LIST));
+ auto arrow_element = ::arrow::field("element", INT32, false);
+ auto arrow_list = list_case.type_factory(arrow_element);
+ arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+ }
- // // List<OneTuple<String>> (nullable list, non-null elements)
- // optional group my_list (LIST) {
- // repeated group array {
- // required binary str (UTF8);
- // };
- // }
- // Special case: group is named array
- {
- auto element = PrimitiveNode::Make("str", Repetition::REQUIRED,
- ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
- auto array = GroupNode::Make("array", Repetition::REPEATED, {element});
- parquet_fields.push_back(
- GroupNode::Make("my_list", Repetition::OPTIONAL, {array},
ConvertedType::LIST));
- auto arrow_str = ::arrow::field("str", UTF8, false);
- std::vector<std::shared_ptr<Field>> fields({arrow_str});
- auto arrow_struct = ::arrow::struct_(fields);
- auto arrow_element = ::arrow::field("array", arrow_struct, false);
- auto arrow_list = ::arrow::list(arrow_element);
- arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
- }
+ // // List<Tuple<String, Integer>> (nullable list, non-null elements)
+ // optional group my_list (LIST) {
+ // repeated group element {
+ // required binary str (UTF8);
+ // required int32 num;
+ // };
+ // }
+ {
+ auto str_element = PrimitiveNode::Make(
+ "str", Repetition::REQUIRED, ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
+ auto num_element =
+ PrimitiveNode::Make("num", Repetition::REQUIRED, ParquetType::INT32);
+ auto element =
+ GroupNode::Make("element", Repetition::REPEATED, {str_element,
num_element});
+ parquet_fields.push_back(GroupNode::Make("my_list",
Repetition::OPTIONAL, {element},
+ ConvertedType::LIST));
+ auto arrow_str = ::arrow::field("str", UTF8, false);
+ auto arrow_num = ::arrow::field("num", INT32, false);
+ std::vector<std::shared_ptr<Field>> fields({arrow_str, arrow_num});
+ auto arrow_struct = ::arrow::struct_(fields);
+ auto arrow_element = ::arrow::field("element", arrow_struct, false);
+ auto arrow_list = list_case.type_factory(arrow_element);
+ arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+ }
- // // List<OneTuple<String>> (nullable list, non-null elements)
- // optional group my_list (LIST) {
- // repeated group my_list_tuple {
- // required binary str (UTF8);
- // };
- // }
- // Special case: group named ends in _tuple
- {
- auto element = PrimitiveNode::Make("str", Repetition::REQUIRED,
- ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
- auto array = GroupNode::Make("my_list_tuple", Repetition::REPEATED,
{element});
- parquet_fields.push_back(
- GroupNode::Make("my_list", Repetition::OPTIONAL, {array},
ConvertedType::LIST));
- auto arrow_str = ::arrow::field("str", UTF8, false);
- std::vector<std::shared_ptr<Field>> fields({arrow_str});
- auto arrow_struct = ::arrow::struct_(fields);
- auto arrow_element = ::arrow::field("my_list_tuple", arrow_struct, false);
- auto arrow_list = ::arrow::list(arrow_element);
- arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
- }
+ // // List<OneTuple<String>> (nullable list, non-null elements)
+ // optional group my_list (LIST) {
+ // repeated group array {
+ // required binary str (UTF8);
+ // };
+ // }
+ // Special case: group is named array
+ {
+ auto element = PrimitiveNode::Make("str", Repetition::REQUIRED,
+ ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
+ auto array = GroupNode::Make("array", Repetition::REPEATED, {element});
+ parquet_fields.push_back(
+ GroupNode::Make("my_list", Repetition::OPTIONAL, {array},
ConvertedType::LIST));
+ auto arrow_str = ::arrow::field("str", UTF8, false);
+ std::vector<std::shared_ptr<Field>> fields({arrow_str});
+ auto arrow_struct = ::arrow::struct_(fields);
+ auto arrow_element = ::arrow::field("array", arrow_struct, false);
+ auto arrow_list = list_case.type_factory(arrow_element);
+ arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+ }
- // One-level encoding: Only allows required lists with required cells
- // repeated value_type name
- {
- parquet_fields.push_back(
- PrimitiveNode::Make("name", Repetition::REPEATED, ParquetType::INT32));
- auto arrow_element = ::arrow::field("name", INT32, false);
- auto arrow_list = ::arrow::list(arrow_element);
- arrow_fields.push_back(::arrow::field("name", arrow_list, false));
- }
+ // // List<OneTuple<String>> (nullable list, non-null elements)
+ // optional group my_list (LIST) {
+ // repeated group my_list_tuple {
+ // required binary str (UTF8);
+ // };
+ // }
+ // Special case: group named ends in _tuple
+ {
+ auto element = PrimitiveNode::Make("str", Repetition::REQUIRED,
+ ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
+ auto array = GroupNode::Make("my_list_tuple", Repetition::REPEATED,
{element});
+ parquet_fields.push_back(
+ GroupNode::Make("my_list", Repetition::OPTIONAL, {array},
ConvertedType::LIST));
+ auto arrow_str = ::arrow::field("str", UTF8, false);
+ std::vector<std::shared_ptr<Field>> fields({arrow_str});
+ auto arrow_struct = ::arrow::struct_(fields);
+ auto arrow_element = ::arrow::field("my_list_tuple", arrow_struct,
false);
+ auto arrow_list = list_case.type_factory(arrow_element);
+ arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+ }
- // Two-level encoding List<List<Integer>>:
- // optional group my_list (LIST) {
- // repeated group array (LIST) {
- // repeated int32 array;
- // }
- // }
- {
- auto inner_array =
- PrimitiveNode::Make("array", Repetition::REPEATED, ParquetType::INT32);
- auto outer_array = GroupNode::Make("array", Repetition::REPEATED,
{inner_array},
- ConvertedType::LIST);
- parquet_fields.push_back(GroupNode::Make("my_list", Repetition::OPTIONAL,
- {outer_array},
ConvertedType::LIST));
- auto arrow_inner_array = ::arrow::field("array", INT32,
/*nullable=*/false);
- auto arrow_outer_array =
- ::arrow::field("array", ::arrow::list(arrow_inner_array),
/*nullable=*/false);
- auto arrow_list = ::arrow::list(arrow_outer_array);
- arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
- }
+ // One-level encoding: Only allows required lists with required cells
+ // repeated value_type name
+ {
+ parquet_fields.push_back(
+ PrimitiveNode::Make("name", Repetition::REPEATED,
ParquetType::INT32));
+ auto arrow_element = ::arrow::field("name", INT32, false);
+ auto arrow_list = list_case.type_factory(arrow_element);
+ arrow_fields.push_back(::arrow::field("name", arrow_list, false));
+ }
- // List<Map<String, String>> in three-level list encoding:
- // optional group my_list (LIST) {
- // repeated group list {
- // required group element (MAP) {
- // repeated group key_value {
- // required binary key (STRING);
- // optional binary value (STRING);
- // }
- // }
- // }
- // }
- {
- auto key = PrimitiveNode::Make("key", Repetition::REQUIRED,
ParquetType::BYTE_ARRAY,
- ConvertedType::UTF8);
- auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
- ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
- auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key,
value});
- auto element =
- GroupNode::Make("element", Repetition::REQUIRED, {key_value},
ConvertedType::MAP);
- auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
- parquet_fields.push_back(
- GroupNode::Make("my_list", Repetition::OPTIONAL, {list},
ConvertedType::LIST));
-
- auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false);
- auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true);
- auto arrow_element = ::arrow::field(
- "element", std::make_shared<::arrow::MapType>(arrow_key, arrow_value),
- /*nullable=*/false);
- auto arrow_list = ::arrow::list(arrow_element);
- arrow_fields.push_back(::arrow::field("my_list", arrow_list,
/*nullable=*/true));
- }
+ // Two-level encoding List<List<Integer>>:
+ // optional group my_list (LIST) {
+ // repeated group array (LIST) {
+ // repeated int32 array;
+ // }
+ // }
+ {
+ auto inner_array =
+ PrimitiveNode::Make("array", Repetition::REPEATED,
ParquetType::INT32);
+ auto outer_array = GroupNode::Make("array", Repetition::REPEATED,
{inner_array},
+ ConvertedType::LIST);
+ parquet_fields.push_back(GroupNode::Make("my_list", Repetition::OPTIONAL,
+ {outer_array},
ConvertedType::LIST));
+ auto arrow_inner_array = ::arrow::field("array", INT32,
/*nullable=*/false);
+ auto arrow_outer_array = ::arrow::field(
+ "array", list_case.type_factory(arrow_inner_array),
/*nullable=*/false);
+ auto arrow_list = list_case.type_factory(arrow_outer_array);
+ arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+ }
- auto arrow_schema = ::arrow::schema(arrow_fields);
- ASSERT_OK(ConvertSchema(parquet_fields));
+ // List<Map<String, String>> in three-level list encoding:
+ // optional group my_list (LIST) {
+ // repeated group list {
+ // required group element (MAP) {
+ // repeated group key_value {
+ // required binary key (STRING);
+ // optional binary value (STRING);
+ // }
+ // }
+ // }
+ // }
+ {
+ auto key = PrimitiveNode::Make("key", Repetition::REQUIRED,
ParquetType::BYTE_ARRAY,
+ ConvertedType::UTF8);
+ auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
+ ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
+ auto key_value = GroupNode::Make("key_value", Repetition::REPEATED,
{key, value});
+ auto element = GroupNode::Make("element", Repetition::REQUIRED,
{key_value},
+ ConvertedType::MAP);
+ auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
+ parquet_fields.push_back(
+ GroupNode::Make("my_list", Repetition::OPTIONAL, {list},
ConvertedType::LIST));
+
+ auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false);
+ auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true);
+ auto arrow_element = ::arrow::field(
+ "element", std::make_shared<::arrow::MapType>(arrow_key,
arrow_value),
+ /*nullable=*/false);
+ auto arrow_list = list_case.type_factory(arrow_element);
+ arrow_fields.push_back(::arrow::field("my_list", arrow_list,
/*nullable=*/true));
+ }
- ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
+ auto arrow_schema = ::arrow::schema(arrow_fields);
+
+ ArrowReaderProperties props;
+ props.set_list_type(list_case.type_id);
+ ASSERT_OK(ConvertSchema(parquet_fields, /*key_value_metadata=*/{}, props));
+
+ ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
+ }
}
TEST_F(TestConvertParquetSchema, UnsupportedThings) {
@@ -769,43 +794,52 @@ TEST_F(TestConvertParquetSchema, ParquetUndefinedType) {
}
TEST_F(TestConvertParquetSchema, ParquetRepeatedNestedSchema) {
- std::vector<NodePtr> parquet_fields;
- std::vector<std::shared_ptr<Field>> arrow_fields;
- {
- // optional int32 leaf1;
- // repeated group outerGroup {
- // optional int32 leaf2;
- // repeated group innerGroup {
- // optional int32 leaf3;
- // }
- // }
- parquet_fields.push_back(
- PrimitiveNode::Make("leaf1", Repetition::OPTIONAL,
ParquetType::INT32));
- parquet_fields.push_back(GroupNode::Make(
- "outerGroup", Repetition::REPEATED,
- {PrimitiveNode::Make("leaf2", Repetition::OPTIONAL,
ParquetType::INT32),
- GroupNode::Make(
- "innerGroup", Repetition::REPEATED,
- {PrimitiveNode::Make("leaf3", Repetition::OPTIONAL,
ParquetType::INT32)})}));
-
- auto inner_group_fields = {::arrow::field("leaf3", INT32, true)};
- auto inner_group_type = ::arrow::struct_(inner_group_fields);
- auto outer_group_fields = {
- ::arrow::field("leaf2", INT32, true),
- ::arrow::field(
- "innerGroup",
- ::arrow::list(::arrow::field("innerGroup", inner_group_type,
false)), false)};
- auto outer_group_type = ::arrow::struct_(outer_group_fields);
-
- arrow_fields.push_back(::arrow::field("leaf1", INT32, true));
- arrow_fields.push_back(::arrow::field(
- "outerGroup",
- ::arrow::list(::arrow::field("outerGroup", outer_group_type, false)),
false));
- }
- auto arrow_schema = ::arrow::schema(arrow_fields);
- ASSERT_OK(ConvertSchema(parquet_fields));
+ for (const auto& list_case : kListCases) {
+ std::vector<NodePtr> parquet_fields;
+ std::vector<std::shared_ptr<Field>> arrow_fields;
- ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
+ {
+ // optional int32 leaf1;
+ // repeated group outerGroup {
+ // optional int32 leaf2;
+ // repeated group innerGroup {
+ // optional int32 leaf3;
+ // }
+ // }
+ parquet_fields.push_back(
+ PrimitiveNode::Make("leaf1", Repetition::OPTIONAL,
ParquetType::INT32));
+ parquet_fields.push_back(GroupNode::Make(
+ "outerGroup", Repetition::REPEATED,
+ {PrimitiveNode::Make("leaf2", Repetition::OPTIONAL,
ParquetType::INT32),
+ GroupNode::Make("innerGroup", Repetition::REPEATED,
+ {PrimitiveNode::Make("leaf3", Repetition::OPTIONAL,
+ ParquetType::INT32)})}));
+
+ auto inner_group_fields = {::arrow::field("leaf3", INT32, true)};
+ auto inner_group_type = ::arrow::struct_(inner_group_fields);
+ auto outer_group_fields = {
+ ::arrow::field("leaf2", INT32, true),
+ ::arrow::field("innerGroup",
+ list_case.type_factory(::arrow::field(
+ "innerGroup", inner_group_type,
/*nullable=*/false)),
+ /*nullable=*/false)};
+ auto outer_group_type = ::arrow::struct_(outer_group_fields);
+
+ arrow_fields.push_back(::arrow::field("leaf1", INT32, true));
+ arrow_fields.push_back(
+ ::arrow::field("outerGroup",
+ list_case.type_factory(::arrow::field(
+ "outerGroup", outer_group_type,
/*nullable=*/false)),
+ /*nullable=*/false));
+ }
+ auto arrow_schema = ::arrow::schema(arrow_fields);
+
+ ArrowReaderProperties props;
+ props.set_list_type(list_case.type_id);
+ ASSERT_OK(ConvertSchema(parquet_fields, /*key_value_metadata=*/{}, props));
+
+ ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
+ }
}
TEST_F(TestConvertParquetSchema, IllegalParquetNestedSchema) {
@@ -1514,48 +1548,50 @@ TEST_F(TestConvertArrowSchema,
ParquetGeoArrowCrsProjjson) {
}
TEST_F(TestConvertArrowSchema, ParquetLists) {
- std::vector<NodePtr> parquet_fields;
- std::vector<std::shared_ptr<Field>> arrow_fields;
+ for (const auto& list_case : kListCases) {
+ std::vector<NodePtr> parquet_fields;
+ std::vector<std::shared_ptr<Field>> arrow_fields;
- // parquet_arrow will always generate 3-level LIST encodings
+ // parquet_arrow will always generate 3-level LIST encodings
- // // List<String> (list non-null, elements nullable)
- // required group my_list (LIST) {
- // repeated group list {
- // optional binary element (UTF8);
- // }
- // }
- {
- auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
- ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
- auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
- parquet_fields.push_back(
- GroupNode::Make("my_list", Repetition::REQUIRED, {list},
ConvertedType::LIST));
- auto arrow_element = ::arrow::field("string", UTF8, true);
- auto arrow_list = ::arrow::list(arrow_element);
- arrow_fields.push_back(::arrow::field("my_list", arrow_list, false));
- }
+ // // List<String> (list non-null, elements nullable)
+ // required group my_list (LIST) {
+ // repeated group list {
+ // optional binary element (UTF8);
+ // }
+ // }
+ {
+ auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
+ ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
+ auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
+ parquet_fields.push_back(
+ GroupNode::Make("my_list", Repetition::REQUIRED, {list},
ConvertedType::LIST));
+ auto arrow_element = ::arrow::field("string", UTF8, true);
+ auto arrow_list = list_case.type_factory(arrow_element);
+ arrow_fields.push_back(::arrow::field("my_list", arrow_list, false));
+ }
- // // List<String> (list nullable, elements non-null)
- // optional group my_list (LIST) {
- // repeated group list {
- // required binary element (UTF8);
- // }
- // }
- {
- auto element = PrimitiveNode::Make("element", Repetition::REQUIRED,
- ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
- auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
- parquet_fields.push_back(
- GroupNode::Make("my_list", Repetition::OPTIONAL, {list},
ConvertedType::LIST));
- auto arrow_element = ::arrow::field("string", UTF8, false);
- auto arrow_list = ::arrow::list(arrow_element);
- arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
- }
+ // // List<String> (list nullable, elements non-null)
+ // optional group my_list (LIST) {
+ // repeated group list {
+ // required binary element (UTF8);
+ // }
+ // }
+ {
+ auto element = PrimitiveNode::Make("element", Repetition::REQUIRED,
+ ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
+ auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
+ parquet_fields.push_back(
+ GroupNode::Make("my_list", Repetition::OPTIONAL, {list},
ConvertedType::LIST));
+ auto arrow_element = ::arrow::field("string", UTF8, false);
+ auto arrow_list = list_case.type_factory(arrow_element);
+ arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+ }
- ASSERT_OK(ConvertSchema(arrow_fields));
+ ASSERT_OK(ConvertSchema(arrow_fields));
- ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields));
+ ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields));
+ }
}
TEST_F(TestConvertArrowSchema, ParquetMaps) {
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index 0488e08d05..cd026b3bea 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -78,6 +78,19 @@ Repetition::type RepetitionFromNullable(bool is_nullable) {
return is_nullable ? Repetition::OPTIONAL : Repetition::REQUIRED;
}
+Result<std::shared_ptr<::arrow::DataType>> MakeArrowList(
+ std::shared_ptr<Field> field, const ArrowReaderProperties& props) {
+ switch (props.list_type()) {
+ case ::arrow::Type::LIST:
+ return ::arrow::list(std::move(field));
+ case ::arrow::Type::LARGE_LIST:
+ return ::arrow::large_list(std::move(field));
+ default:
+ return Status::TypeError("Invalid list_type: " +
+ ::arrow::internal::ToString(props.list_type()));
+ }
+}
+
Status FieldToNode(const std::string& name, const std::shared_ptr<Field>&
field,
const WriterProperties& properties,
const ArrowWriterProperties& arrow_properties, NodePtr*
out);
@@ -776,8 +789,10 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo
current_levels,
RETURN_NOT_OK(
PopulateLeaf(column_index, item_field, current_levels, ctx, out,
child_field));
}
- out->field = ::arrow::field(group.name(), ::arrow::list(child_field->field),
- group.is_optional(),
FieldIdMetadata(group.field_id()));
+ ARROW_ASSIGN_OR_RAISE(auto list_type,
+ MakeArrowList(child_field->field, ctx->properties));
+ out->field = ::arrow::field(group.name(), std::move(list_type),
group.is_optional(),
+ FieldIdMetadata(group.field_id()));
out->level_info = current_levels;
// At this point current levels contains the def level for this list,
// we need to reset to the prior parent.
@@ -805,7 +820,9 @@ Status GroupToSchemaField(const GroupNode& node, LevelInfo
current_levels,
int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
RETURN_NOT_OK(GroupToStruct(node, current_levels, ctx, out,
&out->children[0]));
- out->field = ::arrow::field(node.name(),
::arrow::list(out->children[0].field),
+ ARROW_ASSIGN_OR_RAISE(auto list_type,
+ MakeArrowList(out->children[0].field,
ctx->properties));
+ out->field = ::arrow::field(node.name(), std::move(list_type),
/*nullable=*/false,
FieldIdMetadata(node.field_id()));
ctx->LinkParent(&out->children[0], out);
@@ -856,7 +873,9 @@ Status NodeToSchemaField(const Node& node, LevelInfo
current_levels,
RETURN_NOT_OK(PopulateLeaf(column_index, child_field, current_levels,
ctx, out,
&out->children[0]));
- out->field = ::arrow::field(node.name(), ::arrow::list(child_field),
+ ARROW_ASSIGN_OR_RAISE(auto list_type,
+ MakeArrowList(out->children[0].field,
ctx->properties));
+ out->field = ::arrow::field(node.name(), std::move(list_type),
/*nullable=*/false,
FieldIdMetadata(node.field_id()));
out->level_info = current_levels;
// At this point current_levels has consider this list the ancestor so
restore
@@ -934,6 +953,7 @@
std::function<std::shared_ptr<::arrow::DataType>(FieldVector)> GetNestedFactory(
}
break;
case ::arrow::Type::LIST:
+ case ::arrow::Type::LARGE_LIST:
if (origin_type.id() == ::arrow::Type::LIST) {
return [](FieldVector fields) {
DCHECK_EQ(fields.size(), 1);
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 2c4a072284..acce754c87 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -2008,7 +2008,7 @@ struct SerializeFunctor {
Status Serialize(const ArrayType& array, ArrowWriteContext*, ParquetCType*
out) {
const ArrowCType* input = array.raw_values();
if (array.null_count() > 0) {
- for (int i = 0; i < array.length(); i++) {
+ for (int64_t i = 0; i < array.length(); i++) {
out[i] = static_cast<ParquetCType>(input[i]);
}
} else {
@@ -2067,7 +2067,7 @@ Status
TypedColumnWriterImpl<ParquetType>::WriteArrowSerialize(
template <>
struct SerializeFunctor<BooleanType, ::arrow::BooleanType> {
Status Serialize(const ::arrow::BooleanArray& data, ArrowWriteContext*,
bool* out) {
- for (int i = 0; i < data.length(); i++) {
+ for (int64_t i = 0; i < data.length(); i++) {
*out++ = data.Value(i);
}
return Status::OK();
@@ -2092,7 +2092,7 @@ template <>
struct SerializeFunctor<Int32Type, ::arrow::Date64Type> {
Status Serialize(const ::arrow::Date64Array& array, ArrowWriteContext*,
int32_t* out) {
const int64_t* input = array.raw_values();
- for (int i = 0; i < array.length(); i++) {
+ for (int64_t i = 0; i < array.length(); i++) {
*out++ = static_cast<int32_t>(*input++ / 86400000);
}
return Status::OK();
@@ -2146,7 +2146,7 @@ struct SerializeFunctor<Int32Type, ::arrow::Time32Type> {
const int32_t* input = array.raw_values();
const auto& type = static_cast<const ::arrow::Time32Type&>(*array.type());
if (type.unit() == ::arrow::TimeUnit::SECOND) {
- for (int i = 0; i < array.length(); i++) {
+ for (int64_t i = 0; i < array.length(); i++) {
out[i] = input[i] * 1000;
}
} else {
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index aec9edd093..bbaf6b5e71 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -991,6 +991,7 @@ static constexpr bool kArrowDefaultUseThreads = false;
static constexpr int64_t kArrowDefaultBatchSize = 64 * 1024;
constexpr inline ::arrow::Type::type kArrowDefaultBinaryType =
::arrow::Type::BINARY;
+constexpr inline ::arrow::Type::type kArrowDefaultListType =
::arrow::Type::LIST;
/// EXPERIMENTAL: Properties for configuring FileReader behavior.
class PARQUET_EXPORT ArrowReaderProperties {
@@ -1003,6 +1004,7 @@ class PARQUET_EXPORT ArrowReaderProperties {
cache_options_(::arrow::io::CacheOptions::LazyDefaults()),
coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO),
binary_type_(kArrowDefaultBinaryType),
+ list_type_(kArrowDefaultListType),
arrow_extensions_enabled_(false),
should_load_statistics_(false) {}
@@ -1051,6 +1053,18 @@ class PARQUET_EXPORT ArrowReaderProperties {
/// Return the Arrow binary type to read BYTE_ARRAY columns as.
::arrow::Type::type binary_type() const { return binary_type_; }
+ /// \brief Set the Arrow list type to read Parquet list columns as.
+ ///
+ /// Allowed values are Type::LIST and Type::LARGE_LIST.
+ /// Default is Type::LIST.
+ ///
+ /// However, if a serialized Arrow schema is found in the Parquet metadata,
+ /// this setting is ignored and the Arrow schema takes precedence
+ /// (see ArrowWriterProperties::store_schema).
+ void set_list_type(::arrow::Type::type value) { list_type_ = value; }
+ /// Return the Arrow list type to read Parquet list columns as.
+ ::arrow::Type::type list_type() const { return list_type_; }
+
/// \brief Set the maximum number of rows to read into a record batch.
///
/// Will only be fewer rows when there are no more rows in the file.
@@ -1121,6 +1135,7 @@ class PARQUET_EXPORT ArrowReaderProperties {
::arrow::io::CacheOptions cache_options_;
::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
::arrow::Type::type binary_type_;
+ ::arrow::Type::type list_type_;
bool arrow_extensions_enabled_;
bool should_load_statistics_;
};
diff --git a/python/pyarrow/_dataset_parquet.pyx
b/python/pyarrow/_dataset_parquet.pyx
index f5e501bdc8..9405b5d8c5 100644
--- a/python/pyarrow/_dataset_parquet.pyx
+++ b/python/pyarrow/_dataset_parquet.pyx
@@ -49,7 +49,7 @@ from pyarrow._dataset cimport (
from pyarrow._parquet cimport (
_create_writer_properties, _create_arrow_writer_properties,
- FileMetaData,
+ _unwrap_list_type, FileMetaData,
)
@@ -145,6 +145,7 @@ cdef class ParquetFileFormat(FileFormat):
options.coerce_int96_timestamp_unit = \
read_options._coerce_int96_timestamp_unit
options.binary_type = read_options._binary_type
+ options.list_type = read_options._list_type
self.init(<shared_ptr[CFileFormat]> wrapped)
self.default_fragment_scan_options = default_fragment_scan_options
@@ -186,6 +187,7 @@ cdef class ParquetFileFormat(FileFormat):
parquet_read_options._coerce_int96_timestamp_unit = \
options.coerce_int96_timestamp_unit
parquet_read_options._binary_type = options.binary_type
+ parquet_read_options._list_type = options.list_type
return parquet_read_options
def make_write_options(self, **kwargs):
@@ -516,20 +518,27 @@ cdef class ParquetReadOptions(_Weakrefable):
If given, Parquet binary columns will be read as this datatype.
This setting is ignored if a serialized Arrow schema is found in
the Parquet metadata.
+ list_type : subclass of pyarrow.DataType, default None
+ If given, non-MAP repeated columns will be read as an instance of
+ this datatype (either pyarrow.ListType or pyarrow.LargeListType).
+ This setting is ignored if a serialized Arrow schema is found in
+ the Parquet metadata.
"""
cdef public:
set dictionary_columns
TimeUnit _coerce_int96_timestamp_unit
Type _binary_type
+ Type _list_type
# Also see _PARQUET_READ_OPTIONS
def __init__(self, dictionary_columns=None,
coerce_int96_timestamp_unit=None,
- binary_type=None):
+ binary_type=None, list_type=None):
self.dictionary_columns = set(dictionary_columns or set())
self.coerce_int96_timestamp_unit = coerce_int96_timestamp_unit
self.binary_type = binary_type
+ self.list_type = list_type
@property
def binary_type(self):
@@ -542,6 +551,18 @@ cdef class ParquetReadOptions(_Weakrefable):
else:
self._binary_type = _Type_BINARY
+ @property
+ def list_type(self):
+ return (pa.LargeListType if self._list_type == _Type_LARGE_LIST
+ else pa.ListType)
+
+ @list_type.setter
+ def list_type(self, ty):
+ if ty is not None:
+ self._list_type = _unwrap_list_type(ty)
+ else:
+ self._list_type = _Type_LIST
+
@property
def coerce_int96_timestamp_unit(self):
return timeunit_to_string(self._coerce_int96_timestamp_unit)
@@ -564,9 +585,10 @@ cdef class ParquetReadOptions(_Weakrefable):
bool
"""
return (self.dictionary_columns == other.dictionary_columns and
- self.coerce_int96_timestamp_unit ==
- other.coerce_int96_timestamp_unit and
- self.binary_type == other.binary_type)
+ self._coerce_int96_timestamp_unit ==
+ other._coerce_int96_timestamp_unit and
+ self._binary_type == other._binary_type and
+ self._list_type == other._list_type)
def __eq__(self, other):
try:
@@ -578,7 +600,10 @@ cdef class ParquetReadOptions(_Weakrefable):
return (
f"<ParquetReadOptions"
f" dictionary_columns={self.dictionary_columns}"
- f" coerce_int96_timestamp_unit={self.coerce_int96_timestamp_unit}>"
+ f" coerce_int96_timestamp_unit={self.coerce_int96_timestamp_unit}"
+ f" binary_type={self.binary_type}"
+ f" list_type={self.list_type}"
+ f">"
)
@@ -696,7 +721,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
cdef set _PARQUET_READ_OPTIONS = {
- 'dictionary_columns', 'coerce_int96_timestamp_unit', 'binary_type'
+ 'dictionary_columns', 'coerce_int96_timestamp_unit', 'binary_type',
'list_type',
}
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 1ef575d18f..94365f0f7c 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -68,6 +68,11 @@ cdef shared_ptr[ArrowWriterProperties]
_create_arrow_writer_properties(
store_schema=*,
) except *
+
+# Unwrap the "list_type" argument for ArrowReaderProperties
+cdef Type _unwrap_list_type(obj) except *
+
+
cdef class ParquetSchema(_Weakrefable):
cdef:
FileMetaData parent # the FileMetaData owning the SchemaDescriptor
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index b7b90c09db..d59c70a274 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -42,6 +42,7 @@ from pyarrow.lib cimport (_Weakrefable, Buffer, Schema,
string_to_timeunit)
from pyarrow.lib import (ArrowException, NativeFile, BufferOutputStream,
+ ListType, LargeListType,
_stringify_path,
tobytes, frombytes, is_threading_enabled)
@@ -50,6 +51,16 @@ cimport cpython as cp
_DEFAULT_ROW_GROUP_SIZE = 1024*1024
_MAX_ROW_GROUP_SIZE = 64*1024*1024
+
+cdef Type _unwrap_list_type(obj) except *:
+ if obj is ListType:
+ return _Type_LIST
+ elif obj is LargeListType:
+ return _Type_LARGE_LIST
+ else:
+ raise TypeError(f"Unexpected list_type: {obj!r}")
+
+
cdef class Statistics(_Weakrefable):
"""Statistics for a single column in a single row group."""
@@ -1552,7 +1563,7 @@ cdef class ParquetReader(_Weakrefable):
self._metadata = None
def open(self, object source not None, *, bint use_memory_map=False,
- read_dictionary=None, binary_type=None,
+ read_dictionary=None, binary_type=None, list_type=None,
FileMetaData metadata=None,
int buffer_size=0, bint pre_buffer=False,
coerce_int96_timestamp_unit=None,
@@ -1570,6 +1581,7 @@ cdef class ParquetReader(_Weakrefable):
use_memory_map : bool, default False
read_dictionary : iterable[int or str], optional
binary_type : pyarrow.DataType, optional
+ list_type : subclass of pyarrow.DataType, optional
metadata : FileMetaData, optional
buffer_size : int, default 0
pre_buffer : bool, default False
@@ -1625,6 +1637,9 @@ cdef class ParquetReader(_Weakrefable):
c_binary_type = pyarrow_unwrap_data_type(binary_type)
arrow_props.set_binary_type(c_binary_type.get().id())
+ if list_type is not None:
+ arrow_props.set_list_type(_unwrap_list_type(list_type))
+
if coerce_int96_timestamp_unit is None:
# use the default defined in default_arrow_reader_properties()
pass
diff --git a/python/pyarrow/includes/libarrow_dataset_parquet.pxd
b/python/pyarrow/includes/libarrow_dataset_parquet.pxd
index 9e44d8aa06..8f4917f6c0 100644
--- a/python/pyarrow/includes/libarrow_dataset_parquet.pxd
+++ b/python/pyarrow/includes/libarrow_dataset_parquet.pxd
@@ -64,6 +64,7 @@ cdef extern from "arrow/dataset/api.h" namespace
"arrow::dataset" nogil:
unordered_set[c_string] dict_columns
TimeUnit coerce_int96_timestamp_unit
Type binary_type
+ Type list_type
cdef cppclass CParquetFileFormat "arrow::dataset::ParquetFileFormat"(
CFileFormat):
diff --git a/python/pyarrow/includes/libparquet.pxd
b/python/pyarrow/includes/libparquet.pxd
index 6de37d4c24..d9dd9d1aec 100644
--- a/python/pyarrow/includes/libparquet.pxd
+++ b/python/pyarrow/includes/libparquet.pxd
@@ -442,6 +442,8 @@ cdef extern from "parquet/api/reader.h" namespace "parquet"
nogil:
ArrowReaderProperties()
void set_binary_type(Type binary_type)
Type binary_type()
+ void set_list_type(Type list_type)
+ Type list_type()
void set_read_dictionary(int column_index, c_bool read_dict)
c_bool read_dictionary(int column_index)
void set_batch_size(int64_t batch_size)
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 15dae27896..a84fd5e8b7 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -225,6 +225,11 @@ class ParquetFile:
If given, Parquet binary columns will be read as this datatype.
This setting is ignored if a serialized Arrow schema is found in
the Parquet metadata.
+ list_type : subclass of pyarrow.DataType, default None
+ If given, non-MAP repeated columns will be read as an instance of
+ this datatype (either pyarrow.ListType or pyarrow.LargeListType).
+ This setting is ignored if a serialized Arrow schema is found in
+ the Parquet metadata.
memory_map : bool, default False
If the source is a file path, use a memory map to read file, which can
improve performance in some environments.
@@ -304,8 +309,9 @@ class ParquetFile:
"""
def __init__(self, source, *, metadata=None, common_metadata=None,
- read_dictionary=None, binary_type=None, memory_map=False,
- buffer_size=0, pre_buffer=False,
coerce_int96_timestamp_unit=None,
+ read_dictionary=None, binary_type=None, list_type=None,
+ memory_map=False, buffer_size=0, pre_buffer=False,
+ coerce_int96_timestamp_unit=None,
decryption_properties=None, thrift_string_size_limit=None,
thrift_container_size_limit=None, filesystem=None,
page_checksum_verification=False,
arrow_extensions_enabled=False):
@@ -323,7 +329,7 @@ class ParquetFile:
source, use_memory_map=memory_map,
buffer_size=buffer_size, pre_buffer=pre_buffer,
read_dictionary=read_dictionary, metadata=metadata,
- binary_type=binary_type,
+ binary_type=binary_type, list_type=list_type,
coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
decryption_properties=decryption_properties,
thrift_string_size_limit=thrift_string_size_limit,
@@ -1202,6 +1208,11 @@ binary_type : pyarrow.DataType, default None
If given, Parquet binary columns will be read as this datatype.
This setting is ignored if a serialized Arrow schema is found in
the Parquet metadata.
+list_type : subclass of pyarrow.DataType, default None
+ If given, non-MAP repeated columns will be read as an instance of
+ this datatype (either pyarrow.ListType or pyarrow.LargeListType).
+ This setting is ignored if a serialized Arrow schema is found in
+ the Parquet metadata.
memory_map : bool, default False
If the source is a file path, use a memory map to read file, which can
improve performance in some environments.
@@ -1321,8 +1332,9 @@ Examples
"""
def __init__(self, path_or_paths, filesystem=None, schema=None, *,
filters=None,
- read_dictionary=None, binary_type=None, memory_map=False,
- buffer_size=None, partitioning="hive", ignore_prefixes=None,
+ read_dictionary=None, binary_type=None, list_type=None,
+ memory_map=False, buffer_size=None, partitioning="hive",
+ ignore_prefixes=None,
pre_buffer=True, coerce_int96_timestamp_unit=None,
decryption_properties=None, thrift_string_size_limit=None,
thrift_container_size_limit=None,
@@ -1339,6 +1351,7 @@ Examples
"page_checksum_verification": page_checksum_verification,
"arrow_extensions_enabled": arrow_extensions_enabled,
"binary_type": binary_type,
+ "list_type": list_type,
}
if buffer_size:
read_options.update(use_buffered_stream=True,
@@ -1819,9 +1832,10 @@ Read data from a single Parquet file:
def read_table(source, *, columns=None, use_threads=True,
schema=None, use_pandas_metadata=False, read_dictionary=None,
- binary_type=None, memory_map=False, buffer_size=0,
partitioning="hive",
- filesystem=None, filters=None, ignore_prefixes=None,
- pre_buffer=True, coerce_int96_timestamp_unit=None,
+ binary_type=None, list_type=None, memory_map=False,
buffer_size=0,
+ partitioning="hive", filesystem=None, filters=None,
+ ignore_prefixes=None, pre_buffer=True,
+ coerce_int96_timestamp_unit=None,
decryption_properties=None, thrift_string_size_limit=None,
thrift_container_size_limit=None,
page_checksum_verification=False,
@@ -1836,6 +1850,7 @@ def read_table(source, *, columns=None, use_threads=True,
memory_map=memory_map,
read_dictionary=read_dictionary,
binary_type=binary_type,
+ list_type=list_type,
buffer_size=buffer_size,
filters=filters,
ignore_prefixes=ignore_prefixes,
@@ -1872,6 +1887,7 @@ def read_table(source, *, columns=None, use_threads=True,
dataset = ParquetFile(
source, read_dictionary=read_dictionary,
binary_type=binary_type,
+ list_type=list_type,
memory_map=memory_map, buffer_size=buffer_size,
pre_buffer=pre_buffer,
coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
diff --git a/python/pyarrow/tests/parquet/common.py
b/python/pyarrow/tests/parquet/common.py
index fd6ad94fbd..4f5946649b 100644
--- a/python/pyarrow/tests/parquet/common.py
+++ b/python/pyarrow/tests/parquet/common.py
@@ -67,9 +67,11 @@ def _check_roundtrip(table, expected=None,
read_table_kwargs=None,
# intentionally check twice
result = _roundtrip_table(table, read_table_kwargs=read_table_kwargs,
write_table_kwargs=write_table_kwargs)
+ assert result.schema == expected.schema
assert result.equals(expected)
result = _roundtrip_table(result, read_table_kwargs=read_table_kwargs,
write_table_kwargs=write_table_kwargs)
+ assert result.schema == expected.schema
assert result.equals(expected)
diff --git a/python/pyarrow/tests/parquet/test_data_types.py
b/python/pyarrow/tests/parquet/test_data_types.py
index 6553a5217e..351221f64d 100644
--- a/python/pyarrow/tests/parquet/test_data_types.py
+++ b/python/pyarrow/tests/parquet/test_data_types.py
@@ -351,6 +351,30 @@ def test_large_list_records():
_check_roundtrip(table)
+list_types = [
+ (pa.ListType, pa.list_),
+ (pa.LargeListType, pa.large_list),
+]
+
+
+def test_list_types():
+ data = [[1, 2, None]] * 50
+ for _, in_factory in list_types:
+ array = pa.array(data, type=in_factory(pa.int32()))
+ table = pa.Table.from_arrays([array], ['lists'])
+ for out_type, out_factory in list_types:
+ for store_schema in (True, False):
+ if store_schema:
+ expected_table = table
+ else:
+ expected_table = pa.Table.from_arrays(
+ [pa.array(data, type=out_factory(pa.int32()))],
['lists'])
+ result = _roundtrip_table(
+ table, write_table_kwargs=dict(store_schema=store_schema),
+ read_table_kwargs=dict(list_type=out_type))
+ assert result == expected_table
+
+
@pytest.mark.pandas
def test_parquet_nested_convenience(tempdir):
# ARROW-1684
diff --git a/python/pyarrow/tests/test_dataset.py
b/python/pyarrow/tests/test_dataset.py
index e01c919b42..4af0f914eb 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -846,6 +846,7 @@ def test_parquet_read_options():
opts2 = ds.ParquetReadOptions(dictionary_columns=['a', 'b'])
opts3 = ds.ParquetReadOptions(coerce_int96_timestamp_unit="ms")
opts4 = ds.ParquetReadOptions(binary_type=pa.binary_view())
+ opts5 = ds.ParquetReadOptions(list_type=pa.LargeListType)
assert opts1.dictionary_columns == set()
@@ -857,10 +858,14 @@ def test_parquet_read_options():
assert opts1.binary_type == pa.binary()
assert opts4.binary_type == pa.binary_view()
+ assert opts1.list_type is pa.ListType
+ assert opts5.list_type is pa.LargeListType
+
assert opts1 == opts1
assert opts1 != opts2
assert opts1 != opts3
assert opts1 != opts4
+ assert opts1 != opts5
opts4.binary_type = None
assert opts4.binary_type == pa.binary()
@@ -869,6 +874,13 @@ def test_parquet_read_options():
assert opts4.binary_type == pa.large_binary()
assert opts1 != opts4
+ opts5.list_type = pa.ListType
+ assert opts5.list_type is pa.ListType
+ assert opts5 == opts1
+ opts5.list_type = pa.LargeListType
+ assert opts5.list_type is pa.LargeListType
+ assert opts5 != opts1
+
@pytest.mark.parquet
def test_parquet_file_format_read_options():
@@ -876,6 +888,7 @@ def test_parquet_file_format_read_options():
pff2 = ds.ParquetFileFormat(dictionary_columns={'a'})
pff3 = ds.ParquetFileFormat(coerce_int96_timestamp_unit="s")
pff4 = ds.ParquetFileFormat(binary_type=pa.binary_view())
+ pff5 = ds.ParquetFileFormat(list_type=pa.LargeListType)
assert pff1.read_options == ds.ParquetReadOptions()
assert pff2.read_options == ds.ParquetReadOptions(dictionary_columns=['a'])
@@ -883,6 +896,8 @@ def test_parquet_file_format_read_options():
coerce_int96_timestamp_unit="s")
assert pff4.read_options == ds.ParquetReadOptions(
binary_type=pa.binary_view())
+ assert pff5.read_options == ds.ParquetReadOptions(
+ list_type=pa.LargeListType)
@pytest.mark.parquet