This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new c91e874 ARROW-3843: [C++][Python] Allow a "degenerate" Parquet file
with no columns
c91e874 is described below
commit c91e8747efd1c1cdcbac02528c6c20462c4574df
Author: Wes McKinney <[email protected]>
AuthorDate: Fri Mar 22 09:41:02 2019 -0500
ARROW-3843: [C++][Python] Allow a "degenerate" Parquet file with no columns
Author: Wes McKinney <[email protected]>
Closes #3985 from wesm/ARROW-3843 and squashes the following commits:
2aba9262d <Wes McKinney> Add C++ unit test for zero-columns. Distinguish
from degenerate root node
1588a08b0 <Wes McKinney> Allow a degenerate Parquet file with no columns
---
cpp/src/parquet/schema-test.cc | 7 +++++++
cpp/src/parquet/schema.cc | 9 +++++++--
python/pyarrow/tests/test_parquet.py | 7 +++++++
3 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/cpp/src/parquet/schema-test.cc b/cpp/src/parquet/schema-test.cc
index 3d7eb0e..ecb7f26 100644
--- a/cpp/src/parquet/schema-test.cc
+++ b/cpp/src/parquet/schema-test.cc
@@ -448,6 +448,13 @@ TEST_F(TestSchemaConverter, NestedExample) {
ASSERT_TRUE(check_for_parent_consistency(group_));
}
+TEST_F(TestSchemaConverter, ZeroColumns) {
+ // ARROW-3843
+ SchemaElement elements[1];
+ elements[0] = NewGroup("schema", FieldRepetitionType::REPEATED, 0, 0);
+ ASSERT_NO_THROW(Convert(elements, 1));
+}
+
TEST_F(TestSchemaConverter, InvalidRoot) {
// According to the Parquet specification, the first element in the
// list<SchemaElement> is a group whose children (and their descendants)
diff --git a/cpp/src/parquet/schema.cc b/cpp/src/parquet/schema.cc
index 431f307..0a5668d 100644
--- a/cpp/src/parquet/schema.cc
+++ b/cpp/src/parquet/schema.cc
@@ -370,9 +370,14 @@ void PrimitiveNode::ToParquet(void* opaque_element) const {
std::unique_ptr<Node> FlatSchemaConverter::Convert() {
const SchemaElement& root = elements_[0];
- // Validate the root node
if (root.num_children == 0) {
- throw ParquetException("Root node did not have children");
+ if (length_ == 1) {
+ // Degenerate case of Parquet file with no columns
+ return GroupNode::FromParquet(static_cast<const void*>(&root),
next_id(), {});
+ } else {
+ throw ParquetException(
+ "Parquet schema had multiple nodes but root had no children");
+ }
}
// Relaxing this restriction as some implementations don't set this
diff --git a/python/pyarrow/tests/test_parquet.py
b/python/pyarrow/tests/test_parquet.py
index 34d0956..39479e3 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -236,6 +236,13 @@ def test_empty_table_roundtrip():
_check_roundtrip(table, version='2.0')
[email protected]
+def test_empty_table_no_columns():
+ df = pd.DataFrame()
+ empty = pa.Table.from_pandas(df, preserve_index=False)
+ _check_roundtrip(empty)
+
+
def test_empty_lists_table_roundtrip():
# ARROW-2744: Shouldn't crash when writing an array of empty lists
arr = pa.array([[], []], type=pa.list_(pa.int32()))