[jira] [Commented] (PARQUET-1245) [C++] Segfault when writing Arrow table with duplicate columns

ASF GitHub Bot (JIRA) Thu, 22 Mar 2018 16:54:52 -0700

    [ 
https://issues.apache.org/jira/browse/PARQUET-1245?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16410582#comment-16410582
 ]


ASF GitHub Bot commented on PARQUET-1245:
-----------------------------------------

wesm closed pull request #447: PARQUET-1245: Fix creating Arrow table with 
duplicate column names
URL: https://github.com/apache/parquet-cpp/pull/447
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc 
b/src/parquet/arrow/arrow-reader-writer-test.cc
index 72e65d47..f06f4a87 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -1669,6 +1669,27 @@ TEST(TestArrowReadWrite, TableWithChunkedColumns) {
   }
 }
 
+TEST(TestArrowReadWrite, TableWithDuplicateColumns) {
+  // See ARROW-1974
+  using ::arrow::ArrayFromVector;
+
+  auto f0 = field("duplicate", ::arrow::int8());
+  auto f1 = field("duplicate", ::arrow::int16());
+  auto schema = ::arrow::schema({f0, f1});
+
+  std::vector<int8_t> a0_values = {1, 2, 3};
+  std::vector<int16_t> a1_values = {14, 15, 16};
+
+  std::shared_ptr<Array> a0, a1;
+
+  ArrayFromVector<::arrow::Int8Type, int8_t>(a0_values, &a0);
+  ArrayFromVector<::arrow::Int16Type, int16_t>(a1_values, &a1);
+
+  auto table = Table::Make(schema, {std::make_shared<Column>(f0->name(), a0),
+                                    std::make_shared<Column>(f1->name(), a1)});
+  CheckSimpleRoundtrip(table, table->num_rows());
+}
+
 TEST(TestArrowWrite, CheckChunkSize) {
   const int num_columns = 2;
   const int num_rows = 128;
diff --git a/src/parquet/arrow/arrow-schema-test.cc 
b/src/parquet/arrow/arrow-schema-test.cc
index d502d243..da6af528 100644
--- a/src/parquet/arrow/arrow-schema-test.cc
+++ b/src/parquet/arrow/arrow-schema-test.cc
@@ -165,6 +165,31 @@ TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) {
   CheckFlatSchema(arrow_schema);
 }
 
+TEST_F(TestConvertParquetSchema, DuplicateFieldNames) {
+  std::vector<NodePtr> parquet_fields;
+  std::vector<std::shared_ptr<Field>> arrow_fields;
+
+  parquet_fields.push_back(
+      PrimitiveNode::Make("xxx", Repetition::REQUIRED, ParquetType::BOOLEAN));
+  auto arrow_field1 = std::make_shared<Field>("xxx", BOOL, false);
+
+  parquet_fields.push_back(
+      PrimitiveNode::Make("xxx", Repetition::REQUIRED, ParquetType::INT32));
+  auto arrow_field2 = std::make_shared<Field>("xxx", INT32, false);
+
+  ASSERT_OK(ConvertSchema(parquet_fields));
+  arrow_fields = {arrow_field1, arrow_field2};
+  CheckFlatSchema(std::make_shared<::arrow::Schema>(arrow_fields));
+
+  ASSERT_OK(ConvertSchema(parquet_fields, std::vector<int>({0, 1})));
+  arrow_fields = {arrow_field1, arrow_field2};
+  CheckFlatSchema(std::make_shared<::arrow::Schema>(arrow_fields));
+
+  ASSERT_OK(ConvertSchema(parquet_fields, std::vector<int>({1, 0})));
+  arrow_fields = {arrow_field2, arrow_field1};
+  CheckFlatSchema(std::make_shared<::arrow::Schema>(arrow_fields));
+}
+
 TEST_F(TestConvertParquetSchema, ParquetKeyValueMetadata) {
   std::vector<NodePtr> parquet_fields;
   std::vector<std::shared_ptr<Field>> arrow_fields;
diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc
index bd68ec32..78c3225a 100644
--- a/src/parquet/arrow/reader.cc
+++ b/src/parquet/arrow/reader.cc
@@ -443,7 +443,7 @@ Status FileReader::Impl::ReadRowGroup(int row_group_index,
 }
 
 Status FileReader::Impl::ReadTable(const std::vector<int>& indices,
-                                   std::shared_ptr<Table>* table) {
+                                   std::shared_ptr<Table>* out) {
   std::shared_ptr<::arrow::Schema> schema;
   RETURN_NOT_OK(GetSchema(indices, &schema));
 
@@ -473,7 +473,9 @@ Status FileReader::Impl::ReadTable(const std::vector<int>& 
indices,
     RETURN_NOT_OK(ParallelFor(nthreads, num_fields, ReadColumnFunc));
   }
 
-  *table = Table::Make(schema, columns);
+  std::shared_ptr<Table> table = Table::Make(schema, columns);
+  RETURN_NOT_OK(table->Validate());
+  *out = table;
   return Status::OK();
 }
 
diff --git a/src/parquet/schema-test.cc b/src/parquet/schema-test.cc
index c8cce9fa..ec9aff42 100644
--- a/src/parquet/schema-test.cc
+++ b/src/parquet/schema-test.cc
@@ -292,6 +292,17 @@ class TestGroupNode : public ::testing::Test {
 
     return fields;
   }
+
+  NodeVector Fields2() {
+    // Fields with a duplicate name
+    NodeVector fields;
+
+    fields.push_back(Int32("duplicate", Repetition::REQUIRED));
+    fields.push_back(Int64("unique"));
+    fields.push_back(Double("duplicate"));
+
+    return fields;
+  }
 };
 
 TEST_F(TestGroupNode, Attrs) {
@@ -346,14 +357,23 @@ TEST_F(TestGroupNode, FieldIndex) {
   GroupNode group("group", Repetition::REQUIRED, fields);
   for (size_t i = 0; i < fields.size(); i++) {
     auto field = group.field(static_cast<int>(i));
-    ASSERT_EQ(i, group.FieldIndex(*field.get()));
+    ASSERT_EQ(i, group.FieldIndex(*field));
   }
 
   // Test a non field node
   auto non_field_alien = Int32("alien", Repetition::REQUIRED);   // other name
   auto non_field_familiar = Int32("one", Repetition::REPEATED);  // other node
-  ASSERT_TRUE(group.FieldIndex(*non_field_alien.get()) < 0);
-  ASSERT_TRUE(group.FieldIndex(*non_field_familiar.get()) < 0);
+  ASSERT_TRUE(group.FieldIndex(*non_field_alien) < 0);
+  ASSERT_TRUE(group.FieldIndex(*non_field_familiar) < 0);
+}
+
+TEST_F(TestGroupNode, FieldIndexDuplicateName) {
+  NodeVector fields = Fields2();
+  GroupNode group("group", Repetition::REQUIRED, fields);
+  for (size_t i = 0; i < fields.size(); i++) {
+    auto field = group.field(static_cast<int>(i));
+    ASSERT_EQ(i, group.FieldIndex(*field));
+  }
 }
 
 // ----------------------------------------------------------------------
@@ -677,14 +697,14 @@ TEST_F(TestSchemaDescriptor, BuildTree) {
 
   for (int i = 0; i < nleaves; ++i) {
     auto col = descr_.Column(i);
-    ASSERT_EQ(i, descr_.ColumnIndex(*col->schema_node().get()));
+    ASSERT_EQ(i, descr_.ColumnIndex(*col->schema_node()));
   }
 
   // Test non-column nodes find
   NodePtr non_column_alien = Int32("alien", Repetition::REQUIRED);  // other 
path
   NodePtr non_column_familiar = Int32("a", Repetition::REPEATED);   // other 
node
-  ASSERT_TRUE(descr_.ColumnIndex(*non_column_alien.get()) < 0);
-  ASSERT_TRUE(descr_.ColumnIndex(*non_column_familiar.get()) < 0);
+  ASSERT_TRUE(descr_.ColumnIndex(*non_column_alien) < 0);
+  ASSERT_TRUE(descr_.ColumnIndex(*non_column_familiar) < 0);
 
   ASSERT_EQ(inta.get(), descr_.GetColumnRoot(0));
   ASSERT_EQ(bag.get(), descr_.GetColumnRoot(3));
diff --git a/src/parquet/schema.cc b/src/parquet/schema.cc
index cbe72c64..5c3958e8 100644
--- a/src/parquet/schema.cc
+++ b/src/parquet/schema.cc
@@ -273,16 +273,14 @@ int GroupNode::FieldIndex(const std::string& name) const {
 }
 
 int GroupNode::FieldIndex(const Node& node) const {
-  int result = FieldIndex(node.name());
-  if (result < 0) {
-    return -1;
-  }
-  DCHECK(result < field_count());
-  if (!node.Equals(field(result).get())) {
-    // Same name but not the same node
-    return -1;
+  auto search = field_name_to_idx_.equal_range(node.name());
+  for (auto it = search.first; it != search.second; ++it) {
+    const int idx = it->second;
+    if (&node == field(idx).get()) {
+      return idx;
+    }
   }
-  return result;
+  return -1;
 }
 
 void GroupNode::Visit(Node::Visitor* visitor) { visitor->Visit(this); }
@@ -721,16 +719,14 @@ int SchemaDescriptor::ColumnIndex(const std::string& 
node_path) const {
 }
 
 int SchemaDescriptor::ColumnIndex(const Node& node) const {
-  int result = ColumnIndex(node.path()->ToDotString());
-  if (result < 0) {
-    return -1;
-  }
-  DCHECK(result < num_columns());
-  if (!node.Equals(Column(result)->schema_node().get())) {
-    // Same path but not the same node
-    return -1;
+  auto search = leaf_to_idx_.equal_range(node.path()->ToDotString());
+  for (auto it = search.first; it != search.second; ++it) {
+    const int idx = it->second;
+    if (&node == Column(idx)->schema_node().get()) {
+      return idx;
+    }
   }
-  return result;
+  return -1;
 }
 
 const schema::Node* SchemaDescriptor::GetColumnRoot(int i) const {
diff --git a/src/parquet/schema.h b/src/parquet/schema.h
index 7b6793b8..b778e51b 100644
--- a/src/parquet/schema.h
+++ b/src/parquet/schema.h
@@ -264,7 +264,11 @@ class PARQUET_EXPORT GroupNode : public Node {
   bool Equals(const Node* other) const override;
 
   NodePtr field(int i) const { return fields_[i]; }
+  // Get the index of a field by its name, or negative value if not found.
+  // If several fields share the same name, it is unspecified which one
+  // is returned.
   int FieldIndex(const std::string& name) const;
+  // Get the index of a field by its node, or negative value if not found.
   int FieldIndex(const Node& node) const;
 
   int field_count() const { return static_cast<int>(fields_.size()); }
@@ -282,7 +286,7 @@ class PARQUET_EXPORT GroupNode : public Node {
     auto field_idx = 0;
     for (NodePtr& field : fields_) {
       field->SetParent(this);
-      field_name_to_idx_[field->name()] = field_idx++;
+      field_name_to_idx_.emplace(field->name(), field_idx++);
     }
   }
 
@@ -290,11 +294,12 @@ class PARQUET_EXPORT GroupNode : public Node {
   bool EqualsInternal(const GroupNode* other) const;
 
   // Mapping between field name to the field index
-  std::unordered_map<std::string, int> field_name_to_idx_;
+  std::unordered_multimap<std::string, int> field_name_to_idx_;
 
   FRIEND_TEST(TestGroupNode, Attrs);
   FRIEND_TEST(TestGroupNode, Equals);
   FRIEND_TEST(TestGroupNode, FieldIndex);
+  FRIEND_TEST(TestGroupNode, FieldIndexDuplicateName);
 };
 
 // ----------------------------------------------------------------------
@@ -393,9 +398,11 @@ class PARQUET_EXPORT SchemaDescriptor {
 
   const ColumnDescriptor* Column(int i) const;
 
-  // Get the index of a column by its dotstring path, or negative value if not 
found
+  // Get the index of a column by its dotstring path, or negative value if not 
found.
+  // If several columns share the same dotstring path, it is unspecified which 
one
+  // is returned.
   int ColumnIndex(const std::string& node_path) const;
-  // Get the index of a column by its node, or negative value if not found
+  // Get the index of a column by its node, or negative value if not found.
   int ColumnIndex(const schema::Node& node) const;
 
   bool Equals(const SchemaDescriptor& other) const;
@@ -442,7 +449,7 @@ class PARQUET_EXPORT SchemaDescriptor {
   std::unordered_map<int, const schema::NodePtr> leaf_to_base_;
 
   // Mapping between ColumnPath DotString to the leaf index
-  std::unordered_map<std::string, int> leaf_to_idx_;
+  std::unordered_multimap<std::string, int> leaf_to_idx_;
 };
 
 }  // namespace parquet
diff --git a/src/parquet/util/schema-util.h b/src/parquet/util/schema-util.h
index 4e31d3ca..1c66f675 100644
--- a/src/parquet/util/schema-util.h
+++ b/src/parquet/util/schema-util.h
@@ -71,7 +71,7 @@ inline bool ColumnIndicesToFieldIndices(const 
SchemaDescriptor& descr,
   out->clear();
   for (auto& column_idx : column_indices) {
     auto field_node = descr.GetColumnRoot(column_idx);
-    auto field_idx = group->FieldIndex(field_node->name());
+    auto field_idx = group->FieldIndex(*field_node);
     if (field_idx < 0) {
       return false;
     }


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [C++] Segfault when writing Arrow table with duplicate columns
> --------------------------------------------------------------
>
>                 Key: PARQUET-1245
>                 URL: https://issues.apache.org/jira/browse/PARQUET-1245
>             Project: Parquet
>          Issue Type: Bug
>         Environment: Linux Mint 18.2
> Anaconda Python distribution + pyarrow installed from the conda-forge channel
>            Reporter: Alexey Strokach
>            Assignee: Antoine Pitrou
>            Priority: Minor
>              Labels: pull-request-available
>             Fix For: cpp-1.5.0
>
>
> I accidentally created a large number of Parquet files with two 
> __index_level_0__ columns (through a Spark SQL query).
> PyArrow can read these files into tables, but it segfaults when converting 
> the resulting tables to Pandas DataFrames or when saving the tables to 
> Parquet files.
> {code:none}
> # Duplicate columns cause segmentation faults
> table = pq.read_table('/path/to/duplicate_column_file.parquet')
> table.to_pandas()  # Segmentation fault
> pq.write_table(table, '/some/output.parquet') # Segmentation fault
> {code}
> If I remove the duplicate column using table.remove_column(...) everything 
> works without segfaults.
> {code:none}
> # After removing duplicate columns, everything works fine
> table = pq.read_table('/path/to/duplicate_column_file.parquet')
> table.remove_column(34)
> table.to_pandas()  # OK
> pq.write_table(table, '/some/output.parquet')  # OK
> {code}
> For more concrete examples, see `test_segfault_1.py` and `test_segfault_2.py` 
> here: https://gitlab.com/ostrokach/pyarrow_duplicate_column_errors.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (PARQUET-1245) [C++] Segfault when writing Arrow table with duplicate columns

Reply via email to