[arrow] branch master updated: ARROW-4774: [C++] Fix FileWriter::WriteTable segfault

wesm Fri, 08 Mar 2019 11:22:27 -0800

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new ef99387  ARROW-4774: [C++] Fix FileWriter::WriteTable segfault
ef99387 is described below

commit ef993877ac05bc840aa4670d6e9aa2da9f4eb8d6
Author: François Saint-Jacques <[email protected]>
AuthorDate: Fri Mar 8 13:21:55 2019 -0600

    ARROW-4774: [C++] Fix FileWriter::WriteTable segfault
    
    FileWriter expects a well-formed Table where all columns are of equal
    lengths.
    
    - WriteTable checks if the table is valid
    - Python ensure that a table is valid before wrapping it
    
    Author: François Saint-Jacques <[email protected]>
    
    Closes #3846 from fsaintjacques/ARROW-4774-segfault-parquet-struct and 
squashes the following commits:
    
    4461ba959 <François Saint-Jacques> Python formating
    ae4857091 <François Saint-Jacques> ARROW-4774:  Fix FileWriter::WriteTable 
segfault
---
 cpp/src/parquet/arrow/arrow-reader-writer-test.cc | 37 +++++++++++++++++------
 cpp/src/parquet/arrow/writer.cc                   |  2 ++
 python/pyarrow/public-api.pxi                     |  2 ++
 python/pyarrow/table.pxi                          |  1 -
 python/pyarrow/tests/test_table.py                | 29 ++++++++++++------
 5 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc 
b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
index 0fe4c04..e067838 100644
--- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -1809,6 +1809,27 @@ auto GenerateList = [](int length, 
std::shared_ptr<::DataType>* type,
   MakeListArray(length, 100, type, array);
 };
 
+std::shared_ptr<Table> InvalidTable() {
+  auto type = ::arrow::int8();
+  auto field = ::arrow::field("a", type);
+  auto schema = ::arrow::schema({field, field});
+
+  // Invalid due to array size not matching
+  auto array1 = ArrayFromJSON(type, "[1, 2]");
+  auto array2 = ArrayFromJSON(type, "[1]");
+  return Table::Make(schema, {array1, array2});
+}
+
+TEST(TestArrowReadWrite, InvalidTable) {
+  // ARROW-4774: Shouldn't segfault on writing an invalid table.
+  auto sink = std::make_shared<InMemoryOutputStream>();
+  auto invalid_table = InvalidTable();
+
+  ASSERT_RAISES(Invalid, WriteTable(*invalid_table, 
::arrow::default_memory_pool(), sink,
+                                    1, default_writer_properties(),
+                                    default_arrow_writer_properties()));
+}
+
 TEST(TestArrowReadWrite, TableWithChunkedColumns) {
   std::vector<ArrayFactory> functions = {GenerateInt32, GenerateList};
 
@@ -1889,17 +1910,17 @@ TEST(TestArrowReadWrite, DictionaryColumnChunkedWrite) {
       std::make_shared<::arrow::DictionaryArray>(dict_type, f1_values)};
 
   std::vector<std::shared_ptr<::arrow::Column>> columns;
-  auto column = MakeColumn("column", dict_arrays, false);
+  auto column = MakeColumn("dictionary", dict_arrays, true);
   columns.emplace_back(column);
 
   auto table = Table::Make(schema, columns);
 
   std::shared_ptr<Table> result;
-  DoSimpleRoundtrip(table, 1,
-                    // Just need to make sure that we make
-                    // a chunk size that is smaller than the
-                    // total number of values
-                    2, {}, &result);
+  ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip(table, 1,
+                                            // Just need to make sure that we 
make
+                                            // a chunk size that is smaller 
than the
+                                            // total number of values
+                                            2, {}, &result));
 
   std::vector<std::string> expected_values = {"first",  "second", "first", 
"third",
                                               "second", "third",  "first", 
"second",
@@ -1913,9 +1934,7 @@ TEST(TestArrowReadWrite, DictionaryColumnChunkedWrite) {
   // field, and it also turns into a nullable column
   columns.emplace_back(MakeColumn("dictionary", expected_array, true));
 
-  fields.clear();
-  fields.emplace_back(::arrow::field("dictionary", ::arrow::utf8()));
-  schema = ::arrow::schema(fields);
+  schema = ::arrow::schema({::arrow::field("dictionary", ::arrow::utf8())});
 
   auto expected_table = Table::Make(schema, columns);
 
diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc
index fccf49b..fb43c92 100644
--- a/cpp/src/parquet/arrow/writer.cc
+++ b/cpp/src/parquet/arrow/writer.cc
@@ -1153,6 +1153,8 @@ Status WriteFileMetaData(const FileMetaData& 
file_metadata,
 }
 
 Status FileWriter::WriteTable(const Table& table, int64_t chunk_size) {
+  RETURN_NOT_OK(table.Validate());
+
   if (chunk_size <= 0 && table.num_rows() > 0) {
     return Status::Invalid("chunk size per row_group must be greater than 0");
   } else if (!table.schema()->Equals(*schema_, false)) {
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 7bd9154..6fac3f6 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -257,6 +257,8 @@ cdef api shared_ptr[CTable] pyarrow_unwrap_table(object 
table):
 
 
 cdef api object pyarrow_wrap_table(const shared_ptr[CTable]& ctable):
+    # Ensure that wrapped table is Valid
+    check_status(ctable.get().Validate())
     cdef Table table = Table.__new__(Table)
     table.init(ctable)
     return table
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 1fcbdd6..c46cd6d 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -1172,7 +1172,6 @@ cdef class Table(_PandasConvertible):
             vector[shared_ptr[CColumn]] columns
             Schema cy_schema
             shared_ptr[CSchema] c_schema
-            shared_ptr[CTable] table
             int i, K = <int> len(arrays)
 
         if schema is None:
diff --git a/python/pyarrow/tests/test_table.py 
b/python/pyarrow/tests/test_table.py
index 847b1a4..5eb6aca 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -822,8 +822,8 @@ def test_table_negative_indexing():
     data = [
         pa.array(range(5)),
         pa.array([-10, -5, 0, 5, 10]),
-        pa.array([1.0, 2.0, 3.0]),
-        pa.array(['ab', 'bc', 'cd']),
+        pa.array([1.0, 2.0, 3.0, 4.0, 5.0]),
+        pa.array(['ab', 'bc', 'cd', 'de', 'ef']),
     ]
     table = pa.Table.from_arrays(data, names=tuple('abcd'))
 
@@ -865,16 +865,16 @@ def test_table_safe_casting():
     data = [
         pa.array(range(5), type=pa.int64()),
         pa.array([-10, -5, 0, 5, 10], type=pa.int32()),
-        pa.array([1.0, 2.0, 3.0], type=pa.float64()),
-        pa.array(['ab', 'bc', 'cd'], type=pa.string())
+        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
+        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
     ]
     table = pa.Table.from_arrays(data, names=tuple('abcd'))
 
     expected_data = [
         pa.array(range(5), type=pa.int32()),
         pa.array([-10, -5, 0, 5, 10], type=pa.int16()),
-        pa.array([1, 2, 3], type=pa.int64()),
-        pa.array(['ab', 'bc', 'cd'], type=pa.string())
+        pa.array([1, 2, 3, 4, 5], type=pa.int64()),
+        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
     ]
     expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd'))
 
@@ -893,16 +893,16 @@ def test_table_unsafe_casting():
     data = [
         pa.array(range(5), type=pa.int64()),
         pa.array([-10, -5, 0, 5, 10], type=pa.int32()),
-        pa.array([1.1, 2.2, 3.3], type=pa.float64()),
-        pa.array(['ab', 'bc', 'cd'], type=pa.string())
+        pa.array([1.1, 2.2, 3.3, 4.4, 5.5], type=pa.float64()),
+        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
     ]
     table = pa.Table.from_arrays(data, names=tuple('abcd'))
 
     expected_data = [
         pa.array(range(5), type=pa.int32()),
         pa.array([-10, -5, 0, 5, 10], type=pa.int16()),
-        pa.array([1, 2, 3], type=pa.int64()),
-        pa.array(['ab', 'bc', 'cd'], type=pa.string())
+        pa.array([1, 2, 3, 4, 5], type=pa.int64()),
+        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
     ]
     expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd'))
 
@@ -919,3 +919,12 @@ def test_table_unsafe_casting():
 
     casted_table = table.cast(target_schema, safe=False)
     assert casted_table.equals(expected_table)
+
+
+def test_invalid_table_construct():
+    array = np.array([0, 1], dtype=np.uint8)
+    u8 = pa.uint8()
+    arrays = [pa.array(array, type=u8), pa.array(array[1:], type=u8)]
+
+    with pytest.raises(pa.lib.ArrowInvalid):
+        pa.Table.from_arrays(arrays, names=["a1", "a2"])

[arrow] branch master updated: ARROW-4774: [C++] Fix FileWriter::WriteTable segfault

Reply via email to