This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new ef99387 ARROW-4774: [C++] Fix FileWriter::WriteTable segfault
ef99387 is described below
commit ef993877ac05bc840aa4670d6e9aa2da9f4eb8d6
Author: François Saint-Jacques <[email protected]>
AuthorDate: Fri Mar 8 13:21:55 2019 -0600
ARROW-4774: [C++] Fix FileWriter::WriteTable segfault
FileWriter expects a well-formed Table where all columns are of equal
lengths.
- WriteTable checks if the table is valid
- Python ensure that a table is valid before wrapping it
Author: François Saint-Jacques <[email protected]>
Closes #3846 from fsaintjacques/ARROW-4774-segfault-parquet-struct and
squashes the following commits:
4461ba959 <François Saint-Jacques> Python formating
ae4857091 <François Saint-Jacques> ARROW-4774: Fix FileWriter::WriteTable
segfault
---
cpp/src/parquet/arrow/arrow-reader-writer-test.cc | 37 +++++++++++++++++------
cpp/src/parquet/arrow/writer.cc | 2 ++
python/pyarrow/public-api.pxi | 2 ++
python/pyarrow/table.pxi | 1 -
python/pyarrow/tests/test_table.py | 29 ++++++++++++------
5 files changed, 51 insertions(+), 20 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
index 0fe4c04..e067838 100644
--- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -1809,6 +1809,27 @@ auto GenerateList = [](int length,
std::shared_ptr<::DataType>* type,
MakeListArray(length, 100, type, array);
};
+std::shared_ptr<Table> InvalidTable() {
+ auto type = ::arrow::int8();
+ auto field = ::arrow::field("a", type);
+ auto schema = ::arrow::schema({field, field});
+
+ // Invalid due to array size not matching
+ auto array1 = ArrayFromJSON(type, "[1, 2]");
+ auto array2 = ArrayFromJSON(type, "[1]");
+ return Table::Make(schema, {array1, array2});
+}
+
+TEST(TestArrowReadWrite, InvalidTable) {
+ // ARROW-4774: Shouldn't segfault on writing an invalid table.
+ auto sink = std::make_shared<InMemoryOutputStream>();
+ auto invalid_table = InvalidTable();
+
+ ASSERT_RAISES(Invalid, WriteTable(*invalid_table,
::arrow::default_memory_pool(), sink,
+ 1, default_writer_properties(),
+ default_arrow_writer_properties()));
+}
+
TEST(TestArrowReadWrite, TableWithChunkedColumns) {
std::vector<ArrayFactory> functions = {GenerateInt32, GenerateList};
@@ -1889,17 +1910,17 @@ TEST(TestArrowReadWrite, DictionaryColumnChunkedWrite) {
std::make_shared<::arrow::DictionaryArray>(dict_type, f1_values)};
std::vector<std::shared_ptr<::arrow::Column>> columns;
- auto column = MakeColumn("column", dict_arrays, false);
+ auto column = MakeColumn("dictionary", dict_arrays, true);
columns.emplace_back(column);
auto table = Table::Make(schema, columns);
std::shared_ptr<Table> result;
- DoSimpleRoundtrip(table, 1,
- // Just need to make sure that we make
- // a chunk size that is smaller than the
- // total number of values
- 2, {}, &result);
+ ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip(table, 1,
+ // Just need to make sure that we
make
+ // a chunk size that is smaller
than the
+ // total number of values
+ 2, {}, &result));
std::vector<std::string> expected_values = {"first", "second", "first",
"third",
"second", "third", "first",
"second",
@@ -1913,9 +1934,7 @@ TEST(TestArrowReadWrite, DictionaryColumnChunkedWrite) {
// field, and it also turns into a nullable column
columns.emplace_back(MakeColumn("dictionary", expected_array, true));
- fields.clear();
- fields.emplace_back(::arrow::field("dictionary", ::arrow::utf8()));
- schema = ::arrow::schema(fields);
+ schema = ::arrow::schema({::arrow::field("dictionary", ::arrow::utf8())});
auto expected_table = Table::Make(schema, columns);
diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc
index fccf49b..fb43c92 100644
--- a/cpp/src/parquet/arrow/writer.cc
+++ b/cpp/src/parquet/arrow/writer.cc
@@ -1153,6 +1153,8 @@ Status WriteFileMetaData(const FileMetaData&
file_metadata,
}
Status FileWriter::WriteTable(const Table& table, int64_t chunk_size) {
+ RETURN_NOT_OK(table.Validate());
+
if (chunk_size <= 0 && table.num_rows() > 0) {
return Status::Invalid("chunk size per row_group must be greater than 0");
} else if (!table.schema()->Equals(*schema_, false)) {
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 7bd9154..6fac3f6 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -257,6 +257,8 @@ cdef api shared_ptr[CTable] pyarrow_unwrap_table(object
table):
cdef api object pyarrow_wrap_table(const shared_ptr[CTable]& ctable):
+ # Ensure that wrapped table is Valid
+ check_status(ctable.get().Validate())
cdef Table table = Table.__new__(Table)
table.init(ctable)
return table
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 1fcbdd6..c46cd6d 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -1172,7 +1172,6 @@ cdef class Table(_PandasConvertible):
vector[shared_ptr[CColumn]] columns
Schema cy_schema
shared_ptr[CSchema] c_schema
- shared_ptr[CTable] table
int i, K = <int> len(arrays)
if schema is None:
diff --git a/python/pyarrow/tests/test_table.py
b/python/pyarrow/tests/test_table.py
index 847b1a4..5eb6aca 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -822,8 +822,8 @@ def test_table_negative_indexing():
data = [
pa.array(range(5)),
pa.array([-10, -5, 0, 5, 10]),
- pa.array([1.0, 2.0, 3.0]),
- pa.array(['ab', 'bc', 'cd']),
+ pa.array([1.0, 2.0, 3.0, 4.0, 5.0]),
+ pa.array(['ab', 'bc', 'cd', 'de', 'ef']),
]
table = pa.Table.from_arrays(data, names=tuple('abcd'))
@@ -865,16 +865,16 @@ def test_table_safe_casting():
data = [
pa.array(range(5), type=pa.int64()),
pa.array([-10, -5, 0, 5, 10], type=pa.int32()),
- pa.array([1.0, 2.0, 3.0], type=pa.float64()),
- pa.array(['ab', 'bc', 'cd'], type=pa.string())
+ pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
+ pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
]
table = pa.Table.from_arrays(data, names=tuple('abcd'))
expected_data = [
pa.array(range(5), type=pa.int32()),
pa.array([-10, -5, 0, 5, 10], type=pa.int16()),
- pa.array([1, 2, 3], type=pa.int64()),
- pa.array(['ab', 'bc', 'cd'], type=pa.string())
+ pa.array([1, 2, 3, 4, 5], type=pa.int64()),
+ pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
]
expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd'))
@@ -893,16 +893,16 @@ def test_table_unsafe_casting():
data = [
pa.array(range(5), type=pa.int64()),
pa.array([-10, -5, 0, 5, 10], type=pa.int32()),
- pa.array([1.1, 2.2, 3.3], type=pa.float64()),
- pa.array(['ab', 'bc', 'cd'], type=pa.string())
+ pa.array([1.1, 2.2, 3.3, 4.4, 5.5], type=pa.float64()),
+ pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
]
table = pa.Table.from_arrays(data, names=tuple('abcd'))
expected_data = [
pa.array(range(5), type=pa.int32()),
pa.array([-10, -5, 0, 5, 10], type=pa.int16()),
- pa.array([1, 2, 3], type=pa.int64()),
- pa.array(['ab', 'bc', 'cd'], type=pa.string())
+ pa.array([1, 2, 3, 4, 5], type=pa.int64()),
+ pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
]
expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd'))
@@ -919,3 +919,12 @@ def test_table_unsafe_casting():
casted_table = table.cast(target_schema, safe=False)
assert casted_table.equals(expected_table)
+
+
+def test_invalid_table_construct():
+ array = np.array([0, 1], dtype=np.uint8)
+ u8 = pa.uint8()
+ arrays = [pa.array(array, type=u8), pa.array(array[1:], type=u8)]
+
+ with pytest.raises(pa.lib.ArrowInvalid):
+ pa.Table.from_arrays(arrays, names=["a1", "a2"])