Repository: arrow Updated Branches: refs/heads/master 1571fb436 -> 2f2a0c139
ARROW-1676: [C++] Only pad null bitmap up to a factor of 8 bytes in Feather format cc @rvernica Due to Arrow's buffer padding, the validity bitmap may be larger than the Feather format expects it to be. This can result in off-by-one errors when reading files. It's disappointing that this correctness issue has existed for so long. I am looking to determine if this might have affected users of the Python API Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #1204 from wesm/ARROW-1676 and squashes the following commits: 60a2ed55 [Wes McKinney] Fuzz test more array lengths 540f79d3 [Wes McKinney] Ensure proper padding fa677cc2 [Wes McKinney] Only write necessary bytes from null bitmap in Feather writer Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/2f2a0c13 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/2f2a0c13 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/2f2a0c13 Branch: refs/heads/master Commit: 2f2a0c139a96984086359946a67082f6fb8e5efb Parents: 1571fb4 Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Mon Oct 16 15:35:43 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Mon Oct 16 15:35:43 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/ipc/feather.cc | 6 +++--- python/pyarrow/tests/test_feather.py | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/2f2a0c13/cpp/src/arrow/ipc/feather.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index 9d244f1..97ed601 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -554,12 +554,12 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { if (values.null_count() > 0) { // We assume there is one bit for each value in values.nulls, aligned on a // byte boundary, and we write this much data into the stream + int64_t null_bitmap_size = GetOutputLength(BitUtil::BytesForBits(values.length())); if (values.null_bitmap()) { RETURN_NOT_OK(WritePadded(stream_.get(), values.null_bitmap()->data(), - values.null_bitmap()->size(), &bytes_written)); + null_bitmap_size, &bytes_written)); } else { - RETURN_NOT_OK(WritePaddedBlank( - stream_.get(), BitUtil::BytesForBits(values.length()), &bytes_written)); + RETURN_NOT_OK(WritePaddedBlank(stream_.get(), null_bitmap_size, &bytes_written)); } meta->total_bytes += bytes_written; } http://git-wip-us.apache.org/repos/asf/arrow/blob/2f2a0c13/python/pyarrow/tests/test_feather.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index a7013ba..76f0844 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -249,6 +249,26 @@ class TestFeatherReader(unittest.TestCase): result = read_feather(path) assert_frame_equal(result, ex_frame) + def test_buffer_bounds_error(self): + # ARROW-1676 + path = random_path() + self.test_files.append(path) + + for i in range(16, 256): + values = pa.array([None] + list(range(i)), type=pa.float64()) + + writer = FeatherWriter() + writer.open(path) + + writer.write_array('arr', values) + writer.close() + + result = read_feather(path) + expected = pd.DataFrame({'arr': values.to_pandas()}) + assert_frame_equal(result, expected) + + self._check_pandas_roundtrip(expected, null_counts=[1]) + def test_boolean_object_nulls(self): repeats = 100 arr = np.array([False, None, True] * repeats, dtype=object)