Repository: arrow
Updated Branches:
  refs/heads/master 1571fb436 -> 2f2a0c139


ARROW-1676: [C++] Only pad null bitmap up to a factor of 8 bytes in Feather 
format

cc @rvernica

Due to Arrow's buffer padding, the validity bitmap may be larger than the 
Feather format expects it to be. This can result in off-by-one errors when 
reading files.

It's disappointing that this correctness issue has existed for so long. I am 
looking to determine if this might have affected users of the Python API

Author: Wes McKinney <wes.mckin...@twosigma.com>

Closes #1204 from wesm/ARROW-1676 and squashes the following commits:

60a2ed55 [Wes McKinney] Fuzz test more array lengths
540f79d3 [Wes McKinney] Ensure proper padding
fa677cc2 [Wes McKinney] Only write necessary bytes from null bitmap in Feather 
writer


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/2f2a0c13
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/2f2a0c13
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/2f2a0c13

Branch: refs/heads/master
Commit: 2f2a0c139a96984086359946a67082f6fb8e5efb
Parents: 1571fb4
Author: Wes McKinney <wes.mckin...@twosigma.com>
Authored: Mon Oct 16 15:35:43 2017 -0400
Committer: Wes McKinney <wes.mckin...@twosigma.com>
Committed: Mon Oct 16 15:35:43 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/ipc/feather.cc         |  6 +++---
 python/pyarrow/tests/test_feather.py | 20 ++++++++++++++++++++
 2 files changed, 23 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/2f2a0c13/cpp/src/arrow/ipc/feather.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc
index 9d244f1..97ed601 100644
--- a/cpp/src/arrow/ipc/feather.cc
+++ b/cpp/src/arrow/ipc/feather.cc
@@ -554,12 +554,12 @@ class TableWriter::TableWriterImpl : public ArrayVisitor {
     if (values.null_count() > 0) {
       // We assume there is one bit for each value in values.nulls, aligned on 
a
       // byte boundary, and we write this much data into the stream
+      int64_t null_bitmap_size = 
GetOutputLength(BitUtil::BytesForBits(values.length()));
       if (values.null_bitmap()) {
         RETURN_NOT_OK(WritePadded(stream_.get(), values.null_bitmap()->data(),
-                                  values.null_bitmap()->size(), 
&bytes_written));
+                                  null_bitmap_size, &bytes_written));
       } else {
-        RETURN_NOT_OK(WritePaddedBlank(
-            stream_.get(), BitUtil::BytesForBits(values.length()), 
&bytes_written));
+        RETURN_NOT_OK(WritePaddedBlank(stream_.get(), null_bitmap_size, 
&bytes_written));
       }
       meta->total_bytes += bytes_written;
     }

http://git-wip-us.apache.org/repos/asf/arrow/blob/2f2a0c13/python/pyarrow/tests/test_feather.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_feather.py 
b/python/pyarrow/tests/test_feather.py
index a7013ba..76f0844 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -249,6 +249,26 @@ class TestFeatherReader(unittest.TestCase):
         result = read_feather(path)
         assert_frame_equal(result, ex_frame)
 
+    def test_buffer_bounds_error(self):
+        # ARROW-1676
+        path = random_path()
+        self.test_files.append(path)
+
+        for i in range(16, 256):
+            values = pa.array([None] + list(range(i)), type=pa.float64())
+
+            writer = FeatherWriter()
+            writer.open(path)
+
+            writer.write_array('arr', values)
+            writer.close()
+
+            result = read_feather(path)
+            expected = pd.DataFrame({'arr': values.to_pandas()})
+            assert_frame_equal(result, expected)
+
+            self._check_pandas_roundtrip(expected, null_counts=[1])
+
     def test_boolean_object_nulls(self):
         repeats = 100
         arr = np.array([False, None, True] * repeats, dtype=object)

Reply via email to