This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 27417b2  ARROW-2328: [C++] Fixed and unit tested feather writing with 
slice
27417b2 is described below

commit 27417b27e03418826994bb9e6f17e488c9af786f
Author: Adrian Dorr <adrian.d...@genomicsplc.com>
AuthorDate: Mon Apr 9 19:45:28 2018 +0200

    ARROW-2328: [C++] Fixed and unit tested feather writing with slice
    
    When writing a slice of a table to feather, both the scalar data and the 
null bitmap were written out wrongly, i.e. from the beginning of the original 
table instead of from the offset.
    - scalar data is written from the offset
    - null bitmap is written from the offset.
    
    I messed up my original pull request 
https://github.com/apache/arrow/pull/1766, hence this replaces it. Hope that 
helps.
    
    Author: Adrian Dorr <adrian.d...@genomicsplc.com>
    Author: Adrian Dorr <a.d...@me.com>
    
    Closes #1784 from Adriandorr/feather-splice-fix2 and squashes the following 
commits:
    
    a861113 <Adrian Dorr> {AD} Reverted changes in MakeRandomBinaryArray and 
added MakeBinaryArrayWithUniqueValues and used that in 
MakeStringTypesRecordBatch.
    7e18256 <Adrian Dorr> Changed the feather slice tests to run over cartesian 
product of offsets and sizes.
    03db69b <Adrian Dorr> Renamed MakeStringTypesRecordBatchWithoutNulls to its 
correct name.
    31e3521 <Adrian Dorr> Use commom CheckSlice method.
    80d9eed <Adrian Dorr> Fixed format.
    d38fb6d <Adrian Dorr> Fixed formating
    98254d6 <Adrian Dorr> Converted test to parameterized tests.
    de02b60 <Adrian Dorr> Updated comment.
    c259143 <Adrian Dorr> Renamed MakeStringTypesRecordBatchWithNulls and 
MakeStringTypesRecordBatch.
    716650a <Adrian Dorr> ARROW-2328:  On further investigation, exporting of 
booleans failed because of the extra test added. Alos boolean didn't need to be 
treated special, all types data length should be calcutated as the rounded up 
to the nearest multiple of 8 of length*bit-width.
    7ada731 <Adrian Dorr> {AD} Removed unused statement.
    7bd8170 <Adrian Dorr> ARROW-2328:  Fixed more warning/error.
    4bdb3b2 <Adrian Dorr> (ARROW-2328:  Fixed warning/error.
    4eb073c <Adrian Dorr> (ARROW-2328:  Fixed and unit tested feather writing 
with slice of a table: - scalar data is written from the offset - null bitmap 
is written from the offset.
---
 cpp/src/arrow/ipc/feather-test.cc        | 51 ++++++++++++++++++++++++++
 cpp/src/arrow/ipc/feather.cc             | 63 +++++++++++++++++++++++++-------
 cpp/src/arrow/ipc/ipc-json-test.cc       |  6 +--
 cpp/src/arrow/ipc/ipc-read-write-test.cc |  2 +-
 cpp/src/arrow/ipc/test-common.h          | 29 +++++++++++++--
 5 files changed, 131 insertions(+), 20 deletions(-)

diff --git a/cpp/src/arrow/ipc/feather-test.cc 
b/cpp/src/arrow/ipc/feather-test.cc
index 8ec3b0e..ae1489a 100644
--- a/cpp/src/arrow/ipc/feather-test.cc
+++ b/cpp/src/arrow/ipc/feather-test.cc
@@ -406,6 +406,57 @@ TEST_F(TestTableWriter, PrimitiveNullRoundTrip) {
   }
 }
 
+class TestTableWriterSlice : public TestTableWriter,
+                             public 
::testing::WithParamInterface<std::tuple<int, int>> {
+ public:
+  void CheckSlice(std::shared_ptr<RecordBatch> batch) {
+    auto p = GetParam();
+    auto start = std::get<0>(p);
+    auto size = std::get<1>(p);
+
+    batch = batch->Slice(start, size);
+
+    ASSERT_OK(writer_->Append("f0", *batch->column(0)));
+    ASSERT_OK(writer_->Append("f1", *batch->column(1)));
+    Finish();
+
+    std::shared_ptr<Column> col;
+    ASSERT_OK(reader_->GetColumn(0, &col));
+    ASSERT_TRUE(col->data()->chunk(0)->Equals(batch->column(0)));
+    ASSERT_EQ("f0", col->name());
+
+    ASSERT_OK(reader_->GetColumn(1, &col));
+    ASSERT_TRUE(col->data()->chunk(0)->Equals(batch->column(1)));
+    ASSERT_EQ("f1", col->name());
+  }
+};
+
+TEST_P(TestTableWriterSlice, SliceRoundTrip) {
+  std::shared_ptr<RecordBatch> batch;
+  ASSERT_OK(MakeIntBatchSized(600, &batch));
+  CheckSlice(batch);
+}
+
+TEST_P(TestTableWriterSlice, SliceStringsRoundTrip) {
+  auto p = GetParam();
+  auto start = std::get<0>(p);
+  auto with_nulls = start % 2 == 0;
+  std::shared_ptr<RecordBatch> batch;
+  ASSERT_OK(MakeStringTypesRecordBatch(&batch, with_nulls));
+  CheckSlice(batch);
+}
+
+TEST_P(TestTableWriterSlice, SliceBooleanRoundTrip) {
+  std::shared_ptr<RecordBatch> batch;
+  ASSERT_OK(MakeBooleanBatchSized(600, &batch));
+  CheckSlice(batch);
+}
+
+INSTANTIATE_TEST_CASE_P(TestTableWriterSliceOffsets, TestTableWriterSlice,
+                        ::testing::Combine(::testing::Values(0, 1, 300, 301, 
302, 303,
+                                                             304, 305, 306, 
307),
+                                           ::testing::Values(0, 1, 7, 8, 30, 
32, 100)));
+
 }  // namespace feather
 }  // namespace ipc
 }  // namespace arrow
diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc
index faf6a08..df5c799 100644
--- a/cpp/src/arrow/ipc/feather.cc
+++ b/cpp/src/arrow/ipc/feather.cc
@@ -75,6 +75,43 @@ static Status WritePadded(io::OutputStream* stream, const 
uint8_t* data, int64_t
   return Status::OK();
 }
 
+static Status WritePaddedWithOffset(io::OutputStream* stream, const uint8_t* 
data,
+                                    int64_t bit_offset, const int64_t length,
+                                    int64_t* bytes_written) {
+  data = data + bit_offset / 8;
+  uint8_t bit_shift = static_cast<uint8_t>(bit_offset % 8);
+  if (bit_offset == 0) {
+    RETURN_NOT_OK(stream->Write(data, length));
+  } else {
+    constexpr int64_t buffersize = 256;
+    uint8_t buffer[buffersize];
+    const uint8_t lshift = static_cast<uint8_t>(8 - bit_shift);
+    const uint8_t* buffer_end = buffer + buffersize;
+    uint8_t* buffer_it = buffer;
+
+    for (const uint8_t* end = data + length; data != end;) {
+      uint8_t r = static_cast<uint8_t>(*data++ >> bit_shift);
+      uint8_t l = static_cast<uint8_t>(*data << lshift);
+      uint8_t value = l | r;
+      *buffer_it++ = value;
+      if (buffer_it == buffer_end) {
+        RETURN_NOT_OK(stream->Write(buffer, buffersize));
+        buffer_it = buffer;
+      }
+    }
+    if (buffer_it != buffer) {
+      RETURN_NOT_OK(stream->Write(buffer, buffer_it - buffer));
+    }
+  }
+
+  int64_t remainder = PaddedLength(length) - length;
+  if (remainder != 0) {
+    RETURN_NOT_OK(stream->Write(kPaddingBytes, remainder));
+  }
+  *bytes_written = length + remainder;
+  return Status::OK();
+}
+
 /// For compability, we need to write any data sometimes just to keep producing
 /// files that can be read with an older reader.
 static Status WritePaddedBlank(io::OutputStream* stream, int64_t length,
@@ -554,12 +591,14 @@ class TableWriter::TableWriterImpl : public ArrayVisitor {
 
     // Write the null bitmask
     if (values.null_count() > 0) {
-      // We assume there is one bit for each value in values.nulls, aligned on 
a
-      // byte boundary, and we write this much data into the stream
+      // We assume there is one bit for each value in values.nulls,
+      // starting at the zero offset.
       int64_t null_bitmap_size = 
GetOutputLength(BitUtil::BytesForBits(values.length()));
       if (values.null_bitmap()) {
-        RETURN_NOT_OK(WritePadded(stream_.get(), values.null_bitmap()->data(),
-                                  null_bitmap_size, &bytes_written));
+        auto null_bitmap = values.null_bitmap();
+        RETURN_NOT_OK(WritePaddedWithOffset(stream_.get(), null_bitmap->data(),
+                                            values.offset(), null_bitmap_size,
+                                            &bytes_written));
       } else {
         RETURN_NOT_OK(WritePaddedBlank(stream_.get(), null_bitmap_size, 
&bytes_written));
       }
@@ -567,6 +606,7 @@ class TableWriter::TableWriterImpl : public ArrayVisitor {
     }
 
     int64_t values_bytes = 0;
+    int64_t bit_offset = 0;
 
     const uint8_t* values_buffer = nullptr;
 
@@ -595,20 +635,17 @@ class TableWriter::TableWriterImpl : public ArrayVisitor {
       const auto& prim_values = static_cast<const PrimitiveArray&>(values);
       const auto& fw_type = static_cast<const FixedWidthType&>(*values.type());
 
-      if (values.type_id() == Type::BOOL) {
-        // Booleans are bit-packed
-        values_bytes = BitUtil::BytesForBits(values.length());
-      } else {
-        values_bytes = values.length() * fw_type.bit_width() / 8;
-      }
+      values_bytes = BitUtil::BytesForBits(values.length() * 
fw_type.bit_width());
 
       if (prim_values.values()) {
-        values_buffer = prim_values.values()->data();
+        values_buffer = prim_values.values()->data() +
+                        (prim_values.offset() * fw_type.bit_width() / 8);
+        bit_offset = (prim_values.offset() * fw_type.bit_width()) % 8;
       }
     }
     if (values_buffer) {
-      RETURN_NOT_OK(
-          WritePadded(stream_.get(), values_buffer, values_bytes, 
&bytes_written));
+      RETURN_NOT_OK(WritePaddedWithOffset(stream_.get(), values_buffer, 
bit_offset,
+                                          values_bytes, &bytes_written));
     } else {
       RETURN_NOT_OK(WritePaddedBlank(stream_.get(), values_bytes, 
&bytes_written));
     }
diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc 
b/cpp/src/arrow/ipc/ipc-json-test.cc
index 12fa4bf..adf83c3 100644
--- a/cpp/src/arrow/ipc/ipc-json-test.cc
+++ b/cpp/src/arrow/ipc/ipc-json-test.cc
@@ -375,9 +375,9 @@ TEST(TestJsonFileReadWrite, MinimalFormatExample) {
 #define BATCH_CASES()                                                          
         \
   ::testing::Values(&MakeIntRecordBatch, &MakeListRecordBatch, 
&MakeNonNullRecordBatch, \
                     &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList,         
         \
-                    &MakeStringTypesRecordBatch, &MakeStruct, &MakeUnion, 
&MakeDates,   \
-                    &MakeTimestamps, &MakeTimes, &MakeFWBinary, &MakeDecimal,  
         \
-                    &MakeDictionary);
+                    &MakeStringTypesRecordBatchWithNulls, &MakeStruct, 
&MakeUnion,      \
+                    &MakeDates, &MakeTimestamps, &MakeTimes, &MakeFWBinary,    
         \
+                    &MakeDecimal, &MakeDictionary);
 
 class TestJsonRoundTrip : public ::testing::TestWithParam<MakeRecordBatch*> {
  public:
diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc 
b/cpp/src/arrow/ipc/ipc-read-write-test.cc
index c1ff8a4..437585f 100644
--- a/cpp/src/arrow/ipc/ipc-read-write-test.cc
+++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc
@@ -119,7 +119,7 @@ TEST_F(TestSchemaMetadata, NestedFields) {
 #define BATCH_CASES()                                                          
         \
   ::testing::Values(&MakeIntRecordBatch, &MakeListRecordBatch, 
&MakeNonNullRecordBatch, \
                     &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList,         
         \
-                    &MakeStringTypesRecordBatch, &MakeStruct, &MakeUnion,      
         \
+                    &MakeStringTypesRecordBatchWithNulls, &MakeStruct, 
&MakeUnion,      \
                     &MakeDictionary, &MakeDates, &MakeTimestamps, &MakeTimes,  
         \
                     &MakeFWBinary, &MakeNull, &MakeDecimal, &MakeBooleanBatch);
 
diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h
index 6f8a0dc..fb7792e 100644
--- a/cpp/src/arrow/ipc/test-common.h
+++ b/cpp/src/arrow/ipc/test-common.h
@@ -231,7 +231,24 @@ Status MakeRandomBinaryArray(int64_t length, bool 
include_nulls, MemoryPool* poo
   return builder.Finish(out);
 }
 
-Status MakeStringTypesRecordBatch(std::shared_ptr<RecordBatch>* out) {
+template <class Builder, class RawType>
+Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls,
+                                       MemoryPool* pool, 
std::shared_ptr<Array>* out) {
+  Builder builder(pool);
+  for (int64_t i = 0; i < length; ++i) {
+    if (include_nulls && (i % 7 == 0)) {
+      RETURN_NOT_OK(builder.AppendNull());
+    } else {
+      const std::string value = std::to_string(i);
+      RETURN_NOT_OK(builder.Append(reinterpret_cast<const 
RawType*>(value.data()),
+                                   static_cast<int32_t>(value.size())));
+    }
+  }
+  return builder.Finish(out);
+}
+
+Status MakeStringTypesRecordBatch(std::shared_ptr<RecordBatch>* out,
+                                  bool with_nulls = true) {
   const int64_t length = 500;
   auto string_type = utf8();
   auto binary_type = binary();
@@ -244,18 +261,24 @@ Status 
MakeStringTypesRecordBatch(std::shared_ptr<RecordBatch>* out) {
 
   // Quirk with RETURN_NOT_OK macro and templated functions
   {
-    auto s = MakeRandomBinaryArray<StringBuilder, char>(length, true, pool, 
&a0);
+    auto s = MakeBinaryArrayWithUniqueValues<StringBuilder, char>(length, 
with_nulls,
+                                                                  pool, &a0);
     RETURN_NOT_OK(s);
   }
 
   {
-    auto s = MakeRandomBinaryArray<BinaryBuilder, uint8_t>(length, true, pool, 
&a1);
+    auto s = MakeBinaryArrayWithUniqueValues<BinaryBuilder, uint8_t>(length, 
with_nulls,
+                                                                     pool, 
&a1);
     RETURN_NOT_OK(s);
   }
   *out = RecordBatch::Make(schema, length, {a0, a1});
   return Status::OK();
 }
 
+Status MakeStringTypesRecordBatchWithNulls(std::shared_ptr<RecordBatch>* out) {
+  return MakeStringTypesRecordBatch(out, true);
+}
+
 Status MakeNullRecordBatch(std::shared_ptr<RecordBatch>* out) {
   const int64_t length = 500;
   auto f0 = field("f0", null());

-- 
To stop receiving notification emails like this one, please contact
apit...@apache.org.

Reply via email to