paleolimbot commented on code in PR #596:
URL: https://github.com/apache/arrow-nanoarrow/pull/596#discussion_r1768741591


##########
src/nanoarrow/common/array.c:
##########
@@ -456,10 +470,26 @@ static void ArrowArrayFlushInternalPointers(struct 
ArrowArray* array) {
   struct ArrowArrayPrivateData* private_data =
       (struct ArrowArrayPrivateData*)array->private_data;
 
-  for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
+  const bool is_binary_view = private_data->storage_type == 
NANOARROW_TYPE_STRING_VIEW ||
+                              private_data->storage_type == 
NANOARROW_TYPE_BINARY_VIEW;
+  const int32_t nfixed_buf = is_binary_view ? 2 : NANOARROW_MAX_FIXED_BUFFERS;
+
+  for (int32_t i = 0; i < nfixed_buf; i++) {
     private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data;
   }
 
+  if (is_binary_view) {
+    const int32_t nvirt_buf = private_data->n_variadic_buffers;
+    private_data->buffer_data = (const void**)ArrowRealloc(
+        private_data->buffer_data, sizeof(void*) * (nfixed_buf + nvirt_buf + 
1));

Review Comment:
   It seems at this point like we should have already allocated enough space 
for the extra pointer here (i.e., `buffer_data` should always be allocated up 
front such that it has enough space for the extra buffer at the end).



##########
src/nanoarrow/common/array.c:
##########
@@ -456,10 +470,26 @@ static void ArrowArrayFlushInternalPointers(struct 
ArrowArray* array) {
   struct ArrowArrayPrivateData* private_data =
       (struct ArrowArrayPrivateData*)array->private_data;
 
-  for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
+  const bool is_binary_view = private_data->storage_type == 
NANOARROW_TYPE_STRING_VIEW ||
+                              private_data->storage_type == 
NANOARROW_TYPE_BINARY_VIEW;
+  const int32_t nfixed_buf = is_binary_view ? 2 : NANOARROW_MAX_FIXED_BUFFERS;
+
+  for (int32_t i = 0; i < nfixed_buf; i++) {
     private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data;
   }
 
+  if (is_binary_view) {
+    const int32_t nvirt_buf = private_data->n_variadic_buffers;
+    private_data->buffer_data = (const void**)ArrowRealloc(
+        private_data->buffer_data, sizeof(void*) * (nfixed_buf + nvirt_buf + 
1));
+    for (int32_t i = 0; i < nvirt_buf; ++i) {

Review Comment:
   ```suggestion
       for (int32_t i = 0; i < nvirt_buf; i++) {
   ```



##########
src/nanoarrow/common/array.c:
##########
@@ -720,7 +757,18 @@ static int ArrowArrayViewSetArrayInternal(struct 
ArrowArrayView* array_view,
     }
   }
 
-  // Check the number of buffers
+  if (array_view->storage_type == NANOARROW_TYPE_STRING_VIEW ||
+      array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW) {
+    const int64_t n_buffers = array->n_buffers;
+    const int32_t nfixed_buf = 2;

Review Comment:
   Should there be a `#define NANOARROW_DATA_VIEW_FIXED_BUFFERS 2` (maybe there 
is a better name)? (A bare `2` shows up in a number of places).



##########
src/nanoarrow/common/inline_types.h:
##########
@@ -822,8 +831,8 @@ struct ArrowArrayPrivateData {
 
   // The array of pointers to buffers. This must be updated after a sequence
   // of appends to synchronize its values with the actual buffer addresses
-  // (which may have ben reallocated uring that time)
-  const void* buffer_data[NANOARROW_MAX_FIXED_BUFFERS];
+  // (which may have ben reallocated during that time)

Review Comment:
   ```suggestion
     // (which may have been reallocated during that time)
   ```



##########
src/nanoarrow/common/array_test.cc:
##########
@@ -895,6 +895,146 @@ TEST(ArrayTest, ArrayTestAppendToLargeStringArray) {
   ArrowArrayRelease(&array);
 }
 
+TEST(ArrayTest, ArrayTestAppendToStringViewArray) {
+  struct ArrowArray array;
+
+  ASSERT_EQ(ArrowArrayInitFromType(&array, NANOARROW_TYPE_STRING_VIEW), 
NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+
+  // Check that we can reserve
+  ASSERT_EQ(ArrowArrayReserve(&array, 5), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayBuffer(&array, 1)->capacity_bytes,
+            5 * sizeof(union ArrowBinaryViewType));
+
+  std::string str1{"this_is_a_relatively_long_string"};
+  std::string filler(NANOARROW_BINARY_VIEW_BLOCK_SIZE - 34, 'x');
+  std::string str2{"goes_into_second_variadic_buffer"};
+
+  EXPECT_EQ(ArrowArrayAppendString(&array, "1234"_asv), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayAppendNull(&array, 2), NANOARROW_OK);
+  EXPECT_EQ(
+      ArrowArrayAppendString(&array, {{str1.c_str()}, 
static_cast<int64_t>(str1.size())}),
+      NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayAppendEmpty(&array, 1), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayAppendString(
+                &array, {{filler.c_str()}, 
static_cast<int64_t>(filler.size())}),
+            NANOARROW_OK);
+  EXPECT_EQ(
+      ArrowArrayAppendString(&array, {{str2.c_str()}, 
static_cast<int64_t>(str2.size())}),
+      NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK);
+
+  EXPECT_EQ(array.length, 7);
+  EXPECT_EQ(array.null_count, 2);
+  auto validity_buffer = reinterpret_cast<const uint8_t*>(array.buffers[0]);
+  auto inline_buffer =
+      reinterpret_cast<const union ArrowBinaryViewType*>(array.buffers[1]);
+  auto vbuf1 = reinterpret_cast<const char*>(array.buffers[2]);
+  auto vbuf2 = reinterpret_cast<const char*>(array.buffers[3]);
+  auto sizes_buffer = reinterpret_cast<const int64_t*>(array.buffers[4]);
+
+  EXPECT_EQ(validity_buffer[0], 0b01111001);
+  EXPECT_EQ(memcmp(inline_buffer[0].inlined.data, "1234", 4), 0);
+  EXPECT_EQ(inline_buffer[0].inlined.size, 4);
+  EXPECT_EQ(memcmp(inline_buffer[3].ref.data, str1.data(), 4), 0);
+  EXPECT_EQ(inline_buffer[3].ref.size, str1.size());
+  EXPECT_EQ(inline_buffer[3].ref.buffer_index, 0);
+  EXPECT_EQ(inline_buffer[3].ref.offset, 0);
+
+  EXPECT_EQ(memcmp(inline_buffer[5].ref.data, filler.data(), 4), 0);
+  EXPECT_EQ(inline_buffer[5].ref.size, filler.size());
+  EXPECT_EQ(inline_buffer[5].ref.buffer_index, 0);
+  EXPECT_EQ(inline_buffer[5].ref.offset, str1.size());
+
+  EXPECT_EQ(memcmp(inline_buffer[6].ref.data, str2.data(), 4), 0);
+  EXPECT_EQ(inline_buffer[6].ref.size, str2.size());
+  EXPECT_EQ(inline_buffer[6].ref.buffer_index, 1);
+  EXPECT_EQ(inline_buffer[6].ref.offset, 0);
+
+  EXPECT_EQ(memcmp(vbuf1, str1.c_str(), str1.size()), 0);
+  EXPECT_EQ(sizes_buffer[0], str1.size() + filler.size());
+
+  EXPECT_EQ(memcmp(vbuf2, str2.c_str(), str2.size()), 0);
+  EXPECT_EQ(sizes_buffer[1], str2.size());
+
+  // TODO: need to add overload for ViewArrayAsBytes
+  /*
+  EXPECT_THAT(nanoarrow::ViewArrayAsBytes<64>(&array),
+              ElementsAre("1234"_asv, NA, NA, "56789"_asv, ""_asv));
+  */
+  ArrowArrayRelease(&array);
+}
+
+TEST(ArrayTest, ArrayTestAppendToBinaryViewArray) {
+  struct ArrowArray array;

Review Comment:
   Unless I'm missing something, these two tests can share a common `void 
TestAppendToDataViewArray()` to avoid duplicating this entire test? (I know we 
haven't done a great job of this in the test suite so far :flushed:)



##########
src/nanoarrow/testing/testing.cc:
##########


Review Comment:
   I would remove these changes for now (adding the integration test 
implementation is the next step but I think should be a separate PR with the 
appropriate tests)



##########
src/nanoarrow/common/utils.c:
##########
@@ -179,6 +179,16 @@ void ArrowLayoutInit(struct ArrowLayout* layout, enum 
ArrowType storage_type) {
       layout->buffer_data_type[2] = NANOARROW_TYPE_BINARY;
       break;
 
+    case NANOARROW_TYPE_BINARY_VIEW:
+      layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_VIEW;
+      layout->buffer_data_type[1] = NANOARROW_TYPE_BINARY_VIEW;
+      layout->element_size_bits[1] = 128;
+      break;
+    case NANOARROW_TYPE_STRING_VIEW:
+      layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_VIEW;
+      layout->buffer_data_type[1] = NANOARROW_TYPE_STRING_VIEW;
+      layout->element_size_bits[1] = 128;

Review Comment:
   This can be tested with something like this:
   
   
https://github.com/apache/arrow-nanoarrow/blob/5d53738636cac9d6bf14aebf791a123814a00697/src/nanoarrow/common/schema_test.cc#L826



##########
src/nanoarrow/common/schema.c:
##########
@@ -101,8 +101,12 @@ static const char* ArrowSchemaFormatTemplate(enum 
ArrowType type) {
       return "u";
     case NANOARROW_TYPE_LARGE_STRING:
       return "U";
+    case NANOARROW_TYPE_STRING_VIEW:
+      return "vu";

Review Comment:
   I believe the test for this would just be a line here:
   
   
https://github.com/apache/arrow-nanoarrow/blob/5d53738636cac9d6bf14aebf791a123814a00697/src/nanoarrow/common/schema_test.cc#L110
   
   ...except you'll have to guard it on the Arrow version so that the tests can 
still compile against old versions of Arrow:
   
   ```
   #if defined(ARROW_VERSION_MAJOR) && ARROW_VERSION_MAJOR >= 15
   ...
   #endif
   ```
   
   
   
   



##########
src/nanoarrow/common/inline_array.h:
##########
@@ -810,6 +872,27 @@ static inline int64_t ArrowArrayViewListChildOffset(
   }
 }
 
+static inline struct ArrowBufferView ArrowArrayViewBufferView(
+    const struct ArrowArrayView* array_view, int64_t i) {
+  if (array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW ||
+      array_view->storage_type == NANOARROW_TYPE_STRING_VIEW) {
+    const int32_t nfixed_buf = 2;
+    if (i < nfixed_buf) {
+      return array_view->buffer_views[i];
+    } else {
+      struct ArrowBufferView buf_vw;
+      buf_vw.data.data = array_view->array->buffers[i];
+      if (buf_vw.data.data == NULL) {
+        buf_vw.size_bytes = 0;
+      } else {
+        buf_vw.size_bytes = -1;

Review Comment:
   Can this be `array_view->variadic_buffer_sizes[i - 2]`?



##########
src/nanoarrow/common/inline_array.h:
##########
@@ -467,52 +468,111 @@ static inline ArrowErrorCode 
ArrowArrayAppendDouble(struct ArrowArray* array,
   return NANOARROW_OK;
 }
 
+#define NANOARROW_BINARY_VIEW_INLINE_SIZE 12
+#define NANOARROW_BINARY_VIEW_PREVIEW_SIZE 4
+#define NANOARROW_BINARY_VIEW_BLOCK_SIZE (32 << 10)  // 32KB
+
+// The Arrow C++ implementation uses anonymous structs as members
+// of the ArrowBinaryViewType. For Cython support in this library, we define
+// those structs outside of the ArrowBinaryViewType
+struct ArrowBinaryViewTypeInlinedData {
+  int32_t size;
+  uint8_t data[NANOARROW_BINARY_VIEW_INLINE_SIZE];
+};
+
+struct ArrowBinaryViewTypeRefData {
+  int32_t size;
+  uint8_t data[NANOARROW_BINARY_VIEW_PREVIEW_SIZE];
+  int32_t buffer_index;
+  int32_t offset;
+};
+
+union ArrowBinaryViewType {
+  struct ArrowBinaryViewTypeInlinedData inlined;
+  struct ArrowBinaryViewTypeRefData ref;
+  int64_t alignment_dummy;
+};
+
 static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array,
                                                    struct ArrowBufferView 
value) {
   struct ArrowArrayPrivateData* private_data =
       (struct ArrowArrayPrivateData*)array->private_data;
 
-  struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1);
-  struct ArrowBuffer* data_buffer = ArrowArrayBuffer(
-      array, 1 + (private_data->storage_type != 
NANOARROW_TYPE_FIXED_SIZE_BINARY));
-  int32_t offset;
-  int64_t large_offset;
-  int64_t fixed_size_bytes = private_data->layout.element_size_bits[1] / 8;
+  if (private_data->storage_type == NANOARROW_TYPE_STRING_VIEW ||
+      private_data->storage_type == NANOARROW_TYPE_BINARY_VIEW) {
+    struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1);
+    union ArrowBinaryViewType bvt;
+    bvt.inlined.size = (int32_t)value.size_bytes;
 
-  switch (private_data->storage_type) {
-    case NANOARROW_TYPE_STRING:
-    case NANOARROW_TYPE_BINARY:
-      offset = ((int32_t*)offset_buffer->data)[array->length];
-      if ((((int64_t)offset) + value.size_bytes) > INT32_MAX) {
-        return EOVERFLOW;
+    if (value.size_bytes <= NANOARROW_BINARY_VIEW_INLINE_SIZE) {
+      memcpy(bvt.inlined.data, value.data.as_char, value.size_bytes);
+    } else {
+      int32_t n_vbufs = private_data->n_variadic_buffers;
+      if (n_vbufs == 0 ||
+          private_data->variadic_buffers[n_vbufs - 1].size_bytes + 
value.size_bytes >
+              NANOARROW_BINARY_VIEW_BLOCK_SIZE) {

Review Comment:
   I think the management of the array of buffers should be abstracted at least 
one level away to make it easier to spot potential problems and that here it 
would be more readable:
   
   ```
   int64_t buffer_index;
   NANOARROW_RETURN_NOT_OK(ArrowArrayAddVariadicBuffers(array, 1, 
&buffer_index));
   ```
   
   (Exposing the ability to reserve variadic buffers also unlocks the ability 
to build by buffer and is probably a good idea anyway).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to