paleolimbot commented on code in PR #596:
URL: https://github.com/apache/arrow-nanoarrow/pull/596#discussion_r1768741591
##########
src/nanoarrow/common/array.c:
##########
@@ -456,10 +470,26 @@ static void ArrowArrayFlushInternalPointers(struct
ArrowArray* array) {
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
- for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
+ const bool is_binary_view = private_data->storage_type ==
NANOARROW_TYPE_STRING_VIEW ||
+ private_data->storage_type ==
NANOARROW_TYPE_BINARY_VIEW;
+ const int32_t nfixed_buf = is_binary_view ? 2 : NANOARROW_MAX_FIXED_BUFFERS;
+
+ for (int32_t i = 0; i < nfixed_buf; i++) {
private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data;
}
+ if (is_binary_view) {
+ const int32_t nvirt_buf = private_data->n_variadic_buffers;
+ private_data->buffer_data = (const void**)ArrowRealloc(
+ private_data->buffer_data, sizeof(void*) * (nfixed_buf + nvirt_buf +
1));
Review Comment:
It seems at this point like we should have already allocated enough space
for the extra pointer here (i.e., `buffer_data` should always be allocated up
front such that it has enough space for the extra buffer at the end).
##########
src/nanoarrow/common/array.c:
##########
@@ -456,10 +470,26 @@ static void ArrowArrayFlushInternalPointers(struct
ArrowArray* array) {
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
- for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
+ const bool is_binary_view = private_data->storage_type ==
NANOARROW_TYPE_STRING_VIEW ||
+ private_data->storage_type ==
NANOARROW_TYPE_BINARY_VIEW;
+ const int32_t nfixed_buf = is_binary_view ? 2 : NANOARROW_MAX_FIXED_BUFFERS;
+
+ for (int32_t i = 0; i < nfixed_buf; i++) {
private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data;
}
+ if (is_binary_view) {
+ const int32_t nvirt_buf = private_data->n_variadic_buffers;
+ private_data->buffer_data = (const void**)ArrowRealloc(
+ private_data->buffer_data, sizeof(void*) * (nfixed_buf + nvirt_buf +
1));
+ for (int32_t i = 0; i < nvirt_buf; ++i) {
Review Comment:
```suggestion
for (int32_t i = 0; i < nvirt_buf; i++) {
```
##########
src/nanoarrow/common/array.c:
##########
@@ -720,7 +757,18 @@ static int ArrowArrayViewSetArrayInternal(struct
ArrowArrayView* array_view,
}
}
- // Check the number of buffers
+ if (array_view->storage_type == NANOARROW_TYPE_STRING_VIEW ||
+ array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW) {
+ const int64_t n_buffers = array->n_buffers;
+ const int32_t nfixed_buf = 2;
Review Comment:
Should there be a `#define NANOARROW_DATA_VIEW_FIXED_BUFFERS 2` (maybe there
is a better name)? (A bare `2` shows up in a number of places).
##########
src/nanoarrow/common/inline_types.h:
##########
@@ -822,8 +831,8 @@ struct ArrowArrayPrivateData {
// The array of pointers to buffers. This must be updated after a sequence
// of appends to synchronize its values with the actual buffer addresses
- // (which may have ben reallocated uring that time)
- const void* buffer_data[NANOARROW_MAX_FIXED_BUFFERS];
+ // (which may have ben reallocated during that time)
Review Comment:
```suggestion
// (which may have been reallocated during that time)
```
##########
src/nanoarrow/common/array_test.cc:
##########
@@ -895,6 +895,146 @@ TEST(ArrayTest, ArrayTestAppendToLargeStringArray) {
ArrowArrayRelease(&array);
}
+TEST(ArrayTest, ArrayTestAppendToStringViewArray) {
+ struct ArrowArray array;
+
+ ASSERT_EQ(ArrowArrayInitFromType(&array, NANOARROW_TYPE_STRING_VIEW),
NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+
+ // Check that we can reserve
+ ASSERT_EQ(ArrowArrayReserve(&array, 5), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayBuffer(&array, 1)->capacity_bytes,
+ 5 * sizeof(union ArrowBinaryViewType));
+
+ std::string str1{"this_is_a_relatively_long_string"};
+ std::string filler(NANOARROW_BINARY_VIEW_BLOCK_SIZE - 34, 'x');
+ std::string str2{"goes_into_second_variadic_buffer"};
+
+ EXPECT_EQ(ArrowArrayAppendString(&array, "1234"_asv), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayAppendNull(&array, 2), NANOARROW_OK);
+ EXPECT_EQ(
+ ArrowArrayAppendString(&array, {{str1.c_str()},
static_cast<int64_t>(str1.size())}),
+ NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayAppendEmpty(&array, 1), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayAppendString(
+ &array, {{filler.c_str()},
static_cast<int64_t>(filler.size())}),
+ NANOARROW_OK);
+ EXPECT_EQ(
+ ArrowArrayAppendString(&array, {{str2.c_str()},
static_cast<int64_t>(str2.size())}),
+ NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK);
+
+ EXPECT_EQ(array.length, 7);
+ EXPECT_EQ(array.null_count, 2);
+ auto validity_buffer = reinterpret_cast<const uint8_t*>(array.buffers[0]);
+ auto inline_buffer =
+ reinterpret_cast<const union ArrowBinaryViewType*>(array.buffers[1]);
+ auto vbuf1 = reinterpret_cast<const char*>(array.buffers[2]);
+ auto vbuf2 = reinterpret_cast<const char*>(array.buffers[3]);
+ auto sizes_buffer = reinterpret_cast<const int64_t*>(array.buffers[4]);
+
+ EXPECT_EQ(validity_buffer[0], 0b01111001);
+ EXPECT_EQ(memcmp(inline_buffer[0].inlined.data, "1234", 4), 0);
+ EXPECT_EQ(inline_buffer[0].inlined.size, 4);
+ EXPECT_EQ(memcmp(inline_buffer[3].ref.data, str1.data(), 4), 0);
+ EXPECT_EQ(inline_buffer[3].ref.size, str1.size());
+ EXPECT_EQ(inline_buffer[3].ref.buffer_index, 0);
+ EXPECT_EQ(inline_buffer[3].ref.offset, 0);
+
+ EXPECT_EQ(memcmp(inline_buffer[5].ref.data, filler.data(), 4), 0);
+ EXPECT_EQ(inline_buffer[5].ref.size, filler.size());
+ EXPECT_EQ(inline_buffer[5].ref.buffer_index, 0);
+ EXPECT_EQ(inline_buffer[5].ref.offset, str1.size());
+
+ EXPECT_EQ(memcmp(inline_buffer[6].ref.data, str2.data(), 4), 0);
+ EXPECT_EQ(inline_buffer[6].ref.size, str2.size());
+ EXPECT_EQ(inline_buffer[6].ref.buffer_index, 1);
+ EXPECT_EQ(inline_buffer[6].ref.offset, 0);
+
+ EXPECT_EQ(memcmp(vbuf1, str1.c_str(), str1.size()), 0);
+ EXPECT_EQ(sizes_buffer[0], str1.size() + filler.size());
+
+ EXPECT_EQ(memcmp(vbuf2, str2.c_str(), str2.size()), 0);
+ EXPECT_EQ(sizes_buffer[1], str2.size());
+
+ // TODO: need to add overload for ViewArrayAsBytes
+ /*
+ EXPECT_THAT(nanoarrow::ViewArrayAsBytes<64>(&array),
+ ElementsAre("1234"_asv, NA, NA, "56789"_asv, ""_asv));
+ */
+ ArrowArrayRelease(&array);
+}
+
+TEST(ArrayTest, ArrayTestAppendToBinaryViewArray) {
+ struct ArrowArray array;
Review Comment:
Unless I'm missing something, these two tests can share a common `void
TestAppendToDataViewArray()` to avoid duplicating this entire test? (I know we
haven't done a great job of this in the test suite so far :flushed:)
##########
src/nanoarrow/testing/testing.cc:
##########
Review Comment:
I would remove these changes for now (adding the integration test
implementation is the next step but I think should be a separate PR with the
appropriate tests)
##########
src/nanoarrow/common/utils.c:
##########
@@ -179,6 +179,16 @@ void ArrowLayoutInit(struct ArrowLayout* layout, enum
ArrowType storage_type) {
layout->buffer_data_type[2] = NANOARROW_TYPE_BINARY;
break;
+ case NANOARROW_TYPE_BINARY_VIEW:
+ layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_VIEW;
+ layout->buffer_data_type[1] = NANOARROW_TYPE_BINARY_VIEW;
+ layout->element_size_bits[1] = 128;
+ break;
+ case NANOARROW_TYPE_STRING_VIEW:
+ layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_VIEW;
+ layout->buffer_data_type[1] = NANOARROW_TYPE_STRING_VIEW;
+ layout->element_size_bits[1] = 128;
Review Comment:
This can be tested with something like this:
https://github.com/apache/arrow-nanoarrow/blob/5d53738636cac9d6bf14aebf791a123814a00697/src/nanoarrow/common/schema_test.cc#L826
##########
src/nanoarrow/common/schema.c:
##########
@@ -101,8 +101,12 @@ static const char* ArrowSchemaFormatTemplate(enum
ArrowType type) {
return "u";
case NANOARROW_TYPE_LARGE_STRING:
return "U";
+ case NANOARROW_TYPE_STRING_VIEW:
+ return "vu";
Review Comment:
I believe the test for this would just be a line here:
https://github.com/apache/arrow-nanoarrow/blob/5d53738636cac9d6bf14aebf791a123814a00697/src/nanoarrow/common/schema_test.cc#L110
...except you'll have to guard it on the Arrow version so that the tests can
still compile against old versions of Arrow:
```
#if defined(ARROW_VERSION_MAJOR) && ARROW_VERSION_MAJOR >= 15
...
#endif
```
##########
src/nanoarrow/common/inline_array.h:
##########
@@ -810,6 +872,27 @@ static inline int64_t ArrowArrayViewListChildOffset(
}
}
+static inline struct ArrowBufferView ArrowArrayViewBufferView(
+ const struct ArrowArrayView* array_view, int64_t i) {
+ if (array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW ||
+ array_view->storage_type == NANOARROW_TYPE_STRING_VIEW) {
+ const int32_t nfixed_buf = 2;
+ if (i < nfixed_buf) {
+ return array_view->buffer_views[i];
+ } else {
+ struct ArrowBufferView buf_vw;
+ buf_vw.data.data = array_view->array->buffers[i];
+ if (buf_vw.data.data == NULL) {
+ buf_vw.size_bytes = 0;
+ } else {
+ buf_vw.size_bytes = -1;
Review Comment:
Can this be `array_view->variadic_buffer_sizes[i - 2]`?
##########
src/nanoarrow/common/inline_array.h:
##########
@@ -467,52 +468,111 @@ static inline ArrowErrorCode
ArrowArrayAppendDouble(struct ArrowArray* array,
return NANOARROW_OK;
}
+#define NANOARROW_BINARY_VIEW_INLINE_SIZE 12
+#define NANOARROW_BINARY_VIEW_PREVIEW_SIZE 4
+#define NANOARROW_BINARY_VIEW_BLOCK_SIZE (32 << 10) // 32KB
+
+// The Arrow C++ implementation uses anonymous structs as members
+// of the ArrowBinaryViewType. For Cython support in this library, we define
+// those structs outside of the ArrowBinaryViewType
+struct ArrowBinaryViewTypeInlinedData {
+ int32_t size;
+ uint8_t data[NANOARROW_BINARY_VIEW_INLINE_SIZE];
+};
+
+struct ArrowBinaryViewTypeRefData {
+ int32_t size;
+ uint8_t data[NANOARROW_BINARY_VIEW_PREVIEW_SIZE];
+ int32_t buffer_index;
+ int32_t offset;
+};
+
+union ArrowBinaryViewType {
+ struct ArrowBinaryViewTypeInlinedData inlined;
+ struct ArrowBinaryViewTypeRefData ref;
+ int64_t alignment_dummy;
+};
+
static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array,
struct ArrowBufferView
value) {
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
- struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1);
- struct ArrowBuffer* data_buffer = ArrowArrayBuffer(
- array, 1 + (private_data->storage_type !=
NANOARROW_TYPE_FIXED_SIZE_BINARY));
- int32_t offset;
- int64_t large_offset;
- int64_t fixed_size_bytes = private_data->layout.element_size_bits[1] / 8;
+ if (private_data->storage_type == NANOARROW_TYPE_STRING_VIEW ||
+ private_data->storage_type == NANOARROW_TYPE_BINARY_VIEW) {
+ struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1);
+ union ArrowBinaryViewType bvt;
+ bvt.inlined.size = (int32_t)value.size_bytes;
- switch (private_data->storage_type) {
- case NANOARROW_TYPE_STRING:
- case NANOARROW_TYPE_BINARY:
- offset = ((int32_t*)offset_buffer->data)[array->length];
- if ((((int64_t)offset) + value.size_bytes) > INT32_MAX) {
- return EOVERFLOW;
+ if (value.size_bytes <= NANOARROW_BINARY_VIEW_INLINE_SIZE) {
+ memcpy(bvt.inlined.data, value.data.as_char, value.size_bytes);
+ } else {
+ int32_t n_vbufs = private_data->n_variadic_buffers;
+ if (n_vbufs == 0 ||
+ private_data->variadic_buffers[n_vbufs - 1].size_bytes +
value.size_bytes >
+ NANOARROW_BINARY_VIEW_BLOCK_SIZE) {
Review Comment:
I think the management of the array of buffers should be abstracted at least
one level away to make it easier to spot potential problems and that here it
would be more readable:
```
int64_t buffer_index;
NANOARROW_RETURN_NOT_OK(ArrowArrayAddVariadicBuffers(array, 1,
&buffer_index));
```
(Exposing the ability to reserve variadic buffers also unlocks the ability
to build by buffer and is probably a good idea anyway).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]