This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 6f8badb6 feat: String/Binary View Support (#596)
6f8badb6 is described below
commit 6f8badb649d8416778d81598867adc7263d735ad
Author: William Ayd <[email protected]>
AuthorDate: Mon Sep 23 23:25:25 2024 -0400
feat: String/Binary View Support (#596)
closes https://github.com/apache/arrow-nanoarrow/issues/583
---
python/bootstrap.py | 5 +-
python/src/nanoarrow/__init__.py | 4 +
python/src/nanoarrow/_types.pxd | 4 +
python/src/nanoarrow/schema.py | 39 ++++++++
python/tests/test_schema.py | 2 +
src/nanoarrow/common/array.c | 66 ++++++++++--
src/nanoarrow/common/array_test.cc | 152 ++++++++++++++++++++++++++++
src/nanoarrow/common/inline_array.h | 194 +++++++++++++++++++++++++++++-------
src/nanoarrow/common/inline_types.h | 37 +++++--
src/nanoarrow/common/schema.c | 24 +++++
src/nanoarrow/common/schema_test.cc | 30 ++++++
src/nanoarrow/common/utils.c | 10 ++
src/nanoarrow/testing/testing.cc | 2 +
13 files changed, 518 insertions(+), 51 deletions(-)
diff --git a/python/bootstrap.py b/python/bootstrap.py
index 8fbdf5eb..7745013d 100644
--- a/python/bootstrap.py
+++ b/python/bootstrap.py
@@ -172,7 +172,10 @@ class PxdGenerator:
class NanoarrowPxdGenerator(PxdGenerator):
def _preprocess_content(self, content):
- return re.sub(r"NANOARROW_MAX_FIXED_BUFFERS", "3", content)
+ content = re.sub(r"NANOARROW_MAX_FIXED_BUFFERS", "3", content)
+ content = re.sub(r"NANOARROW_BINARY_VIEW_INLINE_SIZE", "12", content)
+ content = re.sub(r"NANOARROW_BINARY_VIEW_PREFIX_SIZE", "4", content)
+ return content
def _pxd_header(self):
return (
diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py
index 2a97e9f4..1a4b898b 100644
--- a/python/src/nanoarrow/__init__.py
+++ b/python/src/nanoarrow/__init__.py
@@ -48,12 +48,14 @@ from nanoarrow.schema import (
float64,
string,
large_string,
+ string_view,
list_,
large_list,
fixed_size_list,
dictionary,
binary,
large_binary,
+ binary_view,
fixed_size_binary,
date32,
date64,
@@ -82,6 +84,7 @@ __all__ = [
"TimeUnit",
"Type",
"binary",
+ "binary_view",
"bool_",
"c_array",
"c_array_from_buffers",
@@ -117,6 +120,7 @@ __all__ = [
"nulls_forbid",
"nulls_separate",
"string",
+ "string_view",
"struct",
"schema",
"time32",
diff --git a/python/src/nanoarrow/_types.pxd b/python/src/nanoarrow/_types.pxd
index 81b96900..4a53fe31 100644
--- a/python/src/nanoarrow/_types.pxd
+++ b/python/src/nanoarrow/_types.pxd
@@ -67,6 +67,10 @@ cpdef enum CArrowType:
LARGE_BINARY = NANOARROW_TYPE_LARGE_BINARY
LARGE_LIST = NANOARROW_TYPE_LARGE_LIST
INTERVAL_MONTH_DAY_NANO = NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO
+ RUN_END_ENCODED = NANOARROW_TYPE_RUN_END_ENCODED
+ BINARY_VIEW = NANOARROW_TYPE_BINARY_VIEW
+ STRING_VIEW = NANOARROW_TYPE_STRING_VIEW
+
cdef equal(int type_id1, int type_id2)
diff --git a/python/src/nanoarrow/schema.py b/python/src/nanoarrow/schema.py
index c62efc81..2d24bb99 100644
--- a/python/src/nanoarrow/schema.py
+++ b/python/src/nanoarrow/schema.py
@@ -76,6 +76,9 @@ class Type(enum.Enum):
LARGE_BINARY = int(_types.LARGE_BINARY)
LARGE_LIST = int(_types.LARGE_LIST)
INTERVAL_MONTH_DAY_NANO = int(_types.INTERVAL_MONTH_DAY_NANO)
+ RUN_END_ENCODED = int(_types.RUN_END_ENCODED)
+ BINARY_VIEW = int(_types.BINARY_VIEW)
+ STRING_VIEW = int(_types.STRING_VIEW)
def __arrow_c_schema__(self):
# This will only work for parameter-free types
@@ -784,6 +787,24 @@ def large_string(nullable: bool = True) -> Schema:
return Schema(Type.LARGE_STRING, nullable=nullable)
+def string_view(nullable: bool = True) -> Schema:
+ """Create an instance of a string view type.
+
+ Parameters
+ ----------
+ nullable : bool, optional
+ Use ``False`` to mark this field as non-nullable.
+
+ Examples
+ --------
+
+ >>> import nanoarrow as na
+ >>> na.string_view()
+ <Schema> string_view
+ """
+ return Schema(Type.STRING_VIEW, nullable=nullable)
+
+
def binary(nullable: bool = True) -> Schema:
"""Create an instance of a variable or fixed-width binary type.
@@ -820,6 +841,24 @@ def large_binary(nullable: bool = True) -> Schema:
return Schema(Type.LARGE_BINARY, nullable=nullable)
+def binary_view(nullable: bool = True) -> Schema:
+ """Create an instance of a binary view type.
+
+ Parameters
+ ----------
+ nullable : bool, optional
+ Use ``False`` to mark this field as non-nullable.
+
+ Examples
+ --------
+
+ >>> import nanoarrow as na
+ >>> na.binary_view()
+ <Schema> binary_view
+ """
+ return Schema(Type.BINARY_VIEW, nullable=nullable)
+
+
def fixed_size_binary(byte_width: int, nullable: bool = True) -> Schema:
"""Create an instance of a variable or fixed-width binary type.
diff --git a/python/tests/test_schema.py b/python/tests/test_schema.py
index e5fbbcef..770afa4c 100644
--- a/python/tests/test_schema.py
+++ b/python/tests/test_schema.py
@@ -107,6 +107,8 @@ def test_schema_simple():
assert na.interval_months().type == na.Type.INTERVAL_MONTHS
assert na.interval_day_time().type == na.Type.INTERVAL_DAY_TIME
assert na.interval_month_day_nano().type == na.Type.INTERVAL_MONTH_DAY_NANO
+ assert na.binary_view().type == na.Type.BINARY_VIEW
+ assert na.string_view().type == na.Type.STRING_VIEW
def test_schema_fixed_size_binary():
diff --git a/src/nanoarrow/common/array.c b/src/nanoarrow/common/array.c
index 63f65301..3f296dbd 100644
--- a/src/nanoarrow/common/array.c
+++ b/src/nanoarrow/common/array.c
@@ -18,6 +18,7 @@
#include <errno.h>
#include <inttypes.h>
#include <stdarg.h>
+#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -32,6 +33,12 @@ static void ArrowArrayReleaseInternal(struct ArrowArray*
array) {
ArrowBitmapReset(&private_data->bitmap);
ArrowBufferReset(&private_data->buffers[0]);
ArrowBufferReset(&private_data->buffers[1]);
+ ArrowFree(private_data->buffer_data);
+ for (int32_t i = 0; i < private_data->n_variadic_buffers; ++i) {
+ ArrowBufferReset(&private_data->variadic_buffers[i]);
+ }
+ ArrowFree(private_data->variadic_buffers);
+ ArrowFree(private_data->variadic_buffer_sizes);
ArrowFree(private_data);
}
@@ -106,6 +113,10 @@ static ArrowErrorCode ArrowArraySetStorageType(struct
ArrowArray* array,
case NANOARROW_TYPE_DENSE_UNION:
array->n_buffers = 2;
break;
+ case NANOARROW_TYPE_BINARY_VIEW:
+ case NANOARROW_TYPE_STRING_VIEW:
+ array->n_buffers = NANOARROW_BINARY_VIEW_FIXED_BUFFERS;
+ break;
case NANOARROW_TYPE_STRING:
case NANOARROW_TYPE_LARGE_STRING:
case NANOARROW_TYPE_BINARY:
@@ -148,12 +159,17 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray*
array,
ArrowBitmapInit(&private_data->bitmap);
ArrowBufferInit(&private_data->buffers[0]);
ArrowBufferInit(&private_data->buffers[1]);
- private_data->buffer_data[0] = NULL;
- private_data->buffer_data[1] = NULL;
- private_data->buffer_data[2] = NULL;
+ private_data->buffer_data =
+ (const void**)ArrowMalloc(sizeof(void*) * NANOARROW_MAX_FIXED_BUFFERS);
+ for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; ++i) {
+ private_data->buffer_data[i] = NULL;
+ }
+ private_data->n_variadic_buffers = 0;
+ private_data->variadic_buffers = NULL;
+ private_data->variadic_buffer_sizes = NULL;
array->private_data = private_data;
- array->buffers = (const void**)(&private_data->buffer_data);
+ array->buffers = (const void**)(private_data->buffer_data);
// These are not technically "storage" in the sense that they do not appear
// in the ArrowSchemaView's storage_type member; however, allowing them here
@@ -456,10 +472,26 @@ static void ArrowArrayFlushInternalPointers(struct
ArrowArray* array) {
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
- for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
+ const bool is_binary_view = private_data->storage_type ==
NANOARROW_TYPE_STRING_VIEW ||
+ private_data->storage_type ==
NANOARROW_TYPE_BINARY_VIEW;
+ const int32_t nfixed_buf = is_binary_view ? 2 : NANOARROW_MAX_FIXED_BUFFERS;
+
+ for (int32_t i = 0; i < nfixed_buf; i++) {
private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data;
}
+ if (is_binary_view) {
+ const int32_t nvirt_buf = private_data->n_variadic_buffers;
+ private_data->buffer_data = (const void**)ArrowRealloc(
+ private_data->buffer_data, sizeof(void*) * (nfixed_buf + nvirt_buf +
1));
+ for (int32_t i = 0; i < nvirt_buf; i++) {
+ private_data->buffer_data[nfixed_buf + i] =
private_data->variadic_buffers[i].data;
+ }
+ private_data->buffer_data[nfixed_buf + nvirt_buf] =
+ private_data->variadic_buffer_sizes;
+ array->buffers = (const void**)(private_data->buffer_data);
+ }
+
for (int64_t i = 0; i < array->n_children; i++) {
ArrowArrayFlushInternalPointers(array->children[i]);
}
@@ -664,6 +696,7 @@ void ArrowArrayViewSetLength(struct ArrowArrayView*
array_view, int64_t length)
_ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i]
* length) /
8;
continue;
+ case NANOARROW_BUFFER_TYPE_DATA_VIEW:
case NANOARROW_BUFFER_TYPE_TYPE_ID:
case NANOARROW_BUFFER_TYPE_UNION_OFFSET:
array_view->buffer_views[i].size_bytes = element_size_bytes * length;
@@ -700,9 +733,15 @@ static int ArrowArrayViewSetArrayInternal(struct
ArrowArrayView* array_view,
array_view->offset = array->offset;
array_view->length = array->length;
array_view->null_count = array->null_count;
+ array_view->variadic_buffer_sizes = NULL;
+ array_view->n_variadic_buffers = 0;
int64_t buffers_required = 0;
- for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
+ const int nfixed_buf = array_view->storage_type ==
NANOARROW_TYPE_STRING_VIEW ||
+ array_view->storage_type ==
NANOARROW_TYPE_BINARY_VIEW
+ ? NANOARROW_BINARY_VIEW_FIXED_BUFFERS
+ : NANOARROW_MAX_FIXED_BUFFERS;
+ for (int i = 0; i < nfixed_buf; i++) {
if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) {
break;
}
@@ -720,7 +759,19 @@ static int ArrowArrayViewSetArrayInternal(struct
ArrowArrayView* array_view,
}
}
- // Check the number of buffers
+ if (array_view->storage_type == NANOARROW_TYPE_STRING_VIEW ||
+ array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW) {
+ const int64_t n_buffers = array->n_buffers;
+ const int32_t nfixed_buf = NANOARROW_BINARY_VIEW_FIXED_BUFFERS;
+
+ const int32_t nvariadic_buf = (int32_t)(n_buffers - nfixed_buf - 1);
+ if (nvariadic_buf > 0) {
+ array_view->n_variadic_buffers = nvariadic_buf;
+ buffers_required += nvariadic_buf + 1;
+ array_view->variadic_buffer_sizes = (int64_t*)array->buffers[n_buffers -
1];
+ }
+ }
+
if (buffers_required != array->n_buffers) {
ArrowErrorSet(error,
"Expected array with %" PRId64 " buffer(s) but found %"
PRId64
@@ -814,6 +865,7 @@ static int ArrowArrayViewValidateMinimal(struct
ArrowArrayView* array_view,
break;
case NANOARROW_BUFFER_TYPE_TYPE_ID:
case NANOARROW_BUFFER_TYPE_UNION_OFFSET:
+ case NANOARROW_BUFFER_TYPE_DATA_VIEW:
min_buffer_size_bytes = element_size_bytes * offset_plus_length;
break;
case NANOARROW_BUFFER_TYPE_NONE:
diff --git a/src/nanoarrow/common/array_test.cc
b/src/nanoarrow/common/array_test.cc
index 5c18e6af..b3e72e44 100644
--- a/src/nanoarrow/common/array_test.cc
+++ b/src/nanoarrow/common/array_test.cc
@@ -895,6 +895,84 @@ TEST(ArrayTest, ArrayTestAppendToLargeStringArray) {
ArrowArrayRelease(&array);
}
+template <enum ArrowType ArrowT, typename ValueT,
+ ArrowErrorCode (*AppendFunc)(struct ArrowArray*, ValueT)>
+void TestAppendToDataViewArray() {
+ struct ArrowArray array;
+
+ ASSERT_EQ(ArrowArrayInitFromType(&array, ArrowT), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+
+ // Check that we can reserve
+ ASSERT_EQ(ArrowArrayReserve(&array, 5), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayBuffer(&array, 1)->capacity_bytes,
+ 5 * sizeof(union ArrowBinaryView));
+
+ std::string str1{"this_is_a_relatively_long_string"};
+ std::string filler(NANOARROW_BINARY_VIEW_BLOCK_SIZE - 34, 'x');
+ std::string str2{"goes_into_second_variadic_buffer"};
+
+ EXPECT_EQ(AppendFunc(&array, ValueT{{"1234"}, 4}), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayAppendNull(&array, 2), NANOARROW_OK);
+ EXPECT_EQ(AppendFunc(&array, {{str1.c_str()},
static_cast<int64_t>(str1.size())}),
+ NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayAppendEmpty(&array, 1), NANOARROW_OK);
+ EXPECT_EQ(AppendFunc(&array, {{filler.c_str()},
static_cast<int64_t>(filler.size())}),
+ NANOARROW_OK);
+ EXPECT_EQ(AppendFunc(&array, {{str2.c_str()},
static_cast<int64_t>(str2.size())}),
+ NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK);
+
+ EXPECT_EQ(array.length, 7);
+ EXPECT_EQ(array.null_count, 2);
+ auto validity_buffer = reinterpret_cast<const uint8_t*>(array.buffers[0]);
+ auto inline_buffer = reinterpret_cast<const union
ArrowBinaryView*>(array.buffers[1]);
+ auto vbuf1 = reinterpret_cast<const char*>(array.buffers[2]);
+ auto vbuf2 = reinterpret_cast<const char*>(array.buffers[3]);
+ auto sizes_buffer = reinterpret_cast<const int64_t*>(array.buffers[4]);
+
+ EXPECT_EQ(validity_buffer[0], 0b01111001);
+ EXPECT_EQ(memcmp(inline_buffer[0].inlined.data, "1234", 4), 0);
+ EXPECT_EQ(inline_buffer[0].inlined.size, 4);
+ EXPECT_EQ(memcmp(inline_buffer[3].ref.prefix, str1.data(), 4), 0);
+ EXPECT_EQ(inline_buffer[3].ref.size, str1.size());
+ EXPECT_EQ(inline_buffer[3].ref.buffer_index, 0);
+ EXPECT_EQ(inline_buffer[3].ref.offset, 0);
+
+ EXPECT_EQ(memcmp(inline_buffer[5].ref.prefix, filler.data(), 4), 0);
+ EXPECT_EQ(inline_buffer[5].ref.size, filler.size());
+ EXPECT_EQ(inline_buffer[5].ref.buffer_index, 0);
+ EXPECT_EQ(inline_buffer[5].ref.offset, str1.size());
+
+ EXPECT_EQ(memcmp(inline_buffer[6].ref.prefix, str2.data(), 4), 0);
+ EXPECT_EQ(inline_buffer[6].ref.size, str2.size());
+ EXPECT_EQ(inline_buffer[6].ref.buffer_index, 1);
+ EXPECT_EQ(inline_buffer[6].ref.offset, 0);
+
+ EXPECT_EQ(memcmp(vbuf1, str1.c_str(), str1.size()), 0);
+ EXPECT_EQ(sizes_buffer[0], str1.size() + filler.size());
+
+ EXPECT_EQ(memcmp(vbuf2, str2.c_str(), str2.size()), 0);
+ EXPECT_EQ(sizes_buffer[1], str2.size());
+
+ // TODO: issue #633
+ /*
+ EXPECT_THAT(nanoarrow::ViewArrayAsBytes<64>(&array),
+ ElementsAre("1234"_asv, NA, NA, "56789"_asv, ""_asv));
+ */
+ ArrowArrayRelease(&array);
+};
+
+TEST(ArrayTest, ArrayTestAppendToBinaryViewArray) {
+ TestAppendToDataViewArray<NANOARROW_TYPE_STRING_VIEW, struct ArrowStringView,
+ ArrowArrayAppendString>();
+};
+
+TEST(ArrayTest, ArrayTestAppendToStringViewArray) {
+ TestAppendToDataViewArray<NANOARROW_TYPE_BINARY_VIEW, struct ArrowBufferView,
+ ArrowArrayAppendBytes>();
+};
+
TEST(ArrayTest, ArrayTestAppendToFixedSizeBinaryArray) {
struct ArrowArray array;
struct ArrowSchema schema;
@@ -3265,6 +3343,80 @@ TEST(ArrayViewTest, ArrayViewTestGetString) {
TestGetFromBinary<FixedSizeBinaryBuilder>(fixed_size_builder);
}
+template <typename BuilderClass>
+void TestGetFromBinaryView(BuilderClass& builder) {
+ struct ArrowArray array;
+ struct ArrowSchema schema;
+ struct ArrowArrayView array_view;
+ struct ArrowError error;
+
+ auto type = builder.type();
+ ARROW_EXPECT_OK(builder.Append("1234"));
+ ARROW_EXPECT_OK(builder.AppendNulls(2));
+ ARROW_EXPECT_OK(builder.Append("four"));
+
+ std::string str1{"this_is_a_relatively_long_string"};
+ std::string filler(NANOARROW_BINARY_VIEW_BLOCK_SIZE - 34, 'x');
+ std::string str2{"goes_into_second_variadic_buffer"};
+
+ ARROW_EXPECT_OK(builder.Append(str1));
+ ARROW_EXPECT_OK(builder.Append(filler));
+ ARROW_EXPECT_OK(builder.Append(str2));
+
+ auto maybe_arrow_array = builder.Finish();
+ ARROW_EXPECT_OK(maybe_arrow_array);
+ auto arrow_array = maybe_arrow_array.ValueUnsafe();
+
+ ARROW_EXPECT_OK(ExportArray(*arrow_array, &array, &schema));
+ ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, &error),
NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayViewValidate(&array_view,
NANOARROW_VALIDATION_LEVEL_FULL, &error),
+ NANOARROW_OK);
+
+ EXPECT_EQ(array_view.n_variadic_buffers, 2);
+ EXPECT_EQ(array_view.variadic_buffer_sizes[0], str1.size() + filler.size());
+ EXPECT_EQ(array_view.variadic_buffer_sizes[1], str2.size());
+
+ EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 2), 1);
+ EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 3), 0);
+
+ auto string_view = ArrowArrayViewGetStringUnsafe(&array_view, 3);
+ EXPECT_EQ(string_view.size_bytes, strlen("four"));
+ EXPECT_EQ(memcmp(string_view.data, "four", string_view.size_bytes), 0);
+
+ auto buffer_view = ArrowArrayViewGetBytesUnsafe(&array_view, 3);
+ EXPECT_EQ(buffer_view.size_bytes, strlen("four"));
+ EXPECT_EQ(memcmp(buffer_view.data.as_char, "four", buffer_view.size_bytes),
0);
+
+ string_view = ArrowArrayViewGetStringUnsafe(&array_view, 4);
+ EXPECT_EQ(string_view.size_bytes, str1.size());
+ EXPECT_EQ(memcmp(string_view.data, str1.c_str(), string_view.size_bytes), 0);
+
+ string_view = ArrowArrayViewGetStringUnsafe(&array_view, 6);
+ EXPECT_EQ(string_view.size_bytes, str2.size());
+ EXPECT_EQ(memcmp(string_view.data, str2.c_str(), string_view.size_bytes), 0);
+
+ buffer_view = ArrowArrayViewGetBytesUnsafe(&array_view, 4);
+ EXPECT_EQ(buffer_view.size_bytes, str1.size());
+ EXPECT_EQ(memcmp(buffer_view.data.as_char, str1.c_str(),
buffer_view.size_bytes), 0);
+
+ buffer_view = ArrowArrayViewGetBytesUnsafe(&array_view, 6);
+ EXPECT_EQ(buffer_view.size_bytes, str2.size());
+ EXPECT_EQ(memcmp(buffer_view.data.as_char, str2.c_str(),
buffer_view.size_bytes), 0);
+
+ ArrowArrayViewReset(&array_view);
+ ArrowArrayRelease(&array);
+ ArrowSchemaRelease(&schema);
+}
+
+TEST(ArrayViewTest, ArrayViewTestGetStringView) {
+ auto string_view_builder = StringViewBuilder();
+ TestGetFromBinaryView<StringViewBuilder>(string_view_builder);
+
+ auto binary_view_builder = BinaryViewBuilder();
+ TestGetFromBinaryView<BinaryViewBuilder>(binary_view_builder);
+}
+
TEST(ArrayViewTest, ArrayViewTestGetIntervalYearMonth) {
struct ArrowArray array;
struct ArrowSchema schema;
diff --git a/src/nanoarrow/common/inline_array.h
b/src/nanoarrow/common/inline_array.h
index 44f0dd3b..32ff779d 100644
--- a/src/nanoarrow/common/inline_array.h
+++ b/src/nanoarrow/common/inline_array.h
@@ -301,6 +301,7 @@ static inline ArrowErrorCode
_ArrowArrayAppendEmptyInternal(struct ArrowArray* a
i++;
continue;
case NANOARROW_BUFFER_TYPE_DATA:
+ case NANOARROW_BUFFER_TYPE_DATA_VIEW:
// Zero out the next bit of memory
if (private_data->layout.element_size_bits[i] % 8 == 0) {
NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFill(buffer, 0, size_bytes
* n));
@@ -467,52 +468,148 @@ static inline ArrowErrorCode
ArrowArrayAppendDouble(struct ArrowArray* array,
return NANOARROW_OK;
}
+#define NANOARROW_BINARY_VIEW_FIXED_BUFFERS 2
+#define NANOARROW_BINARY_VIEW_INLINE_SIZE 12
+#define NANOARROW_BINARY_VIEW_PREFIX_SIZE 4
+#define NANOARROW_BINARY_VIEW_BLOCK_SIZE (32 << 10) // 32KB
+
+// The Arrow C++ implementation uses anonymous structs as members
+// of the ArrowBinaryView. For Cython support in this library, we define
+// those structs outside of the ArrowBinaryView
+struct ArrowBinaryViewInlined {
+ int32_t size;
+ uint8_t data[NANOARROW_BINARY_VIEW_INLINE_SIZE];
+};
+
+struct ArrowBinaryViewRef {
+ int32_t size;
+ uint8_t prefix[NANOARROW_BINARY_VIEW_PREFIX_SIZE];
+ int32_t buffer_index;
+ int32_t offset;
+};
+
+union ArrowBinaryView {
+ struct ArrowBinaryViewInlined inlined;
+ struct ArrowBinaryViewRef ref;
+ int64_t alignment_dummy;
+};
+
+static inline int32_t ArrowArrayVariadicBufferCount(struct ArrowArray* array) {
+ struct ArrowArrayPrivateData* private_data =
+ (struct ArrowArrayPrivateData*)array->private_data;
+
+ return private_data->n_variadic_buffers;
+}
+
+static inline ArrowErrorCode ArrowArrayAddVariadicBuffers(struct ArrowArray*
array,
+ int32_t nbuffers) {
+ const int32_t n_current_bufs = ArrowArrayVariadicBufferCount(array);
+ const int32_t n_bufs_needed = n_current_bufs + nbuffers;
+
+ struct ArrowArrayPrivateData* private_data =
+ (struct ArrowArrayPrivateData*)array->private_data;
+
+ private_data->variadic_buffers = (struct ArrowBuffer*)ArrowRealloc(
+ private_data->variadic_buffers, sizeof(struct ArrowBuffer) *
n_bufs_needed);
+ if (private_data->variadic_buffers == NULL) {
+ return ENOMEM;
+ }
+ private_data->variadic_buffer_sizes = (int64_t*)ArrowRealloc(
+ private_data->variadic_buffer_sizes, sizeof(int64_t) * n_bufs_needed);
+ if (private_data->variadic_buffer_sizes == NULL) {
+ return ENOMEM;
+ }
+
+ for (int32_t i = n_current_bufs; i < n_bufs_needed; i++) {
+ ArrowBufferInit(&private_data->variadic_buffers[i]);
+ private_data->variadic_buffer_sizes[i] = 0;
+ }
+ private_data->n_variadic_buffers = n_bufs_needed;
+
+ return NANOARROW_OK;
+}
+
static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array,
struct ArrowBufferView
value) {
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
- struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1);
- struct ArrowBuffer* data_buffer = ArrowArrayBuffer(
- array, 1 + (private_data->storage_type !=
NANOARROW_TYPE_FIXED_SIZE_BINARY));
- int32_t offset;
- int64_t large_offset;
- int64_t fixed_size_bytes = private_data->layout.element_size_bits[1] / 8;
+ if (private_data->storage_type == NANOARROW_TYPE_STRING_VIEW ||
+ private_data->storage_type == NANOARROW_TYPE_BINARY_VIEW) {
+ struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1);
+ union ArrowBinaryView bvt;
+ bvt.inlined.size = (int32_t)value.size_bytes;
- switch (private_data->storage_type) {
- case NANOARROW_TYPE_STRING:
- case NANOARROW_TYPE_BINARY:
- offset = ((int32_t*)offset_buffer->data)[array->length];
- if ((((int64_t)offset) + value.size_bytes) > INT32_MAX) {
- return EOVERFLOW;
+ if (value.size_bytes <= NANOARROW_BINARY_VIEW_INLINE_SIZE) {
+ memcpy(bvt.inlined.data, value.data.as_char, value.size_bytes);
+ memset(bvt.inlined.data + bvt.inlined.size, 0,
+ NANOARROW_BINARY_VIEW_INLINE_SIZE - bvt.inlined.size);
+ } else {
+ int32_t current_n_vbufs = ArrowArrayVariadicBufferCount(array);
+ if (current_n_vbufs == 0 ||
+ private_data->variadic_buffers[current_n_vbufs - 1].size_bytes +
+ value.size_bytes >
+ NANOARROW_BINARY_VIEW_BLOCK_SIZE) {
+ const int32_t additional_bufs_needed = 1;
+ NANOARROW_RETURN_NOT_OK(
+ ArrowArrayAddVariadicBuffers(array, additional_bufs_needed));
+ current_n_vbufs += additional_bufs_needed;
}
- offset += (int32_t)value.size_bytes;
- NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(offset_buffer, &offset,
sizeof(int32_t)));
+ const int32_t buf_index = current_n_vbufs - 1;
+ struct ArrowBuffer* variadic_buf =
&private_data->variadic_buffers[buf_index];
+ memcpy(bvt.ref.prefix, value.data.as_char,
NANOARROW_BINARY_VIEW_PREFIX_SIZE);
+ bvt.ref.buffer_index = (int32_t)buf_index;
+ bvt.ref.offset = (int32_t)variadic_buf->size_bytes;
NANOARROW_RETURN_NOT_OK(
- ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes));
- break;
+ ArrowBufferAppend(variadic_buf, value.data.as_char,
value.size_bytes));
+ private_data->variadic_buffer_sizes[buf_index] =
variadic_buf->size_bytes;
+ }
+ NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &bvt, sizeof(bvt)));
+ } else {
+ struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1);
+ struct ArrowBuffer* data_buffer = ArrowArrayBuffer(
+ array, 1 + (private_data->storage_type !=
NANOARROW_TYPE_FIXED_SIZE_BINARY));
+ int32_t offset;
+ int64_t large_offset;
+ int64_t fixed_size_bytes = private_data->layout.element_size_bits[1] / 8;
+
+ switch (private_data->storage_type) {
+ case NANOARROW_TYPE_STRING:
+ case NANOARROW_TYPE_BINARY:
+ offset = ((int32_t*)offset_buffer->data)[array->length];
+ if ((((int64_t)offset) + value.size_bytes) > INT32_MAX) {
+ return EOVERFLOW;
+ }
- case NANOARROW_TYPE_LARGE_STRING:
- case NANOARROW_TYPE_LARGE_BINARY:
- large_offset = ((int64_t*)offset_buffer->data)[array->length];
- large_offset += value.size_bytes;
- NANOARROW_RETURN_NOT_OK(
- ArrowBufferAppend(offset_buffer, &large_offset, sizeof(int64_t)));
- NANOARROW_RETURN_NOT_OK(
- ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes));
- break;
+ offset += (int32_t)value.size_bytes;
+ NANOARROW_RETURN_NOT_OK(
+ ArrowBufferAppend(offset_buffer, &offset, sizeof(int32_t)));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes));
+ break;
- case NANOARROW_TYPE_FIXED_SIZE_BINARY:
- if (value.size_bytes != fixed_size_bytes) {
- return EINVAL;
- }
+ case NANOARROW_TYPE_LARGE_STRING:
+ case NANOARROW_TYPE_LARGE_BINARY:
+ large_offset = ((int64_t*)offset_buffer->data)[array->length];
+ large_offset += value.size_bytes;
+ NANOARROW_RETURN_NOT_OK(
+ ArrowBufferAppend(offset_buffer, &large_offset, sizeof(int64_t)));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes));
+ break;
- NANOARROW_RETURN_NOT_OK(
- ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes));
- break;
- default:
- return EINVAL;
+ case NANOARROW_TYPE_FIXED_SIZE_BINARY:
+ if (value.size_bytes != fixed_size_bytes) {
+ return EINVAL;
+ }
+
+ NANOARROW_RETURN_NOT_OK(
+ ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes));
+ break;
+ default:
+ return EINVAL;
+ }
}
if (private_data->bitmap.buffer.data != NULL) {
@@ -535,8 +632,10 @@ static inline ArrowErrorCode ArrowArrayAppendString(struct
ArrowArray* array,
switch (private_data->storage_type) {
case NANOARROW_TYPE_STRING:
case NANOARROW_TYPE_LARGE_STRING:
+ case NANOARROW_TYPE_STRING_VIEW:
case NANOARROW_TYPE_BINARY:
case NANOARROW_TYPE_LARGE_BINARY:
+ case NANOARROW_TYPE_BINARY_VIEW:
return ArrowArrayAppendBytes(array, buffer_view);
default:
return EINVAL;
@@ -810,6 +909,21 @@ static inline int64_t ArrowArrayViewListChildOffset(
}
}
+static struct ArrowBufferView ArrowArrayViewGetBytesFromViewArrayUnsafe(
+ const struct ArrowArrayView* array_view, int64_t i) {
+ const union ArrowBinaryView* bv =
&array_view->buffer_views[1].data.as_binary_view[i];
+ struct ArrowBufferView out = {{NULL}, bv->inlined.size};
+ if (bv->inlined.size <= NANOARROW_BINARY_VIEW_INLINE_SIZE) {
+ out.data.as_uint8 = bv->inlined.data;
+ return out;
+ }
+
+ const int32_t buf_index = bv->ref.buffer_index +
NANOARROW_BINARY_VIEW_FIXED_BUFFERS;
+ out.data.data = array_view->array->buffers[buf_index];
+ out.data.as_uint8 += bv->ref.offset;
+ return out;
+}
+
static inline int64_t ArrowArrayViewGetIntUnsafe(const struct ArrowArrayView*
array_view,
int64_t i) {
const struct ArrowBufferView* data_view = &array_view->buffer_views[1];
@@ -938,6 +1052,14 @@ static inline struct ArrowStringView
ArrowArrayViewGetStringUnsafe(
view.size_bytes = array_view->layout.element_size_bits[1] / 8;
view.data = array_view->buffer_views[1].data.as_char + (i *
view.size_bytes);
break;
+ case NANOARROW_TYPE_STRING_VIEW:
+ case NANOARROW_TYPE_BINARY_VIEW: {
+ struct ArrowBufferView buf_view =
+ ArrowArrayViewGetBytesFromViewArrayUnsafe(array_view, i);
+ view.data = buf_view.data.as_char;
+ view.size_bytes = buf_view.size_bytes;
+ break;
+ }
default:
view.data = NULL;
view.size_bytes = 0;
@@ -972,6 +1094,10 @@ static inline struct ArrowBufferView
ArrowArrayViewGetBytesUnsafe(
view.data.as_uint8 =
array_view->buffer_views[1].data.as_uint8 + (i * view.size_bytes);
break;
+ case NANOARROW_TYPE_STRING_VIEW:
+ case NANOARROW_TYPE_BINARY_VIEW:
+ view = ArrowArrayViewGetBytesFromViewArrayUnsafe(array_view, i);
+ break;
default:
view.data.data = NULL;
view.size_bytes = 0;
diff --git a/src/nanoarrow/common/inline_types.h
b/src/nanoarrow/common/inline_types.h
index fae01bb2..6f2d1103 100644
--- a/src/nanoarrow/common/inline_types.h
+++ b/src/nanoarrow/common/inline_types.h
@@ -451,7 +451,9 @@ enum ArrowType {
NANOARROW_TYPE_LARGE_BINARY,
NANOARROW_TYPE_LARGE_LIST,
NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO,
- NANOARROW_TYPE_RUN_END_ENCODED
+ NANOARROW_TYPE_RUN_END_ENCODED,
+ NANOARROW_TYPE_BINARY_VIEW,
+ NANOARROW_TYPE_STRING_VIEW
};
/// \brief Get a string value of an enum ArrowType value
@@ -540,6 +542,10 @@ static inline const char* ArrowTypeString(enum ArrowType
type) {
return "interval_month_day_nano";
case NANOARROW_TYPE_RUN_END_ENCODED:
return "run_end_encoded";
+ case NANOARROW_TYPE_BINARY_VIEW:
+ return "binary_view";
+ case NANOARROW_TYPE_STRING_VIEW:
+ return "string_view";
default:
return NULL;
}
@@ -616,15 +622,12 @@ enum ArrowBufferType {
NANOARROW_BUFFER_TYPE_TYPE_ID,
NANOARROW_BUFFER_TYPE_UNION_OFFSET,
NANOARROW_BUFFER_TYPE_DATA_OFFSET,
- NANOARROW_BUFFER_TYPE_DATA
+ NANOARROW_BUFFER_TYPE_DATA,
+ NANOARROW_BUFFER_TYPE_DATA_VIEW
};
-/// \brief The maximum number of buffers in an ArrowArrayView or ArrowLayout
+/// \brief The maximum number of fixed buffers in an ArrowArrayView or
ArrowLayout
/// \ingroup nanoarrow-array-view
-///
-/// All currently supported types have 3 buffers or fewer; however, future
types
-/// may involve a variable number of buffers (e.g., string view). These buffers
-/// will be represented by separate members of the ArrowArrayView or
ArrowLayout.
#define NANOARROW_MAX_FIXED_BUFFERS 3
/// \brief An non-owning view of a string
@@ -671,6 +674,7 @@ union ArrowBufferViewData {
const double* as_double;
const float* as_float;
const char* as_char;
+ const union ArrowBinaryView* as_binary_view;
};
/// \brief An non-owning view of a buffer
@@ -808,6 +812,12 @@ struct ArrowArrayView {
/// type_id == union_type_id_map[128 + child_index]. This value may be
/// NULL in the case where child_id == type_id.
int8_t* union_type_id_map;
+
+ /// \brief Number of variadic buffers
+ int32_t n_variadic_buffers;
+
+ /// \brief Size of each variadic buffer
+ int64_t* variadic_buffer_sizes;
};
// Used as the private data member for ArrowArrays allocated here and accessed
@@ -822,8 +832,8 @@ struct ArrowArrayPrivateData {
// The array of pointers to buffers. This must be updated after a sequence
// of appends to synchronize its values with the actual buffer addresses
- // (which may have ben reallocated uring that time)
- const void* buffer_data[NANOARROW_MAX_FIXED_BUFFERS];
+ // (which may have been reallocated during that time)
+ const void** buffer_data;
// The storage data type, or NANOARROW_TYPE_UNINITIALIZED if unknown
enum ArrowType storage_type;
@@ -835,6 +845,15 @@ struct ArrowArrayPrivateData {
// In the future this could be replaced with a type id<->child mapping
// to support constructing unions in append mode where type_id != child_index
int8_t union_type_id_is_child_index;
+
+ // Number of variadic buffers for binary view types
+ int32_t n_variadic_buffers;
+
+ // Variadic buffers for binary view types
+ struct ArrowBuffer* variadic_buffers;
+
+ // Size of each variadic buffer in bytes
+ int64_t* variadic_buffer_sizes;
};
/// \brief A representation of an interval.
diff --git a/src/nanoarrow/common/schema.c b/src/nanoarrow/common/schema.c
index 21cdcd95..93e9ec57 100644
--- a/src/nanoarrow/common/schema.c
+++ b/src/nanoarrow/common/schema.c
@@ -101,8 +101,12 @@ static const char* ArrowSchemaFormatTemplate(enum
ArrowType type) {
return "u";
case NANOARROW_TYPE_LARGE_STRING:
return "U";
+ case NANOARROW_TYPE_STRING_VIEW:
+ return "vu";
case NANOARROW_TYPE_BINARY:
return "z";
+ case NANOARROW_TYPE_BINARY_VIEW:
+ return "vz";
case NANOARROW_TYPE_LARGE_BINARY:
return "Z";
@@ -1019,6 +1023,24 @@ static ArrowErrorCode ArrowSchemaViewParse(struct
ArrowSchemaView* schema_view,
return EINVAL;
}
+ // view types
+ case 'v': {
+ switch (format[1]) {
+ case 'u':
+ ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_STRING_VIEW);
+ *format_end_out = format + 2;
+ return NANOARROW_OK;
+ case 'z':
+ ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_BINARY_VIEW);
+ *format_end_out = format + 2;
+ return NANOARROW_OK;
+ default:
+ ArrowErrorSet(error, "Expected 'u', or 'z' following 'v' but found
'%s'",
+ format + 1);
+ return EINVAL;
+ }
+ }
+
default:
ArrowErrorSet(error, "Unknown format: '%s'", format);
return EINVAL;
@@ -1150,6 +1172,8 @@ static ArrowErrorCode ArrowSchemaViewValidate(struct
ArrowSchemaView* schema_vie
case NANOARROW_TYPE_TIME32:
case NANOARROW_TYPE_TIME64:
case NANOARROW_TYPE_DURATION:
+ case NANOARROW_TYPE_BINARY_VIEW:
+ case NANOARROW_TYPE_STRING_VIEW:
return ArrowSchemaViewValidateNChildren(schema_view, 0, error);
case NANOARROW_TYPE_FIXED_SIZE_BINARY:
diff --git a/src/nanoarrow/common/schema_test.cc
b/src/nanoarrow/common/schema_test.cc
index a6a2ea5b..3371632b 100644
--- a/src/nanoarrow/common/schema_test.cc
+++ b/src/nanoarrow/common/schema_test.cc
@@ -108,6 +108,10 @@ TEST(SchemaTest, SchemaInitSimple) {
ExpectSchemaInitOk(NANOARROW_TYPE_INTERVAL_MONTHS, month_interval());
ExpectSchemaInitOk(NANOARROW_TYPE_INTERVAL_DAY_TIME, day_time_interval());
ExpectSchemaInitOk(NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO,
month_day_nano_interval());
+#if defined(ARROW_VERSION_MAJOR) && ARROW_VERSION_MAJOR >= 15
+ ExpectSchemaInitOk(NANOARROW_TYPE_STRING_VIEW, utf8_view());
+ ExpectSchemaInitOk(NANOARROW_TYPE_BINARY_VIEW, binary_view());
+#endif
}
TEST(SchemaTest, SchemaInitSimpleError) {
@@ -908,6 +912,32 @@ TEST(SchemaViewTest, SchemaViewInitBinaryAndString) {
EXPECT_EQ(schema_view.layout.element_size_bits[2], 0);
EXPECT_EQ(ArrowSchemaToStdString(&schema), "large_string");
ArrowSchemaRelease(&schema);
+
+ ARROW_EXPECT_OK(ExportType(*utf8_view(), &schema));
+ EXPECT_EQ(ArrowSchemaViewInit(&schema_view, &schema, &error), NANOARROW_OK);
+ EXPECT_EQ(schema_view.type, NANOARROW_TYPE_STRING_VIEW);
+ EXPECT_EQ(schema_view.storage_type, NANOARROW_TYPE_STRING_VIEW);
+ EXPECT_EQ(schema_view.layout.buffer_type[0], NANOARROW_BUFFER_TYPE_VALIDITY);
+ EXPECT_EQ(schema_view.layout.buffer_type[1],
NANOARROW_BUFFER_TYPE_DATA_VIEW);
+ EXPECT_EQ(schema_view.layout.buffer_data_type[0], NANOARROW_TYPE_BOOL);
+ EXPECT_EQ(schema_view.layout.buffer_data_type[1],
NANOARROW_TYPE_STRING_VIEW);
+ EXPECT_EQ(schema_view.layout.element_size_bits[0], 1);
+ EXPECT_EQ(schema_view.layout.element_size_bits[1], 128);
+ EXPECT_EQ(ArrowSchemaToStdString(&schema), "string_view");
+ ArrowSchemaRelease(&schema);
+
+ ARROW_EXPECT_OK(ExportType(*binary_view(), &schema));
+ EXPECT_EQ(ArrowSchemaViewInit(&schema_view, &schema, &error), NANOARROW_OK);
+ EXPECT_EQ(schema_view.type, NANOARROW_TYPE_BINARY_VIEW);
+ EXPECT_EQ(schema_view.storage_type, NANOARROW_TYPE_BINARY_VIEW);
+ EXPECT_EQ(schema_view.layout.buffer_type[0], NANOARROW_BUFFER_TYPE_VALIDITY);
+ EXPECT_EQ(schema_view.layout.buffer_type[1],
NANOARROW_BUFFER_TYPE_DATA_VIEW);
+ EXPECT_EQ(schema_view.layout.buffer_data_type[0], NANOARROW_TYPE_BOOL);
+ EXPECT_EQ(schema_view.layout.buffer_data_type[1],
NANOARROW_TYPE_BINARY_VIEW);
+ EXPECT_EQ(schema_view.layout.element_size_bits[0], 1);
+ EXPECT_EQ(schema_view.layout.element_size_bits[1], 128);
+ EXPECT_EQ(ArrowSchemaToStdString(&schema), "binary_view");
+ ArrowSchemaRelease(&schema);
}
TEST(SchemaViewTest, SchemaViewInitBinaryAndStringErrors) {
diff --git a/src/nanoarrow/common/utils.c b/src/nanoarrow/common/utils.c
index d8923b85..70d5da77 100644
--- a/src/nanoarrow/common/utils.c
+++ b/src/nanoarrow/common/utils.c
@@ -179,6 +179,16 @@ void ArrowLayoutInit(struct ArrowLayout* layout, enum
ArrowType storage_type) {
layout->buffer_data_type[2] = NANOARROW_TYPE_BINARY;
break;
+ case NANOARROW_TYPE_BINARY_VIEW:
+ layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_VIEW;
+ layout->buffer_data_type[1] = NANOARROW_TYPE_BINARY_VIEW;
+ layout->element_size_bits[1] = 128;
+ break;
+ case NANOARROW_TYPE_STRING_VIEW:
+ layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_VIEW;
+ layout->buffer_data_type[1] = NANOARROW_TYPE_STRING_VIEW;
+ layout->element_size_bits[1] = 128;
+
default:
break;
}
diff --git a/src/nanoarrow/testing/testing.cc b/src/nanoarrow/testing/testing.cc
index 13263f11..ce453b9c 100644
--- a/src/nanoarrow/testing/testing.cc
+++ b/src/nanoarrow/testing/testing.cc
@@ -1932,6 +1932,8 @@ ArrowErrorCode SetArrayColumnBuffers(const json& value,
ArrowArrayView* array_vi
}
break;
}
+ case NANOARROW_BUFFER_TYPE_DATA_VIEW:
+ return ENOTSUP;
case NANOARROW_BUFFER_TYPE_NONE:
break;
}