This is an automated email from the ASF dual-hosted git repository.
bkietz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 94fc124288 GH-36099: [C++] Add Utf8View and BinaryView to the c ABI
(#38443)
94fc124288 is described below
commit 94fc124288a0b925c26ea3800ec380020abe9e42
Author: Benjamin Kietzman <[email protected]>
AuthorDate: Wed Nov 29 12:22:20 2023 -0500
GH-36099: [C++] Add Utf8View and BinaryView to the c ABI (#38443)
### Rationale for this change
Utf8View and BinaryView should be added to the c ABI spec and to the c++
library's importer/exporter.
### Are these changes tested?
Yes, minimally
### Are there any user-facing changes?
View arrays will be importable/exportable through the c ABI in c++
* Closes: #36099
Authored-by: Benjamin Kietzman <[email protected]>
Signed-off-by: Benjamin Kietzman <[email protected]>
---
cpp/src/arrow/c/bridge.cc | 87 +++++++++++++++---
cpp/src/arrow/c/bridge_test.cc | 137 ++++++++++++++++++++++++-----
dev/archery/archery/cli.py | 2 +-
dev/archery/archery/integration/datagen.py | 4 +-
docs/source/format/CDataInterface.rst | 12 +++
5 files changed, 203 insertions(+), 39 deletions(-)
diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc
index eeec75f2f4..238afb0328 100644
--- a/cpp/src/arrow/c/bridge.cc
+++ b/cpp/src/arrow/c/bridge.cc
@@ -20,6 +20,7 @@
#include <algorithm>
#include <cerrno>
#include <cstring>
+#include <memory>
#include <string>
#include <string_view>
#include <utility>
@@ -41,6 +42,7 @@
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
+#include "arrow/util/range.h"
#include "arrow/util/small_vector.h"
#include "arrow/util/string.h"
#include "arrow/util/value_parsing.h"
@@ -260,7 +262,7 @@ struct SchemaExporter {
// Dictionary type: parent struct describes index type,
// child dictionary struct describes value type.
RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), this));
- dict_exporter_.reset(new SchemaExporter());
+ dict_exporter_ = std::make_unique<SchemaExporter>();
RETURN_NOT_OK(dict_exporter_->ExportType(*dict_type.value_type()));
} else {
RETURN_NOT_OK(VisitTypeInline(type, this));
@@ -357,10 +359,14 @@ struct SchemaExporter {
Status Visit(const LargeBinaryType& type) { return SetFormat("Z"); }
+ Status Visit(const BinaryViewType& type) { return SetFormat("vz"); }
+
Status Visit(const StringType& type) { return SetFormat("u"); }
Status Visit(const LargeStringType& type) { return SetFormat("U"); }
+ Status Visit(const StringViewType& type) { return SetFormat("vu"); }
+
Status Visit(const Date32Type& type) { return SetFormat("tdD"); }
Status Visit(const Date64Type& type) { return SetFormat("tdm"); }
@@ -521,13 +527,14 @@ namespace {
struct ExportedArrayPrivateData :
PoolAllocationMixin<ExportedArrayPrivateData> {
// The buffers are owned by the ArrayData member
- StaticVector<const void*, 3> buffers_;
+ SmallVector<const void*, 3> buffers_;
struct ArrowArray dictionary_;
SmallVector<struct ArrowArray, 1> children_;
SmallVector<struct ArrowArray*, 4> child_pointers_;
std::shared_ptr<ArrayData> data_;
std::shared_ptr<Device::SyncEvent> sync_;
+ std::vector<int64_t> variadic_buffer_sizes_;
ExportedArrayPrivateData() = default;
ARROW_DEFAULT_MOVE_AND_ASSIGN(ExportedArrayPrivateData);
@@ -570,15 +577,32 @@ struct ArrayExporter {
--n_buffers;
++buffers_begin;
}
+
+ bool need_variadic_buffer_sizes =
+ data->type->id() == Type::BINARY_VIEW || data->type->id() ==
Type::STRING_VIEW;
+ if (need_variadic_buffer_sizes) {
+ ++n_buffers;
+ }
+
export_.buffers_.resize(n_buffers);
std::transform(buffers_begin, data->buffers.end(),
export_.buffers_.begin(),
[](const std::shared_ptr<Buffer>& buffer) -> const void* {
return buffer ? buffer->data() : nullptr;
});
+ if (need_variadic_buffer_sizes) {
+ auto variadic_buffers = util::span(data->buffers).subspan(2);
+ export_.variadic_buffer_sizes_.resize(variadic_buffers.size());
+ size_t i = 0;
+ for (const auto& buf : variadic_buffers) {
+ export_.variadic_buffer_sizes_[i++] = buf->size();
+ }
+ export_.buffers_.back() = export_.variadic_buffer_sizes_.data();
+ }
+
// Export dictionary
if (data->dictionary != nullptr) {
- dict_exporter_.reset(new ArrayExporter());
+ dict_exporter_ = std::make_unique<ArrayExporter>();
RETURN_NOT_OK(dict_exporter_->Export(data->dictionary));
}
@@ -795,7 +819,7 @@ Status InvalidFormatString(std::string_view v) {
class FormatStringParser {
public:
- FormatStringParser() {}
+ FormatStringParser() = default;
explicit FormatStringParser(std::string_view v) : view_(v), index_(0) {}
@@ -941,8 +965,6 @@ Result<DecodedMetadata> DecodeMetadata(const char*
metadata) {
}
struct SchemaImporter {
- SchemaImporter() : c_struct_(nullptr), guard_(nullptr) {}
-
Status Import(struct ArrowSchema* src) {
if (ArrowSchemaIsReleased(src)) {
return Status::Invalid("Cannot import released ArrowSchema");
@@ -1068,6 +1090,8 @@ struct SchemaImporter {
return ProcessPrimitive(binary());
case 'Z':
return ProcessPrimitive(large_binary());
+ case 'v':
+ return ProcessBinaryView();
case 'w':
return ProcessFixedSizeBinary();
case 'd':
@@ -1080,6 +1104,17 @@ struct SchemaImporter {
return f_parser_.Invalid();
}
+ Status ProcessBinaryView() {
+ RETURN_NOT_OK(f_parser_.CheckHasNext());
+ switch (f_parser_.Next()) {
+ case 'z':
+ return ProcessPrimitive(binary_view());
+ case 'u':
+ return ProcessPrimitive(utf8_view());
+ }
+ return f_parser_.Invalid();
+ }
+
Status ProcessTemporal() {
RETURN_NOT_OK(f_parser_.CheckHasNext());
switch (f_parser_.Next()) {
@@ -1360,8 +1395,8 @@ struct SchemaImporter {
return Status::OK();
}
- struct ArrowSchema* c_struct_;
- SchemaExportGuard guard_;
+ struct ArrowSchema* c_struct_{nullptr};
+ SchemaExportGuard guard_{nullptr};
FormatStringParser f_parser_;
int64_t recursion_level_;
std::vector<SchemaImporter> child_importers_;
@@ -1429,7 +1464,7 @@ class ImportedBuffer : public Buffer {
std::shared_ptr<ImportedArrayData> import)
: Buffer(data, size, mm, nullptr, device_type),
import_(std::move(import)) {}
- ~ImportedBuffer() override {}
+ ~ImportedBuffer() override = default;
std::shared_ptr<Device::SyncEvent> device_sync_event() override {
return import_->device_sync_;
@@ -1441,9 +1476,7 @@ class ImportedBuffer : public Buffer {
struct ArrayImporter {
explicit ArrayImporter(const std::shared_ptr<DataType>& type)
- : type_(type),
- zero_size_buffer_(std::make_shared<Buffer>(kZeroSizeArea, 0)),
- device_type_(DeviceAllocationType::kCPU) {}
+ : type_(type), zero_size_buffer_(std::make_shared<Buffer>(kZeroSizeArea,
0)) {}
Status Import(struct ArrowDeviceArray* src, const DeviceMemoryMapper&
mapper) {
ARROW_ASSIGN_OR_RAISE(memory_mgr_, mapper(src->device_type,
src->device_id));
@@ -1591,6 +1624,10 @@ struct ArrayImporter {
Status Visit(const LargeBinaryType& type) { return ImportStringLike(type); }
+ Status Visit(const StringViewType& type) { return ImportBinaryView(type); }
+
+ Status Visit(const BinaryViewType& type) { return ImportBinaryView(type); }
+
Status Visit(const ListType& type) { return ImportListLike(type); }
Status Visit(const LargeListType& type) { return ImportListLike(type); }
@@ -1673,6 +1710,28 @@ struct ArrayImporter {
return Status::OK();
}
+ Status ImportBinaryView(const BinaryViewType&) {
+ RETURN_NOT_OK(CheckNoChildren());
+ if (c_struct_->n_buffers < 3) {
+ return Status::Invalid("Expected at least 3 buffers for imported type ",
+ type_->ToString(), ", ArrowArray struct has ",
+ c_struct_->n_buffers);
+ }
+ RETURN_NOT_OK(AllocateArrayData());
+ RETURN_NOT_OK(ImportNullBitmap());
+ RETURN_NOT_OK(ImportFixedSizeBuffer(1, BinaryViewType::kSize));
+
+ // The last C data buffer stores buffer sizes, and shouldn't be imported
+ auto* buffer_sizes =
+ static_cast<const int64_t*>(c_struct_->buffers[c_struct_->n_buffers -
1]);
+
+ for (int32_t buffer_id = 2; buffer_id < c_struct_->n_buffers - 1;
++buffer_id) {
+ RETURN_NOT_OK(ImportBuffer(buffer_id, buffer_sizes[buffer_id - 2]));
+ }
+ data_->buffers.pop_back();
+ return Status::OK();
+ }
+
template <typename StringType>
Status ImportStringLike(const StringType& type) {
RETURN_NOT_OK(CheckNoChildren());
@@ -1836,7 +1895,7 @@ struct ArrayImporter {
std::shared_ptr<Buffer> zero_size_buffer_;
std::shared_ptr<MemoryManager> memory_mgr_;
- DeviceAllocationType device_type_;
+ DeviceAllocationType device_type_{DeviceAllocationType::kCPU};
};
} // namespace
@@ -2042,7 +2101,7 @@ class ArrayStreamBatchReader : public RecordBatchReader {
DCHECK(!ArrowArrayStreamIsReleased(&stream_));
}
- ~ArrayStreamBatchReader() {
+ ~ArrayStreamBatchReader() override {
if (!ArrowArrayStreamIsReleased(&stream_)) {
ArrowArrayStreamRelease(&stream_);
}
diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc
index 362df83378..326c67f5ec 100644
--- a/cpp/src/arrow/c/bridge_test.cc
+++ b/cpp/src/arrow/c/bridge_test.cc
@@ -38,11 +38,13 @@
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/matchers.h"
#include "arrow/testing/util.h"
+#include "arrow/util/binary_view_util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/endian.h"
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
+#include "arrow/util/range.h"
// TODO(GH-37221): Remove these ifdef checks when compute dependency is removed
#ifdef ARROW_COMPUTE
@@ -58,6 +60,7 @@ using internal::ArrayStreamExportTraits;
using internal::checked_cast;
using internal::SchemaExportGuard;
using internal::SchemaExportTraits;
+using internal::Zip;
template <typename T>
struct ExportTraits {};
@@ -91,7 +94,7 @@ class ReleaseCallback {
public:
using CType = typename Traits::CType;
- explicit ReleaseCallback(CType* c_struct) : called_(false) {
+ explicit ReleaseCallback(CType* c_struct) {
orig_release_ = c_struct->release;
orig_private_data_ = c_struct->private_data;
c_struct->release = StaticRelease;
@@ -123,7 +126,7 @@ class ReleaseCallback {
private:
ARROW_DISALLOW_COPY_AND_ASSIGN(ReleaseCallback);
- bool called_;
+ bool called_{false};
void (*orig_release_)(CType*);
void* orig_private_data_;
};
@@ -238,8 +241,7 @@ struct SchemaExportChecker {
flattened_flags.empty()
? std::vector<int64_t>(flattened_formats_.size(),
kDefaultFlags)
: std::move(flattened_flags)),
- flattened_metadata_(std::move(flattened_metadata)),
- flattened_index_(0) {}
+ flattened_metadata_(std::move(flattened_metadata)) {}
void operator()(struct ArrowSchema* c_export, bool inner = false) {
ASSERT_LT(flattened_index_, flattened_formats_.size());
@@ -288,7 +290,7 @@ struct SchemaExportChecker {
const std::vector<std::string> flattened_names_;
std::vector<int64_t> flattened_flags_;
const std::vector<std::string> flattened_metadata_;
- size_t flattened_index_;
+ size_t flattened_index_{0};
};
class TestSchemaExport : public ::testing::Test {
@@ -354,6 +356,8 @@ TEST_F(TestSchemaExport, Primitive) {
TestPrimitive(large_binary(), "Z");
TestPrimitive(utf8(), "u");
TestPrimitive(large_utf8(), "U");
+ TestPrimitive(binary_view(), "vz");
+ TestPrimitive(utf8_view(), "vu");
TestPrimitive(decimal(16, 4), "d:16,4");
TestPrimitive(decimal256(16, 4), "d:16,4,256");
@@ -565,12 +569,24 @@ struct ArrayExportChecker {
--expected_n_buffers;
++expected_buffers;
}
- ASSERT_EQ(c_export->n_buffers, expected_n_buffers);
+ bool has_variadic_buffer_sizes = expected_data.type->id() ==
Type::STRING_VIEW ||
+ expected_data.type->id() ==
Type::BINARY_VIEW;
+ ASSERT_EQ(c_export->n_buffers, expected_n_buffers +
has_variadic_buffer_sizes);
ASSERT_NE(c_export->buffers, nullptr);
- for (int64_t i = 0; i < c_export->n_buffers; ++i) {
+
+ for (int64_t i = 0; i < expected_n_buffers; ++i) {
auto expected_ptr = expected_buffers[i] ? expected_buffers[i]->data() :
nullptr;
ASSERT_EQ(c_export->buffers[i], expected_ptr);
}
+ if (has_variadic_buffer_sizes) {
+ auto variadic_buffers = util::span(expected_data.buffers).subspan(2);
+ auto variadic_buffer_sizes = util::span(
+ static_cast<const int64_t*>(c_export->buffers[c_export->n_buffers -
1]),
+ variadic_buffers.size());
+ for (auto [buf, size] : Zip(variadic_buffers, variadic_buffer_sizes)) {
+ ASSERT_EQ(buf->size(), size);
+ }
+ }
if (expected_data.dictionary != nullptr) {
// Recurse into dictionary
@@ -883,6 +899,8 @@ TEST_F(TestArrayExport, Primitive) {
TestPrimitive(large_binary(), R"(["foo", "bar", null])");
TestPrimitive(utf8(), R"(["foo", "bar", null])");
TestPrimitive(large_utf8(), R"(["foo", "bar", null])");
+ TestPrimitive(binary_view(), R"(["foo", "bar", null])");
+ TestPrimitive(utf8_view(), R"(["foo", "bar", null])");
TestPrimitive(decimal(16, 4), R"(["1234.5670", null])");
TestPrimitive(decimal256(16, 4), R"(["1234.5670", null])");
@@ -896,6 +914,39 @@ TEST_F(TestArrayExport, PrimitiveSliced) {
TestPrimitive(factory);
}
+constexpr std::string_view binary_view_buffer_content0 = "12345foo bar baz
quux",
+ binary_view_buffer_content1 =
"BinaryViewMultipleBuffers";
+
+static const BinaryViewType::c_type binary_view_buffer1[] = {
+ util::ToBinaryView(binary_view_buffer_content0, 0, 0),
+ util::ToInlineBinaryView("foo"),
+ util::ToBinaryView(binary_view_buffer_content1, 1, 0),
+ util::ToInlineBinaryView("bar"),
+ util::ToBinaryView(binary_view_buffer_content0.substr(5), 0, 5),
+ util::ToInlineBinaryView("baz"),
+ util::ToBinaryView(binary_view_buffer_content1.substr(6, 13), 1, 6),
+ util::ToInlineBinaryView("quux"),
+};
+
+static auto MakeBinaryViewArrayWithMultipleDataBuffers() {
+ static const auto kLength =
static_cast<int64_t>(std::size(binary_view_buffer1));
+ return std::make_shared<BinaryViewArray>(
+ binary_view(), kLength,
+ Buffer::FromVector(std::vector(binary_view_buffer1, binary_view_buffer1
+ kLength)),
+ BufferVector{
+ Buffer::FromString(std::string{binary_view_buffer_content0}),
+ Buffer::FromString(std::string{binary_view_buffer_content1}),
+ });
+}
+
+TEST_F(TestArrayExport, BinaryViewMultipleBuffers) {
+ TestPrimitive(MakeBinaryViewArrayWithMultipleDataBuffers);
+ TestPrimitive([&] {
+ auto arr = MakeBinaryViewArrayWithMultipleDataBuffers();
+ return arr->Slice(1, arr->length() - 2);
+ });
+}
+
TEST_F(TestArrayExport, Null) {
TestPrimitive(null(), "[null, null, null]");
TestPrimitive(null(), "[]");
@@ -1220,13 +1271,16 @@ TEST_F(TestArrayExport, ExportRecordBatch) {
static const char kMyDeviceTypeName[] = "arrowtest::MyDevice";
static const ArrowDeviceType kMyDeviceType = ARROW_DEVICE_EXT_DEV;
-static const void* kMyEventPtr =
reinterpret_cast<void*>(uintptr_t(0xBAADF00D));
+static const void* kMyEventPtr =
+ reinterpret_cast<void*>(static_cast<uintptr_t>(0xBAADF00D));
class MyBuffer final : public MutableBuffer {
public:
using MutableBuffer::MutableBuffer;
- ~MyBuffer() { default_memory_pool()->Free(const_cast<uint8_t*>(data_),
size_); }
+ ~MyBuffer() override {
+ default_memory_pool()->Free(const_cast<uint8_t*>(data_), size_);
+ }
std::shared_ptr<Device::SyncEvent> device_sync_event() override { return
device_sync_; }
@@ -1256,7 +1310,7 @@ class MyDevice : public Device {
explicit MySyncEvent(void* sync_event, release_fn_t release_sync_event)
: Device::SyncEvent(sync_event, release_sync_event) {}
- virtual ~MySyncEvent() = default;
+ ~MySyncEvent() override = default;
Status Wait() override { return Status::OK(); }
Status Record(const Device::Stream&) override { return Status::OK(); }
};
@@ -1966,6 +2020,10 @@ TEST_F(TestSchemaImport, String) {
CheckImport(large_utf8());
FillPrimitive("Z");
CheckImport(large_binary());
+ FillPrimitive("vu");
+ CheckImport(utf8_view());
+ FillPrimitive("vz");
+ CheckImport(binary_view());
FillPrimitive("w:3");
CheckImport(fixed_size_binary(3));
@@ -2419,6 +2477,16 @@ static const void* large_string_buffers_no_nulls1[3] = {
static const void* large_string_buffers_omitted[3] = {
nullptr, large_string_offsets_buffer1, nullptr};
+constexpr int64_t binary_view_buffer_sizes1[] =
{binary_view_buffer_content0.size(),
+
binary_view_buffer_content1.size()};
+static const void* binary_view_buffers_no_nulls1[] = {
+ nullptr,
+ binary_view_buffer1,
+ binary_view_buffer_content0.data(),
+ binary_view_buffer_content1.data(),
+ binary_view_buffer_sizes1,
+};
+
static const int32_t list_offsets_buffer1[] = {0, 2, 2, 5, 6, 8};
static const void* list_buffers_no_nulls1[2] = {nullptr, list_offsets_buffer1};
static const void* list_buffers_nulls1[2] = {bits_buffer1,
list_offsets_buffer1};
@@ -2510,6 +2578,16 @@ class TestArrayImport : public ::testing::Test {
c->buffers = buffers;
}
+ void FillStringViewLike(struct ArrowArray* c, int64_t length, int64_t
null_count,
+ int64_t offset, const void** buffers,
+ int32_t data_buffer_count) {
+ c->length = length;
+ c->null_count = null_count;
+ c->offset = offset;
+ c->n_buffers = 2 + data_buffer_count + 1;
+ c->buffers = buffers;
+ }
+
void FillListLike(struct ArrowArray* c, int64_t length, int64_t null_count,
int64_t offset, const void** buffers) {
c->length = length;
@@ -2583,6 +2661,12 @@ class TestArrayImport : public ::testing::Test {
FillStringLike(&c_struct_, length, null_count, offset, buffers);
}
+ void FillStringViewLike(int64_t length, int64_t null_count, int64_t offset,
+ const void** buffers, int32_t data_buffer_count) {
+ FillStringViewLike(&c_struct_, length, null_count, offset, buffers,
+ data_buffer_count);
+ }
+
void FillListLike(int64_t length, int64_t null_count, int64_t offset,
const void** buffers) {
FillListLike(&c_struct_, length, null_count, offset, buffers);
@@ -2834,6 +2918,10 @@ TEST_F(TestArrayImport, String) {
FillStringLike(4, 0, 0, large_string_buffers_no_nulls1);
CheckImport(ArrayFromJSON(large_binary(), R"(["foo", "", "bar", "quux"])"));
+ auto length = static_cast<int64_t>(std::size(binary_view_buffer1));
+ FillStringViewLike(length, 0, 0, binary_view_buffers_no_nulls1, 2);
+ CheckImport(MakeBinaryViewArrayWithMultipleDataBuffers());
+
// Empty array with null data pointers
FillStringLike(0, 0, 0, string_buffers_omitted);
CheckImport(ArrayFromJSON(utf8(), "[]"));
@@ -3530,15 +3618,16 @@ TEST_F(TestSchemaRoundtrip, Primitive) {
TestWithTypeFactory(boolean);
TestWithTypeFactory(float16);
- TestWithTypeFactory(std::bind(decimal128, 19, 4));
- TestWithTypeFactory(std::bind(decimal256, 19, 4));
- TestWithTypeFactory(std::bind(decimal128, 19, 0));
- TestWithTypeFactory(std::bind(decimal256, 19, 0));
- TestWithTypeFactory(std::bind(decimal128, 19, -5));
- TestWithTypeFactory(std::bind(decimal256, 19, -5));
- TestWithTypeFactory(std::bind(fixed_size_binary, 3));
+ TestWithTypeFactory([] { return decimal128(19, 4); });
+ TestWithTypeFactory([] { return decimal256(19, 4); });
+ TestWithTypeFactory([] { return decimal128(19, 0); });
+ TestWithTypeFactory([] { return decimal256(19, 0); });
+ TestWithTypeFactory([] { return decimal128(19, -5); });
+ TestWithTypeFactory([] { return decimal256(19, -5); });
+ TestWithTypeFactory([] { return fixed_size_binary(3); });
TestWithTypeFactory(binary);
TestWithTypeFactory(large_utf8);
+ TestWithTypeFactory(binary_view);
}
TEST_F(TestSchemaRoundtrip, Temporal) {
@@ -3546,8 +3635,8 @@ TEST_F(TestSchemaRoundtrip, Temporal) {
TestWithTypeFactory(day_time_interval);
TestWithTypeFactory(month_interval);
TestWithTypeFactory(month_day_nano_interval);
- TestWithTypeFactory(std::bind(time64, TimeUnit::NANO));
- TestWithTypeFactory(std::bind(duration, TimeUnit::MICRO));
+ TestWithTypeFactory([] { return time64(TimeUnit::NANO); });
+ TestWithTypeFactory([] { return duration(TimeUnit::MICRO); });
TestWithTypeFactory([]() { return arrow::timestamp(TimeUnit::MICRO,
"Europe/Paris"); });
}
@@ -3803,6 +3892,14 @@ TEST_F(TestArrayRoundtrip, Primitive) {
R"([[4, 5, 6], [1, -600, 5000], null, null])");
}
+TEST_F(TestArrayRoundtrip, BinaryViewMultipleBuffers) {
+ TestWithArrayFactory(MakeBinaryViewArrayWithMultipleDataBuffers);
+ TestWithArrayFactory([&] {
+ auto arr = MakeBinaryViewArrayWithMultipleDataBuffers();
+ return arr->Slice(1, arr->length() - 2);
+ });
+}
+
TEST_F(TestArrayRoundtrip, UnknownNullCount) {
TestWithArrayFactory([]() -> Result<std::shared_ptr<Array>> {
auto arr = ArrayFromJSON(int32(), "[0, 1, 2]");
@@ -4205,8 +4302,6 @@ TEST_F(TestDeviceArrayRoundtrip, Primitive) {
TestWithJSON(mm, int32(), "[4, 5, null]");
}
-// TODO C -> C++ -> C roundtripping tests?
-
////////////////////////////////////////////////////////////////////////////
// Array stream export tests
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 7a3b45f978..7211b6c87a 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -42,7 +42,7 @@ logging.basicConfig(level=logging.INFO)
BOOL = ArrowBool()
[email protected]()
[email protected](context_settings={"help_option_names": ["-h", "--help"]})
@click.option("--debug", type=BOOL, is_flag=True, default=False,
help="Increase logging with debugging output.")
@click.option("--pdb", type=BOOL, is_flag=True, default=False,
diff --git a/dev/archery/archery/integration/datagen.py
b/dev/archery/archery/integration/datagen.py
index ff10c0bb03..80cc1c1e76 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -1858,9 +1858,7 @@ def get_generated_json_files(tempdir=None):
.skip_tester('Go')
.skip_tester('Java')
.skip_tester('JS')
- .skip_tester('Rust')
- .skip_format(SKIP_C_SCHEMA, 'C++')
- .skip_format(SKIP_C_ARRAY, 'C++'),
+ .skip_tester('Rust'),
generate_extension_case()
.skip_tester('C#')
diff --git a/docs/source/format/CDataInterface.rst
b/docs/source/format/CDataInterface.rst
index 36952ad99e..c9beddabed 100644
--- a/docs/source/format/CDataInterface.rst
+++ b/docs/source/format/CDataInterface.rst
@@ -140,10 +140,14 @@ strings:
+-----------------+---------------------------------------------------+------------+
| ``Z`` | large binary |
|
+-----------------+---------------------------------------------------+------------+
+| ``vz`` | binary view |
|
++-----------------+---------------------------------------------------+------------+
| ``u`` | utf-8 string |
|
+-----------------+---------------------------------------------------+------------+
| ``U`` | large utf-8 string |
|
+-----------------+---------------------------------------------------+------------+
+| ``vu`` | utf-8 view |
|
++-----------------+---------------------------------------------------+------------+
| ``d:19,10`` | decimal128 [precision 19, scale 10] |
|
+-----------------+---------------------------------------------------+------------+
| ``d:19,10,NNN`` | decimal bitwidth = NNN [precision 19, scale 10] |
|
@@ -548,6 +552,14 @@ parameterized extension types).
The ``ArrowArray`` structure exported from an extension array simply points
to the storage data of the extension array.
+Binary view arrays
+------------------
+
+For binary or utf-8 view arrays, an extra buffer is appended which stores
+the lengths of each variadic data buffer as ``int64_t``. This buffer is
+necessary since these buffer lengths are not trivially extractable from
+other data in an array of binary or utf-8 view type.
+
.. _c-data-interface-semantics:
Semantics