pitrou commented on code in PR #37792:
URL: https://github.com/apache/arrow/pull/37792#discussion_r1352433949


##########
cpp/src/arrow/type_traits.h:
##########
@@ -341,6 +341,15 @@ struct TypeTraits<BinaryType> {
   static inline std::shared_ptr<DataType> type_singleton() { return binary(); }
 };
 
+template <>
+struct TypeTraits<BinaryViewType> {
+  using ArrayType = BinaryViewArray;
+  using BuilderType = BinaryViewBuilder;
+  using ScalarType = BinaryViewScalar;
+  using CType = BinaryViewType::c_type;
+  constexpr static bool is_parameter_free = false;

Review Comment:
   Should be true now?



##########
cpp/src/arrow/util/assume_aligned.h:
##########
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace arrow::util {
+
+template <size_t N, typename P>
+[[nodiscard]] constexpr P* AssumeAligned(P* ptr) {
+#if defined(__has_builtin)
+#if __has_builtin(__builtin_assume_aligned)
+#define ARROW_HAS_BUILTIN_ASSUME_ALIGNED
+#endif
+#endif
+
+#if defined(ARROW_HAS_BUILTIN_ASSUME_ALIGNED)
+#undef ARROW_HAS_BUILTIN_ASSUME_ALIGNED
+  return static_cast<P*>(__builtin_assume_aligned(ptr, N));

Review Comment:
   I still don't really understand the point of this. Do you know of any 
situation (not some theoretical argument) where this is making a difference?



##########
cpp/src/arrow/type.h:
##########
@@ -710,6 +717,103 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType {
   explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) 
{}
 };
 
+/// \brief Concrete type class for variable-size binary view data
+class ARROW_EXPORT BinaryViewType : public DataType {
+ public:
+  static constexpr Type::type type_id = Type::BINARY_VIEW;
+  static constexpr bool is_utf8 = false;
+  using PhysicalType = BinaryViewType;
+
+  static constexpr int kSize = 16;
+  static constexpr int kInlineSize = 12;
+  static constexpr int kPrefixSize = 4;
+
+  /// Variable length string or binary with inline optimization for small 
values (12 bytes
+  /// or fewer). This is similar to std::string_view except limited in size to 
INT32_MAX
+  /// and at least the first four bytes of the string are copied inline 
(accessible
+  /// without pointer dereference). This inline prefix allows failing 
comparisons early.
+  /// Furthermore when dealing with short strings the CPU cache working set is 
reduced
+  /// since many can be inline.
+  ///
+  /// This union supports two states:
+  ///
+  /// - Entirely inlined string data
+  ///                |----|--------------|
+  ///                 ^    ^
+  ///                 |    |
+  ///              size    in-line string data, zero padded
+  ///
+  /// - Reference into a buffer
+  ///                |----|----|----|----|
+  ///                 ^    ^    ^    ^
+  ///                 |    |    |    |
+  ///              size    |    |    `------.
+  ///                  prefix   |           |
+  ///                        buffer index   |
+  ///                                  offset in buffer
+  ///
+  /// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+  ///
+  /// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+  ///
+  /// Alignment to 64 bits enables an aligned load of the size and prefix into
+  /// a single 64 bit integer, which is useful to the comparison fast path.

Review Comment:
   What are the consequences of `alignas`? It would only matter for 
stack-allocated variables and struct members, right?
   
   In particular, you cannot trust that buffer contents will be aligned 
appropriately.



##########
cpp/src/arrow/type_traits.h:
##########
@@ -371,6 +380,15 @@ struct TypeTraits<StringType> {
   static inline std::shared_ptr<DataType> type_singleton() { return utf8(); }
 };
 
+template <>
+struct TypeTraits<StringViewType> {
+  using ArrayType = StringViewArray;
+  using BuilderType = StringViewBuilder;
+  using ScalarType = StringViewScalar;
+  using CType = BinaryViewType::c_type;
+  constexpr static bool is_parameter_free = false;

Review Comment:
   Same here.



##########
cpp/src/arrow/util/binary_view_util.h:
##########
@@ -0,0 +1,105 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string_view>
+#include <utility>
+
+#include "arrow/type.h"
+#include "arrow/util/assume_aligned.h"
+#include "arrow/util/span.h"
+
+namespace arrow::util {
+
+inline BinaryViewType::c_type ToInlineBinaryView(const void* data, int32_t 
size) {
+  // Small string: inlined. Bytes beyond size are zeroed
+  BinaryViewType::c_type out;
+  out.inlined = {size, {}};
+  memcpy(&out.inlined.data, data, size);
+  return out;
+}
+
+inline BinaryViewType::c_type ToInlineBinaryView(std::string_view v) {
+  return ToInlineBinaryView(v.data(), static_cast<int32_t>(v.size()));
+}
+
+inline BinaryViewType::c_type ToIndexOffsetBinaryView(const void* data, 
int32_t size,

Review Comment:
   This can also produce inlined BinaryViews, so perhaps `ToBinaryView`?
   



##########
cpp/src/arrow/type.cc:
##########
@@ -243,8 +251,12 @@ struct PhysicalTypeVisitor {
   }
 
   template <typename Type, typename PhysicalType = typename Type::PhysicalType>
-  Status Visit(const Type&) {
-    result = TypeTraits<PhysicalType>::type_singleton();
+  Status Visit(const Type& type) {
+    if constexpr (std::is_base_of_v<BinaryViewType, Type>) {
+      result = binary_view();

Review Comment:
   `binary_view` and `utf8_view` can have a `type_singleton` as well, now they 
are parameter-free.



##########
cpp/src/arrow/integration/json_internal.cc:
##########
@@ -106,6 +107,11 @@ std::string GetTimeUnitName(TimeUnit::type unit) {
   return "UNKNOWN";
 }
 
+std::string_view GetStringView(const rj::Value& str) {

Review Comment:
   Would be better to return a `Result<std::string_view>` as the error below 
can trigger on non-conforming JSON input in `ArrayReader::Visit`.



##########
cpp/src/arrow/integration/json_internal.cc:
##########
@@ -1332,6 +1401,97 @@ class ArrayReader {
     return FinishBuilder(&builder);
   }
 
+  template <typename ViewType>
+  enable_if_binary_view_like<ViewType, Status> Visit(const ViewType& type) {
+    ARROW_ASSIGN_OR_RAISE(const auto json_views, GetDataArray(obj_, "VIEWS"));
+    ARROW_ASSIGN_OR_RAISE(const auto json_variadic_bufs,
+                          GetMemberArray(obj_, "VARIADIC_DATA_BUFFERS"));
+
+    using internal::Zip;
+    using util::span;
+
+    BufferVector buffers;
+    buffers.resize(json_variadic_bufs.Size() + 2);
+    for (auto [json_buf, buf] : Zip(json_variadic_bufs, 
span{buffers}.subspan(2))) {
+      auto hex_string = GetStringView(json_buf);
+      ARROW_ASSIGN_OR_RAISE(
+          buf, AllocateBuffer(static_cast<int64_t>(hex_string.size()) / 2, 
pool_));
+      RETURN_NOT_OK(ParseHexValues(hex_string, buf->mutable_data()));
+    }
+
+    TypedBufferBuilder<bool> validity_builder{pool_};
+    RETURN_NOT_OK(validity_builder.Resize(length_));
+    for (bool is_valid : is_valid_) {
+      validity_builder.UnsafeAppend(is_valid);
+    }
+    ARROW_ASSIGN_OR_RAISE(buffers[0], validity_builder.Finish());
+
+    ARROW_ASSIGN_OR_RAISE(
+        buffers[1], AllocateBuffer(length_ * sizeof(BinaryViewType::c_type), 
pool_));
+
+    span views{buffers[1]->mutable_data_as<BinaryViewType::c_type>(),
+               static_cast<size_t>(length_)};
+
+    int64_t null_count = 0;
+    for (auto [json_view, s, is_valid] : Zip(json_views, views, is_valid_)) {
+      if (!is_valid) {
+        s = {};
+        ++null_count;
+        continue;
+      }
+
+      DCHECK(json_view.IsObject());
+      const auto& json_view_obj = json_view.GetObject();
+
+      auto json_size = json_view_obj.FindMember("SIZE");
+      RETURN_NOT_INT("SIZE", json_size, json_view_obj);
+      DCHECK_GE(json_size->value.GetInt64(), 0);
+      auto size = static_cast<int32_t>(json_size->value.GetInt64());
+
+      if (size <= BinaryViewType::kInlineSize) {
+        auto json_inlined = json_view_obj.FindMember("INLINED");
+        RETURN_NOT_STRING("INLINED", json_inlined, json_view_obj);

Review Comment:
   Does this also raise an error if the "INLINED" member doesn't exist?



##########
cpp/src/arrow/integration/json_internal.cc:
##########
@@ -1332,6 +1401,97 @@ class ArrayReader {
     return FinishBuilder(&builder);
   }
 
+  template <typename ViewType>
+  enable_if_binary_view_like<ViewType, Status> Visit(const ViewType& type) {
+    ARROW_ASSIGN_OR_RAISE(const auto json_views, GetDataArray(obj_, "VIEWS"));
+    ARROW_ASSIGN_OR_RAISE(const auto json_variadic_bufs,
+                          GetMemberArray(obj_, "VARIADIC_DATA_BUFFERS"));
+
+    using internal::Zip;
+    using util::span;
+
+    BufferVector buffers;
+    buffers.resize(json_variadic_bufs.Size() + 2);
+    for (auto [json_buf, buf] : Zip(json_variadic_bufs, 
span{buffers}.subspan(2))) {
+      auto hex_string = GetStringView(json_buf);
+      ARROW_ASSIGN_OR_RAISE(
+          buf, AllocateBuffer(static_cast<int64_t>(hex_string.size()) / 2, 
pool_));
+      RETURN_NOT_OK(ParseHexValues(hex_string, buf->mutable_data()));
+    }
+
+    TypedBufferBuilder<bool> validity_builder{pool_};
+    RETURN_NOT_OK(validity_builder.Resize(length_));
+    for (bool is_valid : is_valid_) {
+      validity_builder.UnsafeAppend(is_valid);
+    }
+    ARROW_ASSIGN_OR_RAISE(buffers[0], validity_builder.Finish());
+
+    ARROW_ASSIGN_OR_RAISE(
+        buffers[1], AllocateBuffer(length_ * sizeof(BinaryViewType::c_type), 
pool_));
+
+    span views{buffers[1]->mutable_data_as<BinaryViewType::c_type>(),
+               static_cast<size_t>(length_)};
+
+    int64_t null_count = 0;
+    for (auto [json_view, s, is_valid] : Zip(json_views, views, is_valid_)) {

Review Comment:
   For clarity, perhaps rename `s`  to `out_view`.



##########
cpp/src/arrow/integration/json_internal.cc:
##########
@@ -1332,6 +1401,97 @@ class ArrayReader {
     return FinishBuilder(&builder);
   }
 
+  template <typename ViewType>
+  enable_if_binary_view_like<ViewType, Status> Visit(const ViewType& type) {
+    ARROW_ASSIGN_OR_RAISE(const auto json_views, GetDataArray(obj_, "VIEWS"));
+    ARROW_ASSIGN_OR_RAISE(const auto json_variadic_bufs,
+                          GetMemberArray(obj_, "VARIADIC_DATA_BUFFERS"));
+
+    using internal::Zip;
+    using util::span;
+
+    BufferVector buffers;
+    buffers.resize(json_variadic_bufs.Size() + 2);
+    for (auto [json_buf, buf] : Zip(json_variadic_bufs, 
span{buffers}.subspan(2))) {
+      auto hex_string = GetStringView(json_buf);
+      ARROW_ASSIGN_OR_RAISE(
+          buf, AllocateBuffer(static_cast<int64_t>(hex_string.size()) / 2, 
pool_));
+      RETURN_NOT_OK(ParseHexValues(hex_string, buf->mutable_data()));
+    }
+
+    TypedBufferBuilder<bool> validity_builder{pool_};
+    RETURN_NOT_OK(validity_builder.Resize(length_));
+    for (bool is_valid : is_valid_) {
+      validity_builder.UnsafeAppend(is_valid);
+    }
+    ARROW_ASSIGN_OR_RAISE(buffers[0], validity_builder.Finish());
+
+    ARROW_ASSIGN_OR_RAISE(
+        buffers[1], AllocateBuffer(length_ * sizeof(BinaryViewType::c_type), 
pool_));
+
+    span views{buffers[1]->mutable_data_as<BinaryViewType::c_type>(),
+               static_cast<size_t>(length_)};
+
+    int64_t null_count = 0;
+    for (auto [json_view, s, is_valid] : Zip(json_views, views, is_valid_)) {
+      if (!is_valid) {
+        s = {};
+        ++null_count;
+        continue;
+      }
+
+      DCHECK(json_view.IsObject());
+      const auto& json_view_obj = json_view.GetObject();
+
+      auto json_size = json_view_obj.FindMember("SIZE");
+      RETURN_NOT_INT("SIZE", json_size, json_view_obj);
+      DCHECK_GE(json_size->value.GetInt64(), 0);
+      auto size = static_cast<int32_t>(json_size->value.GetInt64());
+
+      if (size <= BinaryViewType::kInlineSize) {
+        auto json_inlined = json_view_obj.FindMember("INLINED");
+        RETURN_NOT_STRING("INLINED", json_inlined, json_view_obj);
+        s.inlined = {size, {}};
+
+        if constexpr (ViewType::is_utf8) {
+          DCHECK_LE(json_inlined->value.GetStringLength(), 
BinaryViewType::kInlineSize);

Review Comment:
   Can we return an error for all the `DCHECKS` in this function? It would flag 
flawed JSON input more explicitly.



##########
cpp/src/arrow/array/validate.cc:
##########
@@ -595,6 +614,74 @@ struct ValidateArrayImpl {
     return Status::OK();
   }
 
+  Status ValidateBinaryView(const BinaryViewType& type) {
+    int64_t views_byte_size = data.buffers[1]->size();
+    int64_t required_view_count = data.length + data.offset;
+    if (static_cast<int64_t>(views_byte_size / BinaryViewType::kSize) <
+        required_view_count) {
+      return Status::Invalid("View buffer size (bytes): ", views_byte_size,
+                             " isn't large enough for length: ", data.length,
+                             " and offset: ", data.offset);
+    }
+
+    if (!full_validation) return Status::OK();
+
+    auto CheckPrefix = [&](size_t i,
+                           std::array<uint8_t, BinaryViewType::kPrefixSize> 
prefix,
+                           const uint8_t* data) {
+      if (std::memcmp(data, prefix.data(), BinaryViewType::kPrefixSize) == 0) {
+        return Status::OK();
+      }
+      return Status::Invalid("View at slot ", i, " has inlined prefix 0x",
+                             HexEncode(prefix.data(), 
BinaryViewType::kPrefixSize),
+                             " but the out-of-line data begins with 0x",
+                             HexEncode(data, BinaryViewType::kPrefixSize));
+    };
+
+    util::span views(data.GetValues<BinaryViewType::c_type>(1),
+                     static_cast<size_t>(data.length));
+    util::span data_buffers(data.buffers.data() + 2, data.buffers.size() - 2);
+
+    for (size_t i = 0; i < static_cast<size_t>(data.length); ++i) {
+      if (data.IsNull(i)) continue;
+
+      if (views[i].size() < 0) {
+        return Status::Invalid("View at slot ", i, " has negative size ",
+                               views[i].size());
+      }
+
+      if (views[i].is_inline()) continue;

Review Comment:
   We should probably check that padding bytes are zeroed here?



##########
cpp/src/arrow/array/validate.cc:
##########
@@ -595,6 +614,74 @@ struct ValidateArrayImpl {
     return Status::OK();
   }
 
+  Status ValidateBinaryView(const BinaryViewType& type) {
+    int64_t views_byte_size = data.buffers[1]->size();
+    int64_t required_view_count = data.length + data.offset;
+    if (static_cast<int64_t>(views_byte_size / BinaryViewType::kSize) <
+        required_view_count) {
+      return Status::Invalid("View buffer size (bytes): ", views_byte_size,
+                             " isn't large enough for length: ", data.length,
+                             " and offset: ", data.offset);
+    }
+
+    if (!full_validation) return Status::OK();
+
+    auto CheckPrefix = [&](size_t i,
+                           std::array<uint8_t, BinaryViewType::kPrefixSize> 
prefix,
+                           const uint8_t* data) {
+      if (std::memcmp(data, prefix.data(), BinaryViewType::kPrefixSize) == 0) {
+        return Status::OK();
+      }
+      return Status::Invalid("View at slot ", i, " has inlined prefix 0x",
+                             HexEncode(prefix.data(), 
BinaryViewType::kPrefixSize),
+                             " but the out-of-line data begins with 0x",
+                             HexEncode(data, BinaryViewType::kPrefixSize));
+    };
+
+    util::span views(data.GetValues<BinaryViewType::c_type>(1),
+                     static_cast<size_t>(data.length));
+    util::span data_buffers(data.buffers.data() + 2, data.buffers.size() - 2);
+
+    for (size_t i = 0; i < static_cast<size_t>(data.length); ++i) {
+      if (data.IsNull(i)) continue;
+
+      if (views[i].size() < 0) {
+        return Status::Invalid("View at slot ", i, " has negative size ",
+                               views[i].size());
+      }
+
+      if (views[i].is_inline()) continue;
+
+      auto [size, prefix, buffer_index, offset] = views[i].ref;
+
+      if (buffer_index < 0) {
+        return Status::Invalid("View at slot ", i, " has negative buffer index 
",
+                               buffer_index);
+      }
+
+      if (offset < 0) {
+        return Status::Invalid("View at slot ", i, " has negative offset ", 
offset);
+      }
+
+      if (static_cast<size_t>(buffer_index) >= data_buffers.size()) {

Review Comment:
   Should also check that `buffer_index >= 2`?



##########
cpp/src/arrow/array/concatenate_test.cc:
##########
@@ -92,8 +92,14 @@ class ConcatenateTest : public ::testing::Test {
       for (auto null_probability : this->null_probabilities_) {
         std::shared_ptr<Array> array;
         factory(size, null_probability, &array);
+        ASSERT_OK(array->ValidateFull());
         auto expected = array->Slice(offsets.front(), offsets.back() - 
offsets.front());
+        ASSERT_OK(expected->ValidateFull());
         auto slices = this->Slices(array, offsets);
+        for (auto slice : slices) {
+          ASSERT_OK(slice->ValidateFull());
+        }
+        ASSERT_OK(expected->ValidateFull());
         ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices));

Review Comment:
   Do you mean:
   ```suggestion
           ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices));
           ASSERT_OK(actual->ValidateFull());
   ```



##########
cpp/src/arrow/array/data.cc:
##########
@@ -531,6 +565,16 @@ std::shared_ptr<ArrayData> ArraySpan::ToArrayData() const {
   return result;
 }
 
+util::span<std::shared_ptr<Buffer> const> ArraySpan::GetVariadicBuffers() 
const {

Review Comment:
   Nit
   ```suggestion
   util::span<const std::shared_ptr<Buffer>> ArraySpan::GetVariadicBuffers() 
const {
   ```



##########
cpp/src/arrow/array/data.cc:
##########
@@ -531,6 +565,16 @@ std::shared_ptr<ArrayData> ArraySpan::ToArrayData() const {
   return result;
 }
 
+util::span<std::shared_ptr<Buffer> const> ArraySpan::GetVariadicBuffers() 
const {
+  DCHECK(HasVariadicBuffers());
+  return {buffers[2].data_as<std::shared_ptr<Buffer>>(),
+          buffers[2].size / sizeof(std::shared_ptr<Buffer>)};
+}
+
+bool ArraySpan::HasVariadicBuffers() const {
+  return type->id() == Type::BINARY_VIEW || type->id() == Type::STRING_VIEW;
+}
+

Review Comment:
   Are all the `ArraySpan` additions tested somewhere? They look sufficiently 
non-trivial to warrant testing.



##########
cpp/src/arrow/array/array_binary_test.cc:
##########
@@ -880,14 +982,24 @@ class TestBaseBinaryDataVisitor : public ::testing::Test {
  public:
   using TypeClass = T;
 
-  void SetUp() override { type_ = TypeTraits<TypeClass>::type_singleton(); }
+  void SetUp() override {
+    if constexpr (is_binary_view_like_type<TypeClass>::value) {
+      type_ = TypeClass::is_utf8 ? utf8_view() : binary_view();
+    } else {
+      type_ = TypeTraits<TypeClass>::type_singleton();

Review Comment:
   This should work with binary views as well.



##########
cpp/src/arrow/type.h:
##########
@@ -710,6 +717,103 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType {
   explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) 
{}
 };
 
+/// \brief Concrete type class for variable-size binary view data
+class ARROW_EXPORT BinaryViewType : public DataType {
+ public:
+  static constexpr Type::type type_id = Type::BINARY_VIEW;
+  static constexpr bool is_utf8 = false;
+  using PhysicalType = BinaryViewType;
+
+  static constexpr int kSize = 16;
+  static constexpr int kInlineSize = 12;
+  static constexpr int kPrefixSize = 4;
+
+  /// Variable length string or binary with inline optimization for small 
values (12 bytes
+  /// or fewer). This is similar to std::string_view except limited in size to 
INT32_MAX
+  /// and at least the first four bytes of the string are copied inline 
(accessible
+  /// without pointer dereference). This inline prefix allows failing 
comparisons early.
+  /// Furthermore when dealing with short strings the CPU cache working set is 
reduced
+  /// since many can be inline.
+  ///
+  /// This union supports two states:
+  ///
+  /// - Entirely inlined string data
+  ///                |----|--------------|
+  ///                 ^    ^
+  ///                 |    |
+  ///              size    in-line string data, zero padded
+  ///
+  /// - Reference into a buffer
+  ///                |----|----|----|----|
+  ///                 ^    ^    ^    ^
+  ///                 |    |    |    |
+  ///              size    |    |    `------.
+  ///                  prefix   |           |
+  ///                        buffer index   |
+  ///                                  offset in buffer
+  ///
+  /// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+  ///
+  /// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+  ///
+  /// Alignment to 64 bits enables an aligned load of the size and prefix into
+  /// a single 64 bit integer, which is useful to the comparison fast path.

Review Comment:
   Besides, a 64-bit unaligned load would probably work just as well.



##########
cpp/src/arrow/array/builder_binary.cc:
##########
@@ -40,6 +40,60 @@ namespace arrow {
 
 using internal::checked_cast;
 
+// ----------------------------------------------------------------------
+// Binary/StringView
+BinaryViewBuilder::BinaryViewBuilder(const std::shared_ptr<DataType>& type,
+                                     MemoryPool* pool)
+    : BinaryViewBuilder(pool) {}
+
+Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t 
offset,
+                                           int64_t length) {
+  auto bitmap = array.GetValues<uint8_t>(0, 0);
+  auto values = array.GetValues<BinaryViewType::c_type>(1) + offset;
+
+  int64_t out_of_line_total = 0;
+  for (int64_t i = 0; i < length; i++) {
+    if (!values[i].is_inline()) {

Review Comment:
   Shouldn't you filter out null values here?



##########
cpp/src/arrow/array/concatenate.cc:
##########
@@ -230,6 +230,35 @@ class ConcatenateImpl {
     return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]);
   }
 
+  Status Visit(const BinaryViewType& type) {
+    out_->buffers.resize(2);
+
+    for (const auto& in_data : in_) {
+      for (const auto& buf : util::span(in_data->buffers).subspan(2)) {
+        out_->buffers.push_back(buf);
+      }
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto view_buffers, Buffers(1, 
BinaryViewType::kSize));
+    ARROW_ASSIGN_OR_RAISE(auto view_buffer, ConcatenateBuffers(view_buffers, 
pool_));
+
+    auto* s = view_buffer->mutable_data_as<BinaryViewType::c_type>();
+    size_t preceding_buffer_count = 0;
+
+    int64_t i = in_[0]->length;
+    for (size_t in_index = 1; in_index < in_.size(); ++in_index) {
+      preceding_buffer_count += in_[in_index - 1]->buffers.size() - 2;
+
+      for (int64_t end_i = i + in_[in_index]->length; i < end_i; ++i) {

Review Comment:
   Is it ok to not ignore null entries here? I suppose it is if null views have 
sensical values...



##########
cpp/src/arrow/util/binary_view_util.h:
##########
@@ -0,0 +1,105 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string_view>
+#include <utility>
+
+#include "arrow/type.h"
+#include "arrow/util/assume_aligned.h"
+#include "arrow/util/span.h"
+
+namespace arrow::util {
+
+inline BinaryViewType::c_type ToInlineBinaryView(const void* data, int32_t 
size) {
+  // Small string: inlined. Bytes beyond size are zeroed
+  BinaryViewType::c_type out;
+  out.inlined = {size, {}};
+  memcpy(&out.inlined.data, data, size);
+  return out;
+}
+
+inline BinaryViewType::c_type ToInlineBinaryView(std::string_view v) {
+  return ToInlineBinaryView(v.data(), static_cast<int32_t>(v.size()));
+}
+
+inline BinaryViewType::c_type ToIndexOffsetBinaryView(const void* data, 
int32_t size,
+                                                      int32_t buffer_index,
+                                                      int32_t offset) {
+  if (size <= BinaryViewType::kInlineSize) {
+    return ToInlineBinaryView(data, size);
+  }
+
+  // Large string: store index/offset.
+  BinaryViewType::c_type out;
+  out.ref = {size, {}, buffer_index, offset};
+  memcpy(&out.ref.prefix, data, sizeof(out.ref.prefix));
+  return out;
+}
+
+inline BinaryViewType::c_type ToIndexOffsetBinaryView(std::string_view v,
+                                                      int32_t buffer_index,
+                                                      int32_t offset) {
+  return ToIndexOffsetBinaryView(v.data(), static_cast<int32_t>(v.size()), 
buffer_index,
+                                 offset);
+}
+
+template <typename BufferPtr>
+std::string_view FromIndexOffsetBinaryView(const BinaryViewType::c_type& v,

Review Comment:
   This actually accepts any binary view, including inlined ones, so perhaps 
`FromBinaryView`?



##########
cpp/src/arrow/ipc/reader.cc:
##########
@@ -59,6 +59,7 @@
 #include "arrow/util/thread_pool.h"
 #include "arrow/util/ubsan.h"
 #include "arrow/util/vector.h"
+#include "arrow/visit_data_inline.h"

Review Comment:
   I don't think this inclusion is needed, is it?



##########
cpp/src/arrow/util/binary_view_util.h:
##########
@@ -0,0 +1,105 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string_view>
+#include <utility>
+
+#include "arrow/type.h"
+#include "arrow/util/assume_aligned.h"
+#include "arrow/util/span.h"
+
+namespace arrow::util {
+
+inline BinaryViewType::c_type ToInlineBinaryView(const void* data, int32_t 
size) {
+  // Small string: inlined. Bytes beyond size are zeroed
+  BinaryViewType::c_type out;
+  out.inlined = {size, {}};
+  memcpy(&out.inlined.data, data, size);
+  return out;
+}
+
+inline BinaryViewType::c_type ToInlineBinaryView(std::string_view v) {
+  return ToInlineBinaryView(v.data(), static_cast<int32_t>(v.size()));
+}
+
+inline BinaryViewType::c_type ToIndexOffsetBinaryView(const void* data, 
int32_t size,
+                                                      int32_t buffer_index,
+                                                      int32_t offset) {
+  if (size <= BinaryViewType::kInlineSize) {
+    return ToInlineBinaryView(data, size);
+  }
+
+  // Large string: store index/offset.
+  BinaryViewType::c_type out;
+  out.ref = {size, {}, buffer_index, offset};
+  memcpy(&out.ref.prefix, data, sizeof(out.ref.prefix));
+  return out;
+}
+
+inline BinaryViewType::c_type ToIndexOffsetBinaryView(std::string_view v,
+                                                      int32_t buffer_index,
+                                                      int32_t offset) {
+  return ToIndexOffsetBinaryView(v.data(), static_cast<int32_t>(v.size()), 
buffer_index,
+                                 offset);
+}
+
+template <typename BufferPtr>
+std::string_view FromIndexOffsetBinaryView(const BinaryViewType::c_type& v,
+                                           const BufferPtr* data_buffers) {
+  auto* data = v.is_inline() ? v.inlined.data.data()
+                             : data_buffers[v.ref.buffer_index]->data() + 
v.ref.offset;
+  return {reinterpret_cast<const char*>(data), static_cast<size_t>(v.size())};
+}
+template <typename BufferPtr>
+std::string_view FromIndexOffsetBinaryView(BinaryViewType::c_type&&,
+                                           const BufferPtr*) = delete;
+
+template <typename... BufferPtr>
+bool EqualIndexOffsetBinaryView(BinaryViewType::c_type l, 
BinaryViewType::c_type r,
+                                const BufferPtr*... buffers) {
+  int64_t l_size_and_prefix, r_size_and_prefix;
+  memcpy(&l_size_and_prefix, &l, sizeof(l_size_and_prefix));
+  memcpy(&r_size_and_prefix, &r, sizeof(r_size_and_prefix));
+
+  if (l_size_and_prefix != r_size_and_prefix) return false;
+
+  if (l.is_inline()) {
+    // The inline part is zeroed at construction, so we can compare
+    // a word at a time if data extends past 'prefix_'.
+    int64_t l_inlined, r_inlined;
+    memcpy(&l_inlined,
+           AssumeAlignedAs<int64_t>(l.inline_data() + 
BinaryViewType::kPrefixSize),
+           sizeof(l_size_and_prefix));

Review Comment:
   Nit (doesn't change the actual generated code, but conceptually more correct 
:-)). Also same below.
   ```suggestion
              sizeof(l_inlined));
   ```



##########
cpp/src/arrow/util/binary_view_util.h:
##########
@@ -0,0 +1,105 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string_view>
+#include <utility>
+
+#include "arrow/type.h"
+#include "arrow/util/assume_aligned.h"
+#include "arrow/util/span.h"
+
+namespace arrow::util {
+
+inline BinaryViewType::c_type ToInlineBinaryView(const void* data, int32_t 
size) {
+  // Small string: inlined. Bytes beyond size are zeroed
+  BinaryViewType::c_type out;
+  out.inlined = {size, {}};
+  memcpy(&out.inlined.data, data, size);
+  return out;
+}
+
+inline BinaryViewType::c_type ToInlineBinaryView(std::string_view v) {
+  return ToInlineBinaryView(v.data(), static_cast<int32_t>(v.size()));
+}
+
+inline BinaryViewType::c_type ToIndexOffsetBinaryView(const void* data, 
int32_t size,
+                                                      int32_t buffer_index,
+                                                      int32_t offset) {
+  if (size <= BinaryViewType::kInlineSize) {
+    return ToInlineBinaryView(data, size);
+  }
+
+  // Large string: store index/offset.
+  BinaryViewType::c_type out;
+  out.ref = {size, {}, buffer_index, offset};
+  memcpy(&out.ref.prefix, data, sizeof(out.ref.prefix));
+  return out;
+}
+
+inline BinaryViewType::c_type ToIndexOffsetBinaryView(std::string_view v,
+                                                      int32_t buffer_index,
+                                                      int32_t offset) {
+  return ToIndexOffsetBinaryView(v.data(), static_cast<int32_t>(v.size()), 
buffer_index,
+                                 offset);
+}
+
+template <typename BufferPtr>
+std::string_view FromIndexOffsetBinaryView(const BinaryViewType::c_type& v,
+                                           const BufferPtr* data_buffers) {
+  auto* data = v.is_inline() ? v.inlined.data.data()
+                             : data_buffers[v.ref.buffer_index]->data() + 
v.ref.offset;
+  return {reinterpret_cast<const char*>(data), static_cast<size_t>(v.size())};
+}
+template <typename BufferPtr>
+std::string_view FromIndexOffsetBinaryView(BinaryViewType::c_type&&,
+                                           const BufferPtr*) = delete;
+
+template <typename... BufferPtr>
+bool EqualIndexOffsetBinaryView(BinaryViewType::c_type l, 
BinaryViewType::c_type r,
+                                const BufferPtr*... buffers) {

Review Comment:
   It's weird to have a variadic `buffers` here but not above in 
`FromIndexOffsetBinaryView`. Also it's not obvious which are the left-hand 
buffers and which are the right ones.



##########
cpp/src/arrow/ipc/metadata_internal.cc:
##########
@@ -977,7 +996,10 @@ static Status MakeRecordBatch(FBB& fbb, int64_t length, 
int64_t body_length,
   BodyCompressionOffset fb_compression;
   RETURN_NOT_OK(GetBodyCompression(fbb, options, &fb_compression));
 
-  *offset = flatbuf::CreateRecordBatch(fbb, length, fb_nodes, fb_buffers, 
fb_compression);
+  auto fb_variadic_buffer_counts = fbb.CreateVector(variadic_buffer_counts);

Review Comment:
   Does flatbuffers make a difference between an empty vector and an absent 
vector? If yes, then perhaps we can leave it absent when empty...



##########
cpp/src/arrow/util/string.cc:
##########
@@ -73,20 +71,34 @@ std::string HexEncode(std::string_view str) { return 
HexEncode(str.data(), str.s
 
 std::string Escape(std::string_view str) { return Escape(str.data(), 
str.size()); }
 
-Status ParseHexValue(const char* data, uint8_t* out) {
-  char c1 = data[0];
-  char c2 = data[1];
+constexpr uint8_t kInvalidHexDigit = -1;
 
-  const char* kAsciiTableEnd = kAsciiTable + 16;
-  const char* pos1 = std::lower_bound(kAsciiTable, kAsciiTableEnd, c1);
-  const char* pos2 = std::lower_bound(kAsciiTable, kAsciiTableEnd, c2);
+constexpr uint8_t ParseHexDigit(char c) {
+  if (c >= '0' && c <= '9') return c - '0';
+  if (c >= 'A' && c <= 'F') return c - 'A' + 10;
+  return kInvalidHexDigit;
+}
+
+Status ParseHexValue(const char* data, uint8_t* out) {
+  uint8_t high = ParseHexDigit(data[0]);
+  uint8_t low = ParseHexDigit(data[1]);
 
   // Error checking
-  if (pos1 == kAsciiTableEnd || pos2 == kAsciiTableEnd || *pos1 != c1 || *pos2 
!= c2) {
+  if (high == kInvalidHexDigit || low == kInvalidHexDigit) {
     return Status::Invalid("Encountered non-hex digit");
   }
 
-  *out = static_cast<uint8_t>((pos1 - kAsciiTable) << 4 | (pos2 - 
kAsciiTable));
+  *out = static_cast<uint8_t>(high << 4 | low);
+  return Status::OK();
+}
+
+Status ParseHexValues(std::string_view hex_string, uint8_t* out) {

Review Comment:
   (I assume we have tests for the hex-encoding/decoding functions btw?)



##########
cpp/src/arrow/testing/random.h:
##########
@@ -367,6 +367,25 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
                                 int64_t alignment = kDefaultBufferAlignment,
                                 MemoryPool* memory_pool = 
default_memory_pool());
 
+  /// \brief Generate a random StringViewArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min_length the lower bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] max_length the upper bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] max_data_buffer_length the data buffer size at which
+  ///            a new chunk will be generated
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] null_probability the probability of a value being null

Review Comment:
   Reorder arguments to match function signature
   ```suggestion
     /// \param[in] null_probability the probability of a value being null
     /// \param[in] max_data_buffer_length (optional) the data buffer size
     ///            at which a new chunk will be generated
     /// \param[in] alignment alignment for memory allocations (in bytes)
   ```



##########
cpp/src/arrow/array/util.h:
##########
@@ -86,5 +86,18 @@ Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
 ARROW_EXPORT
 std::vector<ArrayVector> RechunkArraysConsistently(const 
std::vector<ArrayVector>&);
 
+/// Convert between index/offset and raw pointer views.

Review Comment:
   This function can probably be removed now?



##########
cpp/src/arrow/array/data.cc:
##########
@@ -92,6 +93,11 @@ bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data) 
{
   return ArraySpan(data).MayHaveLogicalNulls();
 }
 
+BufferSpan PackVariadicBuffers(util::span<std::shared_ptr<Buffer> const> 
buffers) {

Review Comment:
   This reads bizarrely
   ```suggestion
   BufferSpan PackVariadicBuffers(util::span<const std::shared_ptr<Buffer>> 
buffers) {
   ```



##########
cpp/src/arrow/array/util.cc:
##########
@@ -642,29 +665,36 @@ class RepeatedArrayFactory {
 
   template <typename T>
   enable_if_base_binary<T, Status> Visit(const T&) {
-    std::shared_ptr<Buffer> value =
-        checked_cast<const typename TypeTraits<T>::ScalarType&>(scalar_).value;
+    const std::shared_ptr<Buffer>& value = scalar<T>().value;
     std::shared_ptr<Buffer> values_buffer, offsets_buffer;
     RETURN_NOT_OK(CreateBufferOf(value->data(), value->size(), 
&values_buffer));
     auto size = static_cast<typename T::offset_type>(value->size());
     RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer));
-    out_ = std::make_shared<typename TypeTraits<T>::ArrayType>(length_, 
offsets_buffer,
-                                                               values_buffer);
+    out_ = std::make_shared<typename TypeTraits<T>::ArrayType>(
+        length_, std::move(offsets_buffer), std::move(values_buffer));
+    return Status::OK();
+  }
+
+  template <typename T>
+  enable_if_binary_view_like<T, Status> Visit(const T& type) {
+    std::string_view value{*scalar<T>().value};
+    auto s = util::ToIndexOffsetBinaryView(value, 0, 0);
+    RETURN_NOT_OK(FinishFixedWidth(&s, sizeof(s)));
+    if (!s.is_inline()) {
+      out_->data()->buffers.push_back(scalar<T>().value);
+    }

Review Comment:
   I don't really understand how this ensures the proper buffer index for 
non-inlined views. Is it tested somewhere?



##########
cpp/src/arrow/array/concatenate_test.cc:
##########
@@ -155,6 +161,13 @@ TEST_F(ConcatenateTest, StringType) {
   });
 }
 
+TEST_F(ConcatenateTest, StringViewType) {
+  Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* 
out) {
+    *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/15, 
null_probability);

Review Comment:
   Perhaps increase max length a bit to maximize the probability of non-inlined 
views?
   ```suggestion
       *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/40, 
null_probability);
   ```



##########
cpp/src/arrow/array/concatenate.cc:
##########
@@ -230,6 +230,35 @@ class ConcatenateImpl {
     return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]);
   }
 
+  Status Visit(const BinaryViewType& type) {
+    out_->buffers.resize(2);
+
+    for (const auto& in_data : in_) {
+      for (const auto& buf : util::span(in_data->buffers).subspan(2)) {
+        out_->buffers.push_back(buf);
+      }
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto view_buffers, Buffers(1, 
BinaryViewType::kSize));
+    ARROW_ASSIGN_OR_RAISE(auto view_buffer, ConcatenateBuffers(view_buffers, 
pool_));
+
+    auto* s = view_buffer->mutable_data_as<BinaryViewType::c_type>();

Review Comment:
   Call this `views` or `out_views`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to