pitrou commented on code in PR #37792:
URL: https://github.com/apache/arrow/pull/37792#discussion_r1371617003


##########
cpp/src/arrow/integration/json_internal.cc:
##########
@@ -642,6 +650,50 @@ class ArrayWriter {
     writer_->EndArray();
   }
 
+  template <typename ArrayType>
+  void WriteBinaryViewField(const ArrayType& array) {

Review Comment:
   Speaking of which, can you document this additional format in the 
integration docs? Can be another PR to avoid delaying this one any further.
   
https://arrow.apache.org/docs/dev/format/Integration.html#json-test-data-format



##########
cpp/src/arrow/type.cc:
##########
@@ -3294,7 +3318,7 @@ void InitStaticData() {
   // * Time32
   // * Time64
   // * Timestamp
-  g_primitive_types = {null(), boolean(), date32(), date64()};
+  g_primitive_types = {null(), boolean(), date32(), date64(), binary_view(), 
utf8_view()};

Review Comment:
   Hmm, these shouldn't be part of primitive types. Put them in base binary 
types perhaps, or in a new enumeration?



##########
cpp/src/arrow/integration/json_internal.cc:
##########
@@ -106,6 +107,13 @@ std::string GetTimeUnitName(TimeUnit::type unit) {
   return "UNKNOWN";
 }
 
+Result<std::string_view> GetStringView(const rj::Value& str) {
+  if (!str.IsString()) {
+    return Status::Invalid("field was not a string line ", __LINE__);

Review Comment:
   `__LINE__` will just give you the current line inside this function, which 
doesn't seem very useful (it might be if this were a macro). 



##########
cpp/src/arrow/util/binary_view_util.h:
##########
@@ -0,0 +1,95 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string_view>
+#include <utility>
+
+#include "arrow/type.h"
+#include "arrow/util/span.h"
+
+namespace arrow::util {
+
+inline BinaryViewType::c_type ToInlineBinaryView(const void* data, int32_t 
size) {
+  // Small string: inlined. Bytes beyond size are zeroed
+  BinaryViewType::c_type out;
+  out.inlined = {size, {}};
+  memcpy(&out.inlined.data, data, size);
+  return out;
+}
+
+inline BinaryViewType::c_type ToInlineBinaryView(std::string_view v) {
+  return ToInlineBinaryView(v.data(), static_cast<int32_t>(v.size()));
+}
+
+inline BinaryViewType::c_type ToBinaryView(const void* data, int32_t size,
+                                           int32_t buffer_index, int32_t 
offset) {
+  if (size <= BinaryViewType::kInlineSize) {
+    return ToInlineBinaryView(data, size);
+  }
+
+  // Large string: store index/offset.
+  BinaryViewType::c_type out;
+  out.ref = {size, {}, buffer_index, offset};
+  memcpy(&out.ref.prefix, data, sizeof(out.ref.prefix));
+  return out;
+}
+
+inline BinaryViewType::c_type ToBinaryView(std::string_view v, int32_t 
buffer_index,
+                                           int32_t offset) {
+  return ToBinaryView(v.data(), static_cast<int32_t>(v.size()), buffer_index, 
offset);
+}
+
+template <typename BufferPtr>
+std::string_view FromBinaryView(const BinaryViewType::c_type& v,
+                                const BufferPtr* data_buffers) {
+  auto* data = v.is_inline() ? v.inlined.data.data()
+                             : data_buffers[v.ref.buffer_index]->data() + 
v.ref.offset;
+  return {reinterpret_cast<const char*>(data), static_cast<size_t>(v.size())};
+}
+template <typename BufferPtr>
+std::string_view FromBinaryView(BinaryViewType::c_type&&, const BufferPtr*) = 
delete;
+
+template <typename BufferPtr>
+bool EqualBinaryView(BinaryViewType::c_type l, BinaryViewType::c_type r,
+                     const BufferPtr* l_buffers, const BufferPtr* r_buffers) {
+  int64_t l_size_and_prefix, r_size_and_prefix;
+  memcpy(&l_size_and_prefix, &l, sizeof(l_size_and_prefix));
+  memcpy(&r_size_and_prefix, &r, sizeof(r_size_and_prefix));
+
+  if (l_size_and_prefix != r_size_and_prefix) return false;
+
+  if (l.is_inline()) {
+    // The inline part is zeroed at construction, so we can compare
+    // a word at a time if data extends past 'prefix_'.

Review Comment:
   ```suggestion
       // The columnar spec mandates that the inlined part be zero-padded,
       // so we can compare a word at a time regardless of the exact size.
   ```



##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -463,6 +464,248 @@ class ARROW_EXPORT LargeStringBuilder : public 
LargeBinaryBuilder {
   std::shared_ptr<DataType> type() const override { return large_utf8(); }
 };
 
+// ----------------------------------------------------------------------
+// BinaryViewBuilder, StringViewBuilder
+//
+// These builders do not support building raw pointer view arrays.
+
+namespace internal {
+
+// We allocate medium-sized memory chunks and accumulate data in those, which
+// may result in some waste if there are many large-ish strings. If a string
+// comes along that does not fit into a block, we allocate a new block and
+// write into that.
+//
+// Later we can implement optimizations to continuing filling underfull blocks
+// after encountering a large string that required allocating a new block.
+class ARROW_EXPORT StringHeapBuilder {
+ public:
+  static constexpr int64_t kDefaultBlocksize = 32 << 10;  // 32KB
+
+  StringHeapBuilder(MemoryPool* pool, int64_t alignment)
+      : pool_(pool), alignment_(alignment) {}
+
+  void SetBlockSize(int64_t blocksize) { blocksize_ = blocksize; }
+
+  using c_type = BinaryViewType::c_type;
+
+  template <bool Safe>
+  std::conditional_t<Safe, Result<c_type>, c_type> Append(const uint8_t* value,
+                                                          int64_t length) {
+    if (length <= BinaryViewType::kInlineSize) {
+      return util::ToInlineBinaryView(value, static_cast<int32_t>(length));
+    }
+
+    if constexpr (Safe) {
+      ARROW_RETURN_NOT_OK(Reserve(length));
+    }
+
+    auto v =
+        util::ToBinaryView(value, static_cast<int32_t>(length),
+                           static_cast<int32_t>(blocks_.size() - 1), 
current_offset_);
+
+    memcpy(current_out_buffer_, value, static_cast<size_t>(length));
+    current_out_buffer_ += length;
+    current_remaining_bytes_ -= length;
+    current_offset_ += static_cast<int32_t>(length);
+    return v;
+  }
+
+  static constexpr int64_t ValueSizeLimit() {
+    return std::numeric_limits<int32_t>::max();
+  }
+
+  /// \brief Ensure that the indicated number of bytes can be appended via
+  /// UnsafeAppend operations without the need to allocate more memory
+  Status Reserve(int64_t num_bytes) {
+    if (ARROW_PREDICT_FALSE(num_bytes > ValueSizeLimit())) {
+      return Status::CapacityError(
+          "BinaryView or StringView elements cannot reference "
+          "strings larger than 2GB");
+    }
+    if (num_bytes > current_remaining_bytes_) {
+      // Ensure the buffer is fully overwritten to avoid leaking uninitialized
+      // bytes from the allocator
+      if (current_remaining_bytes_ > 0) {
+        std::memset(current_out_buffer_, 0, current_remaining_bytes_);
+        blocks_.back() = SliceBuffer(blocks_.back(), 0,
+                                     blocks_.back()->size() - 
current_remaining_bytes_);
+      }
+      current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : 
blocksize_;
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> new_block,
+                            AllocateBuffer(current_remaining_bytes_, 
alignment_, pool_));
+      current_offset_ = 0;
+      current_out_buffer_ = new_block->mutable_data();
+      blocks_.emplace_back(std::move(new_block));
+    }
+    return Status::OK();
+  }
+
+  void Reset() {
+    current_offset_ = 0;
+    current_out_buffer_ = NULLPTR;
+    current_remaining_bytes_ = 0;
+    blocks_.clear();
+  }
+
+  int64_t current_remaining_bytes() const { return current_remaining_bytes_; }
+
+  std::vector<std::shared_ptr<Buffer>> Finish() {
+    current_offset_ = 0;
+    current_out_buffer_ = NULLPTR;
+    current_remaining_bytes_ = 0;
+    return std::move(blocks_);
+  }
+
+ private:
+  MemoryPool* pool_;
+  int64_t alignment_;
+  int64_t blocksize_ = kDefaultBlocksize;
+  std::vector<std::shared_ptr<Buffer>> blocks_;
+
+  int32_t current_offset_ = 0;
+  uint8_t* current_out_buffer_ = NULLPTR;
+  int64_t current_remaining_bytes_ = 0;
+};
+
+}  // namespace internal
+
+class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
+ public:
+  using TypeClass = BinaryViewType;
+
+  // this constructor provided for MakeBuilder compatibility
+  BinaryViewBuilder(const std::shared_ptr<DataType>&, MemoryPool* pool);
+
+  explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(),
+                             int64_t alignment = kDefaultBufferAlignment)
+      : ArrayBuilder(pool, alignment),
+        data_builder_(pool, alignment),
+        data_heap_builder_(pool, alignment) {}
+
+  void SetBlockSize(int64_t blocksize) { 
data_heap_builder_.SetBlockSize(blocksize); }
+
+  int64_t current_block_bytes_remaining() const {

Review Comment:
   Can you add docstrings to these two methods?



##########
cpp/src/arrow/array/concatenate_test.cc:
##########
@@ -155,6 +161,13 @@ TEST_F(ConcatenateTest, StringType) {
   });
 }
 
+TEST_F(ConcatenateTest, StringViewType) {
+  Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* 
out) {
+    *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/40, 
null_probability);

Review Comment:
   Perhaps pass a small `max_data_buffer_length` here? (such as 200)



##########
cpp/src/arrow/type_traits.h:
##########
@@ -639,10 +681,9 @@ template <typename T, typename R = void>
 using enable_if_fixed_width_type = enable_if_t<is_fixed_width_type<T>::value, 
R>;
 
 template <typename T>
-using is_binary_like_type =
-    std::integral_constant<bool, (is_base_binary_type<T>::value &&
-                                  !is_string_like_type<T>::value) ||
-                                     is_fixed_size_binary_type<T>::value>;
+using is_binary_like_type = std::integral_constant<
+    bool, (is_base_binary_type<T>::value && !is_string_like_type<T>::value) ||
+              is_binary_view_type<T>::value || 
is_fixed_size_binary_type<T>::value>;

Review Comment:
   The question is whether these are used by third-party code. These are not 
internal APIs, and there is a risk that user code will then try to use binary 
view data in non-compatible routines, and get UB or data corruption.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to