[GitHub] [arrow] pitrou commented on a diff in pull request #37792: GH-37710: [C++][Integration] Add C++ Utf8View implementation

via GitHub Mon, 25 Sep 2023 09:04:43 -0700


pitrou commented on code in PR #37792:
URL: https://github.com/apache/arrow/pull/37792#discussion_r1336010665



##########
cpp/src/arrow/type.h:
##########
@@ -710,6 +717,120 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType {
   explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) 
{}
 };
 
+/// \brief Concrete type class for variable-size binary view data
+class ARROW_EXPORT BinaryViewType : public DataType {
+ public:
+  static constexpr Type::type type_id = Type::BINARY_VIEW;
+  static constexpr bool is_utf8 = false;
+  using PhysicalType = BinaryViewType;
+
+  static constexpr int kSize = 16;
+  static constexpr int kInlineSize = 12;
+  static constexpr int kPrefixSize = 4;
+
+  /// Variable length string or binary with inline optimization for small 
values (12 bytes
+  /// or fewer). This is similar to std::string_view except limited in size to 
INT32_MAX
+  /// and at least the first four bytes of the string are copied inline 
(accessible
+  /// without pointer dereference). This inline prefix allows failing 
comparisons early
+  /// and can reduce the CPU cache working set when dealing with short strings.
+  ///
+  /// This union supports three states:
+  ///
+  /// Short string   |----|----|--------|
+  ///                 ^    ^      ^
+  ///                 |    |      |
+  ///                 size prefix remaining in-line portion, zero padded
+  ///
+  /// Long string    |----|----|--------|
+  ///                 ^    ^      ^
+  ///                 |    |      |
+  ///                 size prefix raw pointer to out-of-line portion

Review Comment:
   I honestly don't understand what this is doing in this PR.
   Arrow C++ implements the Arrow format as specified in the Arrow columnar 
format. We should not introduce confusion with other formats.



##########
cpp/src/arrow/util/binary_view_util.h:
##########
@@ -0,0 +1,163 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string_view>
+#include <utility>
+
+#include "arrow/type.h"
+#include "arrow/util/assume_aligned.h"
+#include "arrow/util/span.h"
+
+namespace arrow::util {
+
+inline BinaryViewType::c_type ToInlineBinaryView(const void* data, int32_t 
size) {
+  // Small string: inlined. Bytes beyond size are zeroed
+  BinaryViewType::c_type out;
+  out.inlined = {size, {}};
+  memcpy(&out.inlined.data, data, size);
+  return out;
+}
+
+inline BinaryViewType::c_type ToInlineBinaryView(std::string_view v) {
+  return ToInlineBinaryView(v.data(), static_cast<int32_t>(v.size()));
+}
+
+inline BinaryViewType::c_type ToRawPointerBinaryView(const void* data, int32_t 
size) {
+  if (size <= BinaryViewType::kInlineSize) {
+    return ToInlineBinaryView(data, size);
+  }
+
+  // Large string: store pointer.
+  BinaryViewType::c_type out;
+  out.raw = {size, {}, static_cast<const uint8_t*>(data)};
+  memcpy(&out.raw.prefix, data, sizeof(out.raw.prefix));
+  return out;
+}
+
+inline BinaryViewType::c_type ToRawPointerBinaryView(std::string_view v) {
+  return ToRawPointerBinaryView(v.data(), static_cast<int32_t>(v.size()));
+}
+
+inline BinaryViewType::c_type ToIndexOffsetBinaryView(const void* data, 
int32_t size,
+                                                      int32_t buffer_index,
+                                                      int32_t offset) {
+  if (size <= BinaryViewType::kInlineSize) {
+    return ToInlineBinaryView(data, size);
+  }
+
+  // Large string: store index/offset.
+  BinaryViewType::c_type out;
+  out.io = {size, {}, buffer_index, offset};
+  memcpy(&out.raw.prefix, data, sizeof(out.raw.prefix));
+  return out;
+}
+
+inline BinaryViewType::c_type ToIndexOffsetBinaryView(std::string_view v,
+                                                      int32_t buffer_index,
+                                                      int32_t offset) {
+  return ToIndexOffsetBinaryView(v.data(), static_cast<int32_t>(v.size()), 
buffer_index,
+                                 offset);
+}
+
+inline std::string_view FromRawPointerBinaryView(const BinaryViewType::c_type& 
v) {
+  auto* data = v.is_inline() ? v.inlined.data.data() : v.raw.data;
+  return {reinterpret_cast<const char*>(data), static_cast<size_t>(v.size())};
+}
+inline std::string_view FromRawPointerBinaryView(BinaryViewType::c_type&&) = 
delete;
+
+template <typename BufferPtr>
+std::string_view FromIndexOffsetBinaryView(const BinaryViewType::c_type& v,
+                                           const BufferPtr* data_buffers) {
+  auto* data = v.is_inline() ? v.inlined.data.data()
+                             : data_buffers[v.io.buffer_index]->data() + 
v.io.offset;
+  return {reinterpret_cast<const char*>(data), static_cast<size_t>(v.size())};
+}
+template <typename BufferPtr>
+std::string_view FromIndexOffsetBinaryView(BinaryViewType::c_type&&,
+                                           const BufferPtr*) = delete;
+
+inline int MemcmpRawPointerBinaryViewSkipPrefix(int32_t size, 
BinaryViewType::c_type l,
+                                                BinaryViewType::c_type r) {
+  return memcmp(l.raw.data + BinaryViewType::kPrefixSize,  //
+                r.raw.data + BinaryViewType::kPrefixSize,  //
+                size - BinaryViewType::kPrefixSize);
+}
+
+template <typename BufferPtr>
+inline int MemcmpIndexOffsetBinaryViewSkipPrefix(int32_t size, 
BinaryViewType::c_type l,
+                                                 BinaryViewType::c_type r,
+                                                 const BufferPtr* l_buffers,
+                                                 const BufferPtr* r_buffers) {
+  return memcmp(
+      l_buffers[l.io.buffer_index]->data() + l.io.offset + 
BinaryViewType::kPrefixSize,
+      r_buffers[r.io.buffer_index]->data() + r.io.offset + 
BinaryViewType::kPrefixSize,
+      size - BinaryViewType::kPrefixSize);
+}
+
+template <typename... BufferPtr>
+bool EqualBinaryViewImpl(BinaryViewType::c_type l, BinaryViewType::c_type r,
+                         const BufferPtr*... buffers) {
+  int64_t l_size_and_prefix, r_size_and_prefix;
+  memcpy(&l_size_and_prefix, &l, sizeof(l_size_and_prefix));
+  memcpy(&r_size_and_prefix, &r, sizeof(r_size_and_prefix));
+
+  if (l_size_and_prefix != r_size_and_prefix) return false;
+
+  if (l.is_inline()) {
+    // The inline part is zeroed at construction, so we can compare
+    // a word at a time if data extends past 'prefix_'.
+    int64_t l_inlined, r_inlined;
+    memcpy(&l_inlined,
+           AssumeAlignedAs<int64_t>(l.inline_data() + 
BinaryViewType::kPrefixSize),

Review Comment:
   Hmm, what is the point of `AssumeAlignedAs` here? `memcpy` should work 
perfectly well without it.



##########
cpp/src/arrow/testing/random.h:
##########
@@ -367,6 +367,22 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
                                 int64_t alignment = kDefaultBufferAlignment,
                                 MemoryPool* memory_pool = 
default_memory_pool());
 
+  /// \brief Generate a random StringViewArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min_length the lower bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] max_length the upper bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] null_probability the probability of a value being null
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> StringView(int64_t size, int32_t min_length, int32_t 
max_length,
+                                    double null_probability = 0,
+                                    int64_t alignment = 
kDefaultBufferAlignment,

Review Comment:
   Perhaps let the user choose whether this will generate several buffers?
   For example `int32_t num_data_buffers = 1`, or `std::optional<int64_t> 
max_data_buffer_length = {}`?



##########
cpp/src/arrow/testing/random.cc:
##########
@@ -430,6 +429,15 @@ std::shared_ptr<Array> 
RandomArrayGenerator::BinaryWithRepeats(
   return *strings->View(binary());
 }
 
+std::shared_ptr<Array> RandomArrayGenerator::StringView(int64_t size, int32_t 
min_length,
+                                                        int32_t max_length,
+                                                        double 
null_probability,
+                                                        int64_t alignment,
+                                                        MemoryPool* 
memory_pool) {
+  return GenerateBinaryArray<StringViewType, uint32_t>(

Review Comment:
   `uint32_t` doesn't sound quite right? 
   Also, ideally this would generate an array with several variadic buffers, 
since this is used for testing.
   



##########
cpp/src/arrow/ipc/metadata_internal.cc:
##########
@@ -820,7 +846,7 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, 
FieldPosition field_pos,
     dictionary_id = encoding->id();
   }
 
-  // 4. Is it an extension type?
+  // 4. Is it an extension or view type?

Review Comment:
   Why this addition?



##########
cpp/src/arrow/util/string.cc:
##########
@@ -90,6 +90,16 @@ Status ParseHexValue(const char* data, uint8_t* out) {
   return Status::OK();
 }
 
+Status ParseHexValues(std::string_view hex_string, uint8_t* out) {
+  if (hex_string.size() % 2 != 0) {
+    return Status::Invalid("Expected base16 hex string");
+  }
+  for (size_t j = 0; j < hex_string.size() / 2; ++j) {
+    RETURN_NOT_OK(ParseHexValue(hex_string.data() + j * 2, out + j));

Review Comment:
   Uh, `ParseHexValue` is probably quite slow since it uses `std::lower_bound` 
which is far from an optimal implementation of hex number parsing. Now we're 
calling `ParseHexValue` on potentially large data. Perhaps we can do better?
   



##########
cpp/src/arrow/util/binary_view_util.h:
##########
@@ -0,0 +1,163 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string_view>
+#include <utility>
+
+#include "arrow/type.h"
+#include "arrow/util/assume_aligned.h"
+#include "arrow/util/span.h"
+
+namespace arrow::util {
+
+inline BinaryViewType::c_type ToInlineBinaryView(const void* data, int32_t 
size) {
+  // Small string: inlined. Bytes beyond size are zeroed
+  BinaryViewType::c_type out;
+  out.inlined = {size, {}};
+  memcpy(&out.inlined.data, data, size);
+  return out;
+}
+
+inline BinaryViewType::c_type ToInlineBinaryView(std::string_view v) {
+  return ToInlineBinaryView(v.data(), static_cast<int32_t>(v.size()));
+}
+
+inline BinaryViewType::c_type ToRawPointerBinaryView(const void* data, int32_t 
size) {
+  if (size <= BinaryViewType::kInlineSize) {
+    return ToInlineBinaryView(data, size);
+  }
+
+  // Large string: store pointer.
+  BinaryViewType::c_type out;
+  out.raw = {size, {}, static_cast<const uint8_t*>(data)};
+  memcpy(&out.raw.prefix, data, sizeof(out.raw.prefix));
+  return out;
+}
+
+inline BinaryViewType::c_type ToRawPointerBinaryView(std::string_view v) {
+  return ToRawPointerBinaryView(v.data(), static_cast<int32_t>(v.size()));
+}
+
+inline BinaryViewType::c_type ToIndexOffsetBinaryView(const void* data, 
int32_t size,
+                                                      int32_t buffer_index,
+                                                      int32_t offset) {
+  if (size <= BinaryViewType::kInlineSize) {
+    return ToInlineBinaryView(data, size);
+  }
+
+  // Large string: store index/offset.
+  BinaryViewType::c_type out;
+  out.io = {size, {}, buffer_index, offset};
+  memcpy(&out.raw.prefix, data, sizeof(out.raw.prefix));
+  return out;
+}
+
+inline BinaryViewType::c_type ToIndexOffsetBinaryView(std::string_view v,
+                                                      int32_t buffer_index,
+                                                      int32_t offset) {
+  return ToIndexOffsetBinaryView(v.data(), static_cast<int32_t>(v.size()), 
buffer_index,
+                                 offset);
+}
+
+inline std::string_view FromRawPointerBinaryView(const BinaryViewType::c_type& 
v) {
+  auto* data = v.is_inline() ? v.inlined.data.data() : v.raw.data;
+  return {reinterpret_cast<const char*>(data), static_cast<size_t>(v.size())};
+}
+inline std::string_view FromRawPointerBinaryView(BinaryViewType::c_type&&) = 
delete;
+
+template <typename BufferPtr>
+std::string_view FromIndexOffsetBinaryView(const BinaryViewType::c_type& v,
+                                           const BufferPtr* data_buffers) {
+  auto* data = v.is_inline() ? v.inlined.data.data()
+                             : data_buffers[v.io.buffer_index]->data() + 
v.io.offset;
+  return {reinterpret_cast<const char*>(data), static_cast<size_t>(v.size())};
+}
+template <typename BufferPtr>
+std::string_view FromIndexOffsetBinaryView(BinaryViewType::c_type&&,
+                                           const BufferPtr*) = delete;
+
+inline int MemcmpRawPointerBinaryViewSkipPrefix(int32_t size, 
BinaryViewType::c_type l,
+                                                BinaryViewType::c_type r) {
+  return memcmp(l.raw.data + BinaryViewType::kPrefixSize,  //
+                r.raw.data + BinaryViewType::kPrefixSize,  //
+                size - BinaryViewType::kPrefixSize);
+}
+
+template <typename BufferPtr>
+inline int MemcmpIndexOffsetBinaryViewSkipPrefix(int32_t size, 
BinaryViewType::c_type l,
+                                                 BinaryViewType::c_type r,
+                                                 const BufferPtr* l_buffers,
+                                                 const BufferPtr* r_buffers) {
+  return memcmp(
+      l_buffers[l.io.buffer_index]->data() + l.io.offset + 
BinaryViewType::kPrefixSize,
+      r_buffers[r.io.buffer_index]->data() + r.io.offset + 
BinaryViewType::kPrefixSize,
+      size - BinaryViewType::kPrefixSize);
+}
+
+template <typename... BufferPtr>
+bool EqualBinaryViewImpl(BinaryViewType::c_type l, BinaryViewType::c_type r,
+                         const BufferPtr*... buffers) {
+  int64_t l_size_and_prefix, r_size_and_prefix;
+  memcpy(&l_size_and_prefix, &l, sizeof(l_size_and_prefix));
+  memcpy(&r_size_and_prefix, &r, sizeof(r_size_and_prefix));
+
+  if (l_size_and_prefix != r_size_and_prefix) return false;
+
+  if (l.is_inline()) {
+    // The inline part is zeroed at construction, so we can compare
+    // a word at a time if data extends past 'prefix_'.
+    int64_t l_inlined, r_inlined;
+    memcpy(&l_inlined,
+           AssumeAlignedAs<int64_t>(l.inline_data() + 
BinaryViewType::kPrefixSize),

Review Comment:
   Also, you could simply use `memcmp` here...



##########
cpp/src/arrow/type.h:
##########
@@ -710,6 +717,120 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType {
   explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) 
{}
 };
 
+/// \brief Concrete type class for variable-size binary view data
+class ARROW_EXPORT BinaryViewType : public DataType {
+ public:
+  static constexpr Type::type type_id = Type::BINARY_VIEW;
+  static constexpr bool is_utf8 = false;
+  using PhysicalType = BinaryViewType;
+
+  static constexpr int kSize = 16;
+  static constexpr int kInlineSize = 12;
+  static constexpr int kPrefixSize = 4;
+
+  /// Variable length string or binary with inline optimization for small 
values (12 bytes
+  /// or fewer). This is similar to std::string_view except limited in size to 
INT32_MAX
+  /// and at least the first four bytes of the string are copied inline 
(accessible
+  /// without pointer dereference). This inline prefix allows failing 
comparisons early
+  /// and can reduce the CPU cache working set when dealing with short strings.
+  ///
+  /// This union supports three states:
+  ///
+  /// Short string   |----|----|--------|
+  ///                 ^    ^      ^
+  ///                 |    |      |
+  ///                 size prefix remaining in-line portion, zero padded
+  ///
+  /// Long string    |----|----|--------|
+  ///                 ^    ^      ^
+  ///                 |    |      |
+  ///                 size prefix raw pointer to out-of-line portion
+  ///
+  /// IO Long string |----|----|----|----|

Review Comment:
   It does not make sense to call this "IO". It's a perfectly reasonable 
representation even outside of IO considerations.



##########
cpp/src/arrow/type.h:
##########
@@ -710,6 +717,120 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType {
   explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) 
{}
 };
 
+/// \brief Concrete type class for variable-size binary view data
+class ARROW_EXPORT BinaryViewType : public DataType {
+ public:
+  static constexpr Type::type type_id = Type::BINARY_VIEW;
+  static constexpr bool is_utf8 = false;
+  using PhysicalType = BinaryViewType;
+
+  static constexpr int kSize = 16;
+  static constexpr int kInlineSize = 12;
+  static constexpr int kPrefixSize = 4;
+
+  /// Variable length string or binary with inline optimization for small 
values (12 bytes
+  /// or fewer). This is similar to std::string_view except limited in size to 
INT32_MAX
+  /// and at least the first four bytes of the string are copied inline 
(accessible
+  /// without pointer dereference). This inline prefix allows failing 
comparisons early
+  /// and can reduce the CPU cache working set when dealing with short strings.
+  ///
+  /// This union supports three states:
+  ///
+  /// Short string   |----|----|--------|
+  ///                 ^    ^      ^
+  ///                 |    |      |
+  ///                 size prefix remaining in-line portion, zero padded
+  ///
+  /// Long string    |----|----|--------|
+  ///                 ^    ^      ^
+  ///                 |    |      |
+  ///                 size prefix raw pointer to out-of-line portion
+  ///
+  /// IO Long string |----|----|----|----|
+  ///                 ^    ^      ^     ^
+  ///                 |    |      |     `----------.
+  ///                 size prefix buffer index and offset to out-of-line 
portion
+  ///
+  /// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+  ///
+  /// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+  ///
+  /// There is no way to determine from a non-inline view whether it refers
+  /// to its out-of-line portion with a raw pointer or with index/offset. This
+  /// information is stored at the column level; so a buffer will contain only
+  /// inline and index/offset views OR only inline and raw pointer views.
+  ///
+  /// Alignment to 64 bits enables loading the size and prefix into a single
+  /// 64 bit integer, which is useful to the comparison fast path.

Review Comment:
   Alignment to 64 bits is not needed to "enable" this, so you might rephrase.



##########
cpp/src/arrow/testing/gtest_util.h:
##########
@@ -176,10 +176,17 @@ using DecimalArrowTypes = 
::testing::Types<Decimal128Type, Decimal256Type>;
 using BaseBinaryArrowTypes =
     ::testing::Types<BinaryType, LargeBinaryType, StringType, LargeStringType>;
 
+using BaseBinaryOrBinaryViewLikeArrowTypes =

Review Comment:
   Just a nit, but I think the name might be a bit simpler?
   ```suggestion
   using BaseBinaryOrBinaryViewArrowTypes =
   ```



##########
cpp/src/arrow/json/converter.cc:
##########
@@ -304,6 +304,8 @@ Status MakeConverter(const std::shared_ptr<DataType>& 
out_type, MemoryPool* pool
     CONVERTER_CASE(Type::STRING, BinaryConverter<StringType>);
     CONVERTER_CASE(Type::LARGE_BINARY, BinaryConverter<LargeBinaryType>);
     CONVERTER_CASE(Type::LARGE_STRING, BinaryConverter<LargeStringType>);
+    CONVERTER_CASE(Type::BINARY_VIEW, BinaryConverter<BinaryViewType>);
+    CONVERTER_CASE(Type::STRING_VIEW, BinaryConverter<StringViewType>);

Review Comment:
   Is this tested implicitly or do you need to add tests for this?



##########
cpp/src/arrow/integration/json_internal.cc:
##########
@@ -642,6 +648,50 @@ class ArrayWriter {
     writer_->EndArray();
   }
 
+  template <typename ArrayType>
+  void WriteBinaryViewField(const ArrayType& array) {
+    writer_->Key("VIEWS");
+    writer_->StartArray();
+    for (int64_t i = 0; i < array.length(); ++i) {
+      auto s = array.raw_values()[i];
+      writer_->StartObject();
+      writer_->Key("SIZE");
+      writer_->Int64(s.size());
+      if (s.is_inline()) {
+        writer_->Key("INLINED");
+        if constexpr (ArrayType::TypeClass::is_utf8) {
+          writer_->String(reinterpret_cast<const char*>(s.inline_data()), 
s.size());
+        } else {
+          writer_->String(HexEncode(s.inline_data(), s.size()));
+        }
+      } else {
+        // Prefix is always 4 bytes so it may not be utf-8 even if the whole
+        // string view is
+        writer_->Key("PREFIX_HEX");
+        writer_->String(HexEncode(s.inline_data(), 
BinaryViewType::kPrefixSize));
+        writer_->Key("BUFFER_INDEX");
+        writer_->Int64(s.io.buffer_index);
+        writer_->Key("OFFSET");
+        writer_->Int64(s.io.offset);
+      }
+      writer_->EndObject();
+    }

Review Comment:
   So, this is an array-of-structures encoding, e.g.:
   ```json
   {"VIEWS": [
       {"SIZE": 3, "INLINED": "foo"},
       {"SIZE": 3, "INLINED": "bar"},
       {"SIZE": 20, "PREFIX_HEX": "41424344", "BUFFER_INDEX": 1, "OFFSET": 0},
       {"SIZE": 13, "PREFIX_HEX": "45464748", "BUFFER_INDEX": 1, "OFFSET": 20}
   ]}
   ```
   
   Instead, it could be a structure-of-arrays encoding, e.g.:
   ```json
   {"VIEWS": {
       "SIZE": [3, 3, 20, 13],
       "INLINED": ["foo", "bar", null, null],
       "PREFIX_HEX": [null, null, "41424344", "45464748"],
       "BUFFER_INDEX": [null, null, 1, 1],
       "OFFSET": [null, null, 0, 20]
   }}
   ```
   
   Probably not a big deal, but the second form is probably more compact and 
more efficient to parse/decode.
   



##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -463,6 +464,238 @@ class ARROW_EXPORT LargeStringBuilder : public 
LargeBinaryBuilder {
   std::shared_ptr<DataType> type() const override { return large_utf8(); }
 };
 
+// ----------------------------------------------------------------------
+// BinaryViewBuilder, StringViewBuilder
+//
+// These builders do not support building raw pointer view arrays.
+
+namespace internal {
+
+// We allocate medium-sized memory chunks and accumulate data in those, which
+// may result in some waste if there are many large-ish strings. If a string
+// comes along that does not fit into a block, we allocate a new block and
+// write into that.
+//
+// Later we can implement optimizations to continuing filling underfull blocks
+// after encountering a large string that required allocating a new block.
+class ARROW_EXPORT StringHeapBuilder {
+ public:
+  static constexpr int64_t kDefaultBlocksize = 32 << 10;  // 32KB

Review Comment:
   Two things:
   
   1. I'm not sure what the rationale is for 32 KB, which seems a rather small 
value. When you're sending such small data buffers over the wire (e.g. Flight), 
compression efficiency is reduced compared to larger buffers. 
   
   2. In any case, I'm not sure `BinaryViewBuilder` should do any chunking 
implicitly. I understand that this can avoid reallocs when the total data size 
is not known upfront, but conversely this will break prefetching when reading 
from the array, since each new data buffer is an unpredictable data pointer. 
Whether the former or the latter is a more important concern depends on the use 
case, but intuitively many applications are write-once read-many.
   
   @lidavidm 
   



##########
cpp/src/arrow/array/util.cc:
##########
@@ -271,6 +274,17 @@ class ArrayDataEndianSwapper {
     return Status::OK();
   }
 
+  Status Visit(const BinaryViewType& type) {
+    if (type.has_raw_pointers()) {
+      return Status::Invalid("Swapping endianness of ", type);
+    }
+
+    // XXX: This requires knowledge of whether the array is being swapped to 
native endian
+    // or from it so that we know what size to trust when deciding whether 
something is an
+    // inline view.

Review Comment:
   Can you create an issue for this?



##########
cpp/src/arrow/ipc/writer.h:
##########
@@ -57,6 +57,7 @@ struct IpcPayload {
   MessageType type = MessageType::NONE;
   std::shared_ptr<Buffer> metadata;
   std::vector<std::shared_ptr<Buffer>> body_buffers;
+  std::vector<int64_t> variadic_counts;

Review Comment:
   `variadic_buffer_counts` perhaps?



##########
cpp/src/arrow/array/concatenate.cc:
##########
@@ -606,6 +641,10 @@ Result<std::shared_ptr<Array>> Concatenate(const 
ArrayVector& arrays, MemoryPool
     return Status::Invalid("Must pass at least one array");
   }
 
+  if (arrays.size() == 1) {
+    return arrays[0];
+  }

Review Comment:
   As already said on another PR, we routinely recommend `Concatenate` as the 
idiomatic way to compact a sliced array, so we can't make this change.
   
   (we should even add a test to ensure that the idiom still works)



##########
cpp/src/arrow/integration/json_internal.cc:
##########
@@ -1332,6 +1405,97 @@ class ArrayReader {
     return FinishBuilder(&builder);
   }
 
+  template <typename ViewType>
+  enable_if_binary_view_like<ViewType, Status> Visit(const ViewType& type) {
+    ARROW_ASSIGN_OR_RAISE(const auto json_views, GetDataArray(obj_, "VIEWS"));
+    ARROW_ASSIGN_OR_RAISE(const auto json_variadic_bufs,
+                          GetMemberArray(obj_, "VARIADIC_DATA_BUFFERS"));
+
+    using internal::Zip;
+    using util::span;
+
+    BufferVector buffers;
+    buffers.resize(json_variadic_bufs.Size() + 2);
+    for (auto [json_buf, buf] : Zip(json_variadic_bufs, 
span{buffers}.subspan(2))) {
+      auto hex_string = GetStringView(json_buf);
+      ARROW_ASSIGN_OR_RAISE(
+          buf, AllocateBuffer(static_cast<int64_t>(hex_string.size()) / 2, 
pool_));
+      RETURN_NOT_OK(ParseHexValues(hex_string, buf->mutable_data()));
+    }
+
+    TypedBufferBuilder<bool> validity_builder{pool_};
+    RETURN_NOT_OK(validity_builder.Resize(length_));
+    for (bool is_valid : is_valid_) {
+      validity_builder.UnsafeAppend(is_valid);
+    }
+    ARROW_ASSIGN_OR_RAISE(buffers[0], validity_builder.Finish());
+
+    ARROW_ASSIGN_OR_RAISE(
+        buffers[1], AllocateBuffer(length_ * sizeof(BinaryViewType::c_type), 
pool_));
+
+    span views{buffers[1]->mutable_data_as<BinaryViewType::c_type>(),
+               static_cast<size_t>(length_)};

Review Comment:
   Why not add `Buffer::span_as` and `Buffer:mutable_span_as`?



##########
cpp/src/arrow/array/builder_dict.h:
##########
@@ -115,6 +121,10 @@ class ARROW_EXPORT DictionaryMemoTable {
   Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out);
   Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* 
out);
 
+  // TODO: Consider working BinaryViewType::c_type throughout the hashing 
machinery to
+  // benefit from faster comparisons, reduced need to allocate memory

Review Comment:
   I don't really understand what this TODO alludes to, do you mean computing 
the hash value from the prefix only?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] pitrou commented on a diff in pull request #37792: GH-37710: [C++][Integration] Add C++ Utf8View implementation

Reply via email to