bkietz commented on code in PR #37792:
URL: https://github.com/apache/arrow/pull/37792#discussion_r1336235024


##########
cpp/src/arrow/type.h:
##########
@@ -710,6 +717,120 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType {
   explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) 
{}
 };
 
+/// \brief Concrete type class for variable-size binary view data
+class ARROW_EXPORT BinaryViewType : public DataType {
+ public:
+  static constexpr Type::type type_id = Type::BINARY_VIEW;
+  static constexpr bool is_utf8 = false;
+  using PhysicalType = BinaryViewType;
+
+  static constexpr int kSize = 16;
+  static constexpr int kInlineSize = 12;
+  static constexpr int kPrefixSize = 4;
+
+  /// Variable length string or binary with inline optimization for small 
values (12 bytes
+  /// or fewer). This is similar to std::string_view except limited in size to 
INT32_MAX
+  /// and at least the first four bytes of the string are copied inline 
(accessible
+  /// without pointer dereference). This inline prefix allows failing 
comparisons early
+  /// and can reduce the CPU cache working set when dealing with short strings.
+  ///
+  /// This union supports three states:
+  ///
+  /// Short string   |----|----|--------|
+  ///                 ^    ^      ^
+  ///                 |    |      |
+  ///                 size prefix remaining in-line portion, zero padded
+  ///
+  /// Long string    |----|----|--------|
+  ///                 ^    ^      ^
+  ///                 |    |      |
+  ///                 size prefix raw pointer to out-of-line portion
+  ///
+  /// IO Long string |----|----|----|----|
+  ///                 ^    ^      ^     ^
+  ///                 |    |      |     `----------.
+  ///                 size prefix buffer index and offset to out-of-line 
portion
+  ///
+  /// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+  ///
+  /// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+  ///
+  /// There is no way to determine from a non-inline view whether it refers
+  /// to its out-of-line portion with a raw pointer or with index/offset. This
+  /// information is stored at the column level; so a buffer will contain only
+  /// inline and index/offset views OR only inline and raw pointer views.
+  ///
+  /// Alignment to 64 bits enables loading the size and prefix into a single
+  /// 64 bit integer, which is useful to the comparison fast path.

Review Comment:
   Sorry, I meant an aligned load. I'll update the comment



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to