felipecrv commented on code in PR #37792:
URL: https://github.com/apache/arrow/pull/37792#discussion_r1333576730


##########
cpp/src/arrow/array/builder_binary.cc:
##########
@@ -40,6 +40,72 @@ namespace arrow {
 
 using internal::checked_cast;
 
+// ----------------------------------------------------------------------
+// Binary/StringView
+BinaryViewBuilder::BinaryViewBuilder(const std::shared_ptr<DataType>& type,
+                                     MemoryPool* pool)
+    : BinaryViewBuilder(pool) {
+  ARROW_CHECK(!checked_cast<const BinaryViewType&>(*type).has_raw_pointers());
+}
+
+Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t 
offset,
+                                           int64_t length) {
+  auto bitmap = array.GetValues<uint8_t>(0, 0);
+  auto values = array.GetValues<BinaryViewType::c_type>(1) + offset;
+
+  int64_t out_of_line_total = 0;
+  for (int64_t i = 0; i < length; i++) {
+    if (!values[i].is_inline()) {
+      out_of_line_total += static_cast<int64_t>(values[i].size());
+    }
+  }
+  RETURN_NOT_OK(Reserve(length));
+  RETURN_NOT_OK(ReserveData(out_of_line_total));
+
+  for (int64_t i = 0; i < length; i++) {
+    if (bitmap && !bit_util::GetBit(bitmap, array.offset + offset + i)) {
+      UnsafeAppendNull();
+      continue;
+    }
+
+    if (checked_cast<const BinaryViewType*>(array.type)->has_raw_pointers()) {
+      UnsafeAppend(util::FromRawPointerBinaryView(values[i]));
+    } else {
+      UnsafeAppend(util::FromIndexOffsetBinaryView(
+          values[i], array.buffers[2].data_as<std::shared_ptr<Buffer>>()));
+    }
+  }
+  return Status::OK();
+}
+
+Status BinaryViewBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+  ARROW_ASSIGN_OR_RAISE(auto null_bitmap, 
null_bitmap_builder_.FinishWithLength(length_));
+  ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
+  BufferVector buffers = {null_bitmap, data};
+  for (auto&& buffer : data_heap_builder_.Finish()) {
+    buffers.push_back(std::move(buffer));
+  }
+  *out = ArrayData::Make(type(), length_, std::move(buffers), null_count_);
+  capacity_ = length_ = null_count_ = 0;

Review Comment:
   Shouldn't `Reset()` take care of this?



##########
cpp/src/arrow/type_traits.h:
##########
@@ -614,9 +637,28 @@ using is_string_type =
 template <typename T, typename R = void>
 using enable_if_string = enable_if_t<is_string_type<T>::value, R>;
 
+template <typename T>
+using is_binary_view_like_type = std::is_base_of<BinaryViewType, T>;

Review Comment:
   Is this really necessary used anywhere?



##########
cpp/src/arrow/array/validate.cc:
##########
@@ -453,7 +462,14 @@ struct ValidateArrayImpl {
       return Status::Invalid("Array length is negative");
     }
 
-    if (data.buffers.size() != layout.buffers.size()) {
+    if (layout.variadic_spec) {
+      if (data.buffers.size() < layout.buffers.size()) {
+        return Status::Invalid("Expected at least ", layout.buffers.size(),
+                               " buffers in array "
+                               "of type ",

Review Comment:
   these can fit in one line



##########
cpp/src/arrow/builder.cc:
##########
@@ -190,7 +197,12 @@ struct DictionaryBuilderCase {
 
 struct MakeBuilderImpl {
   template <typename T>
-  enable_if_not_nested<T, Status> Visit(const T&) {
+  enable_if_not_nested<T, Status> Visit(const T& t) {
+    if constexpr (is_binary_view_like_type<T>::value) {
+      if (t.has_raw_pointers()) {
+        return NotImplemented();
+      }

Review Comment:
   I think this answers my question above :)



##########
cpp/src/arrow/array/data.cc:
##########
@@ -351,6 +362,22 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
     }
     this->buffers[2].data = const_cast<uint8_t*>(data_buffer);
     this->buffers[2].size = data_size;
+  } else if (type_id == Type::BINARY_VIEW || type_id == Type::STRING_VIEW) {
+    const auto& scalar = checked_cast<const BaseBinaryScalar&>(value);
+
+    this->buffers[1].size = BinaryViewType::kSize;
+    this->buffers[1].data = scalar.scratch_space_;
+    auto* view = new (&scalar.scratch_space_) BinaryViewType::c_type;

Review Comment:
   It would be nice to have a static_assert here ensuring the scratch_space_ 
can fit the `c_type`.



##########
cpp/src/arrow/array/validate.cc:
##########
@@ -74,10 +76,7 @@ struct BoundsChecker {
   int64_t min_value;
   int64_t max_value;
 
-  Status Visit(const DataType&) {
-    // Default, should be unreachable
-    return Status::NotImplemented("");
-  }
+  Status Visit(const DataType&) { Unreachable("bounds checking of non integer 
type"); }

Review Comment:
   This seems like a dangerous assumption about all future data types.



##########
cpp/src/arrow/array/builder_binary.cc:
##########
@@ -40,6 +40,72 @@ namespace arrow {
 
 using internal::checked_cast;
 
+// ----------------------------------------------------------------------
+// Binary/StringView
+BinaryViewBuilder::BinaryViewBuilder(const std::shared_ptr<DataType>& type,
+                                     MemoryPool* pool)
+    : BinaryViewBuilder(pool) {
+  ARROW_CHECK(!checked_cast<const BinaryViewType&>(*type).has_raw_pointers());

Review Comment:
   Why the builder never accepts types with raw pointers?
   



##########
cpp/src/arrow/type_traits.h:
##########
@@ -639,10 +681,9 @@ template <typename T, typename R = void>
 using enable_if_fixed_width_type = enable_if_t<is_fixed_width_type<T>::value, 
R>;
 
 template <typename T>
-using is_binary_like_type =
-    std::integral_constant<bool, (is_base_binary_type<T>::value &&
-                                  !is_string_like_type<T>::value) ||
-                                     is_fixed_size_binary_type<T>::value>;
+using is_binary_like_type = std::integral_constant<
+    bool, (is_base_binary_type<T>::value && !is_string_like_type<T>::value) ||
+              is_binary_view_type<T>::value || 
is_fixed_size_binary_type<T>::value>;

Review Comment:
   This changes the meaning of `is_binary_like_type`. For list-views I added 
new predicates and kept the existing predicates' semantics.
   
   Illustrate in this graph 
https://gist.github.com/felipecrv/3c02f3784221d946dec1b031c6d400db



##########
cpp/src/arrow/type_traits.h:
##########
@@ -801,8 +842,10 @@ using enable_if_has_c_type = 
enable_if_t<has_c_type<T>::value, R>;
 template <typename T>
 using has_string_view =
     std::integral_constant<bool, std::is_same<BinaryType, T>::value ||
+                                     std::is_same<BinaryViewType, T>::value ||
                                      std::is_same<LargeBinaryType, T>::value ||
                                      std::is_same<StringType, T>::value ||
+                                     std::is_same<StringViewType, T>::value ||
                                      std::is_same<LargeStringType, T>::value ||
                                      std::is_same<FixedSizeBinaryType, 
T>::value>;

Review Comment:
   Can we really anticipate all the consequences of expanding the list of types 
that pass this predicate?



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -3373,6 +3373,8 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptional) {
 
   for (const auto& inner_type : types) {
     if (inner_type->id() == ::arrow::Type::NA) continue;
+    if (inner_type->id() == ::arrow::Type::BINARY_VIEW) continue;
+    if (inner_type->id() == ::arrow::Type::STRING_VIEW) continue;

Review Comment:
   Should a TODO be placed here?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to