pitrou commented on code in PR #45622:
URL: https://github.com/apache/arrow/pull/45622#discussion_r1973315327


##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -303,6 +304,77 @@ class BaseBinaryBuilder
     return Status::OK();
   }
 
+  Status AppendBinaryWithLengths(std::string_view binary, const int32_t* 
value_lengths,
+                                 int64_t length) {

Review Comment:
   Why not call this `AppendValuesWithLengths`?
   Also, it should probably take the right offset type?
   And you need to add a docstring...
   ```suggestion
     /// XXX docstring
     Status AppendValuesWithLengths(std::string_view binary, util::span<const 
offset_type> lengths) {
   ```



##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -303,6 +304,77 @@ class BaseBinaryBuilder
     return Status::OK();
   }
 
+  Status AppendBinaryWithLengths(std::string_view binary, const int32_t* 
value_lengths,
+                                 int64_t length) {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    UnsafeAppendToBitmap(/*valid_bytes=*/NULLPTR, length);
+    // All values is valid
+    int64_t accum_length = 0;
+    for (int64_t i = 0; i < length; ++i) {
+      accum_length += value_lengths[i];
+    }
+    if (ARROW_PREDICT_FALSE(binary.size() < 
static_cast<size_t>(accum_length))) {
+      return Status::Invalid("Binary data is too short");
+    }

Review Comment:
   Why not check equality? It's trivial to call `substr` on a 
`std::string_view`.
   ```suggestion
       if (ARROW_PREDICT_FALSE(binary.size() != 
static_cast<size_t>(accum_length))) {
         return Status::Invalid("Binary size does not match lengths array");
       }
   ```



##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -303,6 +304,77 @@ class BaseBinaryBuilder
     return Status::OK();
   }
 
+  Status AppendBinaryWithLengths(std::string_view binary, const int32_t* 
value_lengths,
+                                 int64_t length) {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    UnsafeAppendToBitmap(/*valid_bytes=*/NULLPTR, length);
+    // All values is valid
+    int64_t accum_length = 0;
+    for (int64_t i = 0; i < length; ++i) {
+      accum_length += value_lengths[i];
+    }
+    if (ARROW_PREDICT_FALSE(binary.size() < 
static_cast<size_t>(accum_length))) {
+      return Status::Invalid("Binary data is too short");
+    }
+    if (ARROW_PREDICT_FALSE(binary.size() + value_data_builder_.length() >
+                            std::numeric_limits<int32_t>::max())) {
+      return Status::Invalid("Append binary data too long");
+    }
+    std::string_view sub_data = binary.substr(0, accum_length);
+    ARROW_RETURN_NOT_OK(value_data_builder_.Append(
+        reinterpret_cast<const uint8_t*>(sub_data.data()), sub_data.size()));
+    accum_length = 0;
+    const int64_t initialize_offset = value_data_builder_.length();
+    for (int64_t i = 0; i < length; ++i) {
+      offsets_builder_.UnsafeAppend(
+          static_cast<int32_t>(initialize_offset + accum_length));
+      accum_length += value_lengths[i];
+    }
+    return Status::OK();
+  }
+
+  Status AppendBinaryWithLengths(std::string_view binary, const int32_t* 
value_lengths,
+                                 int64_t length, int64_t null_count,
+                                 const uint8_t* valid_bits, int64_t 
valid_bits_offset) {
+    if (valid_bits == NULLPTR || null_count == 0) {
+      return AppendBinaryWithLengths(binary, value_lengths, length);
+    }
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    int64_t accum_length = 0;
+    for (int64_t i = 0; i < length; ++i) {
+      accum_length += value_lengths[i];
+    }
+    if (ARROW_PREDICT_FALSE(binary.size() < 
static_cast<size_t>(accum_length))) {
+      return Status::Invalid("Binary data is too short");
+    }
+    const int64_t original_offset = value_data_builder_.length();
+    if (ARROW_PREDICT_FALSE(original_offset + accum_length) >
+        std::numeric_limits<int32_t>::max()) {
+      return Status::Invalid("Append binary data too long");
+    }
+
+    std::string_view sub_data = binary.substr(0, accum_length);
+    ARROW_RETURN_NOT_OK(value_data_builder_.Append(
+        reinterpret_cast<const uint8_t*>(sub_data.data()), sub_data.size()));
+    int64_t length_idx = 0;
+    accum_length = 0;
+    RETURN_NOT_OK(VisitNullBitmapInline(
+        valid_bits, valid_bits_offset, length, null_count,
+        [&]() {
+          offsets_builder_.UnsafeAppend(
+              static_cast<int32_t>(original_offset + accum_length));
+          accum_length += value_lengths[length_idx];
+          ++length_idx;
+          return Status::OK();
+        },
+        [&]() {
+          offsets_builder_.UnsafeAppend(
+              static_cast<int32_t>(original_offset + accum_length));
+          return Status::OK();

Review Comment:
   Hmm, we're not incrementing `length_idx` here? This is not usual in Builder 
APIs, I don't think this is a good idea.



##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -303,6 +304,77 @@ class BaseBinaryBuilder
     return Status::OK();
   }
 
+  Status AppendBinaryWithLengths(std::string_view binary, const int32_t* 
value_lengths,
+                                 int64_t length) {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    UnsafeAppendToBitmap(/*valid_bytes=*/NULLPTR, length);
+    // All values is valid
+    int64_t accum_length = 0;
+    for (int64_t i = 0; i < length; ++i) {
+      accum_length += value_lengths[i];
+    }
+    if (ARROW_PREDICT_FALSE(binary.size() < 
static_cast<size_t>(accum_length))) {
+      return Status::Invalid("Binary data is too short");
+    }
+    if (ARROW_PREDICT_FALSE(binary.size() + value_data_builder_.length() >
+                            std::numeric_limits<int32_t>::max())) {
+      return Status::Invalid("Append binary data too long");
+    }
+    std::string_view sub_data = binary.substr(0, accum_length);
+    ARROW_RETURN_NOT_OK(value_data_builder_.Append(
+        reinterpret_cast<const uint8_t*>(sub_data.data()), sub_data.size()));
+    accum_length = 0;
+    const int64_t initialize_offset = value_data_builder_.length();
+    for (int64_t i = 0; i < length; ++i) {
+      offsets_builder_.UnsafeAppend(
+          static_cast<int32_t>(initialize_offset + accum_length));
+      accum_length += value_lengths[i];
+    }
+    return Status::OK();
+  }
+
+  Status AppendBinaryWithLengths(std::string_view binary, const int32_t* 
value_lengths,
+                                 int64_t length, int64_t null_count,
+                                 const uint8_t* valid_bits, int64_t 
valid_bits_offset) {
+    if (valid_bits == NULLPTR || null_count == 0) {
+      return AppendBinaryWithLengths(binary, value_lengths, length);
+    }
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    int64_t accum_length = 0;
+    for (int64_t i = 0; i < length; ++i) {
+      accum_length += value_lengths[i];
+    }
+    if (ARROW_PREDICT_FALSE(binary.size() < 
static_cast<size_t>(accum_length))) {
+      return Status::Invalid("Binary data is too short");
+    }
+    const int64_t original_offset = value_data_builder_.length();

Review Comment:
   Can you use the same naming as above? e.g. `initial_offset`



##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -303,6 +304,77 @@ class BaseBinaryBuilder
     return Status::OK();
   }
 
+  Status AppendBinaryWithLengths(std::string_view binary, const int32_t* 
value_lengths,
+                                 int64_t length) {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    UnsafeAppendToBitmap(/*valid_bytes=*/NULLPTR, length);
+    // All values is valid
+    int64_t accum_length = 0;
+    for (int64_t i = 0; i < length; ++i) {
+      accum_length += value_lengths[i];
+    }
+    if (ARROW_PREDICT_FALSE(binary.size() < 
static_cast<size_t>(accum_length))) {
+      return Status::Invalid("Binary data is too short");
+    }
+    if (ARROW_PREDICT_FALSE(binary.size() + value_data_builder_.length() >
+                            std::numeric_limits<int32_t>::max())) {
+      return Status::Invalid("Append binary data too long");
+    }

Review Comment:
   Let's call `ValidateOverflow` instead?



##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -303,6 +304,77 @@ class BaseBinaryBuilder
     return Status::OK();
   }
 
+  Status AppendBinaryWithLengths(std::string_view binary, const int32_t* 
value_lengths,
+                                 int64_t length) {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    UnsafeAppendToBitmap(/*valid_bytes=*/NULLPTR, length);

Review Comment:
   Can you call these _after_ the error checks below?



##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -303,6 +304,77 @@ class BaseBinaryBuilder
     return Status::OK();
   }
 
+  Status AppendBinaryWithLengths(std::string_view binary, const int32_t* 
value_lengths,
+                                 int64_t length) {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    UnsafeAppendToBitmap(/*valid_bytes=*/NULLPTR, length);
+    // All values is valid
+    int64_t accum_length = 0;
+    for (int64_t i = 0; i < length; ++i) {
+      accum_length += value_lengths[i];
+    }
+    if (ARROW_PREDICT_FALSE(binary.size() < 
static_cast<size_t>(accum_length))) {
+      return Status::Invalid("Binary data is too short");
+    }
+    if (ARROW_PREDICT_FALSE(binary.size() + value_data_builder_.length() >
+                            std::numeric_limits<int32_t>::max())) {
+      return Status::Invalid("Append binary data too long");
+    }
+    std::string_view sub_data = binary.substr(0, accum_length);
+    ARROW_RETURN_NOT_OK(value_data_builder_.Append(
+        reinterpret_cast<const uint8_t*>(sub_data.data()), sub_data.size()));
+    accum_length = 0;
+    const int64_t initialize_offset = value_data_builder_.length();

Review Comment:
   ```suggestion
       const int64_t initial_offset = value_data_builder_.length();
   ```



##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -303,6 +304,77 @@ class BaseBinaryBuilder
     return Status::OK();
   }
 
+  Status AppendBinaryWithLengths(std::string_view binary, const int32_t* 
value_lengths,
+                                 int64_t length) {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    UnsafeAppendToBitmap(/*valid_bytes=*/NULLPTR, length);
+    // All values is valid
+    int64_t accum_length = 0;
+    for (int64_t i = 0; i < length; ++i) {
+      accum_length += value_lengths[i];
+    }
+    if (ARROW_PREDICT_FALSE(binary.size() < 
static_cast<size_t>(accum_length))) {
+      return Status::Invalid("Binary data is too short");
+    }
+    if (ARROW_PREDICT_FALSE(binary.size() + value_data_builder_.length() >
+                            std::numeric_limits<int32_t>::max())) {
+      return Status::Invalid("Append binary data too long");
+    }
+    std::string_view sub_data = binary.substr(0, accum_length);
+    ARROW_RETURN_NOT_OK(value_data_builder_.Append(
+        reinterpret_cast<const uint8_t*>(sub_data.data()), sub_data.size()));
+    accum_length = 0;
+    const int64_t initialize_offset = value_data_builder_.length();
+    for (int64_t i = 0; i < length; ++i) {
+      offsets_builder_.UnsafeAppend(
+          static_cast<int32_t>(initialize_offset + accum_length));
+      accum_length += value_lengths[i];
+    }
+    return Status::OK();
+  }
+
+  Status AppendBinaryWithLengths(std::string_view binary, const int32_t* 
value_lengths,

Review Comment:
   Same comments below...



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to