pitrou commented on code in PR #45622:
URL: https://github.com/apache/arrow/pull/45622#discussion_r1973315327
##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -303,6 +304,77 @@ class BaseBinaryBuilder
return Status::OK();
}
+ Status AppendBinaryWithLengths(std::string_view binary, const int32_t*
value_lengths,
+ int64_t length) {
Review Comment:
Why not call this `AppendValuesWithLengths`?
Also, it should probably take the right offset type?
And you need to add a docstring...
```suggestion
/// XXX docstring
Status AppendValuesWithLengths(std::string_view binary, util::span<const
offset_type> lengths) {
```
##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -303,6 +304,77 @@ class BaseBinaryBuilder
return Status::OK();
}
+ Status AppendBinaryWithLengths(std::string_view binary, const int32_t*
value_lengths,
+ int64_t length) {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(/*valid_bytes=*/NULLPTR, length);
+ // All values is valid
+ int64_t accum_length = 0;
+ for (int64_t i = 0; i < length; ++i) {
+ accum_length += value_lengths[i];
+ }
+ if (ARROW_PREDICT_FALSE(binary.size() <
static_cast<size_t>(accum_length))) {
+ return Status::Invalid("Binary data is too short");
+ }
Review Comment:
Why not check equality? It's trivial to call `substr` on a
`std::string_view`.
```suggestion
if (ARROW_PREDICT_FALSE(binary.size() !=
static_cast<size_t>(accum_length))) {
return Status::Invalid("Binary size does not match lengths array");
}
```
##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -303,6 +304,77 @@ class BaseBinaryBuilder
return Status::OK();
}
+ Status AppendBinaryWithLengths(std::string_view binary, const int32_t*
value_lengths,
+ int64_t length) {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(/*valid_bytes=*/NULLPTR, length);
+ // All values is valid
+ int64_t accum_length = 0;
+ for (int64_t i = 0; i < length; ++i) {
+ accum_length += value_lengths[i];
+ }
+ if (ARROW_PREDICT_FALSE(binary.size() <
static_cast<size_t>(accum_length))) {
+ return Status::Invalid("Binary data is too short");
+ }
+ if (ARROW_PREDICT_FALSE(binary.size() + value_data_builder_.length() >
+ std::numeric_limits<int32_t>::max())) {
+ return Status::Invalid("Append binary data too long");
+ }
+ std::string_view sub_data = binary.substr(0, accum_length);
+ ARROW_RETURN_NOT_OK(value_data_builder_.Append(
+ reinterpret_cast<const uint8_t*>(sub_data.data()), sub_data.size()));
+ accum_length = 0;
+ const int64_t initialize_offset = value_data_builder_.length();
+ for (int64_t i = 0; i < length; ++i) {
+ offsets_builder_.UnsafeAppend(
+ static_cast<int32_t>(initialize_offset + accum_length));
+ accum_length += value_lengths[i];
+ }
+ return Status::OK();
+ }
+
+ Status AppendBinaryWithLengths(std::string_view binary, const int32_t*
value_lengths,
+ int64_t length, int64_t null_count,
+ const uint8_t* valid_bits, int64_t
valid_bits_offset) {
+ if (valid_bits == NULLPTR || null_count == 0) {
+ return AppendBinaryWithLengths(binary, value_lengths, length);
+ }
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ int64_t accum_length = 0;
+ for (int64_t i = 0; i < length; ++i) {
+ accum_length += value_lengths[i];
+ }
+ if (ARROW_PREDICT_FALSE(binary.size() <
static_cast<size_t>(accum_length))) {
+ return Status::Invalid("Binary data is too short");
+ }
+ const int64_t original_offset = value_data_builder_.length();
+ if (ARROW_PREDICT_FALSE(original_offset + accum_length) >
+ std::numeric_limits<int32_t>::max()) {
+ return Status::Invalid("Append binary data too long");
+ }
+
+ std::string_view sub_data = binary.substr(0, accum_length);
+ ARROW_RETURN_NOT_OK(value_data_builder_.Append(
+ reinterpret_cast<const uint8_t*>(sub_data.data()), sub_data.size()));
+ int64_t length_idx = 0;
+ accum_length = 0;
+ RETURN_NOT_OK(VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, length, null_count,
+ [&]() {
+ offsets_builder_.UnsafeAppend(
+ static_cast<int32_t>(original_offset + accum_length));
+ accum_length += value_lengths[length_idx];
+ ++length_idx;
+ return Status::OK();
+ },
+ [&]() {
+ offsets_builder_.UnsafeAppend(
+ static_cast<int32_t>(original_offset + accum_length));
+ return Status::OK();
Review Comment:
Hmm, we're not incrementing `length_idx` here? This is not usual in Builder
APIs, I don't think this is a good idea.
##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -303,6 +304,77 @@ class BaseBinaryBuilder
return Status::OK();
}
+ Status AppendBinaryWithLengths(std::string_view binary, const int32_t*
value_lengths,
+ int64_t length) {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(/*valid_bytes=*/NULLPTR, length);
+ // All values is valid
+ int64_t accum_length = 0;
+ for (int64_t i = 0; i < length; ++i) {
+ accum_length += value_lengths[i];
+ }
+ if (ARROW_PREDICT_FALSE(binary.size() <
static_cast<size_t>(accum_length))) {
+ return Status::Invalid("Binary data is too short");
+ }
+ if (ARROW_PREDICT_FALSE(binary.size() + value_data_builder_.length() >
+ std::numeric_limits<int32_t>::max())) {
+ return Status::Invalid("Append binary data too long");
+ }
+ std::string_view sub_data = binary.substr(0, accum_length);
+ ARROW_RETURN_NOT_OK(value_data_builder_.Append(
+ reinterpret_cast<const uint8_t*>(sub_data.data()), sub_data.size()));
+ accum_length = 0;
+ const int64_t initialize_offset = value_data_builder_.length();
+ for (int64_t i = 0; i < length; ++i) {
+ offsets_builder_.UnsafeAppend(
+ static_cast<int32_t>(initialize_offset + accum_length));
+ accum_length += value_lengths[i];
+ }
+ return Status::OK();
+ }
+
+ Status AppendBinaryWithLengths(std::string_view binary, const int32_t*
value_lengths,
+ int64_t length, int64_t null_count,
+ const uint8_t* valid_bits, int64_t
valid_bits_offset) {
+ if (valid_bits == NULLPTR || null_count == 0) {
+ return AppendBinaryWithLengths(binary, value_lengths, length);
+ }
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ int64_t accum_length = 0;
+ for (int64_t i = 0; i < length; ++i) {
+ accum_length += value_lengths[i];
+ }
+ if (ARROW_PREDICT_FALSE(binary.size() <
static_cast<size_t>(accum_length))) {
+ return Status::Invalid("Binary data is too short");
+ }
+ const int64_t original_offset = value_data_builder_.length();
Review Comment:
Can you use the same naming as above? e.g. `initial_offset`
##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -303,6 +304,77 @@ class BaseBinaryBuilder
return Status::OK();
}
+ Status AppendBinaryWithLengths(std::string_view binary, const int32_t*
value_lengths,
+ int64_t length) {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(/*valid_bytes=*/NULLPTR, length);
+ // All values is valid
+ int64_t accum_length = 0;
+ for (int64_t i = 0; i < length; ++i) {
+ accum_length += value_lengths[i];
+ }
+ if (ARROW_PREDICT_FALSE(binary.size() <
static_cast<size_t>(accum_length))) {
+ return Status::Invalid("Binary data is too short");
+ }
+ if (ARROW_PREDICT_FALSE(binary.size() + value_data_builder_.length() >
+ std::numeric_limits<int32_t>::max())) {
+ return Status::Invalid("Append binary data too long");
+ }
Review Comment:
Let's call `ValidateOverflow` instead?
##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -303,6 +304,77 @@ class BaseBinaryBuilder
return Status::OK();
}
+ Status AppendBinaryWithLengths(std::string_view binary, const int32_t*
value_lengths,
+ int64_t length) {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(/*valid_bytes=*/NULLPTR, length);
Review Comment:
Can you call these _after_ the error checks below?
##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -303,6 +304,77 @@ class BaseBinaryBuilder
return Status::OK();
}
+ Status AppendBinaryWithLengths(std::string_view binary, const int32_t*
value_lengths,
+ int64_t length) {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(/*valid_bytes=*/NULLPTR, length);
+ // All values is valid
+ int64_t accum_length = 0;
+ for (int64_t i = 0; i < length; ++i) {
+ accum_length += value_lengths[i];
+ }
+ if (ARROW_PREDICT_FALSE(binary.size() <
static_cast<size_t>(accum_length))) {
+ return Status::Invalid("Binary data is too short");
+ }
+ if (ARROW_PREDICT_FALSE(binary.size() + value_data_builder_.length() >
+ std::numeric_limits<int32_t>::max())) {
+ return Status::Invalid("Append binary data too long");
+ }
+ std::string_view sub_data = binary.substr(0, accum_length);
+ ARROW_RETURN_NOT_OK(value_data_builder_.Append(
+ reinterpret_cast<const uint8_t*>(sub_data.data()), sub_data.size()));
+ accum_length = 0;
+ const int64_t initialize_offset = value_data_builder_.length();
Review Comment:
```suggestion
const int64_t initial_offset = value_data_builder_.length();
```
##########
cpp/src/arrow/array/builder_binary.h:
##########
@@ -303,6 +304,77 @@ class BaseBinaryBuilder
return Status::OK();
}
+ Status AppendBinaryWithLengths(std::string_view binary, const int32_t*
value_lengths,
+ int64_t length) {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(/*valid_bytes=*/NULLPTR, length);
+ // All values is valid
+ int64_t accum_length = 0;
+ for (int64_t i = 0; i < length; ++i) {
+ accum_length += value_lengths[i];
+ }
+ if (ARROW_PREDICT_FALSE(binary.size() <
static_cast<size_t>(accum_length))) {
+ return Status::Invalid("Binary data is too short");
+ }
+ if (ARROW_PREDICT_FALSE(binary.size() + value_data_builder_.length() >
+ std::numeric_limits<int32_t>::max())) {
+ return Status::Invalid("Append binary data too long");
+ }
+ std::string_view sub_data = binary.substr(0, accum_length);
+ ARROW_RETURN_NOT_OK(value_data_builder_.Append(
+ reinterpret_cast<const uint8_t*>(sub_data.data()), sub_data.size()));
+ accum_length = 0;
+ const int64_t initialize_offset = value_data_builder_.length();
+ for (int64_t i = 0; i < length; ++i) {
+ offsets_builder_.UnsafeAppend(
+ static_cast<int32_t>(initialize_offset + accum_length));
+ accum_length += value_lengths[i];
+ }
+ return Status::OK();
+ }
+
+ Status AppendBinaryWithLengths(std::string_view binary, const int32_t*
value_lengths,
Review Comment:
Same comments below...
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]