Copilot commented on code in PR #49712:
URL: https://github.com/apache/arrow/pull/49712#discussion_r3316862066
##########
r/man/acero.Rd:
##########
@@ -72,7 +72,7 @@ can assume that the function works in Acero just as it does
in R.
Functions can be called either as \code{pkg::fun()} or just \code{fun()}, i.e.
both
\code{str_sub()} and \code{stringr::str_sub()} work.
-In addition to these functions, you can call any of Arrow's 281 compute
+In addition to these functions, you can call any of Arrow's 253 compute
Review Comment:
This generated Rd file now disagrees with its roxygen source
(`r/R/dplyr-funcs-doc.R` still documents 281 compute functions). Regenerate the
documentation from the source with the repository's configured build, or drop
this unrelated generated change, so the installed docs don't report a different
compute-function count than the source.
##########
r/src/r_to_arrow.cpp:
##########
@@ -910,6 +910,49 @@ class RPrimitiveConverter<T, enable_if_string_like<T>>
}
};
+template <typename T>
+class RPrimitiveConverter<T, enable_if_string_view<T>>
+ : public PrimitiveConverter<T, RConverter> {
+ public:
+ Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+ RVectorType rtype = GetVectorType(x);
+ if (rtype != STRING) {
+ return Status::Invalid("Expecting a character vector");
+ }
+ return UnsafeAppendUtf8Strings(arrow::r::utf8_strings(x), size, offset);
+ }
+
+ void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
+ auto task = [this, values, size]() { return this->Extend(values, size); };
+ tasks.Append(false, std::move(task));
+ }
+
+ private:
+ Status UnsafeAppendUtf8Strings(const cpp11::strings& s, int64_t size,
int64_t offset) {
+ RETURN_NOT_OK(this->primitive_builder_->Reserve(size - offset));
+ const SEXP* p_strings = reinterpret_cast<const SEXP*>(DATAPTR_RO(s)) +
offset;
+
+ int64_t total_length = 0;
+ for (R_xlen_t i = offset; i < size; i++, ++p_strings) {
+ SEXP si = *p_strings;
+ total_length += si == NA_STRING ? 0 : LENGTH(si);
+ }
+ RETURN_NOT_OK(this->primitive_builder_->ReserveData(total_length));
+
+ p_strings = reinterpret_cast<const SEXP*>(DATAPTR_RO(s)) + offset;
+ for (R_xlen_t i = offset; i < size; i++, ++p_strings) {
+ SEXP si = *p_strings;
+ if (si == NA_STRING) {
+ this->primitive_builder_->UnsafeAppendNull();
+ } else {
Review Comment:
Pre-reserving `total_length` here treats the sum of all string bytes as a
single StringView allocation. `BinaryViewBuilder::ReserveData()` rejects
requests over 2GB even though StringView can store multiple out-of-line
buffers, so a column whose combined string data exceeds 2GB but whose
individual values are valid will fail to build. Reserve/append per value (or
otherwise chunk the pre-reserve) so only individual values are checked against
the 2GB limit.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]