pitrou commented on code in PR #49712:
URL: https://github.com/apache/arrow/pull/49712#discussion_r3450833209
##########
cpp/src/arrow/util/converter.h:
##########
@@ -241,6 +241,7 @@ struct MakeConverterImpl {
DICTIONARY_CASE(LargeBinaryType);
DICTIONARY_CASE(StringType);
DICTIONARY_CASE(LargeStringType);
+ DICTIONARY_CASE(StringViewType);
Review Comment:
Why not also `BinaryViewType`?
##########
r/src/array_to_vector.cpp:
##########
@@ -290,26 +290,32 @@ struct Converter_String : public Converter {
Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>&
array,
R_xlen_t start, R_xlen_t n, size_t chunk_index)
const {
- auto p_offset = array->data()->GetValues<int32_t>(1);
- if (!p_offset) {
- return Status::Invalid("Invalid offset buffer");
- }
- auto p_strings = array->data()->GetValues<char>(2, *p_offset);
- if (!p_strings) {
- // There is an offset buffer, but the data buffer is null
- // There is at least one value in the array and not all the values are
null
- // That means all values are either empty strings or nulls so there is
nothing to do
-
- if (array->null_count()) {
- arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
- array->offset(), n);
- for (int i = 0; i < n; i++, null_reader.Next()) {
- if (null_reader.IsNotSet()) {
- SET_STRING_ELT(data, start + i, NA_STRING);
+ // StringViewArray uses a different memory layout (views + data buffers)
rather
+ // than offsets, so skip the offset-based fast path and fall through to the
+ // GetView()-based element loop below.
+ if (array->type_id() != Type::STRING_VIEW) {
Review Comment:
Can use `is_binary_view_like(array->type_id())` to also match BinaryView.
##########
r/R/type.R:
##########
@@ -203,6 +203,13 @@ Utf8 <- R6Class(
code = function(namespace = FALSE) call2("utf8", .ns = if (namespace)
"arrow")
)
)
+StringView <- R6Class(
+ "StringView",
Review Comment:
Is there a reason for calling it `StringView` instead of `Utf8View` (like
the existing `Utf8` and `LargeUtf8`)?
##########
r/src/array_to_vector.cpp:
##########
@@ -1263,6 +1270,10 @@ std::shared_ptr<Converter> Converter::Make(
return
std::make_shared<arrow::r::Converter_String<arrow::LargeStringArray>>(
chunked_array);
+ case Type::STRING_VIEW:
Review Comment:
Why not handle BINARY_VIEW above as well?
##########
r/src/array_to_vector.cpp:
##########
@@ -290,26 +290,32 @@ struct Converter_String : public Converter {
Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>&
array,
R_xlen_t start, R_xlen_t n, size_t chunk_index)
const {
- auto p_offset = array->data()->GetValues<int32_t>(1);
- if (!p_offset) {
- return Status::Invalid("Invalid offset buffer");
- }
- auto p_strings = array->data()->GetValues<char>(2, *p_offset);
- if (!p_strings) {
- // There is an offset buffer, but the data buffer is null
- // There is at least one value in the array and not all the values are
null
- // That means all values are either empty strings or nulls so there is
nothing to do
-
- if (array->null_count()) {
- arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
- array->offset(), n);
- for (int i = 0; i < n; i++, null_reader.Next()) {
- if (null_reader.IsNotSet()) {
- SET_STRING_ELT(data, start + i, NA_STRING);
+ // StringViewArray uses a different memory layout (views + data buffers)
rather
+ // than offsets, so skip the offset-based fast path and fall through to the
+ // GetView()-based element loop below.
+ if (array->type_id() != Type::STRING_VIEW) {
+ auto p_offset = array->data()->GetValues<int32_t>(1);
Review Comment:
So this doesn't work for LargeBinary and LargeString, which have 64-bit
offsets, right?
##########
r/src/datatype.cpp:
##########
@@ -57,6 +57,8 @@ const char* r6_class_name<arrow::DataType>::get(
case Type::STRING:
return "Utf8";
+ case Type::STRING_VIEW:
Review Comment:
Why not `BINARY_VIEW` below as well?
##########
r/tests/testthat/test-Table.R:
##########
@@ -371,6 +371,28 @@ test_that("Can create table with specific dictionary
types", {
expect_equal_data_frame(tab_large, fact)
})
+test_that("Table converts dictionary arrays with wider index types back to R",
{
Review Comment:
This test does not seem related, is it just adding a test that was
overlooked before?
##########
r/src/r_to_arrow.cpp:
##########
@@ -910,6 +910,43 @@ class RPrimitiveConverter<T, enable_if_string_like<T>>
}
};
+template <typename T>
+class RPrimitiveConverter<T, enable_if_string_view<T>>
+ : public PrimitiveConverter<T, RConverter> {
Review Comment:
It seems there's a lot in common between this and the regular `String`
converter (only `UnsafeAppendUtf8Strings` differs AFAICT), perhaps it's worth
factoring things out in a common base class?
##########
r/R/type.R:
##########
Review Comment:
Why not also add `BinaryView` here?
##########
r/tests/testthat/test-Array.R:
##########
@@ -203,6 +203,8 @@ test_that("Array supports character vectors (ARROW-3339)", {
# with NA
expect_array_roundtrip(c("itsy", NA, "spider"), utf8())
expect_array_roundtrip(c("itsy", NA, "spider"), large_utf8(), as =
large_utf8())
+
+ expect_array_roundtrip(c("itsy", NA, "", "spider"), string_view(), as =
string_view())
Review Comment:
String view arrays have a different representation for "short" and "long"
strings (i.e. > 12 characters), I suggest exercising that:
```suggestion
expect_array_roundtrip(c("itsy", NA, "", "spider"), string_view(), as =
string_view())
expect_array_roundtrip(c("itsy", NA, "", "a long non-inlined string",
"another long string"), string_view(), as = string_view())
```
##########
r/tests/testthat/test-Table.R:
##########
@@ -371,6 +371,28 @@ test_that("Can create table with specific dictionary
types", {
expect_equal_data_frame(tab_large, fact)
})
+test_that("Table converts dictionary arrays with wider index types back to R",
{
+ fact <- example_data[, "fct"]
+ for (i in list(uint32(), int64(), uint64())) {
+ sch <- schema(fct = dictionary(i, utf8()))
+ tab <- Table$create(fact, schema = sch)
+ expect_equal(tab$schema, sch)
+ expect_equal_data_frame(tab, fact)
+ }
+})
+
+test_that("Table converts dictionary arrays with string_view values", {
+ expected <- data.frame(foo = factor(c("x", NA, "x")))
Review Comment:
Would also suggest testing with longer strings here.
##########
r/src/r_to_arrow.cpp:
##########
@@ -1062,7 +1099,13 @@ struct RConverterTrait<
};
template <typename T>
-struct RConverterTrait<T, enable_if_binary_view_like<T>> {
+struct RConverterTrait<T, enable_if_string_view<T>> {
+ using type = RPrimitiveConverter<T>;
+};
+
+template <typename T>
+struct RConverterTrait<T, enable_if_t<is_binary_view_like_type<T>::value &&
+ !is_string_view_type<T>::value>> {
// not implemented
Review Comment:
Why not implement it too? It should be reasonably easy given you already the
implementations for Binary and StringView.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]