pitrou commented on code in PR #48391:
URL: https://github.com/apache/arrow/pull/48391#discussion_r2671591869
##########
python/pyarrow/src/arrow/python/numpy_convert.cc:
##########
@@ -37,6 +37,10 @@
namespace arrow {
namespace py {
+#ifndef NPY_VSTRING
+# define NPY_VSTRING 2056
+#endif
Review Comment:
Do we actually need this? Can `IsStringDType` just return false if this
constant is not defined (this is presumably when compiling with NumPy < 2)?
##########
python/pyarrow/src/arrow/python/numpy_convert.cc:
##########
@@ -122,6 +126,10 @@ Result<std::shared_ptr<DataType>>
NumPyScalarToArrowDataType(PyObject* scalar) {
return NumPyDtypeToArrow(descr);
}
+bool IsStringDType(PyArray_Descr* descr) {
+ return descr != nullptr && descr->type_num == NPY_VSTRING;
Review Comment:
The nullptr check seems superfluous, why would one call this function with a
null pointer?
##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -74,6 +77,27 @@ using internal::NumPyTypeSize;
namespace {
+#if NPY_ABI_VERSION >= 0x02000000
Review Comment:
I'm not sure why we're guarding with `NPY_ABI_VERSION` rather than either
`NPY_VERSION` or `NPY_FEATURE_VERSION`. Can a NumPy maintainer explain how
these 3 macros differ? @ngoldbaum @seberg
(also, would be nice if the NumPy docs were a bit [more
talkative](https://numpy.org/doc/stable/search.html?q=NPY_ABI_VERSION) about
this)
##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -815,6 +856,110 @@ Status NumPyConverter::Visit(const StringViewType& type) {
return Status::OK();
}
+#if NPY_ABI_VERSION >= 0x02000000
+template <typename Builder>
+Status NumPyConverter::AppendStringDTypeValues(Builder* builder) {
+ auto* descr = reinterpret_cast<PyArray_StringDTypeObject*>(dtype_);
+
+ npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr);
+ if (allocator == nullptr) {
+ return Status::Invalid("Failed to acquire NumPy StringDType allocator");
+ }
+
+ struct AllocatorGuard {
+ npy_string_allocator* ptr;
+ explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {}
+ ~AllocatorGuard() {
+ if (ptr != nullptr) {
+ ArrowNpyString_release_allocator(ptr);
+ }
+ }
+ } guard(allocator);
Review Comment:
I *think* this can be written more concisely using `std::unique_ptr`:
```suggestion
std::unique_ptr<npy_string_allocator, void(*)(npy_string_allocator*)>
allocator_guard(ptr, &ArrowNpyString_release_allocator);
```
##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -815,6 +856,110 @@ Status NumPyConverter::Visit(const StringViewType& type) {
return Status::OK();
}
+#if NPY_ABI_VERSION >= 0x02000000
+template <typename Builder>
+Status NumPyConverter::AppendStringDTypeValues(Builder* builder) {
+ auto* descr = reinterpret_cast<PyArray_StringDTypeObject*>(dtype_);
+
+ npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr);
+ if (allocator == nullptr) {
+ return Status::Invalid("Failed to acquire NumPy StringDType allocator");
+ }
+
+ struct AllocatorGuard {
+ npy_string_allocator* ptr;
+ explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {}
+ ~AllocatorGuard() {
+ if (ptr != nullptr) {
+ ArrowNpyString_release_allocator(ptr);
+ }
+ }
+ } guard(allocator);
+
+ npy_static_string value = {0, nullptr};
+ char* data = PyArray_BYTES(arr_);
+
+ if (mask_ != nullptr) {
+ Ndarray1DIndexer<uint8_t> mask_values(mask_);
+ for (int64_t i = 0; i < length_; ++i) {
+ if (mask_values[i]) {
+ RETURN_NOT_OK(builder->AppendNull());
+ continue;
+ }
+
+ const auto* packed =
+ reinterpret_cast<const npy_packed_static_string*>(data + i *
stride_);
Review Comment:
I'm just curious, is the StringDType layout documented somewhere? I couldn't
find any reference easily in the NumPy docs. @ngoldbaum @seberg
##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -338,6 +369,16 @@ Status NumPyConverter::Convert() {
return Status::OK();
}
+ if (IsStringDType(dtype_)) {
+#if NPY_ABI_VERSION >= 0x02000000
+ RETURN_NOT_OK(ConvertStringDType());
Review Comment:
It's weird and confusing to do this outside of the visitor machinery. Is it
possible to straighten that out?
##########
python/pyarrow/tests/test_array.py:
##########
@@ -2758,6 +2758,119 @@ def test_array_from_numpy_unicode(string_type):
assert arrow_arr.equals(expected)
[email protected]
+def test_array_from_numpy_string_dtype():
+ dtypes_mod = getattr(np, "dtypes", None)
Review Comment:
Can use something like `dtypes = pytest.importorskip("numpy.dtypes")`
##########
python/pyarrow/tests/test_array.py:
##########
@@ -2758,6 +2758,119 @@ def test_array_from_numpy_unicode(string_type):
assert arrow_arr.equals(expected)
[email protected]
+def test_array_from_numpy_string_dtype():
+ dtypes_mod = getattr(np, "dtypes", None)
+ if dtypes_mod is None:
+ pytest.skip("NumPy dtypes module not available")
+
+ StringDType = getattr(dtypes_mod, "StringDType", None)
+ if StringDType is None:
+ pytest.skip("NumPy StringDType not available")
+
+ dtype = StringDType()
+
+ arr = np.array(["some", "strings"], dtype=dtype)
+
+ arrow_arr = pa.array(arr)
+
+ assert arrow_arr.type == pa.utf8()
+ assert arrow_arr.to_pylist() == ["some", "strings"]
+
+ arrow_arr = pa.array(arr, type=pa.string())
Review Comment:
Note that `pa.string()` and `pa.utf8()` are the same thing, it's a bit
confusing to use both spellings here.
##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -815,6 +856,110 @@ Status NumPyConverter::Visit(const StringViewType& type) {
return Status::OK();
}
+#if NPY_ABI_VERSION >= 0x02000000
+template <typename Builder>
+Status NumPyConverter::AppendStringDTypeValues(Builder* builder) {
+ auto* descr = reinterpret_cast<PyArray_StringDTypeObject*>(dtype_);
+
+ npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr);
+ if (allocator == nullptr) {
+ return Status::Invalid("Failed to acquire NumPy StringDType allocator");
+ }
+
+ struct AllocatorGuard {
+ npy_string_allocator* ptr;
+ explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {}
+ ~AllocatorGuard() {
+ if (ptr != nullptr) {
+ ArrowNpyString_release_allocator(ptr);
+ }
+ }
+ } guard(allocator);
+
+ npy_static_string value = {0, nullptr};
+ char* data = PyArray_BYTES(arr_);
+
+ if (mask_ != nullptr) {
+ Ndarray1DIndexer<uint8_t> mask_values(mask_);
+ for (int64_t i = 0; i < length_; ++i) {
+ if (mask_values[i]) {
+ RETURN_NOT_OK(builder->AppendNull());
+ continue;
+ }
+
+ const auto* packed =
+ reinterpret_cast<const npy_packed_static_string*>(data + i *
stride_);
+ const int is_null = ArrowNpyString_load(allocator, packed, &value);
+ if (is_null == -1) {
+ RETURN_IF_PYERROR();
+ return Status::Invalid("Failed to unpack NumPy StringDType value");
+ }
+ if (is_null) {
+ RETURN_NOT_OK(builder->AppendNull());
+ } else {
+ RETURN_NOT_OK(builder->Append(std::string_view{value.buf,
value.size}));
+ }
+ }
+ return Status::OK();
+ }
+
+ for (int64_t i = 0; i < length_; ++i) {
+ const auto* packed = reinterpret_cast<const
npy_packed_static_string*>(data);
+ const int is_null = ArrowNpyString_load(allocator, packed, &value);
+ if (is_null == -1) {
+ RETURN_IF_PYERROR();
+ return Status::Invalid("Failed to unpack NumPy StringDType value");
+ }
+ if (is_null) {
+ RETURN_NOT_OK(builder->AppendNull());
+ } else {
+ RETURN_NOT_OK(builder->Append(std::string_view{value.buf, value.size}));
+ }
Review Comment:
You could easily have this snippet factored out in a lambda to avoid code
repetition with the loop above.
##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -74,6 +77,27 @@ using internal::NumPyTypeSize;
namespace {
+#if NPY_ABI_VERSION >= 0x02000000
+inline npy_string_allocator* ArrowNpyString_acquire_allocator(
+ const PyArray_StringDTypeObject* descr) {
+ using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*);
+ return reinterpret_cast<Func>(PyArray_API[316])(descr);
Review Comment:
Why not call `NpyString_acquire_allocator` directly? AFAICT it is a macro
that does roughly what your code is doing:
```c
#if NPY_FEATURE_VERSION >= NPY_2_0_API_VERSION
#define NpyString_acquire_allocator \
(*(npy_string_allocator * (*)(const PyArray_StringDTypeObject *)) \
PyArray_API[316])
#endif
```
##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -815,6 +856,110 @@ Status NumPyConverter::Visit(const StringViewType& type) {
return Status::OK();
}
+#if NPY_ABI_VERSION >= 0x02000000
+template <typename Builder>
+Status NumPyConverter::AppendStringDTypeValues(Builder* builder) {
+ auto* descr = reinterpret_cast<PyArray_StringDTypeObject*>(dtype_);
+
+ npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr);
+ if (allocator == nullptr) {
+ return Status::Invalid("Failed to acquire NumPy StringDType allocator");
+ }
+
+ struct AllocatorGuard {
+ npy_string_allocator* ptr;
+ explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {}
+ ~AllocatorGuard() {
+ if (ptr != nullptr) {
+ ArrowNpyString_release_allocator(ptr);
+ }
+ }
+ } guard(allocator);
+
+ npy_static_string value = {0, nullptr};
+ char* data = PyArray_BYTES(arr_);
Review Comment:
Can we make this `const char*`?
##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -338,6 +369,16 @@ Status NumPyConverter::Convert() {
return Status::OK();
}
+ if (IsStringDType(dtype_)) {
+#if NPY_ABI_VERSION >= 0x02000000
+ RETURN_NOT_OK(ConvertStringDType());
+ return Status::OK();
+#else
+ return Status::NotImplemented(
+ "NumPy StringDType requires building PyArrow with NumPy >= 2.0");
Review Comment:
Is this at all possible to happen? I got the impression that one cannot use
NumPy 2 if PyArrow was compiled for NumPy < 2, am I mistaken @ngoldbaum @seberg
?
##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -815,6 +856,110 @@ Status NumPyConverter::Visit(const StringViewType& type) {
return Status::OK();
}
+#if NPY_ABI_VERSION >= 0x02000000
+template <typename Builder>
+Status NumPyConverter::AppendStringDTypeValues(Builder* builder) {
+ auto* descr = reinterpret_cast<PyArray_StringDTypeObject*>(dtype_);
+
+ npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr);
+ if (allocator == nullptr) {
+ return Status::Invalid("Failed to acquire NumPy StringDType allocator");
+ }
+
+ struct AllocatorGuard {
+ npy_string_allocator* ptr;
+ explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {}
+ ~AllocatorGuard() {
+ if (ptr != nullptr) {
+ ArrowNpyString_release_allocator(ptr);
+ }
+ }
+ } guard(allocator);
+
+ npy_static_string value = {0, nullptr};
+ char* data = PyArray_BYTES(arr_);
+
+ if (mask_ != nullptr) {
+ Ndarray1DIndexer<uint8_t> mask_values(mask_);
+ for (int64_t i = 0; i < length_; ++i) {
+ if (mask_values[i]) {
+ RETURN_NOT_OK(builder->AppendNull());
+ continue;
+ }
+
+ const auto* packed =
+ reinterpret_cast<const npy_packed_static_string*>(data + i *
stride_);
+ const int is_null = ArrowNpyString_load(allocator, packed, &value);
Review Comment:
I'm curious, why does this NumPy API need an allocator is all it does is
return a view into the array contents? Especially as no deallocation seems
involved afterwards... @ngoldbaum @seberg
##########
python/pyarrow/tests/test_array.py:
##########
@@ -2758,6 +2758,119 @@ def test_array_from_numpy_unicode(string_type):
assert arrow_arr.equals(expected)
[email protected]
+def test_array_from_numpy_string_dtype():
+ dtypes_mod = getattr(np, "dtypes", None)
+ if dtypes_mod is None:
+ pytest.skip("NumPy dtypes module not available")
+
+ StringDType = getattr(dtypes_mod, "StringDType", None)
+ if StringDType is None:
+ pytest.skip("NumPy StringDType not available")
+
+ dtype = StringDType()
+
+ arr = np.array(["some", "strings"], dtype=dtype)
+
+ arrow_arr = pa.array(arr)
+
+ assert arrow_arr.type == pa.utf8()
+ assert arrow_arr.to_pylist() == ["some", "strings"]
+
+ arrow_arr = pa.array(arr, type=pa.string())
+ assert arrow_arr.type == pa.string()
Review Comment:
Can you also call `arrow_arr.validate(full=True)`? (also in other places)
##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -815,6 +856,110 @@ Status NumPyConverter::Visit(const StringViewType& type) {
return Status::OK();
}
+#if NPY_ABI_VERSION >= 0x02000000
+template <typename Builder>
+Status NumPyConverter::AppendStringDTypeValues(Builder* builder) {
+ auto* descr = reinterpret_cast<PyArray_StringDTypeObject*>(dtype_);
+
+ npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr);
+ if (allocator == nullptr) {
+ return Status::Invalid("Failed to acquire NumPy StringDType allocator");
+ }
+
+ struct AllocatorGuard {
+ npy_string_allocator* ptr;
+ explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {}
+ ~AllocatorGuard() {
+ if (ptr != nullptr) {
+ ArrowNpyString_release_allocator(ptr);
+ }
+ }
+ } guard(allocator);
+
+ npy_static_string value = {0, nullptr};
+ char* data = PyArray_BYTES(arr_);
+
+ if (mask_ != nullptr) {
+ Ndarray1DIndexer<uint8_t> mask_values(mask_);
+ for (int64_t i = 0; i < length_; ++i) {
+ if (mask_values[i]) {
+ RETURN_NOT_OK(builder->AppendNull());
+ continue;
+ }
+
+ const auto* packed =
+ reinterpret_cast<const npy_packed_static_string*>(data + i *
stride_);
+ const int is_null = ArrowNpyString_load(allocator, packed, &value);
+ if (is_null == -1) {
+ RETURN_IF_PYERROR();
+ return Status::Invalid("Failed to unpack NumPy StringDType value");
+ }
+ if (is_null) {
+ RETURN_NOT_OK(builder->AppendNull());
+ } else {
+ RETURN_NOT_OK(builder->Append(std::string_view{value.buf,
value.size}));
+ }
+ }
+ return Status::OK();
+ }
+
+ for (int64_t i = 0; i < length_; ++i) {
+ const auto* packed = reinterpret_cast<const
npy_packed_static_string*>(data);
+ const int is_null = ArrowNpyString_load(allocator, packed, &value);
+ if (is_null == -1) {
+ RETURN_IF_PYERROR();
+ return Status::Invalid("Failed to unpack NumPy StringDType value");
+ }
+ if (is_null) {
+ RETURN_NOT_OK(builder->AppendNull());
+ } else {
+ RETURN_NOT_OK(builder->Append(std::string_view{value.buf, value.size}));
+ }
+ data += stride_;
+ }
+
+ return Status::OK();
+}
+
+Status NumPyConverter::ConvertStringDType() {
+ util::InitializeUTF8();
Review Comment:
Note this is only useful if we validate UTF8 values, which this doesn't seem
to be doing? (presumably because NumPy already advertises the data as valid
UTF8?)
##########
python/pyarrow/tests/test_array.py:
##########
@@ -2758,6 +2758,119 @@ def test_array_from_numpy_unicode(string_type):
assert arrow_arr.equals(expected)
[email protected]
+def test_array_from_numpy_string_dtype():
+ dtypes_mod = getattr(np, "dtypes", None)
Review Comment:
Or, even, better, you can use a pytest fixture:
```python
@pytest.fixture
def string_dtype():
dtypes = pytest.importorskip("numpy.dtypes")
dtype_class = getattr(dtypes, "StringDType", None)
if dtype_class is None:
pytest.skip("NumPy StringDType not available (NumPy > 2 needed)")
return dtype_class
```
and then simply:
```python
@pytest.mark.numpy
def test_array_from_numpy_string_dtype(string_dtype):
arr = np.array(["some", "strings"], dtype=string_dtype())
# etc.
```
##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -338,6 +369,16 @@ Status NumPyConverter::Convert() {
return Status::OK();
}
+ if (IsStringDType(dtype_)) {
+#if NPY_ABI_VERSION >= 0x02000000
+ RETURN_NOT_OK(ConvertStringDType());
Review Comment:
(also `NumPyConverter::VisitString` has a special case for all-NaN Pandas
arrays, I'm not sure that can apply here @jorisvandenbossche )
##########
python/pyarrow/tests/test_array.py:
##########
@@ -2758,6 +2758,119 @@ def test_array_from_numpy_unicode(string_type):
assert arrow_arr.equals(expected)
[email protected]
+def test_array_from_numpy_string_dtype():
+ dtypes_mod = getattr(np, "dtypes", None)
+ if dtypes_mod is None:
+ pytest.skip("NumPy dtypes module not available")
+
+ StringDType = getattr(dtypes_mod, "StringDType", None)
+ if StringDType is None:
+ pytest.skip("NumPy StringDType not available")
+
+ dtype = StringDType()
+
+ arr = np.array(["some", "strings"], dtype=dtype)
+
+ arrow_arr = pa.array(arr)
+
+ assert arrow_arr.type == pa.utf8()
+ assert arrow_arr.to_pylist() == ["some", "strings"]
+
+ arrow_arr = pa.array(arr, type=pa.string())
+ assert arrow_arr.type == pa.string()
+ assert arrow_arr.to_pylist() == ["some", "strings"]
Review Comment:
Can also be written `assert arrow_arr.to_pylist() == arr.tolist()`
##########
python/pyarrow/tests/test_array.py:
##########
@@ -2758,6 +2758,119 @@ def test_array_from_numpy_unicode(string_type):
assert arrow_arr.equals(expected)
[email protected]
+def test_array_from_numpy_string_dtype():
+ dtypes_mod = getattr(np, "dtypes", None)
+ if dtypes_mod is None:
+ pytest.skip("NumPy dtypes module not available")
+
+ StringDType = getattr(dtypes_mod, "StringDType", None)
+ if StringDType is None:
+ pytest.skip("NumPy StringDType not available")
+
+ dtype = StringDType()
+
+ arr = np.array(["some", "strings"], dtype=dtype)
+
+ arrow_arr = pa.array(arr)
+
+ assert arrow_arr.type == pa.utf8()
+ assert arrow_arr.to_pylist() == ["some", "strings"]
+
+ arrow_arr = pa.array(arr, type=pa.string())
+ assert arrow_arr.type == pa.string()
+ assert arrow_arr.to_pylist() == ["some", "strings"]
+
+ arrow_arr = pa.array(arr, type=pa.large_string())
+ assert arrow_arr.type == pa.large_string()
+ assert arrow_arr.to_pylist() == ["some", "strings"]
+
+ arrow_arr = pa.array(arr, type=pa.string_view())
+ assert arrow_arr.type == pa.string_view()
+ assert arrow_arr.to_pylist() == ["some", "strings"]
+
+ arr_full = np.array(["a", "b", "c", "d", "e"], dtype=dtype)
+ arr = arr_full[::2]
+ arrow_arr = pa.array(arr)
+ assert arrow_arr.type == pa.utf8()
+ assert arrow_arr.to_pylist() == ["a", "c", "e"]
+
+
[email protected]
+def test_numpy_stringdtype_thresholds_and_unicode():
+ dtypes_mod = getattr(np, "dtypes", None)
+ if dtypes_mod is None:
+ pytest.skip("NumPy dtypes module not available")
+
+ StringDType = getattr(dtypes_mod, "StringDType", None)
+ if StringDType is None:
+ pytest.skip("NumPy StringDType not available")
+
+ dtype = StringDType()
+
+ short = "hello"
+ medium = "a" * 100
+ long_ = "b" * 300
+ unicode_ = "árvíztűrő tükörfúrógép 🥐 你好"
+ long_unicode = "🥐" * 200
+
+ arr = np.array([short, medium, long_, unicode_, long_unicode], dtype=dtype)
+ assert pa.array(arr).to_pylist() == [short, medium, long_, unicode_,
long_unicode]
+
+
[email protected]
+def test_array_from_numpy_string_dtype_nulls_and_mask():
+ dtypes_mod = getattr(np, "dtypes", None)
+ if dtypes_mod is None:
+ pytest.skip("NumPy dtypes module not available")
+
+ StringDType = getattr(dtypes_mod, "StringDType", None)
+ if StringDType is None:
+ pytest.skip("NumPy StringDType not available")
+
+ # Real StringDType, use its NA sentinel
+ dtype = StringDType(na_object=None)
+ arr = np.array(["this array has", None, "as an entry"], dtype=dtype)
+
+ arrow_arr = pa.array(arr)
+ assert arrow_arr.type == pa.utf8()
+ assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"]
+
+ # Test interplay of NA sentinel and an explicit mask:
+ # - index 1 is null because of na_object / Python None
+ # - index 2 is forced null by the mask
+ mask = np.array([False, False, True], dtype=bool)
+ arrow_arr = pa.array(arr, mask=mask)
+ assert arrow_arr.type == pa.utf8()
+ assert arrow_arr.null_count == 2
Review Comment:
Just validate `arrow_arr` and it will check `null_count` consistency.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]