ngoldbaum commented on code in PR #48391:
URL: https://github.com/apache/arrow/pull/48391#discussion_r2599864090


##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -74,6 +81,37 @@ using internal::NumPyTypeSize;
 
 namespace {
 
+#if NPY_ABI_VERSION >= 0x02000000
+
+// NumPy exposes StringDType helpers in the C-API table from version 2.0 
onward,
+// but the corresponding macros are only available when compiling against a
+// 2.0+ feature level. Arrow still targets an older feature level, so provide
+// local wrappers that call the C-API entries directly.
+
+inline npy_string_allocator* ArrowNpyString_acquire_allocator(
+    const PyArray_StringDTypeObject* descr) {
+  using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*);
+  auto func = reinterpret_cast<Func>(PyArray_API[316]);
+  return func(descr);
+}
+
+inline void ArrowNpyString_release_allocator(npy_string_allocator* allocator) {
+  using Func = void (*)(npy_string_allocator*);
+  auto func = reinterpret_cast<Func>(PyArray_API[318]);
+  func(allocator);
+}
+
+inline int ArrowNpyString_load(npy_string_allocator* allocator,
+                               const npy_packed_static_string* packed,
+                               npy_static_string* out) {
+  using Func =
+      int (*)(npy_string_allocator*, const npy_packed_static_string*, 
npy_static_string*);
+  auto func = reinterpret_cast<Func>(PyArray_API[313]);
+  return func(allocator, packed, out);
+}

Review Comment:
   Wow I hate this. Please don't do this.
   
   Can't this code just be conditionally defined based on `NPY_TARGET_API`?



##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -815,6 +879,105 @@ Status NumPyConverter::Visit(const StringViewType& type) {
   return Status::OK();
 }
 
+#if NPY_ABI_VERSION >= 0x02000000
+
+template <typename Builder>
+Status NumPyConverter::AppendStringDTypeValues(Builder* builder) {
+  auto* descr = reinterpret_cast<PyArray_StringDTypeObject*>(dtype_);
+
+  npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr);

Review Comment:
   FYI for other reviewers: this locks a mutex internally in NumPy.



##########
python/pyarrow/tests/test_array.py:
##########
@@ -2758,6 +2758,70 @@ def test_array_from_numpy_unicode(string_type):
     assert arrow_arr.equals(expected)
 
 
[email protected]
+def test_array_from_numpy_string_dtype():
+    dtypes_mod = getattr(np, "dtypes", None)
+    if dtypes_mod is None:
+        pytest.skip("NumPy dtypes module not available")
+
+    StringDType = getattr(dtypes_mod, "StringDType", None)
+    if StringDType is None:
+        pytest.skip("NumPy StringDType not available")
+
+    dtype = StringDType()
+
+    arr = np.array(["some", "strings"], dtype=dtype)

Review Comment:
   I would probably include some strings that cross NumPy's internal "medium 
string" and "long string" size limits. See NEP 55 for more details on the 
specifics there. I'd also probably include some unicode chracters too.



##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -59,6 +60,12 @@
 #include "arrow/python/type_traits.h"
 #include "arrow/python/vendored/pythoncapi_compat.h"
 
+#if NPY_ABI_VERSION >= 0x02000000

Review Comment:
   I'm not sure this is correct. Isn't what's important that 
`NPY_TARGET_VERSION > NPY_2_0_API_VERSION`? Ditto for similar macro usage below.



##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -338,6 +383,25 @@ Status NumPyConverter::Convert() {
     return Status::OK();
   }
 
+  if (IsStringDType(dtype_)) {
+#if NPY_ABI_VERSION >= 0x02000000
+    RETURN_NOT_OK(ConvertStringDType());
+    return Status::OK();
+#else
+    // Fall back to the generic Python sequence conversion path when the 
StringDType
+    // C API is unavailable.
+    PyConversionOptions py_options;
+    py_options.type = type_;
+    py_options.from_pandas = from_pandas_;
+    ARROW_ASSIGN_OR_RAISE(
+        auto chunked_array,
+        ConvertPySequence(reinterpret_cast<PyObject*>(arr_),
+                          reinterpret_cast<PyObject*>(mask_), py_options, 
pool_));
+    out_arrays_ = chunked_array->chunks();
+    return Status::OK();

Review Comment:
   This is going to be very slow. Also the only way to get a StringDType array 
from NumPy is if at runtime you're using a NumPy newer than 2.0. So the only 
way you enter this code is if you're building PyArrow using NumPy 1.x but then 
want to use it at runtime with NumPy 2.0. Seems kinda silly? Why not just build 
with NumPy 2.0. You don't need other build NumPy with the oldest supported 
NumPy anymore.



##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -815,6 +879,105 @@ Status NumPyConverter::Visit(const StringViewType& type) {
   return Status::OK();
 }
 
+#if NPY_ABI_VERSION >= 0x02000000
+
+template <typename Builder>
+Status NumPyConverter::AppendStringDTypeValues(Builder* builder) {
+  auto* descr = reinterpret_cast<PyArray_StringDTypeObject*>(dtype_);
+
+  npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr);
+  if (allocator == nullptr) {
+    return Status::Invalid("Failed to acquire NumPy StringDType allocator");
+  }
+
+  struct AllocatorGuard {
+    npy_string_allocator* ptr;
+    explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {}
+    ~AllocatorGuard() {
+      if (ptr != nullptr) {
+        ArrowNpyString_release_allocator(ptr);
+      }
+    }
+  } guard(allocator);
+
+  npy_static_string value = {0, nullptr};
+
+  auto append_value = [&](const npy_packed_static_string* packed) -> Status {

Review Comment:
   personally I find this closure a little hard to follow but I'm not a big C++ 
guy. I'd just make it a helper function that's defined outside this function.



##########
python/pyarrow/src/arrow/python/numpy_convert.cc:
##########
@@ -122,6 +122,15 @@ Result<std::shared_ptr<DataType>> 
NumPyScalarToArrowDataType(PyObject* scalar) {
   return NumPyDtypeToArrow(descr);
 }
 
+#if NPY_ABI_VERSION >= 0x02000000
+bool IsStringDType(PyArray_Descr* descr) {
+  // NumPy's variable-width StringDType exposes a dedicated dtype number.

Review Comment:
   total nit: this comment doesn't really add anything - all dtypes shipped by 
NumPy expose a character code.



##########
python/pyarrow/src/arrow/python/numpy_to_arrow.cc:
##########
@@ -815,6 +879,105 @@ Status NumPyConverter::Visit(const StringViewType& type) {
   return Status::OK();
 }
 
+#if NPY_ABI_VERSION >= 0x02000000
+
+template <typename Builder>
+Status NumPyConverter::AppendStringDTypeValues(Builder* builder) {
+  auto* descr = reinterpret_cast<PyArray_StringDTypeObject*>(dtype_);
+
+  npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr);
+  if (allocator == nullptr) {
+    return Status::Invalid("Failed to acquire NumPy StringDType allocator");
+  }
+
+  struct AllocatorGuard {
+    npy_string_allocator* ptr;
+    explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {}
+    ~AllocatorGuard() {
+      if (ptr != nullptr) {
+        ArrowNpyString_release_allocator(ptr);
+      }
+    }
+  } guard(allocator);
+
+  npy_static_string value = {0, nullptr};
+
+  auto append_value = [&](const npy_packed_static_string* packed) -> Status {
+    int rc = ArrowNpyString_load(allocator, packed, &value);

Review Comment:
   I like referring to the return value of `NpyString_load` as `is_null`, since 
it indicates whether or not you've loaded a missing string if it's true.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to