rok commented on code in PR #48727:
URL: https://github.com/apache/arrow/pull/48727#discussion_r2878967399
##########
python/pyarrow/src/arrow/python/helpers.cc:
##########
@@ -296,6 +296,71 @@ bool PyFloat_IsNaN(PyObject* obj) {
namespace {
+// UUID module static data - lazily initialized on first use
+// Uses a conditional initialization strategy: std::once_flag when the GIL is
+// disabled, or a simple boolean flag when the GIL is enabled.
+// See the Pandas static data section below and ARROW-10519 for more details.
+#ifdef Py_GIL_DISABLED
+static std::once_flag uuid_static_initialized;
+#else
+static bool uuid_static_initialized = false;
+#endif
+static PyObject* uuid_UUID = nullptr;
+
+void GetUuidStaticSymbols() {
+ OwnedRef uuid_module;
+
+ // Import uuid module
+ Status s = ImportModule("uuid", &uuid_module);
+ if (!s.ok()) {
+ return;
+ }
+
+#ifndef Py_GIL_DISABLED
+ // Since ImportModule can release the GIL, another thread could have
+ // already initialized the static data.
+ if (uuid_static_initialized) {
+ return;
+ }
+#endif
+
+ OwnedRef ref;
+ if (ImportFromModule(uuid_module.obj(), "UUID", &ref).ok()) {
+ uuid_UUID = ref.obj();
+ }
+}
+
+#ifdef Py_GIL_DISABLED
+void InitUuidStaticData() {
+ std::call_once(uuid_static_initialized, GetUuidStaticSymbols);
+}
+#else
+void InitUuidStaticData() {
+ // NOTE: This is called with the GIL held. We needn't (and shouldn't,
+ // to avoid deadlocks) use an additional C++ lock (ARROW-10519).
+ if (uuid_static_initialized) {
+ return;
+ }
+ GetUuidStaticSymbols();
+ uuid_static_initialized = true;
+}
+#endif
Review Comment:
I think this is now in place, albeit slightly modified. Looks good to me.
##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1399,6 +1399,91 @@ def test_uuid_extension():
assert isinstance(array[0], pa.UuidScalar)
+def test_uuid_scalar_from_python():
+ import uuid
+
+ # Test with explicit type
+ py_uuid = uuid.uuid4()
+ scalar = pa.scalar(py_uuid, type=pa.uuid())
+ assert isinstance(scalar, pa.UuidScalar)
+ assert scalar.type == pa.uuid()
+ assert scalar.as_py() == py_uuid
+
+ # Test with specific UUID value
+ specific_uuid = UUID("12345678-1234-5678-1234-567812345678")
+ scalar = pa.scalar(specific_uuid, type=pa.uuid())
+ assert scalar.as_py() == specific_uuid
+ assert scalar.value.as_py() == specific_uuid.bytes
+
+ scalar = pa.scalar(None, type=pa.uuid())
+ assert scalar.is_valid is False
+ assert scalar.as_py() is None
+
+ # Test type inference from uuid.UUID
+ py_uuid = uuid.uuid4()
+ scalar = pa.scalar(py_uuid)
+ assert isinstance(scalar, pa.UuidScalar)
+ assert scalar.type == pa.uuid()
+ assert scalar.as_py() == py_uuid
+
+
+def test_uuid_array_from_python():
+ import uuid
+
+ # Test array with explicit type
+ uuids = [uuid.uuid4() for _ in range(3)]
+ uuids.append(None)
+
+ arr = pa.array(uuids, type=pa.uuid())
+ assert arr.type == pa.uuid()
+ assert len(arr) == 4
+ assert arr.null_count == 1
+ for i, u in enumerate(uuids):
+ assert arr[i].as_py() == u
+
+ # Test type inference for arrays
+ arr = pa.array(uuids)
+ assert arr.type == pa.uuid()
+ for i, u in enumerate(uuids):
+ assert arr[i].as_py() == u
+
+
[email protected]("bytes_value,exc_type,match", [
+ (b"0123456789abcde", pa.ArrowInvalid, "expected to be length 16 was 15"),
+ (
+ "0123456789abcdef", TypeError,
+ "Expected uuid.UUID.bytes to return bytes, got 'str'"
+ ),
+ (None, TypeError, "Expected uuid.UUID.bytes to return bytes, got
'NoneType'"),
+])
+def test_uuid_bytes_property_not_bytes(bytes_value, exc_type, match):
+ import uuid
+
+ class BadUuid(uuid.UUID):
+ @property
+ def bytes(self):
+ return bytes_value
+
+ bad = BadUuid(uuid.uuid4().hex)
+ with pytest.raises(exc_type, match=match):
+ pa.array([bad], type=pa.uuid())
+ with pytest.raises(exc_type, match=match):
+ pa.scalar(bad, type=pa.uuid())
+
+
+def test_uuid_bytes_property_raises():
+ import uuid
+
+ class BadUuid(uuid.UUID):
+ @property
+ def bytes(self):
+ raise RuntimeError("broken")
+
+ bad = BadUuid(uuid.uuid4().hex)
+ with pytest.raises(RuntimeError, match="broken"):
+ pa.array([bad], type=pa.uuid())
Review Comment:
Similar as above:
```suggestion
with pytest.raises(RuntimeError, match="broken"):
pa.array([bad], type=pa.uuid())
with pytest.raises(RuntimeError, match="broken"):
pa.scalar(bad, type=pa.uuid())
with pytest.raises(RuntimeError, match="broken"):
pa.array([bad])
with pytest.raises(RuntimeError, match="broken"):
pa.scalar(bad)
```
##########
python/pyarrow/src/arrow/python/python_to_arrow.cc:
##########
@@ -1268,16 +1274,24 @@ Result<std::shared_ptr<ChunkedArray>>
ConvertPySequence(PyObject* obj, PyObject*
// In some cases, type inference may be "loose", like strings. If the user
// passed pa.string(), then we will error if we encounter any non-UTF8
// value. If not, then we will allow the result to be a BinaryArray
+ std::shared_ptr<DataType> extension_type;
if (options.type == nullptr) {
ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask,
options.from_pandas));
options.strict = false;
+ // If type inference returned an extension type, convert using
+ // the storage type and then wrap the result as an extension array
+ if (options.type->id() == Type::EXTENSION) {
+ extension_type = options.type;
+ options.type = checked_cast<const
ExtensionType&>(*options.type).storage_type();
+ }
} else {
options.strict = true;
}
ARROW_DCHECK_GE(size, 0);
ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter<PyConverter,
PyConverterTrait>(
options.type, options, pool)));
Review Comment:
I think Python will always unwrap to storage type before sending to
`MakeConverter`. Except if it was an extension of extension?
##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1399,6 +1399,91 @@ def test_uuid_extension():
assert isinstance(array[0], pa.UuidScalar)
+def test_uuid_scalar_from_python():
+ import uuid
+
+ # Test with explicit type
+ py_uuid = uuid.uuid4()
+ scalar = pa.scalar(py_uuid, type=pa.uuid())
+ assert isinstance(scalar, pa.UuidScalar)
+ assert scalar.type == pa.uuid()
+ assert scalar.as_py() == py_uuid
+
+ # Test with specific UUID value
+ specific_uuid = UUID("12345678-1234-5678-1234-567812345678")
+ scalar = pa.scalar(specific_uuid, type=pa.uuid())
+ assert scalar.as_py() == specific_uuid
+ assert scalar.value.as_py() == specific_uuid.bytes
+
+ scalar = pa.scalar(None, type=pa.uuid())
+ assert scalar.is_valid is False
+ assert scalar.as_py() is None
+
+ # Test type inference from uuid.UUID
+ py_uuid = uuid.uuid4()
+ scalar = pa.scalar(py_uuid)
+ assert isinstance(scalar, pa.UuidScalar)
+ assert scalar.type == pa.uuid()
+ assert scalar.as_py() == py_uuid
+
+
+def test_uuid_array_from_python():
+ import uuid
+
+ # Test array with explicit type
+ uuids = [uuid.uuid4() for _ in range(3)]
+ uuids.append(None)
+
+ arr = pa.array(uuids, type=pa.uuid())
+ assert arr.type == pa.uuid()
+ assert len(arr) == 4
+ assert arr.null_count == 1
+ for i, u in enumerate(uuids):
+ assert arr[i].as_py() == u
+
+ # Test type inference for arrays
+ arr = pa.array(uuids)
+ assert arr.type == pa.uuid()
+ for i, u in enumerate(uuids):
+ assert arr[i].as_py() == u
+
+
[email protected]("bytes_value,exc_type,match", [
+ (b"0123456789abcde", pa.ArrowInvalid, "expected to be length 16 was 15"),
+ (
+ "0123456789abcdef", TypeError,
+ "Expected uuid.UUID.bytes to return bytes, got 'str'"
+ ),
+ (None, TypeError, "Expected uuid.UUID.bytes to return bytes, got
'NoneType'"),
+])
+def test_uuid_bytes_property_not_bytes(bytes_value, exc_type, match):
+ import uuid
+
+ class BadUuid(uuid.UUID):
+ @property
+ def bytes(self):
+ return bytes_value
+
+ bad = BadUuid(uuid.uuid4().hex)
+ with pytest.raises(exc_type, match=match):
+ pa.array([bad], type=pa.uuid())
+ with pytest.raises(exc_type, match=match):
+ pa.scalar(bad, type=pa.uuid())
Review Comment:
Perhaps overly careful:
```suggestion
with pytest.raises(exc_type, match=match):
pa.array([bad], type=pa.uuid())
with pytest.raises(exc_type, match=match):
pa.scalar(bad, type=pa.uuid())
with pytest.raises(exc_type, match=match):
pa.array([bad])
with pytest.raises(exc_type, match=match):
pa.scalar(bad)
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]