This is an automated email from the ASF dual-hosted git repository.
rok pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new ad5f15576e GH-48625: [Python] Add temporal unit checking in
NumPyDtypeUnifier (#48626)
ad5f15576e is described below
commit ad5f15576e08a095394ebd2a1812dac4180ffe4d
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Thu Jan 8 04:32:12 2026 +0900
GH-48625: [Python] Add temporal unit checking in NumPyDtypeUnifier (#48626)
### Rationale for this change
This is to address a todo:
https://github.com/apache/arrow/blob/de6eb89dbdcf210802c3aad5d3f1a3d4c64c3582/python/pyarrow/src/arrow/python/inference.cc#L258
When users mix `numpy.datetime64` values with different units (e.g.,
`datetime64[s]` and `datetime64[ms]`) in a single array, PyArrow previously
produced a confusing error message
### What changes are included in this PR?
- Added datetime64 unit validation in
`NumPyDtypeUnifier::Observe_DATETIME()`
- Added `InvalidDatetimeUnitMix()` method
- Updated `NumPyDtypeUnifier::Observe()` to check units for same-type
comparisons
- Updated existing test
`test_array_from_different_numpy_datetime_units_raises`
- Removed the TODO comment (now implemented)
### Are these changes tested?
Manually tested, and unittests were added.
### Are there any user-facing changes?
Yes. It produces a better error message. For example,
```python
import pyarrow as pa
import numpy as np
pa.array([np.datetime64('2020-01-01', 's'), np.datetime64('2020-01-02',
'ms')])
```
Before:
```
pyarrow.lib.ArrowNotImplementedError: Expected np.datetime64 but got:
timestamp[ms]
```
After:
```
pyarrow.lib.ArrowInvalid: Cannot mix NumPy datetime64 units s and ms
```
* GitHub Issue: #48625
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Rok Mihevc <[email protected]>
---
python/pyarrow/src/arrow/python/inference.cc | 65 +++++++++++++++++++++++++++-
python/pyarrow/tests/test_array.py | 30 +++++++++++--
2 files changed, 91 insertions(+), 4 deletions(-)
diff --git a/python/pyarrow/src/arrow/python/inference.cc
b/python/pyarrow/src/arrow/python/inference.cc
index 1aa7915ba1..e5714862e4 100644
--- a/python/pyarrow/src/arrow/python/inference.cc
+++ b/python/pyarrow/src/arrow/python/inference.cc
@@ -108,6 +108,17 @@ class NumPyDtypeUnifier {
GetNumPyTypeName(new_dtype));
}
+ Status InvalidDatetimeUnitMix(PyArray_Descr* new_descr) {
+ auto new_meta = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
+ PyDataType_C_METADATA(new_descr));
+ auto current_meta = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
+ PyDataType_C_METADATA(current_dtype_));
+
+ return Status::Invalid("Cannot mix NumPy datetime64 units ",
+ DatetimeUnitName(current_meta->meta.base), " and ",
+ DatetimeUnitName(new_meta->meta.base));
+ }
+
int Observe_BOOL(PyArray_Descr* descr, int dtype) { return INVALID; }
int Observe_INT8(PyArray_Descr* descr, int dtype) {
@@ -255,7 +266,17 @@ class NumPyDtypeUnifier {
}
int Observe_DATETIME(PyArray_Descr* dtype_obj) {
- // TODO: check that units are all the same
+ // Check that datetime units are consistent across all values
+ auto datetime_meta = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
+ PyDataType_C_METADATA(dtype_obj));
+ auto current_meta = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
+ PyDataType_C_METADATA(current_dtype_));
+
+ if (datetime_meta->meta.base != current_meta->meta.base) {
+ // Units don't match - this is invalid
+ return INVALID;
+ }
+
return OK;
}
@@ -267,6 +288,13 @@ class NumPyDtypeUnifier {
current_type_num_ = dtype;
return Status::OK();
} else if (current_type_num_ == dtype) {
+ // Same type, but for datetime we still need to check units match
+ if (dtype == NPY_DATETIME) {
+ int action = Observe_DATETIME(descr);
+ if (action == INVALID) {
+ return InvalidDatetimeUnitMix(descr);
+ }
+ }
return Status::OK();
}
@@ -309,6 +337,41 @@ class NumPyDtypeUnifier {
int current_type_num() const { return current_type_num_; }
private:
+ static std::string DatetimeUnitName(NPY_DATETIMEUNIT unit) {
+ switch (unit) {
+ case NPY_FR_Y:
+ return "Y";
+ case NPY_FR_M:
+ return "M";
+ case NPY_FR_W:
+ return "W";
+ case NPY_FR_D:
+ return "D";
+ case NPY_FR_h:
+ return "h";
+ case NPY_FR_m:
+ return "m";
+ case NPY_FR_s:
+ return "s";
+ case NPY_FR_ms:
+ return "ms";
+ case NPY_FR_us:
+ return "us";
+ case NPY_FR_ns:
+ return "ns";
+ case NPY_FR_ps:
+ return "ps";
+ case NPY_FR_fs:
+ return "fs";
+ case NPY_FR_as:
+ return "as";
+ case NPY_FR_GENERIC:
+ return "generic";
+ default:
+ return "unknown (" + std::to_string(static_cast<int>(unit)) + ")";
+ }
+ }
+
int current_type_num_;
PyArray_Descr* current_dtype_;
};
diff --git a/python/pyarrow/tests/test_array.py
b/python/pyarrow/tests/test_array.py
index cefa2de161..d09d9f45c7 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -2541,7 +2541,31 @@ def
test_array_from_different_numpy_datetime_units_raises():
ms = np.array(data, dtype='datetime64[ms]')
data = list(s[:2]) + list(ms[2:])
- with pytest.raises(pa.ArrowNotImplementedError):
+ with pytest.raises(pa.ArrowInvalid,
+ match="Cannot mix NumPy datetime64 units s and ms"):
+ pa.array(data)
+
+
[email protected]
[email protected]('unit', [
+ 'Y', # year
+ 'M', # month
+ 'W', # week
+ 'h', # hour
+ 'm', # minute
+ 'ps', # picosecond
+ 'fs', # femtosecond
+ 'as', # attosecond
+])
+def test_array_from_unsupported_numpy_datetime_unit_names(unit):
+ s_data = [np.datetime64('2020-01-01', 's')]
+ unsupported_data = [np.datetime64('2020', unit)]
+
+ # Mix supported unit (s) with unsupported unit
+ data = s_data + unsupported_data
+
+ with pytest.raises(pa.ArrowInvalid,
+ match=f"Cannot mix NumPy datetime64 units s and
{unit}"):
pa.array(data)
@@ -2566,8 +2590,8 @@ def test_array_from_timestamp_with_generic_unit():
x = np.datetime64('2017-01-01 01:01:01.111111111')
y = np.datetime64('2018-11-22 12:24:48.111111111')
- with pytest.raises(pa.ArrowNotImplementedError,
- match='Unbound or generic datetime64 time unit'):
+ with pytest.raises(pa.ArrowInvalid,
+ match='Cannot mix NumPy datetime64 units'):
pa.array([n, x, y])