This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 4679c7c ARROW-3080: [Python] Unify Arrow to Python object conversion
paths
4679c7c is described below
commit 4679c7c0e4f0c9428c36e577f8e299b7c37eb04c
Author: Krisztián Szűcs <[email protected]>
AuthorDate: Sat Oct 10 10:47:32 2020 +0200
ARROW-3080: [Python] Unify Arrow to Python object conversion paths
This issue is more about the testing since we recently had a refactor
targeting the arrow to python conversion paths:
https://issues.apache.org/jira/browse/ARROW-9017
Closes #8349 from kszucs/ARROW-3080-hypo
Authored-by: Krisztián Szűcs <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
ci/scripts/python_test.sh | 2 +-
cpp/src/arrow/python/datetime.cc | 1 -
cpp/src/arrow/python/datetime.h | 55 +++----
cpp/src/arrow/python/helpers.cc | 50 ++++--
cpp/src/arrow/python/helpers.h | 11 ++
cpp/src/arrow/python/python_to_arrow.cc | 61 +++----
dev/tasks/tasks.yml | 12 ++
python/pyarrow/tests/conftest.py | 2 +-
python/pyarrow/tests/strategies.py | 235 ++++++++++++++++++++-------
python/pyarrow/tests/test_convert_builtin.py | 105 +++++++++++-
python/pyarrow/tests/test_pandas.py | 12 ++
python/pyarrow/tests/test_strategies.py | 5 +
12 files changed, 408 insertions(+), 143 deletions(-)
diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh
index 6f961d2..80a9cde 100755
--- a/ci/scripts/python_test.sh
+++ b/ci/scripts/python_test.sh
@@ -29,4 +29,4 @@ export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH}
# Enable some checks inside Python itself
export PYTHONDEVMODE=1
-pytest -r s --pyargs pyarrow
+pytest -r s ${PYTEST_ARGS} --pyargs pyarrow
diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc
index 07df5e7..8df2012 100644
--- a/cpp/src/arrow/python/datetime.cc
+++ b/cpp/src/arrow/python/datetime.cc
@@ -19,7 +19,6 @@
#include <algorithm>
#include <chrono>
#include <iomanip>
-#include <iostream>
#include "arrow/python/common.h"
#include "arrow/python/helpers.h"
diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h
index 4f3adb4..0072cdd 100644
--- a/cpp/src/arrow/python/datetime.h
+++ b/cpp/src/arrow/python/datetime.h
@@ -44,9 +44,9 @@ void InitDatetime();
ARROW_PYTHON_EXPORT
inline int64_t PyTime_to_us(PyObject* pytime) {
- return (static_cast<int64_t>(PyDateTime_TIME_GET_HOUR(pytime)) *
3600000000LL +
- static_cast<int64_t>(PyDateTime_TIME_GET_MINUTE(pytime)) *
60000000LL +
- static_cast<int64_t>(PyDateTime_TIME_GET_SECOND(pytime)) * 1000000LL
+
+ return (PyDateTime_TIME_GET_HOUR(pytime) * 3600000000LL +
+ PyDateTime_TIME_GET_MINUTE(pytime) * 60000000LL +
+ PyDateTime_TIME_GET_SECOND(pytime) * 1000000LL +
PyDateTime_TIME_GET_MICROSECOND(pytime));
}
@@ -77,38 +77,38 @@ ARROW_PYTHON_EXPORT
int64_t PyDate_to_days(PyDateTime_Date* pydate);
ARROW_PYTHON_EXPORT
+inline int64_t PyDate_to_s(PyDateTime_Date* pydate) {
+ return PyDate_to_days(pydate) * 86400LL;
+}
+
+ARROW_PYTHON_EXPORT
inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) {
- return PyDate_to_days(pydate) * 24 * 3600 * 1000;
+ return PyDate_to_days(pydate) * 86400000LL;
}
ARROW_PYTHON_EXPORT
inline int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime) {
- int64_t total_seconds = 0;
- total_seconds += PyDateTime_DATE_GET_SECOND(pydatetime);
- total_seconds += PyDateTime_DATE_GET_MINUTE(pydatetime) * 60;
- total_seconds += PyDateTime_DATE_GET_HOUR(pydatetime) * 3600;
-
- return total_seconds +
- (PyDate_to_ms(reinterpret_cast<PyDateTime_Date*>(pydatetime)) /
1000LL);
+ return (PyDate_to_s(reinterpret_cast<PyDateTime_Date*>(pydatetime)) +
+ PyDateTime_DATE_GET_HOUR(pydatetime) * 3600LL +
+ PyDateTime_DATE_GET_MINUTE(pydatetime) * 60LL +
+ PyDateTime_DATE_GET_SECOND(pydatetime));
}
ARROW_PYTHON_EXPORT
inline int64_t PyDateTime_to_ms(PyDateTime_DateTime* pydatetime) {
- int64_t date_ms = PyDateTime_to_s(pydatetime) * 1000;
- int ms = PyDateTime_DATE_GET_MICROSECOND(pydatetime) / 1000;
- return date_ms + ms;
+ return (PyDateTime_to_s(pydatetime) * 1000LL +
+ PyDateTime_DATE_GET_MICROSECOND(pydatetime) / 1000);
}
ARROW_PYTHON_EXPORT
inline int64_t PyDateTime_to_us(PyDateTime_DateTime* pydatetime) {
- int64_t ms = PyDateTime_to_s(pydatetime) * 1000;
- int us = PyDateTime_DATE_GET_MICROSECOND(pydatetime);
- return ms * 1000 + us;
+ return (PyDateTime_to_s(pydatetime) * 1000000LL +
+ PyDateTime_DATE_GET_MICROSECOND(pydatetime));
}
ARROW_PYTHON_EXPORT
inline int64_t PyDateTime_to_ns(PyDateTime_DateTime* pydatetime) {
- return PyDateTime_to_us(pydatetime) * 1000;
+ return PyDateTime_to_us(pydatetime) * 1000LL;
}
ARROW_PYTHON_EXPORT
@@ -131,30 +131,25 @@ inline TimePoint TimePoint_from_ns(int64_t val) {
ARROW_PYTHON_EXPORT
inline int64_t PyDelta_to_s(PyDateTime_Delta* pytimedelta) {
- int64_t total_seconds = 0;
- total_seconds += PyDateTime_DELTA_GET_SECONDS(pytimedelta);
- total_seconds += PyDateTime_DELTA_GET_DAYS(pytimedelta) * 24 * 3600;
- return total_seconds;
+ return (PyDateTime_DELTA_GET_DAYS(pytimedelta) * 86400LL +
+ PyDateTime_DELTA_GET_SECONDS(pytimedelta));
}
ARROW_PYTHON_EXPORT
inline int64_t PyDelta_to_ms(PyDateTime_Delta* pytimedelta) {
- int64_t total_ms = PyDelta_to_s(pytimedelta) * 1000;
- total_ms += PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta) / 1000;
- return total_ms;
+ return (PyDelta_to_s(pytimedelta) * 1000LL +
+ PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta) / 1000);
}
ARROW_PYTHON_EXPORT
inline int64_t PyDelta_to_us(PyDateTime_Delta* pytimedelta) {
- int64_t total_us = 0;
- total_us += PyDelta_to_s(pytimedelta) * 1000 * 1000;
- total_us += PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta);
- return total_us;
+ return (PyDelta_to_s(pytimedelta) * 1000000LL +
+ PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta));
}
ARROW_PYTHON_EXPORT
inline int64_t PyDelta_to_ns(PyDateTime_Delta* pytimedelta) {
- return PyDelta_to_us(pytimedelta) * 1000;
+ return PyDelta_to_us(pytimedelta) * 1000LL;
}
ARROW_PYTHON_EXPORT
diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc
index 1845aa1..f20cdf0 100644
--- a/cpp/src/arrow/python/helpers.cc
+++ b/cpp/src/arrow/python/helpers.cc
@@ -128,6 +128,14 @@ Status PyObject_StdStringStr(PyObject* obj, std::string*
out) {
return PyUnicode_AsStdString(string_ref.obj(), out);
}
+Result<bool> IsModuleImported(const std::string& module_name) {
+ // PyImport_GetModuleDict returns with a borrowed reference
+ OwnedRef key(PyUnicode_FromString(module_name.c_str()));
+ auto is_imported = PyDict_Contains(PyImport_GetModuleDict(), key.obj());
+ RETURN_IF_PYERROR();
+ return is_imported;
+}
+
Status ImportModule(const std::string& module_name, OwnedRef* ref) {
PyObject* module = PyImport_ImportModule(module_name.c_str());
RETURN_IF_PYERROR();
@@ -258,30 +266,44 @@ bool PyFloat_IsNaN(PyObject* obj) {
namespace {
static std::once_flag pandas_static_initialized;
-static PyTypeObject* pandas_NaTType = nullptr;
+
static PyObject* pandas_NA = nullptr;
+static PyObject* pandas_NaT = nullptr;
+static PyObject* pandas_Timedelta = nullptr;
+static PyObject* pandas_Timestamp = nullptr;
+static PyTypeObject* pandas_NaTType = nullptr;
void GetPandasStaticSymbols() {
OwnedRef pandas;
+
+ // import pandas
Status s = ImportModule("pandas", &pandas);
if (!s.ok()) {
return;
}
OwnedRef ref;
- s = ImportFromModule(pandas.obj(), "NaT", &ref);
- if (!s.ok()) {
- return;
+
+ // set NaT sentinel and its type
+ if (ImportFromModule(pandas.obj(), "NaT", &ref).ok()) {
+ pandas_NaT = ref.obj();
+ // PyObject_Type returns a new reference but we trust that pandas.NaT will
+ // outlive our use of this PyObject*
+ pandas_NaTType = Py_TYPE(ref.obj());
+ }
+
+ // retain a reference to Timedelta
+ if (ImportFromModule(pandas.obj(), "Timedelta", &ref).ok()) {
+ pandas_Timedelta = ref.obj();
}
- PyObject* nat_type = PyObject_Type(ref.obj());
- pandas_NaTType = reinterpret_cast<PyTypeObject*>(nat_type);
- // PyObject_Type returns a new reference but we trust that pandas.NaT will
- // outlive our use of this PyObject*
- Py_DECREF(nat_type);
+ // retain a reference to Timestamp
+ if (ImportFromModule(pandas.obj(), "Timestamp", &ref).ok()) {
+ pandas_Timestamp = ref.obj();
+ }
+ // if pandas.NA exists, retain a reference to it
if (ImportFromModule(pandas.obj(), "NA", &ref).ok()) {
- // If pandas.NA exists, retain a reference to it
pandas_NA = ref.obj();
}
}
@@ -307,6 +329,14 @@ bool PandasObjectIsNull(PyObject* obj) {
return false;
}
+bool IsPandasTimedelta(PyObject* obj) {
+ return pandas_Timedelta && PyObject_IsInstance(obj, pandas_Timedelta);
+}
+
+bool IsPandasTimestamp(PyObject* obj) {
+ return pandas_Timestamp && PyObject_IsInstance(obj, pandas_Timestamp);
+}
+
Status InvalidValue(PyObject* obj, const std::string& why) {
std::string obj_as_str;
RETURN_NOT_OK(internal::PyObject_StdStringStr(obj, &obj_as_str));
diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h
index 4d27c2c..1928875 100644
--- a/cpp/src/arrow/python/helpers.h
+++ b/cpp/src/arrow/python/helpers.h
@@ -51,6 +51,10 @@ ARROW_PYTHON_EXPORT Status PyFloat_AsHalf(PyObject* obj,
npy_half* out);
namespace internal {
+// \brief Check that a Python module has been already imported
+// \param[in] module_name The name of the module
+Result<bool> IsModuleImported(const std::string& module_name);
+
// \brief Import a Python module
// \param[in] module_name The name of the module
// \param[out] ref The OwnedRef containing the module PyObject*
@@ -76,6 +80,13 @@ void InitPandasStaticData();
ARROW_PYTHON_EXPORT
bool PandasObjectIsNull(PyObject* obj);
+// \brief Check that obj is a pandas.Timedelta instance
+ARROW_PYTHON_EXPORT
+bool IsPandasTimedelta(PyObject* obj);
+
+// \brief Check that obj is a pandas.Timestamp instance
+bool IsPandasTimestamp(PyObject* obj);
+
// \brief Check whether obj is a floating-point NaN
ARROW_PYTHON_EXPORT
bool PyFloat_IsNaN(PyObject* obj);
diff --git a/cpp/src/arrow/python/python_to_arrow.cc
b/cpp/src/arrow/python/python_to_arrow.cc
index 6d98211..d3b9d6b 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -249,18 +249,26 @@ class PyValue {
value = internal::PyDateTime_to_s(dt) - offset;
break;
case TimeUnit::MILLI:
- value = internal::PyDateTime_to_ms(dt) - offset * 1000;
+ value = internal::PyDateTime_to_ms(dt) - offset * 1000LL;
break;
case TimeUnit::MICRO:
- value = internal::PyDateTime_to_us(dt) - offset * 1000 * 1000;
+ value = internal::PyDateTime_to_us(dt) - offset * 1000000LL;
break;
case TimeUnit::NANO:
- // Conversion to nanoseconds can overflow -> check multiply of
microseconds
- value = internal::PyDateTime_to_us(dt);
- if (arrow::internal::MultiplyWithOverflow(value, 1000, &value)) {
- return internal::InvalidValue(obj, "out of bounds for nanosecond
resolution");
+ if (internal::IsPandasTimestamp(obj)) {
+ OwnedRef nanos(PyObject_GetAttrString(obj, "value"));
+ RETURN_IF_PYERROR();
+ RETURN_NOT_OK(internal::CIntFromPython(nanos.obj(), &value));
+ } else {
+ // Conversion to nanoseconds can overflow -> check multiply of
microseconds
+ value = internal::PyDateTime_to_us(dt);
+ if (arrow::internal::MultiplyWithOverflow(value, 1000LL, &value)) {
+ return internal::InvalidValue(obj,
+ "out of bounds for nanosecond
resolution");
+ }
}
- if (arrow::internal::SubtractWithOverflow(value, offset * 1000 *
1000 * 1000,
+ // Adjust with offset and check for overflow
+ if (arrow::internal::SubtractWithOverflow(value, offset *
1000000000LL,
&value)) {
return internal::InvalidValue(obj, "out of bounds for nanosecond
resolution");
}
@@ -298,7 +306,13 @@ class PyValue {
value = internal::PyDelta_to_us(dt);
break;
case TimeUnit::NANO:
- value = internal::PyDelta_to_ns(dt);
+ if (internal::IsPandasTimedelta(obj)) {
+ OwnedRef nanos(PyObject_GetAttrString(obj, "value"));
+ RETURN_IF_PYERROR();
+ RETURN_NOT_OK(internal::CIntFromPython(nanos.obj(), &value));
+ } else {
+ value = internal::PyDelta_to_ns(dt);
+ }
break;
default:
return Status::UnknownError("Invalid time unit");
@@ -664,12 +678,7 @@ class PyListConverter : public ListConverter<T,
PyConverter, PyConverterTrait> {
} \
return AppendNdarrayTyped<TYPE, NUMPY_TYPE>(ndarray); \
}
-// Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise
-#define LIST_SLOW_CASE(TYPE_ID) \
- case Type::TYPE_ID: { \
- return Extend(this->value_converter_.get(), value, size); \
- }
- LIST_SLOW_CASE(NA)
+ LIST_FAST_CASE(BOOL, BooleanType, NPY_BOOL)
LIST_FAST_CASE(UINT8, UInt8Type, NPY_UINT8)
LIST_FAST_CASE(INT8, Int8Type, NPY_INT8)
LIST_FAST_CASE(UINT16, UInt16Type, NPY_UINT16)
@@ -683,24 +692,9 @@ class PyListConverter : public ListConverter<T,
PyConverter, PyConverterTrait> {
LIST_FAST_CASE(DOUBLE, DoubleType, NPY_DOUBLE)
LIST_FAST_CASE(TIMESTAMP, TimestampType, NPY_DATETIME)
LIST_FAST_CASE(DURATION, DurationType, NPY_TIMEDELTA)
- LIST_SLOW_CASE(DATE32)
- LIST_SLOW_CASE(DATE64)
- LIST_SLOW_CASE(TIME32)
- LIST_SLOW_CASE(TIME64)
- LIST_SLOW_CASE(BINARY)
- LIST_SLOW_CASE(FIXED_SIZE_BINARY)
- LIST_SLOW_CASE(STRING)
#undef LIST_FAST_CASE
-#undef LIST_SLOW_CASE
- case Type::LIST: {
- if (PyArray_DESCR(ndarray)->type_num != NPY_OBJECT) {
- return Status::Invalid(
- "Can only convert list types from NumPy object array input");
- }
- return Extend(this->value_converter_.get(), value, /*reserved=*/0);
- }
default: {
- return Status::TypeError("Unknown list item type: ",
value_type->ToString());
+ return Extend(this->value_converter_.get(), value, size);
}
}
}
@@ -1004,6 +998,13 @@ Result<std::shared_ptr<ChunkedArray>>
ConvertPySequence(PyObject* obj, PyObject*
PyObject* seq;
OwnedRef tmp_seq_nanny;
+ ARROW_ASSIGN_OR_RAISE(auto is_pandas_imported,
internal::IsModuleImported("pandas"));
+ if (is_pandas_imported) {
+ // If pandas has been already imported initialize the static pandas
objects to
+ // support converting from pd.Timedelta and pd.Timestamp objects
+ internal::InitPandasStaticData();
+ }
+
int64_t size = options.size;
RETURN_NOT_OK(ConvertToSequenceAndInferSize(obj, &seq, &size));
tmp_seq_nanny.reset(seq);
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index e4d3d0a..8e5a74f 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -1662,6 +1662,18 @@ tasks:
PYTHON: 3.8
run: conda-python
+ test-conda-python-3.8-hypothesis:
+ ci: github
+ template: docker-tests/github.linux.yml
+ params:
+ env:
+ HYPOTHESIS_PROFILE: ci
+ PYARROW_TEST_HYPOTHESIS: ON
+ PYTHON: 3.8
+ # limit to execute hypothesis tests only
+ PYTEST_ARGS: "-m hypothesis"
+ run: conda-python-pandas
+
test-debian-10-python-3:
ci: azure
template: docker-tests/azure.linux.yml
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index 43a6496..18cd6e9 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -28,7 +28,7 @@ from pyarrow.util import find_free_port
# setup hypothesis profiles
h.settings.register_profile('ci', max_examples=1000)
-h.settings.register_profile('dev', max_examples=10)
+h.settings.register_profile('dev', max_examples=50)
h.settings.register_profile('debug', max_examples=10,
verbosity=h.Verbosity.verbose)
diff --git a/python/pyarrow/tests/strategies.py
b/python/pyarrow/tests/strategies.py
index 97e972d..cb9d943 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -42,6 +42,17 @@ binary_type = st.just(pa.binary())
string_type = st.just(pa.string())
large_binary_type = st.just(pa.large_binary())
large_string_type = st.just(pa.large_string())
+fixed_size_binary_type = st.builds(
+ pa.binary,
+ st.integers(min_value=0, max_value=16)
+)
+binary_like_types = st.one_of(
+ binary_type,
+ string_type,
+ large_binary_type,
+ large_string_type,
+ fixed_size_binary_type
+)
signed_integer_types = st.sampled_from([
pa.int8(),
@@ -98,12 +109,9 @@ temporal_types = st.one_of(
primitive_types = st.one_of(
null_type,
bool_type,
- binary_type,
- string_type,
- large_binary_type,
- large_string_type,
numeric_types,
- temporal_types
+ temporal_types,
+ binary_like_types
)
metadata = st.dictionaries(st.text(), st.text())
@@ -124,28 +132,49 @@ def fields(draw, type_strategy=primitive_types):
def list_types(item_strategy=primitive_types):
return (
st.builds(pa.list_, item_strategy) |
- st.builds(pa.large_list, item_strategy)
+ st.builds(pa.large_list, item_strategy) |
+ st.builds(
+ pa.list_,
+ item_strategy,
+ st.integers(min_value=0, max_value=16)
+ )
)
-def struct_types(item_strategy=primitive_types):
- return st.builds(pa.struct, st.lists(fields(item_strategy)))
-
-
-def complex_types(inner_strategy=primitive_types):
- return list_types(inner_strategy) | struct_types(inner_strategy)
-
-
-def nested_list_types(item_strategy=primitive_types, max_leaves=3):
- return st.recursive(item_strategy, list_types, max_leaves=max_leaves)
[email protected]
+def struct_types(draw, item_strategy=primitive_types):
+ fields_strategy = st.lists(fields(item_strategy))
+ fields_rendered = draw(fields_strategy)
+ field_names = [field.name for field in fields_rendered]
+ # check that field names are unique, see ARROW-9997
+ h.assume(len(set(field_names)) == len(field_names))
+ return pa.struct(fields_rendered)
+
+
+def dictionary_types(key_strategy=None, value_strategy=None):
+ key_strategy = key_strategy or signed_integer_types
+ value_strategy = value_strategy or st.one_of(
+ bool_type,
+ integer_types,
+ st.sampled_from([pa.float32(), pa.float64()]),
+ binary_type,
+ string_type,
+ fixed_size_binary_type,
+ )
+ return st.builds(pa.dictionary, key_strategy, value_strategy)
-def nested_struct_types(item_strategy=primitive_types, max_leaves=3):
- return st.recursive(item_strategy, struct_types, max_leaves=max_leaves)
[email protected]
+def map_types(draw, key_strategy=primitive_types,
+ item_strategy=primitive_types):
+ key_type = draw(key_strategy)
+ h.assume(not pa.types.is_null(key_type))
+ value_type = draw(item_strategy)
+ return pa.map_(key_type, value_type)
-def nested_complex_types(inner_strategy=primitive_types, max_leaves=3):
- return st.recursive(inner_strategy, complex_types, max_leaves=max_leaves)
+# union type
+# extension type
def schemas(type_strategy=primitive_types, max_fields=None):
@@ -153,10 +182,17 @@ def schemas(type_strategy=primitive_types,
max_fields=None):
return st.builds(pa.schema, children)
-complex_schemas = schemas(complex_types())
-
-
-all_types = st.one_of(primitive_types, complex_types(), nested_complex_types())
+all_types = st.deferred(
+ lambda: (
+ primitive_types |
+ list_types() |
+ struct_types() |
+ dictionary_types() |
+ map_types() |
+ list_types(all_types) |
+ struct_types(all_types)
+ )
+)
all_fields = fields(all_types)
all_schemas = schemas(all_types)
@@ -165,7 +201,21 @@ _default_array_sizes = st.integers(min_value=0,
max_value=20)
@st.composite
-def arrays(draw, type, size=None):
+def _pylist(draw, value_type, size, nullable=True):
+ arr = draw(arrays(value_type, size=size, nullable=False))
+ return arr.to_pylist()
+
+
[email protected]
+def _pymap(draw, key_type, value_type, size, nullable=True):
+ length = draw(size)
+ keys = draw(_pylist(key_type, size=length, nullable=False))
+ values = draw(_pylist(value_type, size=length, nullable=nullable))
+ return list(zip(keys, values))
+
+
[email protected]
+def arrays(draw, type, size=None, nullable=True):
if isinstance(type, st.SearchStrategy):
ty = draw(type)
elif isinstance(type, pa.DataType):
@@ -180,38 +230,24 @@ def arrays(draw, type, size=None):
elif not isinstance(size, int):
raise TypeError('Size must be an integer')
- shape = (size,)
-
- if pa.types.is_list(ty) or pa.types.is_large_list(ty):
- offsets = draw(npst.arrays(np.uint8(), shape=shape)).cumsum() // 20
- offsets = np.insert(offsets, 0, 0, axis=0) # prepend with zero
- values = draw(arrays(ty.value_type, size=int(offsets.sum())))
- if pa.types.is_large_list(ty):
- array_type = pa.LargeListArray
- else:
- array_type = pa.ListArray
- return array_type.from_arrays(offsets, values)
-
- if pa.types.is_struct(ty):
- h.assume(len(ty) > 0)
- fields, child_arrays = [], []
- for field in ty:
- fields.append(field)
- child_arrays.append(draw(arrays(field.type, size=size)))
- return pa.StructArray.from_arrays(child_arrays, fields=fields)
-
- if (pa.types.is_boolean(ty) or pa.types.is_integer(ty) or
- pa.types.is_floating(ty)):
- values = npst.arrays(ty.to_pandas_dtype(), shape=(size,))
- np_arr = draw(values)
- if pa.types.is_floating(ty):
- # Workaround ARROW-4952: no easy way to assert array equality
- # in a NaN-tolerant way.
- np_arr[np.isnan(np_arr)] = -42.0
- return pa.array(np_arr, type=ty)
-
if pa.types.is_null(ty):
+ h.assume(nullable)
value = st.none()
+ elif pa.types.is_boolean(ty):
+ value = st.booleans()
+ elif pa.types.is_integer(ty):
+ values = draw(npst.arrays(ty.to_pandas_dtype(), shape=(size,)))
+ return pa.array(values, type=ty)
+ elif pa.types.is_floating(ty):
+ values = draw(npst.arrays(ty.to_pandas_dtype(), shape=(size,)))
+ # Workaround ARROW-4952: no easy way to assert array equality
+ # in a NaN-tolerant way.
+ values[np.isnan(values)] = -42.0
+ return pa.array(values, type=ty)
+ elif pa.types.is_decimal(ty):
+ # TODO(kszucs): properly limit the precision
+ # value = st.decimals(places=type.scale, allow_infinity=False)
+ h.reject()
elif pa.types.is_time(ty):
value = st.times()
elif pa.types.is_date(ty):
@@ -219,8 +255,8 @@ def arrays(draw, type, size=None):
elif pa.types.is_timestamp(ty):
min_int64 = -(2**63)
max_int64 = 2**63 - 1
- min_datetime = datetime.datetime.fromtimestamp(min_int64 / 10**9)
- max_datetime = datetime.datetime.fromtimestamp(max_int64 / 10**9)
+ min_datetime = datetime.datetime.fromtimestamp(min_int64 // 10**9)
+ max_datetime = datetime.datetime.fromtimestamp(max_int64 // 10**9)
try:
offset_hours = int(ty.tz)
tz = pytz.FixedOffset(offset_hours * 60)
@@ -234,14 +270,34 @@ def arrays(draw, type, size=None):
value = st.binary()
elif pa.types.is_string(ty) or pa.types.is_large_string(ty):
value = st.text()
- elif pa.types.is_decimal(ty):
- # TODO(kszucs): properly limit the precision
- # value = st.decimals(places=type.scale, allow_infinity=False)
- h.reject()
+ elif pa.types.is_fixed_size_binary(ty):
+ value = st.binary(min_size=ty.byte_width, max_size=ty.byte_width)
+ elif pa.types.is_list(ty):
+ value = _pylist(ty.value_type, size=size, nullable=nullable)
+ elif pa.types.is_large_list(ty):
+ value = _pylist(ty.value_type, size=size, nullable=nullable)
+ elif pa.types.is_fixed_size_list(ty):
+ value = _pylist(ty.value_type, size=ty.list_size, nullable=nullable)
+ elif pa.types.is_dictionary(ty):
+ values = _pylist(ty.value_type, size=size, nullable=nullable)
+ return pa.array(draw(values), type=ty)
+ elif pa.types.is_map(ty):
+ value = _pymap(ty.key_type, ty.item_type, size=_default_array_sizes,
+ nullable=nullable)
+ elif pa.types.is_struct(ty):
+ h.assume(len(ty) > 0)
+ fields, child_arrays = [], []
+ for field in ty:
+ fields.append(field)
+ child_arrays.append(draw(arrays(field.type, size=size)))
+ return pa.StructArray.from_arrays(child_arrays, fields=fields)
else:
raise NotImplementedError(ty)
+ if nullable:
+ value = st.one_of(st.none(), value)
values = st.lists(value, min_size=size, max_size=size)
+
return pa.array(draw(values), type=ty)
@@ -293,3 +349,60 @@ all_arrays = arrays(all_types)
all_chunked_arrays = chunked_arrays(all_types)
all_record_batches = record_batches(all_types)
all_tables = tables(all_types)
+
+
+# Define the same rules as above for pandas tests by excluding certain types
+# from the generation because of known issues.
+
+pandas_compatible_primitive_types = st.one_of(
+ null_type,
+ bool_type,
+ integer_types,
+ st.sampled_from([pa.float32(), pa.float64()]),
+ decimal_type,
+ date_types,
+ time_types,
+ # Need to exclude timestamp and duration types otherwise hypothesis
+ # discovers ARROW-10210
+ # timestamp_types,
+ # duration_types
+ binary_type,
+ string_type,
+ large_binary_type,
+ large_string_type,
+)
+
+# Need to exclude floating point types otherwise hypothesis discovers
+# ARROW-10211
+pandas_compatible_dictionary_value_types = st.one_of(
+ bool_type,
+ integer_types,
+ binary_type,
+ string_type,
+ fixed_size_binary_type,
+)
+
+
+def pandas_compatible_list_types(
+ item_strategy=pandas_compatible_primitive_types
+):
+ # Need to exclude fixed size list type otherwise hypothesis discovers
+ # ARROW-10194
+ return (
+ st.builds(pa.list_, item_strategy) |
+ st.builds(pa.large_list, item_strategy)
+ )
+
+
+pandas_compatible_types = st.deferred(
+ lambda: st.one_of(
+ pandas_compatible_primitive_types,
+ pandas_compatible_list_types(pandas_compatible_primitive_types),
+ struct_types(pandas_compatible_primitive_types),
+ dictionary_types(
+ value_strategy=pandas_compatible_dictionary_value_types
+ ),
+ pandas_compatible_list_types(pandas_compatible_types),
+ struct_types(pandas_compatible_types)
+ )
+)
diff --git a/python/pyarrow/tests/test_convert_builtin.py
b/python/pyarrow/tests/test_convert_builtin.py
index c4580db..91624d7 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -387,7 +387,6 @@ def test_broken_integers(seq):
def test_numpy_scalars_mixed_type():
-
# ARROW-4324
data = [np.int32(10), np.float32(0.5)]
arr = pa.array(data)
@@ -627,6 +626,50 @@ def test_multidimensional_ndarray_as_nested_list():
assert result.equals(expected)
[email protected](('data', 'value_type'), [
+ ([True, False], pa.bool_()),
+ ([None, None], pa.null()),
+ ([1, 2, None], pa.int8()),
+ ([1, 2., 3., None], pa.float32()),
+ ([datetime.date.today(), None], pa.date32()),
+ ([None, datetime.date.today()], pa.date64()),
+ ([datetime.time(1, 1, 1), None], pa.time32('s')),
+ ([None, datetime.time(2, 2, 2)], pa.time64('us')),
+ ([datetime.datetime.now(), None], pa.timestamp('us')),
+ ([datetime.timedelta(seconds=10)], pa.duration('s')),
+ ([b"a", b"b"], pa.binary()),
+ ([b"aaa", b"bbb", b"ccc"], pa.binary(3)),
+ ([b"a", b"b", b"c"], pa.large_binary()),
+ (["a", "b", "c"], pa.string()),
+ (["a", "b", "c"], pa.large_string()),
+ (
+ [{"a": 1, "b": 2}, None, {"a": 5, "b": None}],
+ pa.struct([('a', pa.int8()), ('b', pa.int16())])
+ )
+])
+def test_list_array_from_object_ndarray(data, value_type):
+ ty = pa.list_(value_type)
+ ndarray = np.array(data, dtype=object)
+ arr = pa.array([ndarray], type=ty)
+ assert arr.type.equals(ty)
+ assert arr.to_pylist() == [data]
+
+
[email protected](('data', 'value_type'), [
+ ([[1, 2], [3]], pa.list_(pa.int64())),
+ ([[1, 2], [3, 4]], pa.list_(pa.int64(), 2)),
+ ([[1], [2, 3]], pa.large_list(pa.int64()))
+])
+def test_nested_list_array_from_object_ndarray(data, value_type):
+ ndarray = np.empty(len(data), dtype=object)
+ ndarray[:] = [np.array(item, dtype=object) for item in data]
+
+ ty = pa.list_(value_type)
+ arr = pa.array([ndarray], type=ty)
+ assert arr.type.equals(ty)
+ assert arr.to_pylist() == [data]
+
+
def test_array_ignore_nan_from_pandas():
# See ARROW-4324, this reverts logic that was introduced in
# ARROW-2240
@@ -1903,18 +1946,62 @@ def test_dictionary_from_strings():
assert a.dictionary.equals(expected_dictionary)
-def _has_unique_field_names(ty):
- if isinstance(ty, pa.StructType):
- field_names = [field.name for field in ty]
- return len(set(field_names)) == len(field_names)
- else:
- return True
[email protected](('unit', 'expected'), [
+ ('s', datetime.timedelta(seconds=-2147483000)),
+ ('ms', datetime.timedelta(milliseconds=-2147483000)),
+ ('us', datetime.timedelta(microseconds=-2147483000)),
+ ('ns', datetime.timedelta(microseconds=-2147483))
+])
+def test_duration_array_roundtrip_corner_cases(unit, expected):
+ # Corner case discovered by hypothesis: there were implicit conversions to
+ # unsigned values resulting wrong values with wrong signs.
+ ty = pa.duration(unit)
+ arr = pa.array([-2147483000], type=ty)
+ restored = pa.array(arr.to_pylist(), type=ty)
+ assert arr.equals(restored)
+
+ expected_list = [expected]
+ if unit == 'ns':
+ # if pandas is available then a pandas Timedelta is returned
+ try:
+ import pandas as pd
+ except ImportError:
+ pass
+ else:
+ expected_list = [pd.Timedelta(-2147483000, unit='ns')]
+
+ assert restored.to_pylist() == expected_list
+
+
[email protected]
+def test_roundtrip_nanosecond_resolution_pandas_temporal_objects():
+ # corner case discovered by hypothesis: preserving the nanoseconds on
+ # conversion from a list of Timedelta and Timestamp objects
+ import pandas as pd
+
+ ty = pa.duration('ns')
+ arr = pa.array([9223371273709551616], type=ty)
+ data = arr.to_pylist()
+ assert isinstance(data[0], pd.Timedelta)
+ restored = pa.array(data, type=ty)
+ assert arr.equals(restored)
+ assert restored.to_pylist() == [
+ pd.Timedelta(9223371273709551616, unit='ns')
+ ]
+
+ ty = pa.timestamp('ns')
+ arr = pa.array([9223371273709551616], type=ty)
+ data = arr.to_pylist()
+ assert isinstance(data[0], pd.Timestamp)
+ restored = pa.array(data, type=ty)
+ assert arr.equals(restored)
+ assert restored.to_pylist() == [
+ pd.Timestamp(9223371273709551616, unit='ns')
+ ]
@h.given(past.all_arrays)
def test_array_to_pylist_roundtrip(arr):
- # TODO(kszucs): ARROW-9997
- h.assume(_has_unique_field_names(arr.type))
seq = arr.to_pylist()
restored = pa.array(seq, type=arr.type)
assert restored.equals(arr)
diff --git a/python/pyarrow/tests/test_pandas.py
b/python/pyarrow/tests/test_pandas.py
index 756b4ff..54f4574 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -35,6 +35,7 @@ import pytz
from pyarrow.pandas_compat import get_logical_type, _pandas_api
from pyarrow.tests.util import random_ascii, rands
+import pyarrow.tests.strategies as past
import pyarrow as pa
try:
@@ -2842,6 +2843,17 @@ def test_convert_unsupported_type_error_message():
# ----------------------------------------------------------------------
+# Hypothesis tests
+
+
[email protected](past.arrays(past.pandas_compatible_types))
+def test_array_to_pandas_roundtrip(arr):
+ s = arr.to_pandas()
+ restored = pa.array(s, type=arr.type, from_pandas=True)
+ assert restored.equals(arr)
+
+
+# ----------------------------------------------------------------------
# Test object deduplication in to_pandas
diff --git a/python/pyarrow/tests/test_strategies.py
b/python/pyarrow/tests/test_strategies.py
index f4249df..14fc949 100644
--- a/python/pyarrow/tests/test_strategies.py
+++ b/python/pyarrow/tests/test_strategies.py
@@ -41,6 +41,11 @@ def test_arrays(array):
assert isinstance(array, pa.lib.Array)
[email protected](past.arrays(past.primitive_types, nullable=False))
+def test_array_nullability(array):
+ assert array.null_count == 0
+
+
@h.given(past.all_chunked_arrays)
def test_chunked_arrays(chunked_array):
assert isinstance(chunked_array, pa.lib.ChunkedArray)