[arrow] branch master updated: ARROW-3080: [Python] Unify Arrow to Python object conversion paths

jorisvandenbossche Sat, 10 Oct 2020 01:49:32 -0700

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 4679c7c  ARROW-3080: [Python] Unify Arrow to Python object conversion 
paths
4679c7c is described below

commit 4679c7c0e4f0c9428c36e577f8e299b7c37eb04c
Author: Krisztián Szűcs <[email protected]>
AuthorDate: Sat Oct 10 10:47:32 2020 +0200

    ARROW-3080: [Python] Unify Arrow to Python object conversion paths
    
    This issue is more about the testing since we recently had a refactor 
targeting the arrow to python conversion paths: 
https://issues.apache.org/jira/browse/ARROW-9017
    
    Closes #8349 from kszucs/ARROW-3080-hypo
    
    Authored-by: Krisztián Szűcs <[email protected]>
    Signed-off-by: Joris Van den Bossche <[email protected]>
---
 ci/scripts/python_test.sh                    |   2 +-
 cpp/src/arrow/python/datetime.cc             |   1 -
 cpp/src/arrow/python/datetime.h              |  55 +++----
 cpp/src/arrow/python/helpers.cc              |  50 ++++--
 cpp/src/arrow/python/helpers.h               |  11 ++
 cpp/src/arrow/python/python_to_arrow.cc      |  61 +++----
 dev/tasks/tasks.yml                          |  12 ++
 python/pyarrow/tests/conftest.py             |   2 +-
 python/pyarrow/tests/strategies.py           | 235 ++++++++++++++++++++-------
 python/pyarrow/tests/test_convert_builtin.py | 105 +++++++++++-
 python/pyarrow/tests/test_pandas.py          |  12 ++
 python/pyarrow/tests/test_strategies.py      |   5 +
 12 files changed, 408 insertions(+), 143 deletions(-)

diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh
index 6f961d2..80a9cde 100755
--- a/ci/scripts/python_test.sh
+++ b/ci/scripts/python_test.sh
@@ -29,4 +29,4 @@ export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH}
 # Enable some checks inside Python itself
 export PYTHONDEVMODE=1
 
-pytest -r s --pyargs pyarrow
+pytest -r s ${PYTEST_ARGS} --pyargs pyarrow
diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc
index 07df5e7..8df2012 100644
--- a/cpp/src/arrow/python/datetime.cc
+++ b/cpp/src/arrow/python/datetime.cc
@@ -19,7 +19,6 @@
 #include <algorithm>
 #include <chrono>
 #include <iomanip>
-#include <iostream>
 
 #include "arrow/python/common.h"
 #include "arrow/python/helpers.h"
diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h
index 4f3adb4..0072cdd 100644
--- a/cpp/src/arrow/python/datetime.h
+++ b/cpp/src/arrow/python/datetime.h
@@ -44,9 +44,9 @@ void InitDatetime();
 
 ARROW_PYTHON_EXPORT
 inline int64_t PyTime_to_us(PyObject* pytime) {
-  return (static_cast<int64_t>(PyDateTime_TIME_GET_HOUR(pytime)) * 
3600000000LL +
-          static_cast<int64_t>(PyDateTime_TIME_GET_MINUTE(pytime)) * 
60000000LL +
-          static_cast<int64_t>(PyDateTime_TIME_GET_SECOND(pytime)) * 1000000LL 
+
+  return (PyDateTime_TIME_GET_HOUR(pytime) * 3600000000LL +
+          PyDateTime_TIME_GET_MINUTE(pytime) * 60000000LL +
+          PyDateTime_TIME_GET_SECOND(pytime) * 1000000LL +
           PyDateTime_TIME_GET_MICROSECOND(pytime));
 }
 
@@ -77,38 +77,38 @@ ARROW_PYTHON_EXPORT
 int64_t PyDate_to_days(PyDateTime_Date* pydate);
 
 ARROW_PYTHON_EXPORT
+inline int64_t PyDate_to_s(PyDateTime_Date* pydate) {
+  return PyDate_to_days(pydate) * 86400LL;
+}
+
+ARROW_PYTHON_EXPORT
 inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) {
-  return PyDate_to_days(pydate) * 24 * 3600 * 1000;
+  return PyDate_to_days(pydate) * 86400000LL;
 }
 
 ARROW_PYTHON_EXPORT
 inline int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime) {
-  int64_t total_seconds = 0;
-  total_seconds += PyDateTime_DATE_GET_SECOND(pydatetime);
-  total_seconds += PyDateTime_DATE_GET_MINUTE(pydatetime) * 60;
-  total_seconds += PyDateTime_DATE_GET_HOUR(pydatetime) * 3600;
-
-  return total_seconds +
-         (PyDate_to_ms(reinterpret_cast<PyDateTime_Date*>(pydatetime)) / 
1000LL);
+  return (PyDate_to_s(reinterpret_cast<PyDateTime_Date*>(pydatetime)) +
+          PyDateTime_DATE_GET_HOUR(pydatetime) * 3600LL +
+          PyDateTime_DATE_GET_MINUTE(pydatetime) * 60LL +
+          PyDateTime_DATE_GET_SECOND(pydatetime));
 }
 
 ARROW_PYTHON_EXPORT
 inline int64_t PyDateTime_to_ms(PyDateTime_DateTime* pydatetime) {
-  int64_t date_ms = PyDateTime_to_s(pydatetime) * 1000;
-  int ms = PyDateTime_DATE_GET_MICROSECOND(pydatetime) / 1000;
-  return date_ms + ms;
+  return (PyDateTime_to_s(pydatetime) * 1000LL +
+          PyDateTime_DATE_GET_MICROSECOND(pydatetime) / 1000);
 }
 
 ARROW_PYTHON_EXPORT
 inline int64_t PyDateTime_to_us(PyDateTime_DateTime* pydatetime) {
-  int64_t ms = PyDateTime_to_s(pydatetime) * 1000;
-  int us = PyDateTime_DATE_GET_MICROSECOND(pydatetime);
-  return ms * 1000 + us;
+  return (PyDateTime_to_s(pydatetime) * 1000000LL +
+          PyDateTime_DATE_GET_MICROSECOND(pydatetime));
 }
 
 ARROW_PYTHON_EXPORT
 inline int64_t PyDateTime_to_ns(PyDateTime_DateTime* pydatetime) {
-  return PyDateTime_to_us(pydatetime) * 1000;
+  return PyDateTime_to_us(pydatetime) * 1000LL;
 }
 
 ARROW_PYTHON_EXPORT
@@ -131,30 +131,25 @@ inline TimePoint TimePoint_from_ns(int64_t val) {
 
 ARROW_PYTHON_EXPORT
 inline int64_t PyDelta_to_s(PyDateTime_Delta* pytimedelta) {
-  int64_t total_seconds = 0;
-  total_seconds += PyDateTime_DELTA_GET_SECONDS(pytimedelta);
-  total_seconds += PyDateTime_DELTA_GET_DAYS(pytimedelta) * 24 * 3600;
-  return total_seconds;
+  return (PyDateTime_DELTA_GET_DAYS(pytimedelta) * 86400LL +
+          PyDateTime_DELTA_GET_SECONDS(pytimedelta));
 }
 
 ARROW_PYTHON_EXPORT
 inline int64_t PyDelta_to_ms(PyDateTime_Delta* pytimedelta) {
-  int64_t total_ms = PyDelta_to_s(pytimedelta) * 1000;
-  total_ms += PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta) / 1000;
-  return total_ms;
+  return (PyDelta_to_s(pytimedelta) * 1000LL +
+          PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta) / 1000);
 }
 
 ARROW_PYTHON_EXPORT
 inline int64_t PyDelta_to_us(PyDateTime_Delta* pytimedelta) {
-  int64_t total_us = 0;
-  total_us += PyDelta_to_s(pytimedelta) * 1000 * 1000;
-  total_us += PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta);
-  return total_us;
+  return (PyDelta_to_s(pytimedelta) * 1000000LL +
+          PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta));
 }
 
 ARROW_PYTHON_EXPORT
 inline int64_t PyDelta_to_ns(PyDateTime_Delta* pytimedelta) {
-  return PyDelta_to_us(pytimedelta) * 1000;
+  return PyDelta_to_us(pytimedelta) * 1000LL;
 }
 
 ARROW_PYTHON_EXPORT
diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc
index 1845aa1..f20cdf0 100644
--- a/cpp/src/arrow/python/helpers.cc
+++ b/cpp/src/arrow/python/helpers.cc
@@ -128,6 +128,14 @@ Status PyObject_StdStringStr(PyObject* obj, std::string* 
out) {
   return PyUnicode_AsStdString(string_ref.obj(), out);
 }
 
+Result<bool> IsModuleImported(const std::string& module_name) {
+  // PyImport_GetModuleDict returns with a borrowed reference
+  OwnedRef key(PyUnicode_FromString(module_name.c_str()));
+  auto is_imported = PyDict_Contains(PyImport_GetModuleDict(), key.obj());
+  RETURN_IF_PYERROR();
+  return is_imported;
+}
+
 Status ImportModule(const std::string& module_name, OwnedRef* ref) {
   PyObject* module = PyImport_ImportModule(module_name.c_str());
   RETURN_IF_PYERROR();
@@ -258,30 +266,44 @@ bool PyFloat_IsNaN(PyObject* obj) {
 namespace {
 
 static std::once_flag pandas_static_initialized;
-static PyTypeObject* pandas_NaTType = nullptr;
+
 static PyObject* pandas_NA = nullptr;
+static PyObject* pandas_NaT = nullptr;
+static PyObject* pandas_Timedelta = nullptr;
+static PyObject* pandas_Timestamp = nullptr;
+static PyTypeObject* pandas_NaTType = nullptr;
 
 void GetPandasStaticSymbols() {
   OwnedRef pandas;
+
+  // import pandas
   Status s = ImportModule("pandas", &pandas);
   if (!s.ok()) {
     return;
   }
 
   OwnedRef ref;
-  s = ImportFromModule(pandas.obj(), "NaT", &ref);
-  if (!s.ok()) {
-    return;
+
+  // set NaT sentinel and its type
+  if (ImportFromModule(pandas.obj(), "NaT", &ref).ok()) {
+    pandas_NaT = ref.obj();
+    // PyObject_Type returns a new reference but we trust that pandas.NaT will
+    // outlive our use of this PyObject*
+    pandas_NaTType = Py_TYPE(ref.obj());
+  }
+
+  // retain a reference to Timedelta
+  if (ImportFromModule(pandas.obj(), "Timedelta", &ref).ok()) {
+    pandas_Timedelta = ref.obj();
   }
-  PyObject* nat_type = PyObject_Type(ref.obj());
-  pandas_NaTType = reinterpret_cast<PyTypeObject*>(nat_type);
 
-  // PyObject_Type returns a new reference but we trust that pandas.NaT will
-  // outlive our use of this PyObject*
-  Py_DECREF(nat_type);
+  // retain a reference to Timestamp
+  if (ImportFromModule(pandas.obj(), "Timestamp", &ref).ok()) {
+    pandas_Timestamp = ref.obj();
+  }
 
+  // if pandas.NA exists, retain a reference to it
   if (ImportFromModule(pandas.obj(), "NA", &ref).ok()) {
-    // If pandas.NA exists, retain a reference to it
     pandas_NA = ref.obj();
   }
 }
@@ -307,6 +329,14 @@ bool PandasObjectIsNull(PyObject* obj) {
   return false;
 }
 
+bool IsPandasTimedelta(PyObject* obj) {
+  return pandas_Timedelta && PyObject_IsInstance(obj, pandas_Timedelta);
+}
+
+bool IsPandasTimestamp(PyObject* obj) {
+  return pandas_Timestamp && PyObject_IsInstance(obj, pandas_Timestamp);
+}
+
 Status InvalidValue(PyObject* obj, const std::string& why) {
   std::string obj_as_str;
   RETURN_NOT_OK(internal::PyObject_StdStringStr(obj, &obj_as_str));
diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h
index 4d27c2c..1928875 100644
--- a/cpp/src/arrow/python/helpers.h
+++ b/cpp/src/arrow/python/helpers.h
@@ -51,6 +51,10 @@ ARROW_PYTHON_EXPORT Status PyFloat_AsHalf(PyObject* obj, 
npy_half* out);
 
 namespace internal {
 
+// \brief Check that a Python module has been already imported
+// \param[in] module_name The name of the module
+Result<bool> IsModuleImported(const std::string& module_name);
+
 // \brief Import a Python module
 // \param[in] module_name The name of the module
 // \param[out] ref The OwnedRef containing the module PyObject*
@@ -76,6 +80,13 @@ void InitPandasStaticData();
 ARROW_PYTHON_EXPORT
 bool PandasObjectIsNull(PyObject* obj);
 
+// \brief Check that obj is a pandas.Timedelta instance
+ARROW_PYTHON_EXPORT
+bool IsPandasTimedelta(PyObject* obj);
+
+// \brief Check that obj is a pandas.Timestamp instance
+bool IsPandasTimestamp(PyObject* obj);
+
 // \brief Check whether obj is a floating-point NaN
 ARROW_PYTHON_EXPORT
 bool PyFloat_IsNaN(PyObject* obj);
diff --git a/cpp/src/arrow/python/python_to_arrow.cc 
b/cpp/src/arrow/python/python_to_arrow.cc
index 6d98211..d3b9d6b 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -249,18 +249,26 @@ class PyValue {
           value = internal::PyDateTime_to_s(dt) - offset;
           break;
         case TimeUnit::MILLI:
-          value = internal::PyDateTime_to_ms(dt) - offset * 1000;
+          value = internal::PyDateTime_to_ms(dt) - offset * 1000LL;
           break;
         case TimeUnit::MICRO:
-          value = internal::PyDateTime_to_us(dt) - offset * 1000 * 1000;
+          value = internal::PyDateTime_to_us(dt) - offset * 1000000LL;
           break;
         case TimeUnit::NANO:
-          // Conversion to nanoseconds can overflow -> check multiply of 
microseconds
-          value = internal::PyDateTime_to_us(dt);
-          if (arrow::internal::MultiplyWithOverflow(value, 1000, &value)) {
-            return internal::InvalidValue(obj, "out of bounds for nanosecond 
resolution");
+          if (internal::IsPandasTimestamp(obj)) {
+            OwnedRef nanos(PyObject_GetAttrString(obj, "value"));
+            RETURN_IF_PYERROR();
+            RETURN_NOT_OK(internal::CIntFromPython(nanos.obj(), &value));
+          } else {
+            // Conversion to nanoseconds can overflow -> check multiply of 
microseconds
+            value = internal::PyDateTime_to_us(dt);
+            if (arrow::internal::MultiplyWithOverflow(value, 1000LL, &value)) {
+              return internal::InvalidValue(obj,
+                                            "out of bounds for nanosecond 
resolution");
+            }
           }
-          if (arrow::internal::SubtractWithOverflow(value, offset * 1000 * 
1000 * 1000,
+          // Adjust with offset and check for overflow
+          if (arrow::internal::SubtractWithOverflow(value, offset * 
1000000000LL,
                                                     &value)) {
             return internal::InvalidValue(obj, "out of bounds for nanosecond 
resolution");
           }
@@ -298,7 +306,13 @@ class PyValue {
           value = internal::PyDelta_to_us(dt);
           break;
         case TimeUnit::NANO:
-          value = internal::PyDelta_to_ns(dt);
+          if (internal::IsPandasTimedelta(obj)) {
+            OwnedRef nanos(PyObject_GetAttrString(obj, "value"));
+            RETURN_IF_PYERROR();
+            RETURN_NOT_OK(internal::CIntFromPython(nanos.obj(), &value));
+          } else {
+            value = internal::PyDelta_to_ns(dt);
+          }
           break;
         default:
           return Status::UnknownError("Invalid time unit");
@@ -664,12 +678,7 @@ class PyListConverter : public ListConverter<T, 
PyConverter, PyConverterTrait> {
     }                                                           \
     return AppendNdarrayTyped<TYPE, NUMPY_TYPE>(ndarray);       \
   }
-// Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise
-#define LIST_SLOW_CASE(TYPE_ID)                               \
-  case Type::TYPE_ID: {                                       \
-    return Extend(this->value_converter_.get(), value, size); \
-  }
-      LIST_SLOW_CASE(NA)
+      LIST_FAST_CASE(BOOL, BooleanType, NPY_BOOL)
       LIST_FAST_CASE(UINT8, UInt8Type, NPY_UINT8)
       LIST_FAST_CASE(INT8, Int8Type, NPY_INT8)
       LIST_FAST_CASE(UINT16, UInt16Type, NPY_UINT16)
@@ -683,24 +692,9 @@ class PyListConverter : public ListConverter<T, 
PyConverter, PyConverterTrait> {
       LIST_FAST_CASE(DOUBLE, DoubleType, NPY_DOUBLE)
       LIST_FAST_CASE(TIMESTAMP, TimestampType, NPY_DATETIME)
       LIST_FAST_CASE(DURATION, DurationType, NPY_TIMEDELTA)
-      LIST_SLOW_CASE(DATE32)
-      LIST_SLOW_CASE(DATE64)
-      LIST_SLOW_CASE(TIME32)
-      LIST_SLOW_CASE(TIME64)
-      LIST_SLOW_CASE(BINARY)
-      LIST_SLOW_CASE(FIXED_SIZE_BINARY)
-      LIST_SLOW_CASE(STRING)
 #undef LIST_FAST_CASE
-#undef LIST_SLOW_CASE
-      case Type::LIST: {
-        if (PyArray_DESCR(ndarray)->type_num != NPY_OBJECT) {
-          return Status::Invalid(
-              "Can only convert list types from NumPy object array input");
-        }
-        return Extend(this->value_converter_.get(), value, /*reserved=*/0);
-      }
       default: {
-        return Status::TypeError("Unknown list item type: ", 
value_type->ToString());
+        return Extend(this->value_converter_.get(), value, size);
       }
     }
   }
@@ -1004,6 +998,13 @@ Result<std::shared_ptr<ChunkedArray>> 
ConvertPySequence(PyObject* obj, PyObject*
   PyObject* seq;
   OwnedRef tmp_seq_nanny;
 
+  ARROW_ASSIGN_OR_RAISE(auto is_pandas_imported, 
internal::IsModuleImported("pandas"));
+  if (is_pandas_imported) {
+    // If pandas has been already imported initialize the static pandas 
objects to
+    // support converting from pd.Timedelta and pd.Timestamp objects
+    internal::InitPandasStaticData();
+  }
+
   int64_t size = options.size;
   RETURN_NOT_OK(ConvertToSequenceAndInferSize(obj, &seq, &size));
   tmp_seq_nanny.reset(seq);
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index e4d3d0a..8e5a74f 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -1662,6 +1662,18 @@ tasks:
         PYTHON: 3.8
       run: conda-python
 
+  test-conda-python-3.8-hypothesis:
+    ci: github
+    template: docker-tests/github.linux.yml
+    params:
+      env:
+        HYPOTHESIS_PROFILE: ci
+        PYARROW_TEST_HYPOTHESIS: ON
+        PYTHON: 3.8
+        # limit to execute hypothesis tests only
+        PYTEST_ARGS: "-m hypothesis"
+      run: conda-python-pandas
+
   test-debian-10-python-3:
     ci: azure
     template: docker-tests/azure.linux.yml
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index 43a6496..18cd6e9 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -28,7 +28,7 @@ from pyarrow.util import find_free_port
 
 # setup hypothesis profiles
 h.settings.register_profile('ci', max_examples=1000)
-h.settings.register_profile('dev', max_examples=10)
+h.settings.register_profile('dev', max_examples=50)
 h.settings.register_profile('debug', max_examples=10,
                             verbosity=h.Verbosity.verbose)
 
diff --git a/python/pyarrow/tests/strategies.py 
b/python/pyarrow/tests/strategies.py
index 97e972d..cb9d943 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -42,6 +42,17 @@ binary_type = st.just(pa.binary())
 string_type = st.just(pa.string())
 large_binary_type = st.just(pa.large_binary())
 large_string_type = st.just(pa.large_string())
+fixed_size_binary_type = st.builds(
+    pa.binary,
+    st.integers(min_value=0, max_value=16)
+)
+binary_like_types = st.one_of(
+    binary_type,
+    string_type,
+    large_binary_type,
+    large_string_type,
+    fixed_size_binary_type
+)
 
 signed_integer_types = st.sampled_from([
     pa.int8(),
@@ -98,12 +109,9 @@ temporal_types = st.one_of(
 primitive_types = st.one_of(
     null_type,
     bool_type,
-    binary_type,
-    string_type,
-    large_binary_type,
-    large_string_type,
     numeric_types,
-    temporal_types
+    temporal_types,
+    binary_like_types
 )
 
 metadata = st.dictionaries(st.text(), st.text())
@@ -124,28 +132,49 @@ def fields(draw, type_strategy=primitive_types):
 def list_types(item_strategy=primitive_types):
     return (
         st.builds(pa.list_, item_strategy) |
-        st.builds(pa.large_list, item_strategy)
+        st.builds(pa.large_list, item_strategy) |
+        st.builds(
+            pa.list_,
+            item_strategy,
+            st.integers(min_value=0, max_value=16)
+        )
     )
 
 
-def struct_types(item_strategy=primitive_types):
-    return st.builds(pa.struct, st.lists(fields(item_strategy)))
-
-
-def complex_types(inner_strategy=primitive_types):
-    return list_types(inner_strategy) | struct_types(inner_strategy)
-
-
-def nested_list_types(item_strategy=primitive_types, max_leaves=3):
-    return st.recursive(item_strategy, list_types, max_leaves=max_leaves)
[email protected]
+def struct_types(draw, item_strategy=primitive_types):
+    fields_strategy = st.lists(fields(item_strategy))
+    fields_rendered = draw(fields_strategy)
+    field_names = [field.name for field in fields_rendered]
+    # check that field names are unique, see ARROW-9997
+    h.assume(len(set(field_names)) == len(field_names))
+    return pa.struct(fields_rendered)
+
+
+def dictionary_types(key_strategy=None, value_strategy=None):
+    key_strategy = key_strategy or signed_integer_types
+    value_strategy = value_strategy or st.one_of(
+        bool_type,
+        integer_types,
+        st.sampled_from([pa.float32(), pa.float64()]),
+        binary_type,
+        string_type,
+        fixed_size_binary_type,
+    )
+    return st.builds(pa.dictionary, key_strategy, value_strategy)
 
 
-def nested_struct_types(item_strategy=primitive_types, max_leaves=3):
-    return st.recursive(item_strategy, struct_types, max_leaves=max_leaves)
[email protected]
+def map_types(draw, key_strategy=primitive_types,
+              item_strategy=primitive_types):
+    key_type = draw(key_strategy)
+    h.assume(not pa.types.is_null(key_type))
+    value_type = draw(item_strategy)
+    return pa.map_(key_type, value_type)
 
 
-def nested_complex_types(inner_strategy=primitive_types, max_leaves=3):
-    return st.recursive(inner_strategy, complex_types, max_leaves=max_leaves)
+# union type
+# extension type
 
 
 def schemas(type_strategy=primitive_types, max_fields=None):
@@ -153,10 +182,17 @@ def schemas(type_strategy=primitive_types, 
max_fields=None):
     return st.builds(pa.schema, children)
 
 
-complex_schemas = schemas(complex_types())
-
-
-all_types = st.one_of(primitive_types, complex_types(), nested_complex_types())
+all_types = st.deferred(
+    lambda: (
+        primitive_types |
+        list_types() |
+        struct_types() |
+        dictionary_types() |
+        map_types() |
+        list_types(all_types) |
+        struct_types(all_types)
+    )
+)
 all_fields = fields(all_types)
 all_schemas = schemas(all_types)
 
@@ -165,7 +201,21 @@ _default_array_sizes = st.integers(min_value=0, 
max_value=20)
 
 
 @st.composite
-def arrays(draw, type, size=None):
+def _pylist(draw, value_type, size, nullable=True):
+    arr = draw(arrays(value_type, size=size, nullable=False))
+    return arr.to_pylist()
+
+
[email protected]
+def _pymap(draw, key_type, value_type, size, nullable=True):
+    length = draw(size)
+    keys = draw(_pylist(key_type, size=length, nullable=False))
+    values = draw(_pylist(value_type, size=length, nullable=nullable))
+    return list(zip(keys, values))
+
+
[email protected]
+def arrays(draw, type, size=None, nullable=True):
     if isinstance(type, st.SearchStrategy):
         ty = draw(type)
     elif isinstance(type, pa.DataType):
@@ -180,38 +230,24 @@ def arrays(draw, type, size=None):
     elif not isinstance(size, int):
         raise TypeError('Size must be an integer')
 
-    shape = (size,)
-
-    if pa.types.is_list(ty) or pa.types.is_large_list(ty):
-        offsets = draw(npst.arrays(np.uint8(), shape=shape)).cumsum() // 20
-        offsets = np.insert(offsets, 0, 0, axis=0)  # prepend with zero
-        values = draw(arrays(ty.value_type, size=int(offsets.sum())))
-        if pa.types.is_large_list(ty):
-            array_type = pa.LargeListArray
-        else:
-            array_type = pa.ListArray
-        return array_type.from_arrays(offsets, values)
-
-    if pa.types.is_struct(ty):
-        h.assume(len(ty) > 0)
-        fields, child_arrays = [], []
-        for field in ty:
-            fields.append(field)
-            child_arrays.append(draw(arrays(field.type, size=size)))
-        return pa.StructArray.from_arrays(child_arrays, fields=fields)
-
-    if (pa.types.is_boolean(ty) or pa.types.is_integer(ty) or
-            pa.types.is_floating(ty)):
-        values = npst.arrays(ty.to_pandas_dtype(), shape=(size,))
-        np_arr = draw(values)
-        if pa.types.is_floating(ty):
-            # Workaround ARROW-4952: no easy way to assert array equality
-            # in a NaN-tolerant way.
-            np_arr[np.isnan(np_arr)] = -42.0
-        return pa.array(np_arr, type=ty)
-
     if pa.types.is_null(ty):
+        h.assume(nullable)
         value = st.none()
+    elif pa.types.is_boolean(ty):
+        value = st.booleans()
+    elif pa.types.is_integer(ty):
+        values = draw(npst.arrays(ty.to_pandas_dtype(), shape=(size,)))
+        return pa.array(values, type=ty)
+    elif pa.types.is_floating(ty):
+        values = draw(npst.arrays(ty.to_pandas_dtype(), shape=(size,)))
+        # Workaround ARROW-4952: no easy way to assert array equality
+        # in a NaN-tolerant way.
+        values[np.isnan(values)] = -42.0
+        return pa.array(values, type=ty)
+    elif pa.types.is_decimal(ty):
+        # TODO(kszucs): properly limit the precision
+        # value = st.decimals(places=type.scale, allow_infinity=False)
+        h.reject()
     elif pa.types.is_time(ty):
         value = st.times()
     elif pa.types.is_date(ty):
@@ -219,8 +255,8 @@ def arrays(draw, type, size=None):
     elif pa.types.is_timestamp(ty):
         min_int64 = -(2**63)
         max_int64 = 2**63 - 1
-        min_datetime = datetime.datetime.fromtimestamp(min_int64 / 10**9)
-        max_datetime = datetime.datetime.fromtimestamp(max_int64 / 10**9)
+        min_datetime = datetime.datetime.fromtimestamp(min_int64 // 10**9)
+        max_datetime = datetime.datetime.fromtimestamp(max_int64 // 10**9)
         try:
             offset_hours = int(ty.tz)
             tz = pytz.FixedOffset(offset_hours * 60)
@@ -234,14 +270,34 @@ def arrays(draw, type, size=None):
         value = st.binary()
     elif pa.types.is_string(ty) or pa.types.is_large_string(ty):
         value = st.text()
-    elif pa.types.is_decimal(ty):
-        # TODO(kszucs): properly limit the precision
-        # value = st.decimals(places=type.scale, allow_infinity=False)
-        h.reject()
+    elif pa.types.is_fixed_size_binary(ty):
+        value = st.binary(min_size=ty.byte_width, max_size=ty.byte_width)
+    elif pa.types.is_list(ty):
+        value = _pylist(ty.value_type, size=size, nullable=nullable)
+    elif pa.types.is_large_list(ty):
+        value = _pylist(ty.value_type, size=size, nullable=nullable)
+    elif pa.types.is_fixed_size_list(ty):
+        value = _pylist(ty.value_type, size=ty.list_size, nullable=nullable)
+    elif pa.types.is_dictionary(ty):
+        values = _pylist(ty.value_type, size=size, nullable=nullable)
+        return pa.array(draw(values), type=ty)
+    elif pa.types.is_map(ty):
+        value = _pymap(ty.key_type, ty.item_type, size=_default_array_sizes,
+                       nullable=nullable)
+    elif pa.types.is_struct(ty):
+        h.assume(len(ty) > 0)
+        fields, child_arrays = [], []
+        for field in ty:
+            fields.append(field)
+            child_arrays.append(draw(arrays(field.type, size=size)))
+        return pa.StructArray.from_arrays(child_arrays, fields=fields)
     else:
         raise NotImplementedError(ty)
 
+    if nullable:
+        value = st.one_of(st.none(), value)
     values = st.lists(value, min_size=size, max_size=size)
+
     return pa.array(draw(values), type=ty)
 
 
@@ -293,3 +349,60 @@ all_arrays = arrays(all_types)
 all_chunked_arrays = chunked_arrays(all_types)
 all_record_batches = record_batches(all_types)
 all_tables = tables(all_types)
+
+
+# Define the same rules as above for pandas tests by excluding certain types
+# from the generation because of known issues.
+
+pandas_compatible_primitive_types = st.one_of(
+    null_type,
+    bool_type,
+    integer_types,
+    st.sampled_from([pa.float32(), pa.float64()]),
+    decimal_type,
+    date_types,
+    time_types,
+    # Need to exclude timestamp and duration types otherwise hypothesis
+    # discovers ARROW-10210
+    # timestamp_types,
+    # duration_types
+    binary_type,
+    string_type,
+    large_binary_type,
+    large_string_type,
+)
+
+# Need to exclude floating point types otherwise hypothesis discovers
+# ARROW-10211
+pandas_compatible_dictionary_value_types = st.one_of(
+    bool_type,
+    integer_types,
+    binary_type,
+    string_type,
+    fixed_size_binary_type,
+)
+
+
+def pandas_compatible_list_types(
+    item_strategy=pandas_compatible_primitive_types
+):
+    # Need to exclude fixed size list type otherwise hypothesis discovers
+    # ARROW-10194
+    return (
+        st.builds(pa.list_, item_strategy) |
+        st.builds(pa.large_list, item_strategy)
+    )
+
+
+pandas_compatible_types = st.deferred(
+    lambda: st.one_of(
+        pandas_compatible_primitive_types,
+        pandas_compatible_list_types(pandas_compatible_primitive_types),
+        struct_types(pandas_compatible_primitive_types),
+        dictionary_types(
+            value_strategy=pandas_compatible_dictionary_value_types
+        ),
+        pandas_compatible_list_types(pandas_compatible_types),
+        struct_types(pandas_compatible_types)
+    )
+)
diff --git a/python/pyarrow/tests/test_convert_builtin.py 
b/python/pyarrow/tests/test_convert_builtin.py
index c4580db..91624d7 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -387,7 +387,6 @@ def test_broken_integers(seq):
 
 
 def test_numpy_scalars_mixed_type():
-
     # ARROW-4324
     data = [np.int32(10), np.float32(0.5)]
     arr = pa.array(data)
@@ -627,6 +626,50 @@ def test_multidimensional_ndarray_as_nested_list():
     assert result.equals(expected)
 
 
[email protected](('data', 'value_type'), [
+    ([True, False], pa.bool_()),
+    ([None, None], pa.null()),
+    ([1, 2, None], pa.int8()),
+    ([1, 2., 3., None], pa.float32()),
+    ([datetime.date.today(), None], pa.date32()),
+    ([None, datetime.date.today()], pa.date64()),
+    ([datetime.time(1, 1, 1), None], pa.time32('s')),
+    ([None, datetime.time(2, 2, 2)], pa.time64('us')),
+    ([datetime.datetime.now(), None], pa.timestamp('us')),
+    ([datetime.timedelta(seconds=10)], pa.duration('s')),
+    ([b"a", b"b"], pa.binary()),
+    ([b"aaa", b"bbb", b"ccc"], pa.binary(3)),
+    ([b"a", b"b", b"c"], pa.large_binary()),
+    (["a", "b", "c"], pa.string()),
+    (["a", "b", "c"], pa.large_string()),
+    (
+        [{"a": 1, "b": 2}, None, {"a": 5, "b": None}],
+        pa.struct([('a', pa.int8()), ('b', pa.int16())])
+    )
+])
+def test_list_array_from_object_ndarray(data, value_type):
+    ty = pa.list_(value_type)
+    ndarray = np.array(data, dtype=object)
+    arr = pa.array([ndarray], type=ty)
+    assert arr.type.equals(ty)
+    assert arr.to_pylist() == [data]
+
+
[email protected](('data', 'value_type'), [
+    ([[1, 2], [3]], pa.list_(pa.int64())),
+    ([[1, 2], [3, 4]], pa.list_(pa.int64(), 2)),
+    ([[1], [2, 3]], pa.large_list(pa.int64()))
+])
+def test_nested_list_array_from_object_ndarray(data, value_type):
+    ndarray = np.empty(len(data), dtype=object)
+    ndarray[:] = [np.array(item, dtype=object) for item in data]
+
+    ty = pa.list_(value_type)
+    arr = pa.array([ndarray], type=ty)
+    assert arr.type.equals(ty)
+    assert arr.to_pylist() == [data]
+
+
 def test_array_ignore_nan_from_pandas():
     # See ARROW-4324, this reverts logic that was introduced in
     # ARROW-2240
@@ -1903,18 +1946,62 @@ def test_dictionary_from_strings():
     assert a.dictionary.equals(expected_dictionary)
 
 
-def _has_unique_field_names(ty):
-    if isinstance(ty, pa.StructType):
-        field_names = [field.name for field in ty]
-        return len(set(field_names)) == len(field_names)
-    else:
-        return True
[email protected](('unit', 'expected'), [
+    ('s', datetime.timedelta(seconds=-2147483000)),
+    ('ms', datetime.timedelta(milliseconds=-2147483000)),
+    ('us', datetime.timedelta(microseconds=-2147483000)),
+    ('ns', datetime.timedelta(microseconds=-2147483))
+])
+def test_duration_array_roundtrip_corner_cases(unit, expected):
+    # Corner case discovered by hypothesis: there were implicit conversions to
+    # unsigned values resulting wrong values with wrong signs.
+    ty = pa.duration(unit)
+    arr = pa.array([-2147483000], type=ty)
+    restored = pa.array(arr.to_pylist(), type=ty)
+    assert arr.equals(restored)
+
+    expected_list = [expected]
+    if unit == 'ns':
+        # if pandas is available then a pandas Timedelta is returned
+        try:
+            import pandas as pd
+        except ImportError:
+            pass
+        else:
+            expected_list = [pd.Timedelta(-2147483000, unit='ns')]
+
+    assert restored.to_pylist() == expected_list
+
+
[email protected]
+def test_roundtrip_nanosecond_resolution_pandas_temporal_objects():
+    # corner case discovered by hypothesis: preserving the nanoseconds on
+    # conversion from a list of Timedelta and Timestamp objects
+    import pandas as pd
+
+    ty = pa.duration('ns')
+    arr = pa.array([9223371273709551616], type=ty)
+    data = arr.to_pylist()
+    assert isinstance(data[0], pd.Timedelta)
+    restored = pa.array(data, type=ty)
+    assert arr.equals(restored)
+    assert restored.to_pylist() == [
+        pd.Timedelta(9223371273709551616, unit='ns')
+    ]
+
+    ty = pa.timestamp('ns')
+    arr = pa.array([9223371273709551616], type=ty)
+    data = arr.to_pylist()
+    assert isinstance(data[0], pd.Timestamp)
+    restored = pa.array(data, type=ty)
+    assert arr.equals(restored)
+    assert restored.to_pylist() == [
+        pd.Timestamp(9223371273709551616, unit='ns')
+    ]
 
 
 @h.given(past.all_arrays)
 def test_array_to_pylist_roundtrip(arr):
-    # TODO(kszucs): ARROW-9997
-    h.assume(_has_unique_field_names(arr.type))
     seq = arr.to_pylist()
     restored = pa.array(seq, type=arr.type)
     assert restored.equals(arr)
diff --git a/python/pyarrow/tests/test_pandas.py 
b/python/pyarrow/tests/test_pandas.py
index 756b4ff..54f4574 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -35,6 +35,7 @@ import pytz
 
 from pyarrow.pandas_compat import get_logical_type, _pandas_api
 from pyarrow.tests.util import random_ascii, rands
+import pyarrow.tests.strategies as past
 
 import pyarrow as pa
 try:
@@ -2842,6 +2843,17 @@ def test_convert_unsupported_type_error_message():
 
 
 # ----------------------------------------------------------------------
+# Hypothesis tests
+
+
[email protected](past.arrays(past.pandas_compatible_types))
+def test_array_to_pandas_roundtrip(arr):
+    s = arr.to_pandas()
+    restored = pa.array(s, type=arr.type, from_pandas=True)
+    assert restored.equals(arr)
+
+
+# ----------------------------------------------------------------------
 # Test object deduplication in to_pandas
 
 
diff --git a/python/pyarrow/tests/test_strategies.py 
b/python/pyarrow/tests/test_strategies.py
index f4249df..14fc949 100644
--- a/python/pyarrow/tests/test_strategies.py
+++ b/python/pyarrow/tests/test_strategies.py
@@ -41,6 +41,11 @@ def test_arrays(array):
     assert isinstance(array, pa.lib.Array)
 
 
[email protected](past.arrays(past.primitive_types, nullable=False))
+def test_array_nullability(array):
+    assert array.null_count == 0
+
+
 @h.given(past.all_chunked_arrays)
 def test_chunked_arrays(chunked_array):
     assert isinstance(chunked_array, pa.lib.ChunkedArray)

[arrow] branch master updated: ARROW-3080: [Python] Unify Arrow to Python object conversion paths

Reply via email to