Repository: arrow
Updated Branches:
  refs/heads/master e8945325e -> 7870804e0


ARROW-1074: Support lists and arrays in pandas DataFrames without explicit 
schema

This introduces automatic type inference for lists and numpy arrays in a pandas 
data frame.

Partial implementation for: https://issues.apache.org/jira/browse/ARROW-575

Author: fjetter <florian.jet...@blue-yonder.com>

Closes #825 from fjetter/feature/pandas_converter_lists and squashes the 
following commits:

8bde4e7 [fjetter] Use unicode instead of str in tests
6d262e9 [fjetter] Use OwnedRef reset
037cc77 [fjetter] apply clang-format
331f8a7 [fjetter] Fix bus error
506666f [fjetter] Support numpy array in sequential visitor
b54c1f5 [fjetter] Factor out InferArrowType
4a61585 [fjetter] Add vscode config files to gitignore
6dee516 [fjetter] infer lists in pandas converter


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/7870804e
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/7870804e
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/7870804e

Branch: refs/heads/master
Commit: 7870804e0ea370b0e56811769e5252e6aba69e34
Parents: e894532
Author: fjetter <florian.jet...@blue-yonder.com>
Authored: Mon Jul 10 16:22:05 2017 +0200
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Mon Jul 10 16:22:05 2017 +0200

----------------------------------------------------------------------
 .gitignore                                  |  3 +-
 cpp/src/arrow/python/builtin_convert.cc     | 42 ++++++++++++++++--------
 cpp/src/arrow/python/builtin_convert.h      |  2 ++
 cpp/src/arrow/python/pandas_convert.cc      | 13 ++++++--
 python/pyarrow/tests/test_convert_pandas.py | 30 +++++++++++++++++
 5 files changed, 72 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/7870804e/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index 5e28b36..dd69b6c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,4 +26,5 @@
 MANIFEST
 
 cpp/.idea/
-python/.eggs/
\ No newline at end of file
+python/.eggs/
+.vscode
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/arrow/blob/7870804e/cpp/src/arrow/python/builtin_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/builtin_convert.cc 
b/cpp/src/arrow/python/builtin_convert.cc
index 11114b0..f10dac7 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -155,14 +155,20 @@ class SeqVisitor {
   // co-recursive with VisitElem
   Status Visit(PyObject* obj, int level = 0) {
     if (level > max_nesting_level_) { max_nesting_level_ = level; }
-
     // Loop through either a sequence or an iterator.
     if (PySequence_Check(obj)) {
       Py_ssize_t size = PySequence_Size(obj);
       for (int64_t i = 0; i < size; ++i) {
-        // TODO(wesm): Specialize for PyList_GET_ITEM?
-        OwnedRef ref = OwnedRef(PySequence_GetItem(obj, i));
-        RETURN_NOT_OK(VisitElem(ref, level));
+        OwnedRef ref;
+        if (PyArray_Check(obj)) {
+          auto array = reinterpret_cast<PyArrayObject*>(obj);
+          auto ptr = reinterpret_cast<const char*>(PyArray_GETPTR1(array, i));
+          ref.reset(PyArray_GETITEM(array, ptr));
+          RETURN_NOT_OK(VisitElem(ref, level));
+        } else {
+          ref.reset(PySequence_GetItem(obj, i));
+          RETURN_NOT_OK(VisitElem(ref, level));
+        }
       }
     } else if (PyObject_HasAttrString(obj, "__iter__")) {
       OwnedRef iter = OwnedRef(PyObject_GetIter(obj));
@@ -280,25 +286,32 @@ Status InferArrowSize(PyObject* obj, int64_t* size) {
 }
 
 // Non-exhaustive type inference
-Status InferArrowTypeAndSize(
-    PyObject* obj, int64_t* size, std::shared_ptr<DataType>* out_type) {
-  RETURN_NOT_OK(InferArrowSize(obj, size));
-
-  // For 0-length sequences, refuse to guess
-  if (*size == 0) { *out_type = null(); }
-
+Status InferArrowType(PyObject* obj, std::shared_ptr<DataType>* out_type) {
   PyDateTime_IMPORT;
   SeqVisitor seq_visitor;
   RETURN_NOT_OK(seq_visitor.Visit(obj));
   RETURN_NOT_OK(seq_visitor.Validate());
 
   *out_type = seq_visitor.GetType();
-
   if (*out_type == nullptr) { return Status::TypeError("Unable to determine 
data type"); }
 
   return Status::OK();
 }
 
+Status InferArrowTypeAndSize(
+    PyObject* obj, int64_t* size, std::shared_ptr<DataType>* out_type) {
+  RETURN_NOT_OK(InferArrowSize(obj, size));
+
+  // For 0-length sequences, refuse to guess
+  if (*size == 0) {
+    *out_type = null();
+    return Status::OK();
+  }
+  RETURN_NOT_OK(InferArrowType(obj, out_type));
+
+  return Status::OK();
+}
+
 // Marshal Python sequence (list, tuple, etc.) to Arrow array
 class SeqConverter {
  public:
@@ -464,8 +477,9 @@ class FixedWidthBytesConverter
   inline Status AppendItem(const OwnedRef& item) {
     PyObject* bytes_obj;
     OwnedRef tmp;
-    Py_ssize_t expected_length = 
std::dynamic_pointer_cast<FixedSizeBinaryType>(
-        typed_builder_->type())->byte_width();
+    Py_ssize_t expected_length =
+        std::dynamic_pointer_cast<FixedSizeBinaryType>(typed_builder_->type())
+            ->byte_width();
     if (item.obj() == Py_None) {
       RETURN_NOT_OK(typed_builder_->AppendNull());
       return Status::OK();

http://git-wip-us.apache.org/repos/asf/arrow/blob/7870804e/cpp/src/arrow/python/builtin_convert.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/builtin_convert.h 
b/cpp/src/arrow/python/builtin_convert.h
index 7a84cbe..dd878b2 100644
--- a/cpp/src/arrow/python/builtin_convert.h
+++ b/cpp/src/arrow/python/builtin_convert.h
@@ -38,6 +38,8 @@ class Status;
 
 namespace py {
 
+ARROW_EXPORT arrow::Status InferArrowType(
+    PyObject* obj, std::shared_ptr<arrow::DataType>* out_type);
 ARROW_EXPORT arrow::Status InferArrowTypeAndSize(
     PyObject* obj, int64_t* size, std::shared_ptr<arrow::DataType>* out_type);
 ARROW_EXPORT arrow::Status InferArrowSize(PyObject* obj, int64_t* size);

http://git-wip-us.apache.org/repos/asf/arrow/blob/7870804e/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc 
b/cpp/src/arrow/python/pandas_convert.cc
index f75a2ba..2364f13 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -893,9 +893,13 @@ Status PandasConverter::ConvertObjects() {
         return ConvertDates<Date32Type>();
       } else if (PyObject_IsInstance(const_cast<PyObject*>(objects[i]), 
Decimal.obj())) {
         return ConvertDecimals();
+      } else if (PyList_Check(objects[i]) || PyArray_Check(objects[i])) {
+        std::shared_ptr<DataType> inferred_type;
+        RETURN_NOT_OK(InferArrowType(objects[i], &inferred_type));
+        return ConvertLists(inferred_type);
       } else {
-        return InvalidConversion(
-            const_cast<PyObject*>(objects[i]), "string, bool, float, int, 
date, decimal");
+        return InvalidConversion(const_cast<PyObject*>(objects[i]),
+            "string, bool, float, int, date, decimal, list, array");
       }
     }
   }
@@ -1038,7 +1042,10 @@ Status PandasConverter::ConvertLists(const 
std::shared_ptr<DataType>& type) {
     LIST_CASE(DOUBLE, NPY_DOUBLE, DoubleType)
     LIST_CASE(STRING, NPY_OBJECT, StringType)
     default:
-      return Status::TypeError("Unknown list item type");
+      std::stringstream ss;
+      ss << "Unknown list item type: ";
+      ss << type->ToString();
+      return Status::TypeError(ss.str());
   }
 
   return Status::TypeError("Unknown list type");

http://git-wip-us.apache.org/repos/asf/arrow/blob/7870804e/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index b952d4a..fb69cac 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -691,3 +691,33 @@ class TestPandasConversion(unittest.TestCase):
 
         series = pd.Series(arr.to_pandas())
         tm.assert_series_equal(series, expected)
+
+    def test_infer_lists(self):
+        data = OrderedDict([
+            ('nan_ints', [[None, 1], [2, 3]]),
+            ('ints', [[0, 1], [2, 3]]),
+            ('strs', [[None, u'b'], [u'c', u'd']])
+        ])
+        df = pd.DataFrame(data)
+
+        expected_schema = pa.schema([
+            pa.field('nan_ints', pa.list_(pa.int64())),
+            pa.field('ints', pa.list_(pa.int64())),
+            pa.field('strs', pa.list_(pa.string()))
+        ])
+
+        self._check_pandas_roundtrip(df, expected_schema=expected_schema)
+
+    def test_infer_numpy_array(self):
+        data = OrderedDict([
+            ('ints', [
+                np.array([0, 1], dtype=np.int64),
+                np.array([2, 3], dtype=np.int64)
+            ])
+        ])
+        df = pd.DataFrame(data)
+        expected_schema = pa.schema([
+            pa.field('ints', pa.list_(pa.int64()))
+        ])
+
+        self._check_pandas_roundtrip(df, expected_schema=expected_schema)

Reply via email to