[jira] [Commented] (ARROW-2140) [Python] Conversion from Numpy float16 array unimplemented

ASF GitHub Bot (JIRA) Thu, 29 Mar 2018 16:39:35 -0700

    [ 
https://issues.apache.org/jira/browse/ARROW-2140?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16419938#comment-16419938
 ]


ASF GitHub Bot commented on ARROW-2140:
---------------------------------------

wesm closed pull request #1744: ARROW-2140: [Python] Improve float16 support
URL: https://github.com/apache/arrow/pull/1744
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/ci/msvc-build.bat b/ci/msvc-build.bat
index a29ef0bad..cec14297e 100644
--- a/ci/msvc-build.bat
+++ b/ci/msvc-build.bat
@@ -68,6 +68,7 @@ if "%JOB%" == "Build_Debug" (
   exit /B 0
 )
 
+@rem Note: avoid Cython 0.28.0 due to 
https://github.com/cython/cython/issues/2148
 conda create -n arrow -q -y python=%PYTHON% ^
       six pytest setuptools numpy pandas ^
       cython=0.27.3 ^
diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index 247d10278..aa3c3154c 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -36,6 +36,7 @@ source activate $CONDA_ENV_DIR
 python --version
 which python
 
+# Note: avoid Cython 0.28.0 due to https://github.com/cython/cython/issues/2148
 conda install -y -q pip \
       nomkl \
       cloudpickle \
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc 
b/cpp/src/arrow/python/arrow_to_pandas.cc
index 5749e4f40..92461fc16 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -197,6 +197,7 @@ class PandasBlock {
     INT32,
     UINT64,
     INT64,
+    HALF_FLOAT,
     FLOAT,
     DOUBLE,
     BOOL,
@@ -815,6 +816,31 @@ using Int32Block = IntBlock<Type::INT32, int32_t>;
 using UInt64Block = IntBlock<Type::UINT64, uint64_t>;
 using Int64Block = IntBlock<Type::INT64, int64_t>;
 
+class Float16Block : public PandasBlock {
+ public:
+  using PandasBlock::PandasBlock;
+  Status Allocate() override { return AllocateNDArray(NPY_FLOAT16); }
+
+  Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement,
+               int64_t rel_placement) override {
+    Type::type type = col->type()->id();
+
+    if (type != Type::HALF_FLOAT) {
+      std::stringstream ss;
+      ss << "Cannot write Arrow data of type " << col->type()->ToString();
+      ss << " to a Pandas float16 block.";
+      return Status::NotImplemented(ss.str());
+    }
+
+    npy_half* out_buffer =
+        reinterpret_cast<npy_half*>(block_data_) + rel_placement * num_rows_;
+
+    ConvertNumericNullable<npy_half>(*col->data().get(), NPY_HALF_NAN, 
out_buffer);
+    placement_data_[rel_placement] = abs_placement;
+    return Status::OK();
+  }
+};
+
 class Float32Block : public PandasBlock {
  public:
   using PandasBlock::PandasBlock;
@@ -1225,6 +1251,7 @@ Status MakeBlock(PandasOptions options, PandasBlock::type 
type, int64_t num_rows
     BLOCK_CASE(INT32, Int32Block);
     BLOCK_CASE(UINT64, UInt64Block);
     BLOCK_CASE(INT64, Int64Block);
+    BLOCK_CASE(HALF_FLOAT, Float16Block);
     BLOCK_CASE(FLOAT, Float32Block);
     BLOCK_CASE(DOUBLE, Float64Block);
     BLOCK_CASE(BOOL, BoolBlock);
@@ -1269,6 +1296,9 @@ static Status GetPandasBlockType(const Column& col, const 
PandasOptions& options
       INTEGER_CASE(UINT64);
     case Type::INT64:
       INTEGER_CASE(INT64);
+    case Type::HALF_FLOAT:
+      *output_type = PandasBlock::HALF_FLOAT;
+      break;
     case Type::FLOAT:
       *output_type = PandasBlock::FLOAT;
       break;
diff --git a/cpp/src/arrow/python/arrow_to_python.cc 
b/cpp/src/arrow/python/arrow_to_python.cc
index 5515d24bd..3fdc5f144 100644
--- a/cpp/src/arrow/python/arrow_to_python.cc
+++ b/cpp/src/arrow/python/arrow_to_python.cc
@@ -127,11 +127,8 @@ Status GetValue(PyObject* context, const UnionArray& 
parent, const Array& arr,
       return CheckPyError();
     }
     case Type::HALF_FLOAT: {
-      *result = PyArrayScalar_New(Half);
+      *result = PyHalf_FromHalf(static_cast<const 
HalfFloatArray&>(arr).Value(index));
       RETURN_IF_PYERROR();
-
-      npy_half halffloat = static_cast<const 
HalfFloatArray&>(arr).Value(index);
-      PyArrayScalar_ASSIGN(*result, Half, halffloat);
       return Status::OK();
     }
     case Type::FLOAT:
diff --git a/cpp/src/arrow/python/builtin_convert.cc 
b/cpp/src/arrow/python/builtin_convert.cc
index 595499de7..5e99992f1 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -655,6 +655,17 @@ class TimestampConverter
   TimeUnit::type unit_;
 };
 
+class Float16Converter
+    : public TypedConverterVisitor<HalfFloatBuilder, Float16Converter> {
+ public:
+  // Append a non-missing item
+  Status AppendItem(PyObject* obj) {
+    npy_half val;
+    RETURN_NOT_OK(PyFloat_AsHalf(obj, &val));
+    return typed_builder_->Append(val);
+  }
+};
+
 class Float32Converter : public TypedConverterVisitor<FloatBuilder, 
Float32Converter> {
  public:
   // Append a non-missing item
@@ -887,6 +898,8 @@ std::unique_ptr<SeqConverter> GetConverter(const 
std::shared_ptr<DataType>& type
     case Type::TIMESTAMP:
       return std::unique_ptr<SeqConverter>(
           new TimestampConverter(static_cast<const 
TimestampType&>(*type).unit()));
+    case Type::HALF_FLOAT:
+      return std::unique_ptr<SeqConverter>(new Float16Converter);
     case Type::FLOAT:
       return std::unique_ptr<SeqConverter>(new Float32Converter);
     case Type::DOUBLE:
diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc
index 5719af6f3..bd31beccb 100644
--- a/cpp/src/arrow/python/helpers.cc
+++ b/cpp/src/arrow/python/helpers.cc
@@ -58,6 +58,24 @@ std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
   }
 }
 
+PyObject* PyHalf_FromHalf(npy_half value) {
+  PyObject* result = PyArrayScalar_New(Half);
+  if (result != NULL) {
+    PyArrayScalar_ASSIGN(result, Half, value);
+  }
+  return result;
+}
+
+Status PyFloat_AsHalf(PyObject* obj, npy_half* out) {
+  if (PyArray_IsScalar(obj, Half)) {
+    *out = PyArrayScalar_VAL(obj, Half);
+    return Status::OK();
+  } else {
+    // XXX: cannot use npy_double_to_half() without linking with Numpy
+    return Status::TypeError("Expected np.float16 instance");
+  }
+}
+
 namespace internal {
 
 Status ImportModule(const std::string& module_name, OwnedRef* ref) {
diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h
index b9f505a16..e2f3b1829 100644
--- a/cpp/src/arrow/python/helpers.h
+++ b/cpp/src/arrow/python/helpers.h
@@ -24,6 +24,8 @@
 #include <string>
 #include <utility>
 
+#include <numpy/halffloat.h>
+
 #include "arrow/type.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
@@ -41,6 +43,12 @@ class OwnedRef;
 // \return A shared pointer to DataType
 ARROW_EXPORT std::shared_ptr<DataType> GetPrimitiveType(Type::type type);
 
+// \brief Construct a np.float16 object from a npy_half value.
+ARROW_EXPORT PyObject* PyHalf_FromHalf(npy_half value);
+
+// \brief Convert a Python object to a npy_half value.
+ARROW_EXPORT Status PyFloat_AsHalf(PyObject* obj, npy_half* out);
+
 namespace internal {
 
 // \brief Import a Python module
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index bfd7d4db9..09f907c8a 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -78,7 +78,7 @@ def parse_version(root):
                          BooleanValue,
                          Int8Value, Int16Value, Int32Value, Int64Value,
                          UInt8Value, UInt16Value, UInt32Value, UInt64Value,
-                         FloatValue, DoubleValue, ListValue,
+                         HalfFloatValue, FloatValue, DoubleValue, ListValue,
                          BinaryValue, StringValue, FixedSizeBinaryValue,
                          DecimalValue,
                          Date32Value, Date64Value, TimestampValue)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 8dac57d18..a093cd5f1 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -689,6 +689,10 @@ cdef class Time64Array(NumericArray):
     pass
 
 
+cdef class HalfFloatArray(FloatingPointArray):
+    pass
+
+
 cdef class FloatArray(FloatingPointArray):
     pass
 
@@ -1008,6 +1012,7 @@ cdef dict _array_classes = {
     _Type_TIMESTAMP: TimestampArray,
     _Type_TIME32: Time32Array,
     _Type_TIME64: Time64Array,
+    _Type_HALF_FLOAT: HalfFloatArray,
     _Type_FLOAT: FloatArray,
     _Type_DOUBLE: DoubleArray,
     _Type_LIST: ListArray,
diff --git a/python/pyarrow/includes/common.pxd 
b/python/pyarrow/includes/common.pxd
index 5afa07537..16eae92e3 100644
--- a/python/pyarrow/includes/common.pxd
+++ b/python/pyarrow/includes/common.pxd
@@ -35,6 +35,9 @@ cdef extern from "<Python.h>":
     void Py_XDECREF(PyObject* o)
     Py_ssize_t Py_REFCNT(PyObject* o)
 
+cdef extern from "numpy/halffloat.h":
+    ctypedef uint16_t npy_half
+
 cdef extern from "arrow/api.h" namespace "arrow" nogil:
     # We can later add more of the common status factory methods as needed
     cdef CStatus CStatus_OK "Status::OK"()
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 01a641896..b5f5b3e3f 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -327,6 +327,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CTimestampArray" arrow::TimestampArray"(CArray):
         int64_t Value(int i)
 
+    cdef cppclass CHalfFloatArray" arrow::HalfFloatArray"(CArray):
+        uint16_t Value(int i)
+
     cdef cppclass CFloatArray" arrow::FloatArray"(CArray):
         float Value(int i)
 
@@ -869,6 +872,9 @@ cdef extern from "arrow/compute/api.h" namespace 
"arrow::compute" nogil:
 cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
     shared_ptr[CDataType] GetPrimitiveType(Type type)
     shared_ptr[CDataType] GetTimestampType(TimeUnit unit)
+
+    object PyHalf_FromHalf(npy_half value)
+
     CStatus ConvertPySequence(object obj, CMemoryPool* pool,
                               shared_ptr[CArray]* out)
     CStatus ConvertPySequence(object obj, const shared_ptr[CDataType]& type,
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index e4d574f18..8a950867d 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -234,6 +234,10 @@ cdef class UInt64Array(IntegerArray):
     pass
 
 
+cdef class HalfFloatArray(FloatingPointArray):
+    pass
+
+
 cdef class FloatArray(FloatingPointArray):
     pass
 
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index a801acd69..f23414b06 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -249,6 +249,13 @@ cdef class TimestampValue(ArrayValue):
         return converter(value, tzinfo=tzinfo)
 
 
+cdef class HalfFloatValue(ArrayValue):
+
+    def as_py(self):
+        cdef CHalfFloatArray* ap = <CHalfFloatArray*> self.sp_array.get()
+        return PyHalf_FromHalf(ap.Value(self.index))
+
+
 cdef class FloatValue(ArrayValue):
 
     def as_py(self):
@@ -388,6 +395,7 @@ cdef dict _scalar_classes = {
     _Type_TIME32: Time32Value,
     _Type_TIME64: Time64Value,
     _Type_TIMESTAMP: TimestampValue,
+    _Type_HALF_FLOAT: HalfFloatValue,
     _Type_FLOAT: FloatValue,
     _Type_DOUBLE: DoubleValue,
     _Type_LIST: ListValue,
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index d448de08b..45ec66dab 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -462,7 +462,9 @@ class TestConvertPrimitiveTypes(object):
     def test_float_no_nulls(self):
         data = {}
         fields = []
-        dtypes = [('f4', pa.float32()), ('f8', pa.float64())]
+        dtypes = [('f2', pa.float16()),
+                  ('f4', pa.float32()),
+                  ('f8', pa.float64())]
         num_values = 100
 
         for numpy_dtype, arrow_dtype in dtypes:
@@ -478,8 +480,10 @@ def test_float_nulls(self):
         num_values = 100
 
         null_mask = np.random.randint(0, 10, size=num_values) < 3
-        dtypes = [('f4', pa.float32()), ('f8', pa.float64())]
-        names = ['f4', 'f8']
+        dtypes = [('f2', pa.float16()),
+                  ('f4', pa.float32()),
+                  ('f8', pa.float64())]
+        names = ['f2', 'f4', 'f8']
         expected_cols = []
 
         arrays = []
@@ -653,6 +657,21 @@ def _check_type(t):
         _check_type(pa.int32())
         _check_type(pa.float64())
 
+    def test_half_floats_from_numpy(self):
+        arr = np.array([1.5, np.nan], dtype=np.float16)
+        a = pa.array(arr, type=pa.float16())
+        x, y = a.to_pylist()
+        assert isinstance(x, np.float16)
+        assert x == 1.5
+        assert isinstance(y, np.float16)
+        assert np.isnan(y)
+
+        a = pa.array(arr, type=pa.float16(), from_pandas=True)
+        x, y = a.to_pylist()
+        assert isinstance(x, np.float16)
+        assert x == 1.5
+        assert y is None
+
 
 @pytest.mark.parametrize('dtype',
                          ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'])
@@ -1668,7 +1687,7 @@ class TestConvertMisc(object):
         (np.uint16, pa.uint16()),
         (np.uint32, pa.uint32()),
         (np.uint64, pa.uint64()),
-        # (np.float16, pa.float16()),  # XXX unsupported
+        (np.float16, pa.float16()),
         (np.float32, pa.float32()),
         (np.float64, pa.float64()),
         # XXX unsupported
diff --git a/python/pyarrow/tests/test_scalars.py 
b/python/pyarrow/tests/test_scalars.py
index 7061a0d3a..b82322f57 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -18,6 +18,7 @@
 
 import pytest
 
+import numpy as np
 import pandas as pd
 
 from pyarrow.compat import unittest, u, unicode_type
@@ -76,6 +77,16 @@ def test_double(self):
         v = arr[2]
         assert v.as_py() == 3.0
 
+    def test_half_float(self):
+        arr = pa.array([np.float16(1.5), None], type=pa.float16())
+        v = arr[0]
+        assert isinstance(v, pa.HalfFloatValue)
+        assert repr(v) == "1.5"
+        assert v.as_py() == 1.5
+        assert v == 1.5
+
+        assert arr[1] is pa.NA
+
     def test_string_unicode(self):
         arr = pa.array([u'foo', None, u'mañana'])
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [Python] Conversion from Numpy float16 array unimplemented
> ----------------------------------------------------------
>
>                 Key: ARROW-2140
>                 URL: https://issues.apache.org/jira/browse/ARROW-2140
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: Python
>    Affects Versions: 0.8.0
>            Reporter: Antoine Pitrou
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.10.0
>
>
> {code}
> >>> arr = np.array([1.5], dtype=np.float16)
> >>> pa.array(arr, type=pa.float16())
> Traceback (most recent call last):
>   File "<ipython-input-6-e432e6663efb>", line 1, in <module>
>     pa.array(arr)
>   File "array.pxi", line 177, in pyarrow.lib.array
>   File "array.pxi", line 84, in pyarrow.lib._ndarray_to_array
>   File "public-api.pxi", line 158, in pyarrow.lib.pyarrow_wrap_array
> KeyError: 10
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (ARROW-2140) [Python] Conversion from Numpy float16 array unimplemented

Reply via email to