This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 45f5da2  ARROW-1982: [Python] Coerce Parquet statistics as bytes to 
more useful Python scalar types
45f5da2 is described below

commit 45f5da2f4b841c5562b14727361da421f5fb64a2
Author: Wes McKinney <[email protected]>
AuthorDate: Mon Mar 5 14:28:35 2018 -0500

    ARROW-1982: [Python] Coerce Parquet statistics as bytes to more useful 
Python scalar types
    
    I also changed the BYTE_ARRAY, FIXED_LEN_BYTE_ARRAY to return bytes since 
decoding from binary to UTF8 unicode didn't seem correct to me as the default 
behavior
    
    Author: Wes McKinney <[email protected]>
    
    Closes #1698 from wesm/ARROW-1982 and squashes the following commits:
    
    0c6b23a7 <Wes McKinney> Update comments per code review
    d0ae6f40 <Wes McKinney> Coerce Parquet statistics as bytes to more useful 
Python scalar types
---
 python/pyarrow/_parquet.pyx          | 29 +++++++++++++++++++++++++++--
 python/pyarrow/tests/test_parquet.py | 24 ++++++++++++------------
 2 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index e513e1d..101fcd1 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -70,6 +70,31 @@ cdef class RowGroupStatistics:
                                self.num_values,
                                self.physical_type)
 
+    cdef inline _cast_statistic(self, object value):
+        # Input value is bytes
+        cdef ParquetType physical_type = self.statistics.get().physical_type()
+        if physical_type == ParquetType_BOOLEAN:
+            return bool(int(value))
+        elif physical_type == ParquetType_INT32:
+            return int(value)
+        elif physical_type == ParquetType_INT64:
+            return int(value)
+        elif physical_type == ParquetType_INT96:
+            # Leave as PyBytes
+            return value
+        elif physical_type == ParquetType_FLOAT:
+            return float(value)
+        elif physical_type == ParquetType_DOUBLE:
+            return float(value)
+        elif physical_type == ParquetType_BYTE_ARRAY:
+            # Leave as PyBytes
+            return value
+        elif physical_type == ParquetType_FIXED_LEN_BYTE_ARRAY:
+            # Leave as PyBytes
+            return value
+        else:
+            raise ValueError('Unknown physical ParquetType')
+
     property has_min_max:
 
         def __get__(self):
@@ -82,7 +107,7 @@ cdef class RowGroupStatistics:
             encode_min = self.statistics.get().EncodeMin()
 
             min_value = FormatStatValue(raw_physical_type, encode_min.c_str())
-            return frombytes(min_value)
+            return self._cast_statistic(min_value)
 
     property max:
 
@@ -91,7 +116,7 @@ cdef class RowGroupStatistics:
             encode_max = self.statistics.get().EncodeMax()
 
             max_value = FormatStatValue(raw_physical_type, encode_max.c_str())
-            return frombytes(max_value)
+            return self._cast_statistic(max_value)
 
     property null_count:
 
diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index cec01c8..a3da05f 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -26,7 +26,7 @@ import sys
 
 import pytest
 
-from pyarrow.compat import guid, u, BytesIO, unichar, frombytes
+from pyarrow.compat import guid, u, BytesIO, unichar
 from pyarrow.tests import util
 from pyarrow.filesystem import LocalFileSystem
 import pyarrow as pa
@@ -524,20 +524,20 @@ def test_parquet_metadata_api():
 @pytest.mark.parametrize(
     'data, dtype, min_value, max_value, null_count, num_values',
     [
-        ([1, 2, 2, None, 4], np.uint8, u'1', u'4', 1, 4),
-        ([1, 2, 2, None, 4], np.uint16, u'1', u'4', 1, 4),
-        ([1, 2, 2, None, 4], np.uint32, u'1', u'4', 1, 4),
-        ([1, 2, 2, None, 4], np.uint64, u'1', u'4', 1, 4),
-        ([-1, 2, 2, None, 4], np.int16, u'-1', u'4', 1, 4),
-        ([-1, 2, 2, None, 4], np.int32, u'-1', u'4', 1, 4),
-        ([-1, 2, 2, None, 4], np.int64, u'-1', u'4', 1, 4),
-        ([-1.1, 2.2, 2.3, None, 4.4], np.float32, u'-1.1', u'4.4', 1, 4),
-        ([-1.1, 2.2, 2.3, None, 4.4], np.float64, u'-1.1', u'4.4', 1, 4),
+        ([1, 2, 2, None, 4], np.uint8, 1, 4, 1, 4),
+        ([1, 2, 2, None, 4], np.uint16, 1, 4, 1, 4),
+        ([1, 2, 2, None, 4], np.uint32, 1, 4, 1, 4),
+        ([1, 2, 2, None, 4], np.uint64, 1, 4, 1, 4),
+        ([-1, 2, 2, None, 4], np.int16, -1, 4, 1, 4),
+        ([-1, 2, 2, None, 4], np.int32, -1, 4, 1, 4),
+        ([-1, 2, 2, None, 4], np.int64, -1, 4, 1, 4),
+        ([-1.1, 2.2, 2.3, None, 4.4], np.float32, -1.1, 4.4, 1, 4),
+        ([-1.1, 2.2, 2.3, None, 4.4], np.float64, -1.1, 4.4, 1, 4),
         (
             [u'', u'b', unichar(1000), None, u'aaa'],
-            str, u' ', frombytes((unichar(1000) + u' ').encode('utf-8')), 1, 4
+            str, b' ', (unichar(1000) + u' ').encode('utf-8'), 1, 4
         ),
-        ([True, False, False, True, True], np.bool, u'0', u'1', 0, 5),
+        ([True, False, False, True, True], np.bool, False, True, 0, 5),
     ]
 )
 def test_parquet_column_statistics_api(

-- 
To stop receiving notification emails like this one, please contact
[email protected].

Reply via email to