[jira] [Commented] (ARROW-1982) [Python] Return parquet statistics min/max as values instead of strings

ASF GitHub Bot (JIRA) Mon, 05 Mar 2018 11:29:37 -0800

    [ 
https://issues.apache.org/jira/browse/ARROW-1982?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386608#comment-16386608
 ]


ASF GitHub Bot commented on ARROW-1982:
---------------------------------------

wesm closed pull request #1698: ARROW-1982: [Python] Coerce Parquet statistics 
as bytes to more useful Python scalar types
URL: https://github.com/apache/arrow/pull/1698
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index e513e1d92..101fcd165 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -70,6 +70,31 @@ cdef class RowGroupStatistics:
                                self.num_values,
                                self.physical_type)
 
+    cdef inline _cast_statistic(self, object value):
+        # Input value is bytes
+        cdef ParquetType physical_type = self.statistics.get().physical_type()
+        if physical_type == ParquetType_BOOLEAN:
+            return bool(int(value))
+        elif physical_type == ParquetType_INT32:
+            return int(value)
+        elif physical_type == ParquetType_INT64:
+            return int(value)
+        elif physical_type == ParquetType_INT96:
+            # Leave as PyBytes
+            return value
+        elif physical_type == ParquetType_FLOAT:
+            return float(value)
+        elif physical_type == ParquetType_DOUBLE:
+            return float(value)
+        elif physical_type == ParquetType_BYTE_ARRAY:
+            # Leave as PyBytes
+            return value
+        elif physical_type == ParquetType_FIXED_LEN_BYTE_ARRAY:
+            # Leave as PyBytes
+            return value
+        else:
+            raise ValueError('Unknown physical ParquetType')
+
     property has_min_max:
 
         def __get__(self):
@@ -82,7 +107,7 @@ cdef class RowGroupStatistics:
             encode_min = self.statistics.get().EncodeMin()
 
             min_value = FormatStatValue(raw_physical_type, encode_min.c_str())
-            return frombytes(min_value)
+            return self._cast_statistic(min_value)
 
     property max:
 
@@ -91,7 +116,7 @@ cdef class RowGroupStatistics:
             encode_max = self.statistics.get().EncodeMax()
 
             max_value = FormatStatValue(raw_physical_type, encode_max.c_str())
-            return frombytes(max_value)
+            return self._cast_statistic(max_value)
 
     property null_count:
 
diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index cec01c859..a3da05fe3 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -26,7 +26,7 @@
 
 import pytest
 
-from pyarrow.compat import guid, u, BytesIO, unichar, frombytes
+from pyarrow.compat import guid, u, BytesIO, unichar
 from pyarrow.tests import util
 from pyarrow.filesystem import LocalFileSystem
 import pyarrow as pa
@@ -524,20 +524,20 @@ def test_parquet_metadata_api():
 @pytest.mark.parametrize(
     'data, dtype, min_value, max_value, null_count, num_values',
     [
-        ([1, 2, 2, None, 4], np.uint8, u'1', u'4', 1, 4),
-        ([1, 2, 2, None, 4], np.uint16, u'1', u'4', 1, 4),
-        ([1, 2, 2, None, 4], np.uint32, u'1', u'4', 1, 4),
-        ([1, 2, 2, None, 4], np.uint64, u'1', u'4', 1, 4),
-        ([-1, 2, 2, None, 4], np.int16, u'-1', u'4', 1, 4),
-        ([-1, 2, 2, None, 4], np.int32, u'-1', u'4', 1, 4),
-        ([-1, 2, 2, None, 4], np.int64, u'-1', u'4', 1, 4),
-        ([-1.1, 2.2, 2.3, None, 4.4], np.float32, u'-1.1', u'4.4', 1, 4),
-        ([-1.1, 2.2, 2.3, None, 4.4], np.float64, u'-1.1', u'4.4', 1, 4),
+        ([1, 2, 2, None, 4], np.uint8, 1, 4, 1, 4),
+        ([1, 2, 2, None, 4], np.uint16, 1, 4, 1, 4),
+        ([1, 2, 2, None, 4], np.uint32, 1, 4, 1, 4),
+        ([1, 2, 2, None, 4], np.uint64, 1, 4, 1, 4),
+        ([-1, 2, 2, None, 4], np.int16, -1, 4, 1, 4),
+        ([-1, 2, 2, None, 4], np.int32, -1, 4, 1, 4),
+        ([-1, 2, 2, None, 4], np.int64, -1, 4, 1, 4),
+        ([-1.1, 2.2, 2.3, None, 4.4], np.float32, -1.1, 4.4, 1, 4),
+        ([-1.1, 2.2, 2.3, None, 4.4], np.float64, -1.1, 4.4, 1, 4),
         (
             [u'', u'b', unichar(1000), None, u'aaa'],
-            str, u' ', frombytes((unichar(1000) + u' ').encode('utf-8')), 1, 4
+            str, b' ', (unichar(1000) + u' ').encode('utf-8'), 1, 4
         ),
-        ([True, False, False, True, True], np.bool, u'0', u'1', 0, 5),
+        ([True, False, False, True, True], np.bool, False, True, 0, 5),
     ]
 )
 def test_parquet_column_statistics_api(


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [Python] Return parquet statistics min/max as values instead of strings
> -----------------------------------------------------------------------
>
>                 Key: ARROW-1982
>                 URL: https://issues.apache.org/jira/browse/ARROW-1982
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>            Reporter: Jim Crist
>            Assignee: Wes McKinney
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> Currently `min` and `max` column statistics are returned as formatted strings 
> of the _physical type_. This makes using them in python a bit tricky, as the 
> strings need to be parsed as the proper _logical type_. Observe:
> {code}
> In [20]: import pandas as pd
> In [21]: df = pd.DataFrame({'a': [1, 2, 3],
>     ...:                    'b': ['a', 'b', 'c'],
>     ...:                    'c': [pd.Timestamp('1991-01-01')]*3})
>     ...:
> In [22]: df.to_parquet('temp.parquet', engine='pyarrow')
> In [23]: from pyarrow import parquet as pq
> In [24]: f = pq.ParquetFile('temp.parquet')
> In [25]: rg = f.metadata.row_group(0)
> In [26]: rg.column(0).statistics.min  # string instead of integer
> Out[26]: '1'
> In [27]: rg.column(1).statistics.min  # weird space added after value due to 
> formatter
> Out[27]: 'a '
> In [28]: rg.column(2).statistics.min  # formatted as physical type (int) 
> instead of logical (datetime)
> Out[28]: '662688000000'
> {code}
> Since the type information is known, it should be possible to convert these 
> to arrow values instead of strings.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (ARROW-1982) [Python] Return parquet statistics min/max as values instead of strings

Reply via email to