This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 631fa0a6b7 GH-45457: [Python] Add `pyarrow.ArrayStatistics` (#45550)
631fa0a6b7 is described below

commit 631fa0a6b780109194e7f1c318bd685ec7efd6e8
Author: Sutou Kouhei <[email protected]>
AuthorDate: Tue Feb 25 22:25:52 2025 +0900

    GH-45457: [Python] Add `pyarrow.ArrayStatistics` (#45550)
    
    ### Rationale for this change
    
    Apache Arrow C++ can attach statistics read from Apache Parquet data to 
`arrow::Array`. If we have the bindings of the feature in Python, Python users 
can also use attached statistics.
    
    ### What changes are included in this PR?
    
    * Add `pyarrow.ArrayStatistics`
    * Add `pyarrow.Array.statistics()`.
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    Yes.
    * GitHub Issue: #45457
    
    Lead-authored-by: Sutou Kouhei <[email protected]>
    Co-authored-by: Sutou Kouhei <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 python/pyarrow/array.pxi                          | 112 ++++++++++++++++++++++
 python/pyarrow/includes/libarrow.pxd              |  22 +++++
 python/pyarrow/lib.pxd                            |   8 ++
 python/pyarrow/tests/parquet/test_parquet_file.py |  19 ++++
 4 files changed, 161 insertions(+)

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 91770a5219..b738dc04b0 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -21,6 +21,9 @@ import os
 import warnings
 from cython import sizeof
 
+cdef extern from "<variant>" namespace "std":
+    c_bool holds_alternative[T](...)
+    T get[T](...)
 
 cdef _sequence_to_array(object sequence, object mask, object size,
                         DataType type, CMemoryPool* pool, c_bool from_pandas):
@@ -704,6 +707,101 @@ def _restore_array(data):
     return pyarrow_wrap_array(MakeArray(ad))
 
 
+cdef class ArrayStatistics(_Weakrefable):
+    """
+    The class for statistics of an array.
+    """
+
+    def __init__(self):
+        raise TypeError(f"Do not call {self.__class__.__name__}'s constructor "
+                        "directly")
+
+    cdef void init(self, const shared_ptr[CArrayStatistics]& sp_statistics):
+        self.sp_statistics = sp_statistics
+
+    def __repr__(self):
+        return (f"arrow.ArrayStatistics<null_count={self.null_count}, "
+                f"distinct_count={self.distinct_count}, min={self.min}, "
+                f"is_min_exact={self.is_min_exact}, max={self.max}, "
+                f"is_max_exact={self.is_max_exact}>")
+
+    @property
+    def null_count(self):
+        """
+        The number of nulls.
+        """
+        null_count = self.sp_statistics.get().null_count
+        # We'll be able to simplify this after
+        # https://github.com/cython/cython/issues/6692 is solved.
+        if null_count.has_value():
+            return null_count.value()
+        else:
+            return None
+
+    @property
+    def distinct_count(self):
+        """
+        The number of distinct values.
+        """
+        distinct_count = self.sp_statistics.get().distinct_count
+        # We'll be able to simplify this after
+        # https://github.com/cython/cython/issues/6692 is solved.
+        if distinct_count.has_value():
+            return distinct_count.value()
+        else:
+            return None
+
+    @property
+    def min(self):
+        """
+        The minimum value.
+        """
+        return self._get_value(self.sp_statistics.get().min)
+
+    @property
+    def is_min_exact(self):
+        """
+        Whether the minimum value is an exact value or not.
+        """
+        return self.sp_statistics.get().is_min_exact
+
+    @property
+    def max(self):
+        """
+        The maximum value.
+        """
+        return self._get_value(self.sp_statistics.get().max)
+
+    @property
+    def is_max_exact(self):
+        """
+        Whether the maximum value is an exact value or not.
+        """
+        return self.sp_statistics.get().is_max_exact
+
+    cdef _get_value(self, const optional[CArrayStatisticsValueType]& 
optional_value):
+        """
+        Get a raw value from
+        std::optional<arrow::ArrayStatistics::ValueType>> data.
+
+        arrow::ArrayStatistics::ValueType is
+        std::variant<bool, int64_t, uint64_t, double, std::string>.
+        """
+        if not optional_value.has_value():
+            return None
+        value = optional_value.value()
+        if holds_alternative[c_bool](value):
+            return get[c_bool](value)
+        elif holds_alternative[int64_t](value):
+            return get[int64_t](value)
+        elif holds_alternative[uint64_t](value):
+            return get[uint64_t](value)
+        elif holds_alternative[double](value):
+            return get[double](value)
+        else:
+            return get[c_string](value)
+
+
 cdef class _PandasConvertible(_Weakrefable):
 
     def to_pandas(
@@ -2099,6 +2197,20 @@ cdef class Array(_PandasConvertible):
         if self.sp_array.get().device_type() != CDeviceAllocationType_kCPU:
             raise NotImplementedError("Implemented only for data on CPU 
device")
 
+    @property
+    def statistics(self):
+        """
+        Statistics of the array.
+        """
+        cdef ArrayStatistics stat
+        sp_stat = self.sp_array.get().statistics()
+        if sp_stat.get() == nullptr:
+            return None
+        else:
+            stat = ArrayStatistics.__new__(ArrayStatistics)
+            stat.init(sp_stat)
+            return stat
+
 
 cdef _array_like_to_pandas(obj, options, types_mapper):
     cdef:
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index d4e34e0a84..556696e344 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -101,6 +101,16 @@ cdef extern from "arrow/util/future.h" namespace "arrow" 
nogil:
         CStatus status()
 
 
+cdef extern from "<variant>" namespace "std" nogil:
+    cdef cppclass CArrayStatisticsValueType" std::variant<bool, int64_t, 
uint64_t, double, std::string>":
+        CArrayStatisticsValueType()
+        CArrayStatisticsValueType(c_bool)
+        CArrayStatisticsValueType(int64_t)
+        CArrayStatisticsValueType(uint64_t)
+        CArrayStatisticsValueType(double)
+        CArrayStatisticsValueType(c_string)
+
+
 cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef enum Type" arrow::Type::type":
         _Type_NA" arrow::Type::NA"
@@ -188,6 +198,16 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     c_bool is_primitive(Type type)
     c_bool is_numeric(Type type)
 
+    cdef cppclass CArrayStatistics" arrow::ArrayStatistics":
+        optional[int64_t] null_count
+        optional[int64_t] distinct_count
+        optional[CArrayStatisticsValueType] min
+        c_bool is_min_exact
+        optional[CArrayStatisticsValueType] max
+        c_bool is_max_exact
+
+        c_bool Equals(const CArrayStatistics& statistics) const
+
     cdef cppclass CArrayData" arrow::ArrayData":
         shared_ptr[CDataType] type
         int64_t length
@@ -251,6 +271,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         CDeviceAllocationType device_type()
         CResult[shared_ptr[CArray]] CopyTo(const shared_ptr[CMemoryManager]& 
to) const
 
+        const shared_ptr[CArrayStatistics]& statistics() const
+
     shared_ptr[CArray] MakeArray(const shared_ptr[CArrayData]& data)
     CResult[shared_ptr[CArray]] MakeArrayOfNull(
         const shared_ptr[CDataType]& type, int64_t length, CMemoryPool* pool)
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 892c974ab1..0b2dedad50 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -261,6 +261,14 @@ cdef class Scalar(_Weakrefable):
     cdef inline shared_ptr[CScalar] unwrap(self) nogil
 
 
+cdef class ArrayStatistics(_Weakrefable):
+    cdef:
+        shared_ptr[CArrayStatistics] sp_statistics
+
+    cdef void init(self, const shared_ptr[CArrayStatistics]& sp_statistics) 
except *
+    cdef _get_value(self, const optional[CArrayStatisticsValueType]& 
optional_value)
+
+
 cdef class _PandasConvertible(_Weakrefable):
     pass
 
diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py 
b/python/pyarrow/tests/parquet/test_parquet_file.py
index 93097a1afa..ae8a16e874 100644
--- a/python/pyarrow/tests/parquet/test_parquet_file.py
+++ b/python/pyarrow/tests/parquet/test_parquet_file.py
@@ -334,3 +334,22 @@ def test_parquet_file_with_filesystem(s3_example_fs, 
use_uri):
         assert f.read() == table
         assert not f.closed
     assert f.closed
+
+
+def test_read_statistics():
+    table = pa.table({"value": pa.array([-1, None, 3])})
+    buf = io.BytesIO()
+    _write_table(table, buf)
+    buf.seek(0)
+
+    statistics = pq.ParquetFile(buf).read().columns[0].chunks[0].statistics
+    assert statistics.null_count == 1
+    assert statistics.distinct_count is None
+    assert statistics.min == -1
+    assert statistics.is_min_exact
+    assert statistics.max == 3
+    assert statistics.is_max_exact
+    assert repr(statistics) == ("arrow.ArrayStatistics<"
+                                "null_count=1, distinct_count=None, "
+                                "min=-1, is_min_exact=True, "
+                                "max=3, is_max_exact=True>")

Reply via email to