This is an automated email from the ASF dual-hosted git repository.
amolina pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new e5072bdacb ARROW-15429: [Python] Address docstrings for ChunkedArray
class, methods, attributes and constructor
e5072bdacb is described below
commit e5072bdacbe715f64d6d16f5deb0bb4a7f22c62f
Author: Alenka Frim <[email protected]>
AuthorDate: Thu Apr 7 14:03:15 2022 +0200
ARROW-15429: [Python] Address docstrings for ChunkedArray class, methods,
attributes and constructor
This PR adds docstring examples to:
- `pyarrow.ChunkedArray` class methods and attributes
- `pyarrow.chunked_array`
- `to_pandas` for `_PandasConvertible`
Closes #12754 from AlenkaF/ARROW-15429
Lead-authored-by: Alenka Frim <[email protected]>
Co-authored-by: Alenka Frim <[email protected]>
Signed-off-by: Alessandro Molina <[email protected]>
---
python/pyarrow/array.pxi | 16 ++
python/pyarrow/table.pxi | 682 ++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 694 insertions(+), 4 deletions(-)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 17a55e6205..832a16c6fe 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -768,6 +768,22 @@ cdef class _PandasConvertible(_Weakrefable):
3 100 Centipede
>>> isinstance(table.to_pandas(), pd.DataFrame)
True
+
+ Convert a Chunked Array to pandas Series:
+
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ >>> n_legs.to_pandas()
+ 0 2
+ 1 2
+ 2 4
+ 3 4
+ 4 5
+ 5 100
+ dtype: int64
+ >>> import pandas as pd
+ >>> isinstance(n_legs.to_pandas(), pd.Series)
+ True
"""
options = dict(
pool=memory_pool,
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index a145ecb297..94e99c50ce 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -25,6 +25,34 @@ cdef class ChunkedArray(_PandasConvertible):
Warnings
--------
Do not call this class's constructor directly.
+
+ Examples
+ --------
+ To construct a ChunkedArray object use :func:`pyarrow.chunked_array`:
+
+ >>> import pyarrow as pa
+ >>> pa.chunked_array([], type=pa.int8())
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ ...
+ ]
+
+ >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ 2,
+ 4
+ ],
+ [
+ 4,
+ 5,
+ 100
+ ]
+ ]
+ >>> isinstance(pa.chunked_array([[2, 2, 4], [4, 5, 100]]), pa.ChunkedArray)
+ True
"""
def __cinit__(self):
@@ -51,9 +79,27 @@ cdef class ChunkedArray(_PandasConvertible):
@property
def type(self):
+ """
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ >>> n_legs.type
+ DataType(int64)
+ """
return pyarrow_wrap_data_type(self.sp_chunked_array.get().type())
def length(self):
+ """
+ Return length of a ChunkedArray.
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ >>> n_legs.length()
+ 6
+ """
return self.chunked_array.length()
def __len__(self):
@@ -85,6 +131,13 @@ cdef class ChunkedArray(_PandasConvertible):
skip_new_lines : bool
If the array should be rendered as a single line of text
or if each element should be on its own line.
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ >>> n_legs.to_string(skip_new_lines=True)
+ '[[2,2,4],[4,5,100]]'
"""
cdef:
c_string result
@@ -144,6 +197,13 @@ cdef class ChunkedArray(_PandasConvertible):
Returns
-------
int
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]])
+ >>> n_legs.null_count
+ 1
"""
return self.chunked_array.null_count()
@@ -162,6 +222,13 @@ cdef class ChunkedArray(_PandasConvertible):
The dictionary of dictionary arrays will always be counted in their
entirety even if the array only references a portion of the dictionary.
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]])
+ >>> n_legs.nbytes
+ 49
"""
cdef:
CResult[int64_t] c_res_buffer
@@ -180,6 +247,13 @@ cdef class ChunkedArray(_PandasConvertible):
If a buffer is referenced multiple times then it will
only be counted once.
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]])
+ >>> n_legs.get_total_buffer_size()
+ 49
"""
cdef:
int64_t total_buffer_size
@@ -235,6 +309,25 @@ cdef class ChunkedArray(_PandasConvertible):
Returns
-------
array : boolean Array or ChunkedArray
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]])
+ >>> n_legs.is_null()
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ false,
+ false,
+ false
+ ],
+ [
+ false,
+ true,
+ false
+ ]
+ ]
"""
options = _pc().NullOptions(nan_is_null=nan_is_null)
return _pc().call_function('is_null', [self], options)
@@ -242,6 +335,25 @@ cdef class ChunkedArray(_PandasConvertible):
def is_valid(self):
"""
Return boolean array indicating the non-null values.
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]])
+ >>> n_legs.is_valid()
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ true,
+ true,
+ true
+ ],
+ [
+ true,
+ false,
+ true
+ ]
+ ]
"""
return _pc().is_valid(self)
@@ -253,7 +365,9 @@ cdef class ChunkedArray(_PandasConvertible):
def fill_null(self, fill_value):
"""
- See :func:`pyarrow.compute.fill_null` for usage.
+ Replace each null element in values with fill_value.
+
+ See :func:`pyarrow.compute.fill_null` for full usage.
Parameters
----------
@@ -264,6 +378,24 @@ cdef class ChunkedArray(_PandasConvertible):
-------
result : Array or ChunkedArray
A new array with nulls replaced by the given value.
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> fill_value = pa.scalar(5, type=pa.int8())
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]])
+ >>> n_legs.fill_null(fill_value)
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ 2,
+ 4,
+ 4,
+ 5,
+ 100
+ ]
+ ]
"""
return _pc().fill_null(self, fill_value)
@@ -279,6 +411,19 @@ cdef class ChunkedArray(_PandasConvertible):
Returns
-------
are_equal : bool
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ >>> animals = pa.chunked_array((
+ ... ["Flamingo", "Parot", "Dog"],
+ ... ["Horse", "Brittle stars", "Centipede"]
+ ... ))
+ >>> n_legs.equals(n_legs)
+ True
+ >>> n_legs.equals(animals)
+ False
"""
if other is None:
return False
@@ -303,6 +448,13 @@ cdef class ChunkedArray(_PandasConvertible):
Returns
-------
array : numpy.ndarray
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ >>> n_legs.to_numpy()
+ array([ 2, 2, 4, 4, 5, 100])
"""
cdef:
PyObject* out
@@ -357,6 +509,19 @@ cdef class ChunkedArray(_PandasConvertible):
Returns
-------
cast : Array or ChunkedArray
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ >>> n_legs.type
+ DataType(int64)
+
+ Change the data type of an array:
+
+ >>> n_legs_seconds = n_legs.cast(pa.duration('s'))
+ >>> n_legs_seconds.type
+ DurationType(duration[s])
"""
return _pc().cast(self, target_type, safe=safe)
@@ -375,6 +540,50 @@ cdef class ChunkedArray(_PandasConvertible):
-------
encoded : ChunkedArray
A dictionary-encoded version of this array.
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> animals = pa.chunked_array((
+ ... ["Flamingo", "Parot", "Dog"],
+ ... ["Horse", "Brittle stars", "Centipede"]
+ ... ))
+ >>> animals.dictionary_encode()
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ ...
+ -- dictionary:
+ [
+ "Flamingo",
+ "Parot",
+ "Dog",
+ "Horse",
+ "Brittle stars",
+ "Centipede"
+ ]
+ -- indices:
+ [
+ 0,
+ 1,
+ 2
+ ],
+ ...
+ -- dictionary:
+ [
+ "Flamingo",
+ "Parot",
+ "Dog",
+ "Horse",
+ "Brittle stars",
+ "Centipede"
+ ]
+ -- indices:
+ [
+ 3,
+ 4,
+ 5
+ ]
+ ]
"""
options = _pc().DictionaryEncodeOptions(null_encoding)
return _pc().call_function('dictionary_encode', [self], options)
@@ -392,6 +601,53 @@ cdef class ChunkedArray(_PandasConvertible):
Returns
-------
result : list of ChunkedArray
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ >>> c_arr = pa.chunked_array(n_legs.value_counts())
+ >>> c_arr
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ -- is_valid: all not null
+ -- child 0 type: int64
+ [
+ 2,
+ 4,
+ 5,
+ 100
+ ]
+ -- child 1 type: int64
+ [
+ 2,
+ 2,
+ 1,
+ 1
+ ]
+ ]
+ >>> c_arr.flatten()
+ [<pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ 4,
+ 5,
+ 100
+ ]
+ ], <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ 2,
+ 1,
+ 1
+ ]
+ ]]
+ >>> c_arr.type
+ StructType(struct<values: int64, counts: int64>)
+ >>> n_legs.type
+ DataType(int64)
"""
cdef:
vector[shared_ptr[CChunkedArray]] flattened
@@ -414,6 +670,35 @@ cdef class ChunkedArray(_PandasConvertible):
Returns
-------
result : Array
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ >>> n_legs
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ 2,
+ 4
+ ],
+ [
+ 4,
+ 5,
+ 100
+ ]
+ ]
+ >>> n_legs.combine_chunks()
+ <pyarrow.lib.Int64Array object at ...>
+ [
+ 2,
+ 2,
+ 4,
+ 4,
+ 5,
+ 100
+ ]
"""
return concat_arrays(self.chunks)
@@ -424,6 +709,33 @@ cdef class ChunkedArray(_PandasConvertible):
Returns
-------
pyarrow.Array
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ >>> n_legs
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ 2,
+ 4
+ ],
+ [
+ 4,
+ 5,
+ 100
+ ]
+ ]
+ >>> n_legs.unique()
+ <pyarrow.lib.Int64Array object at ...>
+ [
+ 2,
+ 4,
+ 5,
+ 100
+ ]
"""
return _pc().call_function('unique', [self])
@@ -434,6 +746,42 @@ cdef class ChunkedArray(_PandasConvertible):
Returns
-------
An array of <input type "Values", int64_t "Counts"> structs
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ >>> n_legs
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ 2,
+ 4
+ ],
+ [
+ 4,
+ 5,
+ 100
+ ]
+ ]
+ >>> n_legs.value_counts()
+ <pyarrow.lib.StructArray object at ...>
+ -- is_valid: all not null
+ -- child 0 type: int64
+ [
+ 2,
+ 4,
+ 5,
+ 100
+ ]
+ -- child 1 type: int64
+ [
+ 2,
+ 2,
+ 1,
+ 1
+ ]
"""
return _pc().call_function('value_counts', [self])
@@ -452,6 +800,35 @@ cdef class ChunkedArray(_PandasConvertible):
Returns
-------
sliced : ChunkedArray
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ >>> n_legs
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ 2,
+ 4
+ ],
+ [
+ 4,
+ 5,
+ 100
+ ]
+ ]
+ >>> n_legs.slice(2,2)
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 4
+ ],
+ [
+ 4
+ ]
+ ]
"""
cdef shared_ptr[CChunkedArray] result
@@ -484,6 +861,49 @@ cdef class ChunkedArray(_PandasConvertible):
filtered : Array or ChunkedArray
An array of the same type, with only the elements selected by
the boolean mask.
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ >>> n_legs
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ 2,
+ 4
+ ],
+ [
+ 4,
+ 5,
+ 100
+ ]
+ ]
+ >>> mask = pa.array([True, False, None, True, False, True])
+ >>> n_legs.filter(mask)
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2
+ ],
+ [
+ 4,
+ 100
+ ]
+ ]
+ >>> n_legs.filter(mask, null_selection_behavior="emit_null")
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ null
+ ],
+ [
+ 4,
+ 100
+ ]
+ ]
"""
return _pc().filter(self, mask, null_selection_behavior)
@@ -508,6 +928,29 @@ cdef class ChunkedArray(_PandasConvertible):
-------
index : Int64Scalar
The index of the value in the array (-1 if not found).
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ >>> n_legs
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ 2,
+ 4
+ ],
+ [
+ 4,
+ 5,
+ 100
+ ]
+ ]
+ >>> n_legs.index(4)
+ <pyarrow.Int64Scalar: 2>
+ >>> n_legs.index(4, start=3)
+ <pyarrow.Int64Scalar: 3>
"""
return _pc().index(self, value, start, end, memory_pool=memory_pool)
@@ -526,6 +969,34 @@ cdef class ChunkedArray(_PandasConvertible):
-------
taken : Array or ChunkedArray
An array with the same datatype, containing the taken values.
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ >>> n_legs
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ 2,
+ 4
+ ],
+ [
+ 4,
+ 5,
+ 100
+ ]
+ ]
+ >>> n_legs.take([1,4,5])
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ 5,
+ 100
+ ]
+ ]
"""
return _pc().take(self, indices)
@@ -533,6 +1004,38 @@ cdef class ChunkedArray(_PandasConvertible):
"""
Remove missing values from a chunked array.
See :func:`pyarrow.compute.drop_null` for full description.
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]])
+ >>> n_legs
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ 2,
+ null
+ ],
+ [
+ 4,
+ 5,
+ 100
+ ]
+ ]
+ >>> n_legs.drop_null()
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ 2
+ ],
+ [
+ 4,
+ 5,
+ 100
+ ]
+ ]
"""
return _pc().drop_null(self)
@@ -555,6 +1058,79 @@ cdef class ChunkedArray(_PandasConvertible):
Returns
-------
result : ChunkedArray
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> arr_1 = pa.array(["Flamingo", "Parot", "Dog"]).dictionary_encode()
+ >>> arr_2 = pa.array(["Horse", "Brittle stars",
"Centipede"]).dictionary_encode()
+ >>> c_arr = pa.chunked_array([arr_1, arr_2])
+ >>> c_arr
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ ...
+ -- dictionary:
+ [
+ "Flamingo",
+ "Parot",
+ "Dog"
+ ]
+ -- indices:
+ [
+ 0,
+ 1,
+ 2
+ ],
+ ...
+ -- dictionary:
+ [
+ "Horse",
+ "Brittle stars",
+ "Centipede"
+ ]
+ -- indices:
+ [
+ 0,
+ 1,
+ 2
+ ]
+ ]
+ >>> c_arr.unify_dictionaries()
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ ...
+ -- dictionary:
+ [
+ "Flamingo",
+ "Parot",
+ "Dog",
+ "Horse",
+ "Brittle stars",
+ "Centipede"
+ ]
+ -- indices:
+ [
+ 0,
+ 1,
+ 2
+ ],
+ ...
+ -- dictionary:
+ [
+ "Flamingo",
+ "Parot",
+ "Dog",
+ "Horse",
+ "Brittle stars",
+ "Centipede"
+ ]
+ -- indices:
+ [
+ 3,
+ 4,
+ 5
+ ]
+ ]
"""
cdef:
CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
@@ -569,17 +1145,24 @@ cdef class ChunkedArray(_PandasConvertible):
@property
def num_chunks(self):
"""
- Number of underlying chunks
+ Number of underlying chunks.
Returns
-------
int
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]])
+ >>> n_legs.num_chunks
+ 2
"""
return self.chunked_array.num_chunks()
def chunk(self, i):
"""
- Select a chunk by its index
+ Select a chunk by its index.
Parameters
----------
@@ -588,6 +1171,18 @@ cdef class ChunkedArray(_PandasConvertible):
Returns
-------
pyarrow.Array
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]])
+ >>> n_legs.chunk(1)
+ <pyarrow.lib.Int64Array object at ...>
+ [
+ 4,
+ 5,
+ 100
+ ]
"""
if i >= self.num_chunks or i < 0:
raise IndexError('Chunk index out of range.')
@@ -596,15 +1191,70 @@ cdef class ChunkedArray(_PandasConvertible):
@property
def chunks(self):
+ """
+ Convert to a list of single-chunked arrays.
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]])
+ >>> n_legs
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ 2,
+ null
+ ],
+ [
+ 4,
+ 5,
+ 100
+ ]
+ ]
+ >>> n_legs.chunks
+ [<pyarrow.lib.Int64Array object at ...>
+ [
+ 2,
+ 2,
+ null
+ ], <pyarrow.lib.Int64Array object at ...>
+ [
+ 4,
+ 5,
+ 100
+ ]]
+ """
return list(self.iterchunks())
def iterchunks(self):
+ """
+ Convert to an iterator of ChunkArrays.
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]])
+ >>> for i in n_legs.iterchunks():
+ ... print(i.null_count)
+ ...
+ 0
+ 1
+
+ """
for i in range(self.num_chunks):
yield self.chunk(i)
def to_pylist(self):
"""
Convert to a list of native Python objects.
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]])
+ >>> n_legs.to_pylist()
+ [2, 2, 4, 4, None, 100]
"""
result = []
for i in range(self.num_chunks):
@@ -625,6 +1275,30 @@ def chunked_array(arrays, type=None):
Returns
-------
ChunkedArray
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> pa.chunked_array([], type=pa.int8())
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ ...
+ ]
+
+ >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ <pyarrow.lib.ChunkedArray object at ...>
+ [
+ [
+ 2,
+ 2,
+ 4
+ ],
+ [
+ 4,
+ 5,
+ 100
+ ]
+ ]
"""
cdef:
Array arr
@@ -820,7 +1494,7 @@ cdef class RecordBatch(_PandasConvertible):
int: int64
str: string
>>> pa.RecordBatch.from_pylist(pylist)[0]
- <pyarrow.lib.Int64Array object at 0x1256b08e0>
+ <pyarrow.lib.Int64Array object at ...>
[
1,
2