This is an automated email from the ASF dual-hosted git repository.
wjones127 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 14ec80f182 ARROW-15006: [Python][Doc] Add five more numpydoc checks to
CI (#15214)
14ec80f182 is described below
commit 14ec80f182532b960cd4b5d1e72bcad04ba651da
Author: Bryce Mecum <[email protected]>
AuthorDate: Fri Jan 6 14:21:27 2023 -0900
ARROW-15006: [Python][Doc] Add five more numpydoc checks to CI (#15214)
This adds the numpydoc checks GL10, PR04, PR05, RT03, and YD01 to CI and
fixes the associated issues in docstrings.
These checks are:
- GL10: reST directives {directives} must be followed by two colons
- PR04: Parameter "{param_name}" has no type
- PR05: Parameter "{param_name}" type should not finish with "."
- RT03: Return value has no description
- YD01: No Yields section found
https://numpydoc.readthedocs.io/en/latest/validation.html
Lead-authored-by: Bryce Mecum <[email protected]>
Co-authored-by: Will Jones <[email protected]>
Signed-off-by: Will Jones <[email protected]>
---
docker-compose.yml | 2 +-
python/pyarrow/_dataset.pyx | 18 +++++++++++++----
python/pyarrow/_dataset_parquet.pyx | 17 +++++++++-------
python/pyarrow/array.pxi | 6 +++---
python/pyarrow/compute.py | 9 ++++++++-
python/pyarrow/dataset.py | 2 ++
python/pyarrow/feather.py | 2 ++
python/pyarrow/fs.py | 2 +-
python/pyarrow/io.pxi | 4 ++--
python/pyarrow/ipc.pxi | 39 +++++++++++++++++++++++--------------
python/pyarrow/ipc.py | 7 +++++++
python/pyarrow/parquet/core.py | 21 +++++++++++++-------
python/pyarrow/plasma.py | 9 +++++----
python/pyarrow/table.pxi | 10 +++++-----
14 files changed, 98 insertions(+), 50 deletions(-)
diff --git a/docker-compose.yml b/docker-compose.yml
index df497a2de1..3774d55868 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1107,7 +1107,7 @@ services:
["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
/arrow/ci/scripts/python_build.sh /arrow /build &&
pip install -e /arrow/dev/archery[numpydoc] &&
- archery numpydoc --allow-rule PR01,PR03,PR10 &&
+ archery numpydoc --allow-rule GL10,PR01,PR03,PR04,PR05,PR10,RT03,YD01
&&
/arrow/ci/scripts/python_test.sh /arrow"]
conda-python-dask:
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 42781ff2aa..5f1610c384 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -532,12 +532,12 @@ cdef class InMemoryDataset(Dataset):
Parameters
----------
- source : The data for this dataset.
- Can be a RecordBatch, Table, list of
- RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader.
+ source : RecordBatch, Table, list, tuple
+ The data for this dataset. Can be a RecordBatch, Table, list of
+ RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader
If an iterable is provided, the schema must also be provided.
schema : Schema, optional
- Only required if passing an iterable as the source.
+ Only required if passing an iterable as the source
"""
cdef:
@@ -2647,6 +2647,16 @@ cdef class Scanner(_Weakrefable):
memory_pool : MemoryPool, default None
For memory allocations, if required. If not specified, uses the
default pool.
+ use_threads : bool, default True
+ If enabled, then maximum parallelism will be used determined by
+ the number of available CPU cores.
+ use_async : bool, default True
+ This flag is deprecated and is being kept for this release for
+ backwards compatibility. It will be removed in the next
+ release.
+ memory_pool : MemoryPool, default None
+ For memory allocations, if required. If not specified, uses the
+ default pool.
"""
cdef:
shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
diff --git a/python/pyarrow/_dataset_parquet.pyx
b/python/pyarrow/_dataset_parquet.pyx
index 744bfac6bf..01a3b30da5 100644
--- a/python/pyarrow/_dataset_parquet.pyx
+++ b/python/pyarrow/_dataset_parquet.pyx
@@ -71,7 +71,7 @@ cdef class ParquetFileFormat(FileFormat):
default_fragment_scan_options : ParquetFragmentScanOptions
Scan Options for the file.
**kwargs : dict
- Additional options for read option or scan option.
+ Additional options for read option or scan option
"""
cdef:
@@ -236,9 +236,12 @@ class RowGroupInfo:
Parameters
----------
- id : the group id.
- metadata : the rowgroup metadata.
- schema : schema of the rows.
+ id : integer
+ The group ID.
+ metadata : FileMetaData
+ The rowgroup metadata.
+ schema : Schema
+ Schema of the rows.
"""
def __init__(self, id, metadata, schema):
@@ -449,12 +452,12 @@ cdef class ParquetReadOptions(_Weakrefable):
----------
dictionary_columns : list of string, default None
Names of columns which should be dictionary encoded as
- they are read.
- coerce_int96_timestamp_unit : str, default None.
+ they are read
+ coerce_int96_timestamp_unit : str, default None
Cast timestamps that are stored in INT96 format to a particular
resolution (e.g. 'ms'). Setting to None is equivalent to 'ns'
and therefore INT96 timestamps will be inferred as timestamps
- in nanoseconds.
+ in nanoseconds
"""
cdef public:
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 5772592ead..b2dff65677 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -984,7 +984,7 @@ cdef class Array(_PandasConvertible):
Parameters
----------
- null_encoding
+ null_encoding : str, default "mask"
How to handle null entries.
Returns
@@ -1265,7 +1265,7 @@ cdef class Array(_PandasConvertible):
Parameters
----------
- fill_value
+ fill_value : any
The replacement value for null entries.
Returns
@@ -1363,7 +1363,7 @@ cdef class Array(_PandasConvertible):
----------
mask : Array or array-like
The boolean mask to filter the array with.
- null_selection_behavior
+ null_selection_behavior : str, default "drop"
How nulls in the mask should be handled.
Returns
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index 265d75f6f6..1ee6c40f42 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -374,6 +374,7 @@ def cast(arr, target_type=None, safe=None, options=None):
Returns
-------
casted : Array
+ The cast result as a new Array
"""
safe_vars_passed = (safe is not None) or (target_type is not None)
@@ -452,6 +453,7 @@ def take(data, indices, *, boundscheck=True,
memory_pool=None):
Returns
-------
result : depends on inputs
+ Selected values for the given indices
Examples
--------
@@ -490,6 +492,7 @@ def fill_null(values, fill_value):
Returns
-------
result : depends on inputs
+ Values with all null elements replaced
Examples
--------
@@ -534,7 +537,8 @@ def top_k_unstable(values, k, sort_keys=None, *,
memory_pool=None):
Returns
-------
- result : Array of indices
+ result : Array
+ Indices of the top-k ordered elements
Examples
--------
@@ -581,6 +585,7 @@ def bottom_k_unstable(values, k, sort_keys=None, *,
memory_pool=None):
Returns
-------
result : Array of indices
+ Indices of the bottom-k ordered elements
Examples
--------
@@ -650,6 +655,7 @@ def field(*name_or_index):
Returns
-------
field_expr : Expression
+ Reference to the given field
Examples
--------
@@ -691,5 +697,6 @@ def scalar(value):
Returns
-------
scalar_expr : Expression
+ An Expression representing the scalar value
"""
return Expression._scalar(value)
diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
index adbf064a73..de9469de44 100644
--- a/python/pyarrow/dataset.py
+++ b/python/pyarrow/dataset.py
@@ -151,6 +151,7 @@ def partitioning(schema=None, field_names=None, flavor=None,
Returns
-------
Partitioning or PartitioningFactory
+ The partioning scheme
Examples
--------
@@ -524,6 +525,7 @@ def parquet_dataset(metadata_path, schema=None,
filesystem=None, format=None,
Returns
-------
FileSystemDataset
+ The dataset corresponding to the given metadata
"""
from pyarrow.fs import LocalFileSystem, _ensure_filesystem
diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index 54a16a2f89..fbd0602597 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -221,6 +221,7 @@ def read_feather(source, columns=None, use_threads=True,
Returns
-------
df : pandas.DataFrame
+ The contents of the Feather file as a pandas.DataFrame
"""
return (read_table(
source, columns=columns, memory_map=memory_map,
@@ -246,6 +247,7 @@ def read_table(source, columns=None, memory_map=False,
use_threads=True):
Returns
-------
table : pyarrow.Table
+ The contents of the Feather file as a pyarrow.Table
"""
reader = _feather.FeatherReader(
source, use_memory_map=memory_map, use_threads=use_threads)
diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py
index ab151bc5d8..21db243528 100644
--- a/python/pyarrow/fs.py
+++ b/python/pyarrow/fs.py
@@ -281,7 +281,7 @@ class FSSpecHandler(FileSystemHandler):
Parameters
----------
- fs : FSSpec-compliant filesystem instance.
+ fs : FSSpec-compliant filesystem instance
Examples
--------
diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
index f1a1b315f6..21c17b4d36 100644
--- a/python/pyarrow/io.pxi
+++ b/python/pyarrow/io.pxi
@@ -2207,7 +2207,7 @@ def input_stream(source, compression='detect',
buffer_size=None):
Parameters
----------
- source : str, Path, buffer, file-like object, ...
+ source : str, Path, buffer, or file-like object
The source to open for reading.
compression : str optional, default 'detect'
The compression algorithm to use for on-the-fly decompression.
@@ -2259,7 +2259,7 @@ def output_stream(source, compression='detect',
buffer_size=None):
Parameters
----------
- source : str, Path, buffer, file-like object, ...
+ source : str, Path, buffer, file-like object
The source to open for writing.
compression : str optional, default 'detect'
The compression algorithm to use for on-the-fly compression.
diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi
index 35a07d8737..6e60b8b9a0 100644
--- a/python/pyarrow/ipc.pxi
+++ b/python/pyarrow/ipc.pxi
@@ -57,11 +57,16 @@ class WriteStats(_WriteStats):
Parameters
----------
- num_messages : number of messages.
- num_record_batches : number of record batches.
- num_dictionary_batches : number of dictionary batches.
- num_dictionary_deltas : delta of dictionaries.
- num_replaced_dictionaries : number of replaced dictionaries.
+ num_messages : int
+ Number of messages.
+ num_record_batches : int
+ Number of record batches.
+ num_dictionary_batches : int
+ Number of dictionary batches.
+ num_dictionary_deltas : int
+ Delta of dictionaries.
+ num_replaced_dictionaries : int
+ Number of replaced dictionaries.
"""
__slots__ = ()
@@ -84,11 +89,16 @@ class ReadStats(_ReadStats):
Parameters
----------
- num_messages : number of messages.
- num_record_batches : number of record batches.
- num_dictionary_batches : number of dictionary batches.
- num_dictionary_deltas : delta of dictionaries.
- num_replaced_dictionaries : number of replaced dictionaries.
+ num_messages : int
+ Number of messages.
+ num_record_batches : int
+ Number of record batches.
+ num_dictionary_batches : int
+ Number of dictionary batches.
+ num_dictionary_deltas : int
+ Delta of dictionaries.
+ num_replaced_dictionaries : int
+ Number of replaced dictionaries.
"""
__slots__ = ()
@@ -106,16 +116,15 @@ cdef class IpcReadOptions(_Weakrefable):
Parameters
----------
- ensure_native_endian : bool
+ ensure_native_endian : bool, default True
Whether to convert incoming data to platform-native endianness.
- Default is true.
use_threads : bool
Whether to use the global CPU thread pool to parallelize any
- computational tasks like decompression.
+ computational tasks like decompression
included_fields : list
If empty (the default), return all deserialized fields.
If non-empty, the values are the indices of fields to read on
- the top-level schema.
+ the top-level schema
"""
__slots__ = ()
@@ -411,7 +420,7 @@ cdef class MessageReader(_Weakrefable):
Parameters
----------
- source
+ source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python
object
A readable source, like an InputStream
"""
cdef:
diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py
index fc724109d9..523196e1e3 100644
--- a/python/pyarrow/ipc.py
+++ b/python/pyarrow/ipc.py
@@ -164,6 +164,7 @@ Create an Arrow columnar IPC stream writer instance
Returns
-------
writer : RecordBatchStreamWriter
+ A writer for the given sink
""".format(_ipc_writer_class_doc)
@@ -180,9 +181,11 @@ def open_stream(source, *, options=None, memory_pool=None):
If None, default values will be used.
memory_pool : MemoryPool, default None
If None, default memory pool is used.
+
Returns
-------
reader : RecordBatchStreamReader
+ A reader for the given source
"""
return RecordBatchStreamReader(source, options=options,
memory_pool=memory_pool)
@@ -202,6 +205,7 @@ Create an Arrow columnar IPC file writer instance
Returns
-------
writer : RecordBatchFileWriter
+ A writer for the given sink
""".format(_ipc_writer_class_doc)
@@ -221,9 +225,11 @@ def open_file(source, footer_offset=None, *, options=None,
memory_pool=None):
If None, default values will be used.
memory_pool : MemoryPool, default None
If None, default memory pool is used.
+
Returns
-------
reader : RecordBatchFileReader
+ A reader for the given source
"""
return RecordBatchFileReader(
source, footer_offset=footer_offset,
@@ -271,6 +277,7 @@ def deserialize_pandas(buf, *, use_threads=True):
Returns
-------
df : pandas.DataFrame
+ The buffer deserialized as pandas DataFrame
"""
buffer_reader = pa.BufferReader(buf)
with pa.RecordBatchStreamReader(buffer_reader) as reader:
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index e6148be0f8..88e3cf2a67 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -162,6 +162,7 @@ def filters_to_expression(filters):
Returns
-------
pyarrow.compute.Expression
+ An Expression representing the filters
"""
import pyarrow.dataset as ds
@@ -242,7 +243,7 @@ class ParquetFile:
Coalesce and issue file reads in parallel to improve performance on
high-latency filesystems (e.g. S3). If True, Arrow will use a
background I/O thread pool.
- coerce_int96_timestamp_unit : str, default None.
+ coerce_int96_timestamp_unit : str, default None
Cast timestamps that are stored in INT96 format to a particular
resolution (e.g. 'ms'). Setting to None is equivalent to 'ns'
and therefore INT96 timestamps will be inferred as timestamps
@@ -541,9 +542,9 @@ class ParquetFile:
If True and file has custom pandas schema metadata, ensure that
index columns are also loaded.
- Returns
+ Yields
-------
- iterator of pyarrow.RecordBatch
+ pyarrow.RecordBatch
Contents of each batch as a record batch
Examples
@@ -645,7 +646,8 @@ class ParquetFile:
Returns
-------
- num_rows : number of rows in file
+ num_rows : int
+ Number of rows in file
Examples
--------
@@ -1186,6 +1188,7 @@ class ParquetDatasetPiece:
Returns
-------
metadata : FileMetaData
+ The file's metadata
"""
with self.open() as parquet:
return parquet.metadata
@@ -1222,6 +1225,7 @@ class ParquetDatasetPiece:
Returns
-------
table : pyarrow.Table
+ The piece as a pyarrow.Table.
"""
if self.open_file_func is not None:
reader = self.open()
@@ -1309,7 +1313,8 @@ class PartitionSet:
Parameters
----------
- key : The value for which we want to known the index.
+ key : str or int
+ The value for which we want to known the index.
"""
if key in self.key_indices:
return self.key_indices[key]
@@ -1713,7 +1718,7 @@ pre_buffer : bool, default True
use_legacy_dataset=False. If using a filesystem layer that itself
performs readahead (e.g. fsspec's S3FS), disable readahead for best
results.
-coerce_int96_timestamp_unit : str, default None.
+coerce_int96_timestamp_unit : str, default None
Cast timestamps that are stored in INT96 format to a particular resolution
(e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96
timestamps will be inferred as timestamps in nanoseconds.
@@ -2783,7 +2788,7 @@ pre_buffer : bool, default True
use_legacy_dataset=False. If using a filesystem layer that itself
performs readahead (e.g. fsspec's S3FS), disable readahead for best
results.
-coerce_int96_timestamp_unit : str, default None.
+coerce_int96_timestamp_unit : str, default None
Cast timestamps that are stored in INT96 format to a particular
resolution (e.g. 'ms'). Setting to None is equivalent to 'ns'
and therefore INT96 timestamps will be inferred as timestamps
@@ -3560,6 +3565,7 @@ def read_metadata(where, memory_map=False,
decryption_properties=None,
Returns
-------
metadata : FileMetaData
+ The metadata of the Parquet file
Examples
--------
@@ -3609,6 +3615,7 @@ def read_schema(where, memory_map=False,
decryption_properties=None,
Returns
-------
schema : pyarrow.Schema
+ The schema of the Parquet file
Examples
--------
diff --git a/python/pyarrow/plasma.py b/python/pyarrow/plasma.py
index 5c2c654341..0034276555 100644
--- a/python/pyarrow/plasma.py
+++ b/python/pyarrow/plasma.py
@@ -108,11 +108,12 @@ def start_plasma_store(plasma_store_memory,
external_store : str
External store to use for evicted objects.
- Returns
+ Yields
-------
- result : (str, subprocess.Popen)
- A tuple of the name of the plasma store socket and the process ID of
- the plasma store process.
+ plasma_store_name : str
+ Name of the plasma store socket
+ proc : subprocess.Popen
+ Process ID of the plasma store process
"""
warnings.warn(
"Plasma is deprecated since Arrow 10.0.0. It will be removed in "
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 53e8412282..bcc428a4cb 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -366,7 +366,7 @@ cdef class ChunkedArray(_PandasConvertible):
Parameters
----------
- fill_value
+ fill_value : any
The replacement value for null entries.
Returns
@@ -530,7 +530,7 @@ cdef class ChunkedArray(_PandasConvertible):
Parameters
----------
- null_encoding
+ null_encoding : str, default "mask"
How to handle null entries.
Returns
@@ -853,7 +853,7 @@ cdef class ChunkedArray(_PandasConvertible):
----------
mask : Array or array-like
The boolean mask to filter the chunked array with.
- null_selection_behavior
+ null_selection_behavior : str, default "drop"
How nulls in the mask should be handled.
Returns
@@ -2103,7 +2103,7 @@ cdef class RecordBatch(_PandasConvertible):
----------
mask : Array or array-like
The boolean mask to filter the record batch with.
- null_selection_behavior
+ null_selection_behavior : str, default "drop"
How nulls in the mask should be handled.
Returns
@@ -2938,7 +2938,7 @@ cdef class Table(_PandasConvertible):
----------
mask : Array or array-like or .Expression
The boolean mask or the :class:`.Expression` to filter the table
with.
- null_selection_behavior
+ null_selection_behavior : str, default "drop"
How nulls in the mask should be handled, does nothing if
an :class:`.Expression` is used.