pitrou commented on a change in pull request #11245:
URL: https://github.com/apache/arrow/pull/11245#discussion_r719253766
##########
File path: python/pyarrow/_compute.pyx
##########
@@ -670,13 +699,29 @@ class CastOptions(_CastOptions):
@staticmethod
def safe(target_type=None):
+ """"
+ Cast operation options.
+
+ Parameters
+ ----------
+ target_type : optional
+ Target type for the safe cast.
Review comment:
"Target cast type"
##########
File path: python/pyarrow/_csv.pyx
##########
@@ -59,26 +59,26 @@ cdef class ReadOptions(_Weakrefable):
This will determine multi-threading granularity as well as
the size of individual record batches or table chunks.
Minimum valid value for block size is 1
- skip_rows: int, optional (default 0)
+ skip_rows : int, optional (default 0)
Review comment:
Just for the record, did you have to do these by hand or is there a
mechanical fixer?
##########
File path: python/pyarrow/_compute.pyx
##########
@@ -502,6 +512,17 @@ def call_function(name, args, options=None,
memory_pool=None):
The function is looked up in the global registry
(as returned by `function_registry()`).
+
+ Parameters
+ ----------
+ name : str
+ The name of the function to call.
+ args : list
+ The arguments to the function.
+ options : optional
+ options provided to the function.
+ memory_pool : MemoryPool, optional
+ memory pool to use for allocations during function.
Review comment:
"during function execution"?
##########
File path: python/pyarrow/_compute.pyx
##########
@@ -670,13 +699,29 @@ class CastOptions(_CastOptions):
@staticmethod
def safe(target_type=None):
+ """"
+ Cast operation options.
+
+ Parameters
+ ----------
+ target_type : optional
+ Target type for the safe cast.
+ """
self = CastOptions()
self._set_safe()
self._set_type(target_type)
return self
@staticmethod
def unsafe(target_type=None):
+ """"
+ Cast operation options.
Review comment:
Same remarks here.
##########
File path: python/pyarrow/io.pxi
##########
@@ -1436,6 +1481,20 @@ class Transcoder:
def transcoding_input_stream(stream, src_encoding, dest_encoding):
+ """
+ Add a transcoding transformation to the stream.
+ Incoming data will be decoded according to ``src_encoding`` and
+ emitted data will be encoded according to ``dest_encoding``.
Review comment:
I would say "decoded according to ... and then re-encoded according to
...".
##########
File path: python/pyarrow/io.pxi
##########
@@ -1357,6 +1380,18 @@ cdef class BufferedInputStream(NativeFile):
cdef class BufferedOutputStream(NativeFile):
+ """
+ Wraps an output stream making it buffered.
+
+ Parameters
+ ----------
+ stream : NativeFile
+ The stream to wrap with the buffer
Review comment:
Perhaps "writable stream" or "output stream"?
##########
File path: python/pyarrow/io.pxi
##########
@@ -1307,6 +1317,19 @@ ctypedef CRandomAccessFile* _RandomAccessFilePtr
cdef class BufferedInputStream(NativeFile):
+ """
+ An InputStream that performs buffered reads from an unbuffered InputStream,
+ which can mitigate the overhead of many small reads in some cases
+
+ Parameters
+ ----------
+ stream : NativeFile
Review comment:
There is a bit of terminology confusion between "NativeFile" and
"InputStream". Perhaps "InputStream" should be spelled "input stream" to make
it clear it doesn't name a base class.
##########
File path: python/pyarrow/io.pxi
##########
@@ -1478,6 +1542,18 @@ def foreign_buffer(address, size, base=None):
The *base* object will be kept alive as long as this buffer is alive,
including across language boundaries (for example if the buffer is
referenced by C++ code).
+
+ Parameters
+ ----------
+ address : int
+ Specify the starting address of the buffer. The address can
+ refer to both device or host memory but it must be
+ accessible from device after mapping it with
+ `get_device_address` method.
+ size : int
+ Specify the size of device buffer in bytes.
+ base : {None, object}
+ Specify object that owns the referenced memory.
Review comment:
I don't think it's necessary to add "Specify" at the start of each
parameter description.
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -252,6 +287,11 @@ shape: {0.shape}""".format(self)
def from_tensor(obj):
"""
Convert arrow::Tensor to arrow::SparseCOOTensor.
+
+ Parameters
+ ----------
+ obj : Tensor
+ The tensor that should be converted.
Review comment:
"The dense tensor", perhaps?
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -225,6 +253,13 @@ shape: {0.shape}""".format(self)
def from_pydata_sparse(obj, dim_names=None):
"""
Convert pydata/sparse.COO to arrow::SparseCOOTensor.
Review comment:
"sparse.COO"?
##########
File path: python/pyarrow/parquet.py
##########
@@ -214,6 +214,8 @@ class ParquetFile:
Coalesce and issue file reads in parallel to improve performance on
high-latency filesystems (e.g. S3). If True, Arrow will use a
background I/O thread pool.
+ read_dictionary : list
+ List of names to read directly as DictionaryArray.
Review comment:
"column names"
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -395,13 +435,34 @@ shape: {0.shape}""".format(self)
def from_dense_numpy(cls, obj, dim_names=None):
"""
Convert numpy.ndarray to arrow::SparseCSRMatrix
+
+ Parameters
+ ----------
+ obj : numpy.ndarray
+ The source numpy array
Review comment:
"The dense numpy array that should be converted"?
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -585,6 +658,20 @@ shape: {0.shape}""".format(self)
def from_numpy(data, indptr, indices, shape, dim_names=None):
"""
Create arrow::SparseCSCMatrix from numpy.ndarrays
+
+ Parameters
+ ----------
+ data : numpy.ndarray
+ Data used to populate the rows.
+ indptr : numpy.ndarray
+ Range of the rows,
+ The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data.
Review comment:
"i-th column"?
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -462,6 +530,11 @@ shape: {0.shape}""".format(self)
def from_tensor(obj):
"""
Convert arrow::Tensor to arrow::SparseCSRMatrix.
+
+ Parameters
+ ----------
+ obj : Tensor
+ The tensor that should be converted.
Review comment:
"dense tensor"
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -771,6 +870,22 @@ shape: {0.shape}""".format(self)
dim_names=None):
"""
Create arrow::SparseCSFTensor from numpy.ndarrays
+
+ Parameters
+ ----------
+ data : numpy.ndarray
+ Data used to populate the rows.
Review comment:
"to populate the sparse tensor"
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -771,6 +870,22 @@ shape: {0.shape}""".format(self)
dim_names=None):
"""
Create arrow::SparseCSFTensor from numpy.ndarrays
+
+ Parameters
+ ----------
+ data : numpy.ndarray
+ Data used to populate the rows.
+ indptr : numpy.ndarray
+ Range of the rows,
+ The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data.
+ indices : numpy.ndarray
+ Column indices of the corresponding non-zero values.
Review comment:
It seems you have copy-pasted all this from the previous `from_numpy`
methods, but it doesn't really correspond to what's expected. Below you can see
that these are lists of ndarrays (CSF is a n-dimensional generalization of the
idea behind CSR and CSC, there's an explanation here:
https://www.boristhebrave.com/2021/01/01/compressed-sparse-fibers-explained/).
##########
File path: python/pyarrow/compute.py
##########
@@ -132,11 +132,15 @@ def _decorate_compute_function(wrapper, exposed_name,
func, option_class):
if option_class is not None:
doc_pieces.append("""\
options : pyarrow.compute.{0}, optional
- Parameters altering compute function semantics
- **kwargs : optional
- Parameters for {0} constructor. Either `options`
- or `**kwargs` can be passed, but not both at the same time.
+ Parameters altering compute function semantics.
""".format(option_class.__name__))
+ options_sig = inspect.signature(option_class)
+ for p in options_sig.parameters.values():
+ doc_pieces.append("""\
+ {0} : optional
+ Parameter for {1} constructor. Either `options`
+ or `{0}` can be passed, but not both at the same time.
+ """.format(p.name, option_class.__name__))
Review comment:
See https://issues.apache.org/jira/browse/ARROW-10317
##########
File path: python/pyarrow/_compute.pyx
##########
@@ -670,13 +699,29 @@ class CastOptions(_CastOptions):
@staticmethod
def safe(target_type=None):
+ """"
+ Cast operation options.
Review comment:
Right, the current description is a bit cryptic. Perhaps `Create a
CastOptions with "safe" defaults`?
##########
File path: python/pyarrow/fs.py
##########
@@ -256,6 +256,12 @@ class FSSpecHandler(FileSystemHandler):
https://filesystem-spec.readthedocs.io/en/latest/index.html
+ Parameters
+ ----------
+ fs : The file system implementation according to FSSpec.
+
+ Example
+ -------
Review comment:
Well, there is a single example, isn't there?
##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -1795,6 +1819,20 @@ cdef class IpcFileFormat(FileFormat):
cdef class CsvFileFormat(FileFormat):
+ """
+ FileFormat for CSV files.
+
+ Parameters
+ ----------
+ parse_options : ParseOptions
+ Options regarding parsing of CSV.
Review comment:
"CSV parsing"?
##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -2791,7 +2838,14 @@ cdef class RecordBatchIterator(_Weakrefable):
class TaggedRecordBatch(collections.namedtuple(
"TaggedRecordBatch", ["record_batch", "fragment"])):
- """A combination of a record batch and the fragment it came from."""
+ """
+ A combination of a record batch and the fragment it came from.
+
+ Parameters
+ ----------
+ record_batch : The record batch.
+ fragment : fragment of the record batch.
Review comment:
Speaking of which, is describing the parameters mandatory? Some classes
may not be meant to be instantiated by the user.
##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -1795,6 +1819,20 @@ cdef class IpcFileFormat(FileFormat):
cdef class CsvFileFormat(FileFormat):
+ """
+ FileFormat for CSV files.
+
+ Parameters
+ ----------
+ parse_options : ParseOptions
+ Options regarding parsing of CSV.
+ convert_options : ConvertOptions
+ Options regarding value conversion.
+ read_options : ReadOptions
+ Options regarding the CSV file read operation.
Review comment:
"General read options"?
##########
File path: python/pyarrow/io.pxi
##########
@@ -1307,6 +1317,19 @@ ctypedef CRandomAccessFile* _RandomAccessFilePtr
cdef class BufferedInputStream(NativeFile):
+ """
+ An InputStream that performs buffered reads from an unbuffered InputStream,
+ which can mitigate the overhead of many small reads in some cases
+
+ Parameters
+ ----------
+ stream : NativeFile
+ The stream to wrap with the buffer
Review comment:
"readable stream" or "input stream" perhaps?
##########
File path: python/pyarrow/fs.py
##########
@@ -256,6 +256,12 @@ class FSSpecHandler(FileSystemHandler):
https://filesystem-spec.readthedocs.io/en/latest/index.html
+ Parameters
+ ----------
+ fs : The file system implementation according to FSSpec.
Review comment:
I would say "The FSSpec-compliant filesystem instance"
##########
File path: python/pyarrow/lib.pyx
##########
@@ -60,6 +60,11 @@ def set_cpu_count(int count):
"""
Set the number of threads to use in parallel operations.
+ Parameters
+ ----------
+ count : int
+ The number of concurrent cpu that should be set.
Review comment:
"The number of threads that should be used"?
##########
File path: python/pyarrow/io.pxi
##########
@@ -1357,6 +1380,18 @@ cdef class BufferedInputStream(NativeFile):
cdef class BufferedOutputStream(NativeFile):
+ """
+ Wraps an output stream making it buffered.
Review comment:
Hmm, why not re-use a similar phrasing as for BufferedInputStream?
##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -508,12 +508,13 @@ cdef class Dataset(_Weakrefable):
cdef class InMemoryDataset(Dataset):
- """A Dataset wrapping in-memory data.
+ """
+ A Dataset wrapping in-memory data.
Parameters
----------
- source
- The data for this dataset. Can be a RecordBatch, Table, list of
+ source : The data for this dataset.
Review comment:
I would say "data source"
##########
File path: python/pyarrow/io.pxi
##########
@@ -1724,6 +1800,12 @@ cdef class Codec(_Weakrefable):
"""
Returns true if the compression level parameter is supported
for the given codec.
+
+ Parameters
+ ----------
+ compression : str
+ Type of compression codec, valid values are: gzip, bz2, brotli,
+ lz4, zstd and snappy.
Review comment:
Is there a way we can make it easier to maintain the list of valid
values?
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -160,6 +170,17 @@ shape: {0.shape}""".format(self)
def from_numpy(data, coords, shape, dim_names=None):
"""
Create arrow::SparseCOOTensor from numpy.ndarrays
Review comment:
"arrow::" is not really a Python convention. Just keep "SparseCOOTensor"?
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -160,6 +170,17 @@ shape: {0.shape}""".format(self)
def from_numpy(data, coords, shape, dim_names=None):
"""
Create arrow::SparseCOOTensor from numpy.ndarrays
+
+ Parameters
+ ----------
+ data : numpy.ndarray
+ Data used to populate the rows.
+ coords : numpy.ndarray
+ Coordinates of the data.
+ shape : tuple
+ Shape of the tensor.
+ dim_names : list
Review comment:
Add "optional"?
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -585,6 +658,20 @@ shape: {0.shape}""".format(self)
def from_numpy(data, indptr, indices, shape, dim_names=None):
"""
Create arrow::SparseCSCMatrix from numpy.ndarrays
+
+ Parameters
+ ----------
+ data : numpy.ndarray
+ Data used to populate the rows.
+ indptr : numpy.ndarray
+ Range of the rows,
Review comment:
"columns"?
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -225,6 +253,13 @@ shape: {0.shape}""".format(self)
def from_pydata_sparse(obj, dim_names=None):
"""
Convert pydata/sparse.COO to arrow::SparseCOOTensor.
+
+ Parameters
+ ----------
+ obj : pydata.sparse.COO
+ The object that should be converted.
Review comment:
"The sparse multidimensional array that should be converted"?
(see
https://sparse.pydata.org/en/stable/generated/sparse.COO.html#sparse.COO )
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -615,6 +702,13 @@ shape: {0.shape}""".format(self)
def from_scipy(obj, dim_names=None):
"""
Convert scipy.sparse.csc_matrix to arrow::SparseCSCMatrix
+
+ Parameters
+ ----------
+ obj : scipy.sparse.csc_matrix
+ The SciPy matrix that should be converted.
Review comment:
"scipy" (lowercased) is used elsewhere
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -585,6 +658,20 @@ shape: {0.shape}""".format(self)
def from_numpy(data, indptr, indices, shape, dim_names=None):
"""
Create arrow::SparseCSCMatrix from numpy.ndarrays
+
+ Parameters
+ ----------
+ data : numpy.ndarray
+ Data used to populate the rows.
Review comment:
"to populate the sparse matrix"
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -585,6 +658,20 @@ shape: {0.shape}""".format(self)
def from_numpy(data, indptr, indices, shape, dim_names=None):
"""
Create arrow::SparseCSCMatrix from numpy.ndarrays
+
+ Parameters
+ ----------
+ data : numpy.ndarray
+ Data used to populate the rows.
+ indptr : numpy.ndarray
+ Range of the rows,
+ The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data.
+ indices : numpy.ndarray
+ Column indices of the corresponding non-zero values.
Review comment:
"Row indices"?
##########
File path: python/pyarrow/types.py
##########
@@ -44,209 +44,359 @@
def is_null(t):
"""
Return True if value is an instance of a null type.
+
+ Parameters
+ ----------
+ t : DataType
+ type to check
Review comment:
To be honest, this is the kind of example of a rather pointless
description ("type to check" does not say anything very useful). Can we just
omit it?
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -395,13 +435,34 @@ shape: {0.shape}""".format(self)
def from_dense_numpy(cls, obj, dim_names=None):
"""
Convert numpy.ndarray to arrow::SparseCSRMatrix
+
+ Parameters
+ ----------
+ obj : numpy.ndarray
+ The source numpy array
+ dim_names : list, optional
+ The names of the dimensions.
"""
return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names))
@staticmethod
def from_numpy(data, indptr, indices, shape, dim_names=None):
"""
- Create arrow::SparseCSRMatrix from numpy.ndarrays
+ Create arrow::SparseCSRMatrix from numpy.ndarrays.
+
+ Parameters
+ ----------
+ data : numpy.ndarray
+ Data used to populate the rows.
Review comment:
"to populate the sparse matrix"
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -817,6 +932,11 @@ shape: {0.shape}""".format(self)
def from_tensor(obj):
"""
Convert arrow::Tensor to arrow::SparseCSFTensor
+
+ Parameters
+ ----------
+ obj : Tensor
+ The tensor that should be converted.
Review comment:
"dense tensor"
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -645,6 +739,11 @@ shape: {0.shape}""".format(self)
def from_tensor(obj):
"""
Convert arrow::Tensor to arrow::SparseCSCMatrix
+
+ Parameters
+ ----------
+ obj : Tensor
+ The tensor that should be converted.
Review comment:
"dense tensor"
##########
File path: python/pyarrow/tensor.pxi
##########
@@ -771,6 +870,22 @@ shape: {0.shape}""".format(self)
dim_names=None):
"""
Create arrow::SparseCSFTensor from numpy.ndarrays
+
+ Parameters
+ ----------
+ data : numpy.ndarray
+ Data used to populate the rows.
+ indptr : numpy.ndarray
+ Range of the rows,
+ The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data.
+ indices : numpy.ndarray
+ Column indices of the corresponding non-zero values.
+ shape : tuple
+ Shape of the matrix.
+ axis_order : list, optional
+ The order of the axis.
Review comment:
I would say for example "The dimensions corresponding to each array in
`indptr` and `indices`"
##########
File path: python/pyarrow/io.pxi
##########
@@ -1430,6 +1441,16 @@ cdef void _cb_transform(transform_func, const
shared_ptr[CBuffer]& src,
cdef class TransformInputStream(NativeFile):
+ """
+ Transform and input stream.
Review comment:
"An input stream that applies a function to an underlying input stream's
contents"
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]