This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 9d2079c ARROW-8989: [C++][Doc] Document available compute functions
9d2079c is described below
commit 9d2079c2ead31399b724ecc3775d61432a8096af
Author: Antoine Pitrou <[email protected]>
AuthorDate: Mon Jul 13 12:48:30 2020 -0500
ARROW-8989: [C++][Doc] Document available compute functions
Also fix glaring bugs in arithmetic kernels
(signed overflow detection was broken).
Closes #7695 from pitrou/ARROW-8989-doc-compute-functions
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Wes McKinney <[email protected]>
---
c_glib/arrow-glib/compute.cpp | 5 +-
cpp/src/arrow/array/validate.cc | 7 +-
cpp/src/arrow/compute/api.h | 4 +
cpp/src/arrow/compute/api_aggregate.h | 61 +--
cpp/src/arrow/compute/api_scalar.h | 97 ++--
cpp/src/arrow/compute/api_vector.h | 37 +-
cpp/src/arrow/compute/cast.cc | 2 +-
cpp/src/arrow/compute/cast.h | 5 +
cpp/src/arrow/compute/exec.h | 14 +-
cpp/src/arrow/compute/function.h | 6 +
cpp/src/arrow/compute/kernels/aggregate_basic.cc | 2 +-
cpp/src/arrow/compute/kernels/aggregate_test.cc | 2 +-
cpp/src/arrow/compute/kernels/scalar_arithmetic.cc | 28 +-
.../compute/kernels/scalar_arithmetic_test.cc | 47 +-
cpp/src/arrow/compute/registry.h | 2 +-
cpp/src/arrow/scalar.h | 40 +-
cpp/src/arrow/util/int_util.h | 33 +-
cpp/src/parquet/column_reader.cc | 7 +-
docs/source/conf.py | 7 +-
docs/source/cpp/api.rst | 2 +
.../cpp/{getting_started.rst => api/compute.rst} | 59 ++-
docs/source/cpp/compute.rst | 526 +++++++++++++++++++++
docs/source/cpp/getting_started.rst | 1 +
docs/source/python/api/arrays.rst | 71 +--
docs/source/python/dataset.rst | 4 +-
25 files changed, 883 insertions(+), 186 deletions(-)
diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp
index d8d0bdc..3e31899 100644
--- a/c_glib/arrow-glib/compute.cpp
+++ b/c_glib/arrow-glib/compute.cpp
@@ -676,7 +676,7 @@ garrow_count_options_set_property(GObject *object,
switch (prop_id) {
case PROP_MODE:
priv->options.count_mode =
- static_cast<arrow::compute::CountOptions::mode>(g_value_get_enum(value));
+ static_cast<arrow::compute::CountOptions::Mode>(g_value_get_enum(value));
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
@@ -706,7 +706,8 @@ static void
garrow_count_options_init(GArrowCountOptions *object)
{
auto priv = GARROW_COUNT_OPTIONS_GET_PRIVATE(object);
- new(&priv->options)
arrow::compute::CountOptions(arrow::compute::CountOptions::COUNT_ALL);
+ new(&priv->options) arrow::compute::CountOptions(
+ arrow::compute::CountOptions::COUNT_NON_NULL);
}
static void
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 3dd0ffd..8fb8b59 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -98,7 +98,7 @@ struct ValidateArrayVisitor {
if (value_size < 0) {
return Status::Invalid("FixedSizeListArray has negative value size ",
value_size);
}
- if (HasMultiplyOverflow(len, value_size) ||
+ if (HasPositiveMultiplyOverflow(len, value_size) ||
array.values()->length() != len * value_size) {
return Status::Invalid("Values Length (", array.values()->length(),
") is not equal to the length (", len,
@@ -329,7 +329,7 @@ Status ValidateArray(const Array& array) {
type.ToString(), ", got ", data.buffers.size());
}
// This check is required to avoid addition overflow below
- if (HasAdditionOverflow(array.length(), array.offset())) {
+ if (HasPositiveAdditionOverflow(array.length(), array.offset())) {
return Status::Invalid("Array of type ", type.ToString(),
" has impossibly large length and offset");
}
@@ -346,7 +346,8 @@ Status ValidateArray(const Array& array) {
min_buffer_size = BitUtil::BytesForBits(array.length() +
array.offset());
break;
case DataTypeLayout::FIXED_WIDTH:
- if (HasMultiplyOverflow(array.length() + array.offset(),
spec.byte_width)) {
+ if (HasPositiveMultiplyOverflow(array.length() + array.offset(),
+ spec.byte_width)) {
return Status::Invalid("Array of type ", type.ToString(),
" has impossibly large length and offset");
}
diff --git a/cpp/src/arrow/compute/api.h b/cpp/src/arrow/compute/api.h
index 3fc6e22..a890cd3 100644
--- a/cpp/src/arrow/compute/api.h
+++ b/cpp/src/arrow/compute/api.h
@@ -20,6 +20,10 @@
#pragma once
+/// \defgroup compute-concrete-options Concrete option classes for compute
functions
+/// @{
+/// @}
+
#include "arrow/compute/api_aggregate.h" // IWYU pragma: export
#include "arrow/compute/api_scalar.h" // IWYU pragma: export
#include "arrow/compute/api_vector.h" // IWYU pragma: export
diff --git a/cpp/src/arrow/compute/api_aggregate.h
b/cpp/src/arrow/compute/api_aggregate.h
index 82a4ebf..72b3108 100644
--- a/cpp/src/arrow/compute/api_aggregate.h
+++ b/cpp/src/arrow/compute/api_aggregate.h
@@ -37,25 +37,47 @@ class ExecContext;
// ----------------------------------------------------------------------
// Aggregate functions
-/// \class CountOptions
+/// \addtogroup compute-concrete-options
+/// @{
+
+/// \brief Control Count kernel behavior
///
-/// The user control the Count kernel behavior with this class. By default, the
-/// it will count all non-null values.
+/// By default, all non-null values are counted.
struct ARROW_EXPORT CountOptions : public FunctionOptions {
- enum mode {
- // Count all non-null values.
- COUNT_ALL = 0,
- // Count all null values.
+ enum Mode {
+ /// Count all non-null values.
+ COUNT_NON_NULL = 0,
+ /// Count all null values.
COUNT_NULL,
};
- explicit CountOptions(enum mode count_mode) : count_mode(count_mode) {}
+ explicit CountOptions(enum Mode count_mode) : count_mode(count_mode) {}
+
+ static CountOptions Defaults() { return CountOptions(COUNT_NON_NULL); }
+
+ enum Mode count_mode = COUNT_NON_NULL;
+};
+
+/// \brief Control MinMax kernel behavior
+///
+/// By default, null values are ignored
+struct ARROW_EXPORT MinMaxOptions : public FunctionOptions {
+ enum Mode {
+ /// Skip null values
+ SKIP = 0,
+ /// Any nulls will result in null output
+ OUTPUT_NULL
+ };
+
+ explicit MinMaxOptions(enum Mode null_handling = SKIP) :
null_handling(null_handling) {}
- static CountOptions Defaults() { return CountOptions(COUNT_ALL); }
+ static MinMaxOptions Defaults() { return MinMaxOptions{}; }
- enum mode count_mode = COUNT_ALL;
+ enum Mode null_handling = SKIP;
};
+/// @}
+
/// \brief Count non-null (or null) values in an array.
///
/// \param[in] options counting options, see CountOptions for more information
@@ -91,25 +113,6 @@ Result<Datum> Mean(const Datum& value, ExecContext* ctx =
NULLPTR);
ARROW_EXPORT
Result<Datum> Sum(const Datum& value, ExecContext* ctx = NULLPTR);
-/// \class MinMaxOptions
-///
-/// The user can control the MinMax kernel behavior with this class. By
default,
-/// it will skip null if there is a null value present.
-struct ARROW_EXPORT MinMaxOptions : public FunctionOptions {
- enum mode {
- /// skip null values
- SKIP = 0,
- /// any nulls will result in null output
- OUTPUT_NULL
- };
-
- explicit MinMaxOptions(enum mode null_handling = SKIP) :
null_handling(null_handling) {}
-
- static MinMaxOptions Defaults() { return MinMaxOptions{}; }
-
- enum mode null_handling = SKIP;
-};
-
/// \brief Calculate the min / max of a numeric array
///
/// This function returns both the min and max as a struct scalar, with type
diff --git a/cpp/src/arrow/compute/api_scalar.h
b/cpp/src/arrow/compute/api_scalar.h
index 858e1ff..1d8ef09 100644
--- a/cpp/src/arrow/compute/api_scalar.h
+++ b/cpp/src/arrow/compute/api_scalar.h
@@ -33,13 +33,64 @@
namespace arrow {
namespace compute {
-// ----------------------------------------------------------------------
+/// \addtogroup compute-concrete-options
+///
+/// @{
struct ArithmeticOptions : public FunctionOptions {
ArithmeticOptions() : check_overflow(false) {}
bool check_overflow;
};
+struct ARROW_EXPORT BinaryContainsExactOptions : public FunctionOptions {
+ explicit BinaryContainsExactOptions(std::string pattern)
+ : pattern(std::move(pattern)) {}
+
+ /// The exact pattern to look for inside input values.
+ std::string pattern;
+};
+
+/// Options for IsIn and Match functions
+struct ARROW_EXPORT SetLookupOptions : public FunctionOptions {
+ explicit SetLookupOptions(Datum value_set, bool skip_nulls)
+ : value_set(std::move(value_set)), skip_nulls(skip_nulls) {}
+
+ /// The set of values to look up input values into.
+ Datum value_set;
+ /// Whether nulls in `value_set` count for lookup.
+ ///
+ /// If true, any null in `value_set` is ignored and nulls in the input
+ /// produce null (Match) or false (IsIn) values in the output.
+ /// If false, any null in `value_set` is successfully matched in
+ /// the input.
+ bool skip_nulls;
+};
+
+struct ARROW_EXPORT StrptimeOptions : public FunctionOptions {
+ explicit StrptimeOptions(std::string format, TimeUnit::type unit)
+ : format(format), unit(unit) {}
+
+ std::string format;
+ TimeUnit::type unit;
+};
+
+enum CompareOperator : int8_t {
+ EQUAL,
+ NOT_EQUAL,
+ GREATER,
+ GREATER_EQUAL,
+ LESS,
+ LESS_EQUAL,
+};
+
+struct CompareOptions : public FunctionOptions {
+ explicit CompareOptions(CompareOperator op) : op(op) {}
+
+ enum CompareOperator op;
+};
+
+/// @}
+
/// \brief Add two values together. Array values must be the same length. If
/// either addend is null the result will be null.
///
@@ -79,21 +130,6 @@ Result<Datum> Multiply(const Datum& left, const Datum&
right,
ArithmeticOptions options = ArithmeticOptions(),
ExecContext* ctx = NULLPTR);
-enum CompareOperator {
- EQUAL,
- NOT_EQUAL,
- GREATER,
- GREATER_EQUAL,
- LESS,
- LESS_EQUAL,
-};
-
-struct CompareOptions : public FunctionOptions {
- explicit CompareOptions(CompareOperator op) : op(op) {}
-
- enum CompareOperator op;
-};
-
/// \brief Compare a numeric array with a scalar.
///
/// \param[in] left datum to compare, must be an Array
@@ -185,15 +221,6 @@ Result<Datum> KleeneOr(const Datum& left, const Datum&
right, ExecContext* ctx =
ARROW_EXPORT
Result<Datum> Xor(const Datum& left, const Datum& right, ExecContext* ctx =
NULLPTR);
-/// For set lookup operations like IsIn, Match
-struct ARROW_EXPORT SetLookupOptions : public FunctionOptions {
- explicit SetLookupOptions(Datum value_set, bool skip_nulls)
- : value_set(std::move(value_set)), skip_nulls(skip_nulls) {}
-
- Datum value_set;
- bool skip_nulls;
-};
-
/// \brief IsIn returns true for each element of `values` that is contained in
/// `value_set`
///
@@ -274,25 +301,5 @@ ARROW_EXPORT
Result<Datum> FillNull(const Datum& values, const Datum& fill_value,
ExecContext* ctx = NULLPTR);
-// ----------------------------------------------------------------------
-// String functions
-
-struct ARROW_EXPORT BinaryContainsExactOptions : public FunctionOptions {
- explicit BinaryContainsExactOptions(std::string pattern) : pattern(pattern)
{}
-
- std::string pattern;
-};
-
-// ----------------------------------------------------------------------
-// Temporal functions
-
-struct ARROW_EXPORT StrptimeOptions : public FunctionOptions {
- explicit StrptimeOptions(std::string format, TimeUnit::type unit)
- : format(format), unit(unit) {}
-
- std::string format;
- TimeUnit::type unit;
-};
-
} // namespace compute
} // namespace arrow
diff --git a/cpp/src/arrow/compute/api_vector.h
b/cpp/src/arrow/compute/api_vector.h
index 28812c3..c3e9dc9 100644
--- a/cpp/src/arrow/compute/api_vector.h
+++ b/cpp/src/arrow/compute/api_vector.h
@@ -29,6 +29,9 @@ namespace compute {
class ExecContext;
+/// \addtogroup compute-concrete-options
+/// @{
+
struct FilterOptions : public FunctionOptions {
/// Configure the action taken when a slot of the selection mask is null
enum NullSelectionBehavior {
@@ -46,6 +49,25 @@ struct FilterOptions : public FunctionOptions {
NullSelectionBehavior null_selection_behavior = DROP;
};
+struct ARROW_EXPORT TakeOptions : public FunctionOptions {
+ explicit TakeOptions(bool boundscheck = true) : boundscheck(boundscheck) {}
+
+ bool boundscheck = true;
+ static TakeOptions BoundsCheck() { return TakeOptions(true); }
+ static TakeOptions NoBoundsCheck() { return TakeOptions(false); }
+ static TakeOptions Defaults() { return BoundsCheck(); }
+};
+
+/// \brief Partitioning options for NthToIndices
+struct PartitionOptions : public FunctionOptions {
+ explicit PartitionOptions(int64_t pivot) : pivot(pivot) {}
+
+ /// The index into the equivalent sorted array of the partition pivot
element.
+ int64_t pivot;
+};
+
+/// @}
+
/// \brief Filter with a boolean selection filter
///
/// The output will be populated with values from the input at positions
@@ -85,15 +107,6 @@ Result<std::shared_ptr<ArrayData>> GetTakeIndices(
} // namespace internal
-struct ARROW_EXPORT TakeOptions : public FunctionOptions {
- explicit TakeOptions(bool boundscheck = true) : boundscheck(boundscheck) {}
-
- bool boundscheck = true;
- static TakeOptions BoundsCheck() { return TakeOptions(true); }
- static TakeOptions NoBoundsCheck() { return TakeOptions(false); }
- static TakeOptions Defaults() { return BoundsCheck(); }
-};
-
/// \brief Take from an array of values at indices in another array
///
/// The output array will be of the same type as the input values
@@ -121,11 +134,6 @@ Result<std::shared_ptr<Array>> Take(const Array& values,
const Array& indices,
const TakeOptions& options =
TakeOptions::Defaults(),
ExecContext* ctx = NULLPTR);
-struct PartitionOptions : public FunctionOptions {
- explicit PartitionOptions(int64_t pivot) : pivot(pivot) {}
- int64_t pivot;
-};
-
/// \brief Returns indices that partition an array around n-th
/// sorted element.
///
@@ -178,6 +186,7 @@ ARROW_EXPORT extern const char kValuesFieldName[];
ARROW_EXPORT extern const char kCountsFieldName[];
ARROW_EXPORT extern const int32_t kValuesFieldIndex;
ARROW_EXPORT extern const int32_t kCountsFieldIndex;
+
/// \brief Return counts of unique elements from an array-like object.
///
/// Note that the counts do not include counts for nulls in the array. These
can be
diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc
index 9c8ea66..211e5a2 100644
--- a/cpp/src/arrow/compute/cast.cc
+++ b/cpp/src/arrow/compute/cast.cc
@@ -136,7 +136,7 @@ Result<const ScalarKernel*> CastFunction::DispatchExact(
// Validate arity
if (passed_num_args != 1) {
- return Status::Invalid("Cast sunctions accept 1 argument but passed ",
+ return Status::Invalid("Cast functions accept 1 argument but passed ",
passed_num_args);
}
std::vector<const ScalarKernel*> candidate_kernels;
diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h
index 907eef3..82dd357 100644
--- a/cpp/src/arrow/compute/cast.h
+++ b/cpp/src/arrow/compute/cast.h
@@ -38,6 +38,9 @@ namespace compute {
class ExecContext;
+/// \addtogroup compute-concrete-options
+/// @{
+
struct ARROW_EXPORT CastOptions : public FunctionOptions {
CastOptions()
: allow_int_overflow(false),
@@ -73,6 +76,8 @@ struct ARROW_EXPORT CastOptions : public FunctionOptions {
bool allow_invalid_utf8;
};
+/// @}
+
// Cast functions are _not_ registered in the FunctionRegistry, though they use
// the same execution machinery
class CastFunction : public ScalarFunction {
diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index aae37c7..142e149 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -211,18 +211,26 @@ struct ExecBatch {
}
};
-/// \brief One-shot invoker for all types of functions. Does kernel dispatch,
-/// argument checking, iteration of ChunkedArray inputs, and wrapping of
-/// outputs
+/// \defgroup compute-call-function One-shot calls to compute functions
+///
+/// @{
+
+/// \brief One-shot invoker for all types of functions.
+///
+/// Does kernel dispatch, argument checking, iteration of ChunkedArray inputs,
+/// and wrapping of outputs.
ARROW_EXPORT
Result<Datum> CallFunction(const std::string& func_name, const
std::vector<Datum>& args,
const FunctionOptions* options, ExecContext* ctx =
NULLPTR);
/// \brief Variant of CallFunction which uses a function's default options.
+///
/// NB: Some functions require FunctionOptions be provided.
ARROW_EXPORT
Result<Datum> CallFunction(const std::string& func_name, const
std::vector<Datum>& args,
ExecContext* ctx = NULLPTR);
+/// @}
+
} // namespace compute
} // namespace arrow
diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h
index 67af4df..93a200e 100644
--- a/cpp/src/arrow/compute/function.h
+++ b/cpp/src/arrow/compute/function.h
@@ -35,6 +35,10 @@
namespace arrow {
namespace compute {
+/// \defgroup compute-functions Abstract compute function API
+///
+/// @{
+
/// \brief Base class for specifying options configuring a function's behavior,
/// such as error handling.
struct ARROW_EXPORT FunctionOptions {};
@@ -277,5 +281,7 @@ class ARROW_EXPORT MetaFunction : public Function {
: Function(std::move(name), Function::META, arity, default_options) {}
};
+/// @}
+
} // namespace compute
} // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 53e89ce..8765914 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -61,7 +61,7 @@ struct CountImpl : public ScalarAggregator {
void Finalize(KernelContext* ctx, Datum* out) override {
const auto& state = checked_cast<const CountImpl&>(*ctx->state());
switch (state.options.count_mode) {
- case CountOptions::COUNT_ALL:
+ case CountOptions::COUNT_NON_NULL:
*out = Datum(state.non_nulls);
break;
case CountOptions::COUNT_NULL:
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc
b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index 3b2d4e0..db548f2 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -277,7 +277,7 @@ static CountPair NaiveCount(const Array& array) {
}
void ValidateCount(const Array& input, CountPair expected) {
- CountOptions all = CountOptions(CountOptions::COUNT_ALL);
+ CountOptions all = CountOptions(CountOptions::COUNT_NON_NULL);
CountOptions nulls = CountOptions(CountOptions::COUNT_NULL);
ASSERT_OK_AND_ASSIGN(Datum result, Count(input, all));
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
index 82a8f15..1f0cd37 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -17,6 +17,7 @@
#include "arrow/compute/kernels/common.h"
#include "arrow/util/int_util.h"
+#include "arrow/util/macros.h"
#ifndef __has_builtin
#define __has_builtin(x) 0
@@ -66,7 +67,7 @@ struct Add {
template <typename T>
static constexpr enable_if_signed_integer<T> Call(KernelContext*, T left, T
right) {
- return to_unsigned(left) + to_unsigned(right);
+ return arrow::internal::SafeSignedAdd(left, right);
}
};
@@ -75,7 +76,7 @@ struct AddChecked {
template <typename T>
static enable_if_integer<T> Call(KernelContext* ctx, T left, T right) {
T result;
- if (__builtin_add_overflow(left, right, &result)) {
+ if (ARROW_PREDICT_FALSE(__builtin_add_overflow(left, right, &result))) {
ctx->SetStatus(Status::Invalid("overflow"));
}
return result;
@@ -83,7 +84,7 @@ struct AddChecked {
#else
template <typename T>
static enable_if_unsigned_integer<T> Call(KernelContext* ctx, T left, T
right) {
- if (arrow::internal::HasAdditionOverflow(left, right)) {
+ if (ARROW_PREDICT_FALSE(arrow::internal::HasPositiveAdditionOverflow(left,
right))) {
ctx->SetStatus(Status::Invalid("overflow"));
}
return left + right;
@@ -91,12 +92,10 @@ struct AddChecked {
template <typename T>
static enable_if_signed_integer<T> Call(KernelContext* ctx, T left, T right)
{
- auto unsigned_left = to_unsigned(left);
- auto unsigned_right = to_unsigned(right);
- if (arrow::internal::HasAdditionOverflow(unsigned_left, unsigned_right)) {
+ if (ARROW_PREDICT_FALSE(arrow::internal::HasSignedAdditionOverflow(left,
right))) {
ctx->SetStatus(Status::Invalid("overflow"));
}
- return unsigned_left + unsigned_right;
+ return left + right;
}
#endif
@@ -119,7 +118,7 @@ struct Subtract {
template <typename T>
static constexpr enable_if_signed_integer<T> Call(KernelContext*, T left, T
right) {
- return to_unsigned(left) - to_unsigned(right);
+ return arrow::internal::SafeSignedSubtract(left, right);
}
};
@@ -128,7 +127,7 @@ struct SubtractChecked {
template <typename T>
static enable_if_integer<T> Call(KernelContext* ctx, T left, T right) {
T result;
- if (__builtin_sub_overflow(left, right, &result)) {
+ if (ARROW_PREDICT_FALSE(__builtin_sub_overflow(left, right, &result))) {
ctx->SetStatus(Status::Invalid("overflow"));
}
return result;
@@ -136,7 +135,8 @@ struct SubtractChecked {
#else
template <typename T>
static enable_if_unsigned_integer<T> Call(KernelContext* ctx, T left, T
right) {
- if (arrow::internal::HasSubtractionOverflow(left, right)) {
+ if (ARROW_PREDICT_FALSE(
+ arrow::internal::HasPositiveSubtractionOverflow(left, right))) {
ctx->SetStatus(Status::Invalid("overflow"));
}
return left - right;
@@ -144,10 +144,10 @@ struct SubtractChecked {
template <typename T>
static enable_if_signed_integer<T> Call(KernelContext* ctx, T left, T right)
{
- if (arrow::internal::HasSubtractionOverflow(left, right)) {
+ if
(ARROW_PREDICT_FALSE(arrow::internal::HasSignedSubtractionOverflow(left,
right))) {
ctx->SetStatus(Status::Invalid("overflow"));
}
- return to_unsigned(left) - to_unsigned(right);
+ return left - right;
}
#endif
@@ -201,12 +201,12 @@ struct MultiplyChecked {
static enable_if_integer<T> Call(KernelContext* ctx, T left, T right) {
T result;
#if __has_builtin(__builtin_mul_overflow)
- if (__builtin_mul_overflow(left, right, &result)) {
+ if (ARROW_PREDICT_FALSE(__builtin_mul_overflow(left, right, &result))) {
ctx->SetStatus(Status::Invalid("overflow"));
}
#else
result = Multiply::Call(ctx, left, right);
- if (left != 0 && result / left != right) {
+ if (left != 0 && ARROW_PREDICT_FALSE(result / left != right)) {
ctx->SetStatus(Status::Invalid("overflow"));
}
#endif
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
index e0f4890..ceb4623 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
@@ -235,9 +235,6 @@ TYPED_TEST(TestBinaryArithmeticSigned, OverflowWraps) {
auto min = std::numeric_limits<CType>::lowest();
auto max = std::numeric_limits<CType>::max();
- this->AssertBinop(Add, MakeArray(min, max, max), MakeArray(CType(-1), 1,
max),
- MakeArray(max, min, CType(-2)));
-
this->AssertBinop(Subtract, MakeArray(min, max, min), MakeArray(1, max, max),
MakeArray(max, 0, 1));
this->AssertBinop(Multiply, MakeArray(min, max, max), MakeArray(max, 2, max),
@@ -261,7 +258,41 @@ TYPED_TEST(TestBinaryArithmeticIntegral, OverflowRaises) {
"overflow");
}
-TYPED_TEST(TestBinaryArithmeticSigned, OverflowRaises) {
+TYPED_TEST(TestBinaryArithmeticSigned, AddOverflowRaises) {
+ using CType = typename TestFixture::CType;
+
+ auto min = std::numeric_limits<CType>::lowest();
+ auto max = std::numeric_limits<CType>::max();
+
+ this->SetOverflowCheck(true);
+
+ this->AssertBinop(Add, MakeArray(max), MakeArray(-1), MakeArray(max - 1));
+ this->AssertBinop(Add, MakeArray(min), MakeArray(1), MakeArray(min + 1));
+ this->AssertBinop(Add, MakeArray(-1), MakeArray(2), MakeArray(1));
+ this->AssertBinop(Add, MakeArray(1), MakeArray(-2), MakeArray(-1));
+
+ this->AssertBinopRaises(Add, MakeArray(max), MakeArray(1), "overflow");
+ this->AssertBinopRaises(Add, MakeArray(min), MakeArray(-1), "overflow");
+}
+
+TYPED_TEST(TestBinaryArithmeticSigned, SubOverflowRaises) {
+ using CType = typename TestFixture::CType;
+
+ auto min = std::numeric_limits<CType>::lowest();
+ auto max = std::numeric_limits<CType>::max();
+
+ this->SetOverflowCheck(true);
+
+ this->AssertBinop(Subtract, MakeArray(max), MakeArray(1), MakeArray(max -
1));
+ this->AssertBinop(Subtract, MakeArray(min), MakeArray(-1), MakeArray(min +
1));
+ this->AssertBinop(Subtract, MakeArray(-1), MakeArray(-2), MakeArray(1));
+ this->AssertBinop(Subtract, MakeArray(1), MakeArray(2), MakeArray(-1));
+
+ this->AssertBinopRaises(Subtract, MakeArray(max), MakeArray(-1), "overflow");
+ this->AssertBinopRaises(Subtract, MakeArray(min), MakeArray(1), "overflow");
+}
+
+TYPED_TEST(TestBinaryArithmeticSigned, MulOverflowRaises) {
using CType = typename TestFixture::CType;
auto min = std::numeric_limits<CType>::lowest();
@@ -270,8 +301,16 @@ TYPED_TEST(TestBinaryArithmeticSigned, OverflowRaises) {
this->SetOverflowCheck(true);
this->AssertBinop(Multiply, MakeArray(max), MakeArray(-1), MakeArray(min +
1));
+ this->AssertBinop(Multiply, MakeArray(max / 2), MakeArray(-2), MakeArray(min
+ 2));
+
this->AssertBinopRaises(Multiply, MakeArray(max), MakeArray(2), "overflow");
+ this->AssertBinopRaises(Multiply, MakeArray(max / 2), MakeArray(3),
"overflow");
+ this->AssertBinopRaises(Multiply, MakeArray(max / 2), MakeArray(-3),
"overflow");
+
+ this->AssertBinopRaises(Multiply, MakeArray(min), MakeArray(2), "overflow");
+ this->AssertBinopRaises(Multiply, MakeArray(min / 2), MakeArray(3),
"overflow");
this->AssertBinopRaises(Multiply, MakeArray(min), MakeArray(-1), "overflow");
+ this->AssertBinopRaises(Multiply, MakeArray(min / 2), MakeArray(-2),
"overflow");
}
TYPED_TEST(TestBinaryArithmeticUnsigned, OverflowWraps) {
diff --git a/cpp/src/arrow/compute/registry.h b/cpp/src/arrow/compute/registry.h
index bb3ded4..2d4c40b 100644
--- a/cpp/src/arrow/compute/registry.h
+++ b/cpp/src/arrow/compute/registry.h
@@ -72,7 +72,7 @@ class ARROW_EXPORT FunctionRegistry {
std::unique_ptr<FunctionRegistryImpl> impl_;
};
-// \brief Return the process-global function registry
+/// \brief Return the process-global function registry
ARROW_EXPORT FunctionRegistry* GetFunctionRegistry();
} // namespace compute
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index 81516eb..1a079bb 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -42,8 +42,12 @@ namespace arrow {
class Array;
-/// \brief Base class for scalar values, representing a single value occupying
-/// an array "slot"
+/// \brief Base class for scalar values
+///
+/// A Scalar represents a single value with a specific DataType.
+/// Scalars are useful for passing single value inputs to compute functions,
+/// or for representing individual array elements (with a non-trivial
+/// wrapping cost, though).
struct ARROW_EXPORT Scalar : public util::EqualityComparable<Scalar> {
virtual ~Scalar() = default;
@@ -82,6 +86,10 @@ struct ARROW_EXPORT Scalar : public
util::EqualityComparable<Scalar> {
: type(std::move(type)), is_valid(is_valid) {}
};
+/// \defgroup concrete-scalar-classes Concrete Scalar subclasses
+///
+/// @{
+
/// \brief A scalar value for NullType. Never valid
struct ARROW_EXPORT NullScalar : public Scalar {
public:
@@ -90,6 +98,8 @@ struct ARROW_EXPORT NullScalar : public Scalar {
NullScalar() : Scalar{null(), false} {}
};
+/// @}
+
namespace internal {
struct ARROW_EXPORT PrimitiveScalarBase : public Scalar {
@@ -119,6 +129,10 @@ struct ARROW_EXPORT PrimitiveScalar : public
PrimitiveScalarBase {
} // namespace internal
+/// \addtogroup concrete-scalar-classes Concrete Scalar subclasses
+///
+/// @{
+
struct ARROW_EXPORT BooleanScalar : public
internal::PrimitiveScalar<BooleanType, bool> {
using Base = internal::PrimitiveScalar<BooleanType, bool>;
using Base::Base;
@@ -423,9 +437,18 @@ struct ARROW_EXPORT ExtensionScalar : public Scalar {
using TypeClass = ExtensionType;
};
+/// @}
+
+/// \defgroup scalar-factories Scalar factory functions
+///
+/// @{
+
+/// \brief Scalar factory for null scalars
ARROW_EXPORT
std::shared_ptr<Scalar> MakeNullScalar(std::shared_ptr<DataType> type);
+/// @}
+
namespace internal {
inline Status CheckBufferLength(...) { return Status::OK(); }
@@ -465,13 +488,22 @@ struct MakeScalarImpl {
std::shared_ptr<Scalar> out_;
};
+/// \addtogroup scalar-factories
+///
+/// @{
+
+/// \brief Scalar factory for non-null scalars
template <typename Value>
Result<std::shared_ptr<Scalar>> MakeScalar(std::shared_ptr<DataType> type,
Value&& value) {
return MakeScalarImpl<Value&&>{type, std::forward<Value>(value),
NULLPTR}.Finish();
}
-/// \brief type inferring scalar factory
+/// \brief Type-inferring scalar factory for non-null scalars
+///
+/// Construct a Scalar instance with a DataType determined by the input C++
type.
+/// (for example Int8Scalar for a int8_t input).
+/// Only non-parametric primitive types and String are supported.
template <typename Value, typename Traits = CTypeTraits<typename
std::decay<Value>::type>,
typename ScalarType = typename Traits::ScalarType,
typename Enable = decltype(ScalarType(std::declval<Value>(),
@@ -484,4 +516,6 @@ inline std::shared_ptr<Scalar> MakeScalar(std::string
value) {
return std::make_shared<StringScalar>(std::move(value));
}
+/// @}
+
} // namespace arrow
diff --git a/cpp/src/arrow/util/int_util.h b/cpp/src/arrow/util/int_util.h
index c4ed0eb..59e4f9c 100644
--- a/cpp/src/arrow/util/int_util.h
+++ b/cpp/src/arrow/util/int_util.h
@@ -83,6 +83,14 @@ SignedInt SafeSignedAdd(SignedInt u, SignedInt v) {
static_cast<UnsignedInt>(v));
}
+/// Signed subtraction with well-defined behaviour on overflow (as unsigned)
+template <typename SignedInt>
+SignedInt SafeSignedSubtract(SignedInt u, SignedInt v) {
+ using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
+ return static_cast<SignedInt>(static_cast<UnsignedInt>(u) -
+ static_cast<UnsignedInt>(v));
+}
+
/// Signed left shift with well-defined behaviour on negative numbers or
overflow
template <typename SignedInt, typename Shift>
SignedInt SafeLeftShift(SignedInt u, Shift shift) {
@@ -90,25 +98,42 @@ SignedInt SafeLeftShift(SignedInt u, Shift shift) {
return static_cast<SignedInt>(static_cast<UnsignedInt>(u) << shift);
}
+// TODO Add portable wrappers for __builtin_add_overflow and friends
+// see http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2428.pdf
+
/// Detect multiplication overflow between *positive* integers
template <typename Integer>
-bool HasMultiplyOverflow(Integer value, Integer multiplicand) {
+bool HasPositiveMultiplyOverflow(Integer value, Integer multiplicand) {
return (multiplicand != 0 &&
value > std::numeric_limits<Integer>::max() / multiplicand);
}
/// Detect addition overflow between *positive* integers
template <typename Integer>
-bool HasAdditionOverflow(Integer value, Integer addend) {
+bool HasPositiveAdditionOverflow(Integer value, Integer addend) {
return (value > std::numeric_limits<Integer>::max() - addend);
}
-/// Detect addition overflow between integers
+/// Detect addition overflow between signed integers
template <typename Integer>
-bool HasSubtractionOverflow(Integer value, Integer minuend) {
+bool HasSignedAdditionOverflow(Integer value, Integer addend) {
+ return (addend > 0) ? (value > std::numeric_limits<Integer>::max() - addend)
+ : (value < std::numeric_limits<Integer>::min() - addend);
+}
+
+/// Detect subtraction overflow between *positive* integers
+template <typename Integer>
+bool HasPositiveSubtractionOverflow(Integer value, Integer minuend) {
return (value < minuend);
}
+/// Detect subtraction overflow between signed integers
+template <typename Integer>
+bool HasSignedSubtractionOverflow(Integer value, Integer subtrahend) {
+ return (subtrahend > 0) ? (value < std::numeric_limits<Integer>::min() +
subtrahend)
+ : (value > std::numeric_limits<Integer>::max() +
subtrahend);
+}
+
/// Upcast an integer to the largest possible width (currently 64 bits)
template <typename Integer>
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 27a3a92..0bfc303 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1028,7 +1028,7 @@ class TypedRecordReader : public
ColumnReaderImplBase<DType>,
// Compute the values capacity in bytes for the given number of elements
int64_t bytes_for_values(int64_t nitems) const {
int64_t type_size = GetTypeByteSize(this->descr_->physical_type());
- if (::arrow::internal::HasMultiplyOverflow(nitems, type_size)) {
+ if (::arrow::internal::HasPositiveMultiplyOverflow(nitems, type_size)) {
throw ParquetException("Total size of items too large");
}
return nitems * type_size;
@@ -1184,7 +1184,7 @@ class TypedRecordReader : public
ColumnReaderImplBase<DType>,
if (extra_size < 0) {
throw ParquetException("Negative size (corrupt file?)");
}
- if (::arrow::internal::HasAdditionOverflow(size, extra_size)) {
+ if (::arrow::internal::HasPositiveAdditionOverflow(size, extra_size)) {
throw ParquetException("Allocation size too large (corrupt file?)");
}
const int64_t target_size = size + extra_size;
@@ -1203,7 +1203,8 @@ class TypedRecordReader : public
ColumnReaderImplBase<DType>,
UpdateCapacity(levels_capacity_, levels_written_, extra_levels);
if (new_levels_capacity > levels_capacity_) {
constexpr auto kItemSize = static_cast<int64_t>(sizeof(int16_t));
- if (::arrow::internal::HasMultiplyOverflow(new_levels_capacity,
kItemSize)) {
+ if (::arrow::internal::HasPositiveMultiplyOverflow(new_levels_capacity,
+ kItemSize)) {
throw ParquetException("Allocation size too large (corrupt file?)");
}
PARQUET_THROW_NOT_OK(def_levels_->Resize(new_levels_capacity *
kItemSize, false));
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 2013a72..4508faa 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -74,6 +74,10 @@ autodoc_default_options = {
'inherited-members': None
}
+# Breathe configuration
+breathe_projects = {"arrow_cpp": "../../cpp/apidoc/xml"}
+breathe_default_project = "arrow_cpp"
+
# Overriden conditionally below
autodoc_mock_imports = []
@@ -86,9 +90,6 @@ napoleon_use_rtype = False
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
-breathe_projects = {"arrow_cpp": "../../cpp/apidoc/xml"}
-breathe_default_project = "arrow_cpp"
-
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
diff --git a/docs/source/cpp/api.rst b/docs/source/cpp/api.rst
index 9b7d356..59d2210 100644
--- a/docs/source/cpp/api.rst
+++ b/docs/source/cpp/api.rst
@@ -26,8 +26,10 @@ API Reference
api/memory
api/datatype
api/array
+ api/scalar
api/builder
api/table
+ api/compute
api/tensor
api/utilities
api/io
diff --git a/docs/source/cpp/getting_started.rst
b/docs/source/cpp/api/compute.rst
similarity index 53%
copy from docs/source/cpp/getting_started.rst
copy to docs/source/cpp/api/compute.rst
index 5ec0dec..3b0a89f 100644
--- a/docs/source/cpp/getting_started.rst
+++ b/docs/source/cpp/api/compute.rst
@@ -15,23 +15,42 @@
.. specific language governing permissions and limitations
.. under the License.
-.. default-domain:: cpp
-.. highlight:: cpp
-
-User Guide
-==========
-
-.. toctree::
-
- overview
- conventions
- cmake
- memory
- arrays
- datatypes
- tables
- io
- parquet
- csv
- json
- flight
+Compute Functions
+=================
+
+Datum class
+-----------
+
+.. doxygenclass:: arrow::Datum
+ :members:
+
+Abstract Function classes
+-------------------------
+
+.. doxygengroup:: compute-functions
+ :content-only:
+ :members:
+
+Function registry
+-----------------
+
+.. doxygenclass:: arrow::compute::FunctionRegistry
+ :members:
+
+.. doxygenfunction:: arrow::compute::GetFunctionRegistry
+
+Convenience functions
+---------------------
+
+.. doxygengroup:: compute-call-function
+ :content-only:
+
+Concrete options classes
+------------------------
+
+.. doxygengroup:: compute-concrete-options
+ :content-only:
+ :members:
+ :undoc-members:
+
+.. TODO: List concrete function invocation shortcuts?
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
new file mode 100644
index 0000000..149dbb3
--- /dev/null
+++ b/docs/source/cpp/compute.rst
@@ -0,0 +1,526 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+.. cpp:namespace:: arrow::compute
+
+=================
+Compute Functions
+=================
+
+The generic Compute API
+=======================
+
+.. TODO: describe API and how to invoke compute functions
+
+Functions and function registry
+-------------------------------
+
+Functions represent logical compute operations over inputs of possibly
+varying types. Internally, a function is implemented by one or several
+"kernels", depending on the concrete input types (for example, a function
+adding values from two inputs can have different kernels depending on
+whether the inputs are integral or floating-point).
+
+Functions are stored in a global :class:`FunctionRegistry` where
+they can be looked up by name.
+
+Input shapes
+------------
+
+Computation inputs are represented as a general :class:`Datum` class,
+which is a tagged union of several shapes of data such as :class:`Scalar`,
+:class:`Array` and :class:`ChunkedArray`. Many compute functions support
+both array (chunked or not) and scalar inputs, however some will mandate
+either. For example, the ``fill_null`` function requires its second input
+to be a scalar, while ``sort_indices`` requires its first and only input to
+be an array.
+
+Invoking functions
+------------------
+
+Compute functions can be invoked by name using
+:func:`arrow::compute::CallFunction`::
+
+ std::shared_ptr<arrow::Array> numbers_array = ...;
+ std::shared_ptr<arrow::Scalar> increment = ...;
+ arrow::Datum incremented_datum;
+
+ ARROW_ASSIGN_OR_RAISE(incremented_datum,
+ arrow::compute::CallFunction("add", {numbers_array,
increment}));
+ std::shared_ptr<Array> incremented_array =
std::move(incremented_datum).array();
+
+(note this example uses implicit conversion from ``std::shared_ptr<Array>``
+to ``Datum``)
+
+Many compute functions are also available directly as concrete APIs, here
+:func:`arrow::compute::Add`::
+
+ std::shared_ptr<arrow::Array> numbers_array = ...;
+ std::shared_ptr<arrow::Scalar> increment = ...;
+ arrow::Datum incremented_datum;
+
+ ARROW_ASSIGN_OR_RAISE(incremented_datum,
+ arrow::compute::Add(numbers_array, increment));
+ std::shared_ptr<Array> incremented_array =
std::move(incremented_datum).array();
+
+Some functions accept or require an options structure that determines the
+exact semantics of the function::
+
+ MinMaxOptions options;
+ options.null_handling = MinMaxOptions::OUTPUT_NULL;
+
+ std::shared_ptr<arrow::Array> array = ...;
+ arrow::Datum minmax_datum;
+
+ ARROW_ASSIGN_OR_RAISE(minmax_datum,
+ arrow::compute::CallFunction("minmax", {array},
&options));
+
+ // Unpack struct scalar result (a two-field {"min", "max"} scalar)
+ const auto& minmax_scalar = \
+ static_cast<const arrow::StructScalar&>(*minmax_datum.scalar());
+ const auto min_value = minmax_scalar.value[0];
+ const auto max_value = minmax_scalar.value[1];
+
+.. seealso::
+ :doc:`Compute API reference <api/compute>`
+
+
+Available functions
+===================
+
+Type categories
+---------------
+
+To avoid exhaustively listing supported types, the tables below use a number
+of general type categories:
+
+* "Numeric": Integer types (Int8, etc.) and Floating-point types (Float32,
+ Float64, sometimes Float16). Some functions also accept Decimal128 input.
+
+* "Temporal": Date types (Date32, Date64), Time types (Time32, Time64),
+ Timestamp, Duration, Interval.
+
+* "Binary-like": Binary, LargeBinary, sometimes also FixedSizeBinary.
+
+* "String-like": String, LargeString.
+
+* "List-like": List, LargeList, sometimes also FixedSizeList.
+
+If you are unsure whether a function supports a concrete input type, we
+recommend you try it out. Unsupported input types return a ``TypeError``
+:class:`Status`.
+
+Aggregations
+------------
+
++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
+| Function name | Arity | Input types | Output type
| Options class |
++==========================+============+====================+=======================+============================================+
+| count | Unary | Any | Scalar Int64
| :struct:`CountOptions` |
++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
+| mean | Unary | Numeric | Scalar Float64
| |
++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
+| minmax | Unary | Numeric | Scalar Struct
(1) | :struct:`MinMaxOptions` |
++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
+| sum | Unary | Numeric | Scalar Numeric
(2) | |
++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
+
+Notes:
+
+* \(1) Output is a ``{"min": input type, "max": input type}`` Struct
+
+* \(2) Output is Int64, UInt64 or Float64, depending on the input type
+
+
+Element-wise ("scalar") functions
+---------------------------------
+
+All element-wise functions accept both arrays and scalars as input. The
+semantics for unary functions are as follow:
+
+* scalar inputs produce a scalar output
+* array inputs produce an array output
+
+Binary functions have the following semantics (which is sometimes called
+"broadcasting" in other systems such as NumPy):
+
+* ``(scalar, scalar)`` inputs produce a scalar output
+* ``(array, array)`` inputs produce an array output (and both inputs must
+ be of the same length)
+* ``(scalar, array)`` and ``(array, scalar)`` produce an array output.
+ The scalar input is handled as if it were an array of the same length N
+ as the other input, with the same value repeated N times.
+
+Arithmetic functions
+~~~~~~~~~~~~~~~~~~~~
+
+These functions expect two inputs of the same type and apply a given binary
+operation to each pair of elements gathered from the inputs. If any of the
+input elements in a pair is null, the corresponding output element is null.
+
+The default variant of these functions does not detect overflow (the result
+then typically wraps around). Each function is also available in an
+overflow-checking variant, suffixed ``_checked``, which returns
+an ``Invalid`` :class:`Status` when overflow is detected.
+
++--------------------------+------------+--------------------+---------------------+
+| Function name | Arity | Input types | Output type
|
++==========================+============+====================+=====================+
+| add | Binary | Numeric | Numeric
|
++--------------------------+------------+--------------------+---------------------+
+| add_checked | Binary | Numeric | Numeric
|
++--------------------------+------------+--------------------+---------------------+
+| multiply | Binary | Numeric | Numeric
|
++--------------------------+------------+--------------------+---------------------+
+| multiply_checked | Binary | Numeric | Numeric
|
++--------------------------+------------+--------------------+---------------------+
+| subtract | Binary | Numeric | Numeric
|
++--------------------------+------------+--------------------+---------------------+
+| subtract_checked | Binary | Numeric | Numeric
|
++--------------------------+------------+--------------------+---------------------+
+
+Comparisons
+~~~~~~~~~~~
+
+Those functions expect two inputs of the same type and apply a given
+comparison operator. If any of the input elements in a pair is null,
+the corresponding output element is null.
+
++--------------------------+------------+---------------------------------------------+---------------------+
+| Function names | Arity | Input types
| Output type |
++==========================+============+=============================================+=====================+
+| equal, not_equal | Binary | Numeric, Temporal, Binary- and
String-like | Boolean |
++--------------------------+------------+---------------------------------------------+---------------------+
+| greater, greater_equal, | Binary | Numeric, Temporal, Binary- and
String-like | Boolean |
+| less, less_equal | |
| |
++--------------------------+------------+---------------------------------------------+---------------------+
+
+Logical functions
+~~~~~~~~~~~~~~~~~~
+
+The normal behaviour for these functions is to emit a null if any of the
+inputs is null (similar to the semantics of ``NaN`` in floating-point
+computations).
+
+Some of them are also available in a `Kleene logic`_ variant (suffixed
+``_kleene``) where null is taken to mean "undefined". This is the
+interpretation of null used in SQL systems as well as R and Julia,
+for example.
+
+For the Kleene logic variants, therefore:
+
+* "true AND null", "null AND true" give "null" (the result is undefined)
+* "true OR null", "null OR true" give "true"
+* "false AND null", "null AND false" give "false"
+* "false OR null", "null OR false" give "null" (the result is undefined)
+
++--------------------------+------------+--------------------+---------------------+
+| Function name | Arity | Input types | Output type
|
++==========================+============+====================+=====================+
+| and | Binary | Boolean | Boolean
|
++--------------------------+------------+--------------------+---------------------+
+| and_kleene | Binary | Boolean | Boolean
|
++--------------------------+------------+--------------------+---------------------+
+| invert | Unary | Boolean | Boolean
|
++--------------------------+------------+--------------------+---------------------+
+| or | Binary | Boolean | Boolean
|
++--------------------------+------------+--------------------+---------------------+
+| or_kleene | Binary | Boolean | Boolean
|
++--------------------------+------------+--------------------+---------------------+
+| xor | Binary | Boolean | Boolean
|
++--------------------------+------------+--------------------+---------------------+
+
+.. _Kleene logic:
https://en.wikipedia.org/wiki/Three-valued_logic#Kleene_and_Priest_logics
+
+String functions
+~~~~~~~~~~~~~~~~
+
++--------------------------+------------+--------------------+---------------------+---------+
+| Function name | Arity | Input types | Output type
| Notes |
++==========================+============+====================+=====================+=========+
+| ascii_length | Unary | String-like | Int32 or Int64
| \(1) |
++--------------------------+------------+--------------------+---------------------+---------+
+| ascii_lower | Unary | String-like | String-like
| \(2) |
++--------------------------+------------+--------------------+---------------------+---------+
+| ascii_upper | Unary | String-like | String-like
| \(2) |
++--------------------------+------------+--------------------+---------------------+---------+
+| utf8_lower | Unary | String-like | String-like
| \(3) |
++--------------------------+------------+--------------------+---------------------+---------+
+| utf8_upper | Unary | String-like | String-like
| \(3) |
++--------------------------+------------+--------------------+---------------------+---------+
+
+* \(1) Output is the physical length in bytes of each input element. Output
+ type is Int32 for String, Int64 for LargeString.
+
+* \(2) Each ASCII character in the input is converted to lowercase or
+ uppercase. Non-ASCII characters are left untouched.
+
+* \(3) Each UTF8-encoded character in the input is converted to lowercase or
+ uppercase.
+
+Containment tests
+~~~~~~~~~~~~~~~~~
+
++--------------------------+------------+------------------------------------+---------------+----------------------------------------+
+| Function name | Arity | Input types |
Output type | Options class |
++==========================+============+====================================+===============+========================================+
+| binary_contains_exact | Unary | String-like |
Boolean (1) | :struct:`BinaryContainsExactOptions` |
++--------------------------+------------+------------------------------------+---------------+----------------------------------------+
+| isin | Unary | Boolean, Null, Numeric, Temporal, |
Boolean (2) | :struct:`SetLookupOptions` |
+| | | Binary- and String-like |
| |
++--------------------------+------------+------------------------------------+---------------+----------------------------------------+
+| match | Unary | Boolean, Null, Numeric, Temporal, |
Int32 (3) | :struct:`SetLookupOptions` |
+| | | Binary- and String-like |
| |
++--------------------------+------------+------------------------------------+---------------+----------------------------------------+
+
+* \(1) Output is true iff :member:`BinaryContainsExactOptions::pattern`
+ is a substring of the corresponding input element.
+
+* \(2) Output is true iff the corresponding input element is equal to one
+ of the elements in :member:`SetLookupOptions::value_set`.
+
+* \(3) Output is the index of the corresponding input element in
+ :member:`SetLookupOptions::value_set`, if found there. Otherwise,
+ output is null.
+
+Structural transforms
+~~~~~~~~~~~~~~~~~~~~~
+
+.. XXX (this category is a bit of a hodgepodge)
+
++--------------------------+------------+---------------------------------------+---------------------+---------+
+| Function name | Arity | Input types
| Output type | Notes |
++==========================+============+=======================================+=====================+=========+
+| fill_null | Binary | Boolean, Null, Numeric, Temporal
| Boolean | \(1) |
++--------------------------+------------+---------------------------------------+---------------------+---------+
+| is_null | Unary | Any
| Boolean | \(2) |
++--------------------------+------------+---------------------------------------+---------------------+---------+
+| is_valid | Unary | Any
| Boolean | \(2) |
++--------------------------+------------+---------------------------------------+---------------------+---------+
+| list_value_lengths | Unary | List-like
| Int32 or Int64 | \(4) |
++--------------------------+------------+---------------------------------------+---------------------+---------+
+
+* \(1) First input must be an array, second input a scalar of the same type.
+ Output is an array of the same type as the inputs, and with the same values
+ as the first input, except for nulls replaced with the second input value.
+
+* \(2) Output is true iff the corresponding input element is non-null.
+
+* \(3) Output is true iff the corresponding input element is null.
+
+* \(4) Each output element is the length of the corresponding input element
+ (null if input is null). Output type is Int32 for List, Int64 for LargeList.
+
+Conversions
+~~~~~~~~~~~
+
+A general conversion function named ``cast`` is provided which accepts a large
+number of input and output types. The type to cast to can be passed in a
+:struct:`CastOptions` instance. As an alternative, the same service is
+provided by a concrete function :func:`~arrow::compute::Cast`.
+
++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
+| Function name | Arity | Input types | Output type
| Options class |
++==========================+============+====================+=======================+============================================+
+| cast | Unary | Many | Variable
| :struct:`CastOptions` |
++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
+| strptime | Unary | String-like | Timestamp
| :struct:`StrptimeOptions` |
++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
+
+The conversions available with ``cast`` are listed below. In all cases, a
+null input value is converted into a null output value.
+
+**Truth value extraction**
+
++-----------------------------+------------------------------------+--------------+
+| Input type | Output type | Notes
|
++=============================+====================================+==============+
+| Binary- and String-like | Boolean | \(1)
|
++-----------------------------+------------------------------------+--------------+
+| Numeric | Boolean | \(2)
|
++-----------------------------+------------------------------------+--------------+
+
+* \(1) Output is true iff the corresponding input value has non-zero length.
+
+* \(2) Output is true iff the corresponding input value is non-zero.
+
+**Same-kind conversion**
+
++-----------------------------+------------------------------------+--------------+
+| Input type | Output type | Notes
|
++=============================+====================================+==============+
+| Int32 | 32-bit Temporal | \(1)
|
++-----------------------------+------------------------------------+--------------+
+| Int64 | 64-bit Temporal | \(1)
|
++-----------------------------+------------------------------------+--------------+
+| (Large)Binary | (Large)String | \(2)
|
++-----------------------------+------------------------------------+--------------+
+| (Large)String | (Large)Binary | \(3)
|
++-----------------------------+------------------------------------+--------------+
+| Numeric | Numeric | \(4) \(5)
|
++-----------------------------+------------------------------------+--------------+
+| 32-bit Temporal | Int32 | \(1)
|
++-----------------------------+------------------------------------+--------------+
+| 64-bit Temporal | Int64 | \(1)
|
++-----------------------------+------------------------------------+--------------+
+| Temporal | Temporal | \(4) \(5)
|
++-----------------------------+------------------------------------+--------------+
+
+* \(1) No-operation cast: the raw values are kept identical, only
+ the type is changed.
+
+* \(2) Validates the contents if :member:`CastOptions::allow_invalid_utf8`
+ is false.
+
+* \(3) No-operation cast: only the type is changed.
+
+* \(4) Overflow and truncation checks are enabled depending on
+ the given :struct:`CastOptions`.
+
+* \(5) Not all such casts have been implemented.
+
+**String representations**
+
++-----------------------------+------------------------------------+---------+
+| Input type | Output type | Notes |
++=============================+====================================+=========+
+| Boolean | String-like | |
++-----------------------------+------------------------------------+---------+
+| Numeric | String-like | |
++-----------------------------+------------------------------------+---------+
+
+**Generic conversions**
+
++-----------------------------+------------------------------------+---------+
+| Input type | Output type | Notes |
++=============================+====================================+=========+
+| Dictionary | Dictionary value type | \(1) |
++-----------------------------+------------------------------------+---------+
+| Extension | Extension storage type | |
++-----------------------------+------------------------------------+---------+
+| List-like | List-like | \(2) |
++-----------------------------+------------------------------------+---------+
+| Null | Any | |
++-----------------------------+------------------------------------+---------+
+
+* \(1) The dictionary indices are unchanged, the dictionary values are
+ cast from the input value type to the output value type (if a conversion
+ is available).
+
+* \(2) The list offsets are unchanged, the list values are cast from the
+ input value type to the output value type (if a conversion is
+ available).
+
+
+Array-wise ("vector") functions
+-------------------------------
+
+Associative transforms
+~~~~~~~~~~~~~~~~~~~~~~
+
++--------------------------+------------+------------------------------------+----------------------------+
+| Function name | Arity | Input types |
Output type |
++==========================+============+====================================+============================+
+| dictionary_encode | Unary | Boolean, Null, Numeric, |
Dictionary (1) |
+| | | Temporal, Binary- and String-like |
|
++--------------------------+------------+------------------------------------+----------------------------+
+| unique | Unary | Boolean, Null, Numeric, |
Input type (2) |
+| | | Temporal, Binary- and String-like |
|
++--------------------------+------------+------------------------------------+----------------------------+
+| value_counts | Unary | Boolean, Null, Numeric, |
Input type (3) |
+| | | Temporal, Binary- and String-like |
|
++--------------------------+------------+------------------------------------+----------------------------+
+
+* \(1) Output is ``Dictionary(Int32, input type)``.
+
+* \(2) Duplicates are removed from the output while the original order is
+ maintained.
+
+* \(3) Output is a ``{"values": input type, "counts": Int64}`` Struct.
+ Each output element corresponds to a unique value in the input, along
+ with the number of times this value has appeared.
+
+Selections
+~~~~~~~~~~
+
+These functions select a subset of the first input defined by the second input.
+
++-----------------+------------+---------------+--------------+------------------+-------------------------+-------------+
+| Function name | Arity | Input type 1 | Input type 2 | Output type
| Options class | Notes |
++=================+============+===============+==============+==================+=========================+=============+
+| filter | Binary | Any (1) | Boolean | Input type 1
| :struct:`FilterOptions` | \(2) |
++-----------------+------------+---------------+--------------+------------------+-------------------------+-------------+
+| take | Binary | Any (1) | Integer | Input type 1
| :struct:`TakeOptions` | \(3) |
++-----------------+------------+---------------+--------------+------------------+-------------------------+-------------+
+
+* \(1) Unions are unsupported.
+
+* \(2) Each element in input 1 is appended to the output iff the corresponding
+ element in input 2 is true.
+
+* \(3) For each element *i* in input 2, the *i*'th element in input 1 is
+ appended to the output.
+
+Sorts and partitions
+~~~~~~~~~~~~~~~~~~~~
+
+In these functions, nulls are considered greater than any other value
+(they will be sorted or partitioned at the end of the array).
+
++-----------------------+------------+-------------------------+-------------------+--------------------------------+-------------+
+| Function name | Arity | Input types | Output type
| Options class | Notes |
++=======================+============+=========================+===================+================================+=============+
+| partition_indices | Unary | Binary- and String-like | UInt64
| :struct:`PartitionOptions` | \(1) \(3) |
++-----------------------+------------+-------------------------+-------------------+--------------------------------+-------------+
+| partition_indices | Unary | Numeric | UInt64
| :struct:`PartitionOptions` | \(1) |
++-----------------------+------------+-------------------------+-------------------+--------------------------------+-------------+
+| sort_indices | Unary | Binary- and String-like | UInt64
| | \(2) \(3) |
++-----------------------+------------+-------------------------+-------------------+--------------------------------+-------------+
+| sort_indices | Unary | Numeric | UInt64
| | \(2) |
++-----------------------+------------+-------------------------+-------------------+--------------------------------+-------------+
+
+* \(1) The output is an array of indices into the input array, that define
+ a partition around the *N*'th input array element in sorted order. *N* is
+ given in :member:`PartitionOptions::pivot`.
+
+* \(2) The output is an array of indices into the input array, that define
+ a non-stable sort of the input array.
+
+* \(3) Input values are ordered lexicographically as bytestrings (even
+ for String arrays).
+
+
+Structural transforms
+~~~~~~~~~~~~~~~~~~~~~
+
++--------------------------+------------+--------------------+---------------------+---------+
+| Function name | Arity | Input types | Output type
| Notes |
++==========================+============+====================+=====================+=========+
+| list_flatten | Unary | List-like | List value type
| \(1) |
++--------------------------+------------+--------------------+---------------------+---------+
+| list_parent_indices | Unary | List-like | Int32 or Int64
| \(2) |
++--------------------------+------------+--------------------+---------------------+---------+
+
+* \(1) The top level of nesting is removed: all values in the list child array,
+ including nulls, are appended to the output. However, nulls in the parent
+ list array are discarded.
+
+* \(2) For each value in the list child array, the index at which it is found
+ in the list array is appended to the output. Nulls in the parent list array
+ are discarded.
diff --git a/docs/source/cpp/getting_started.rst
b/docs/source/cpp/getting_started.rst
index 5ec0dec..0927e7f 100644
--- a/docs/source/cpp/getting_started.rst
+++ b/docs/source/cpp/getting_started.rst
@@ -30,6 +30,7 @@ User Guide
arrays
datatypes
tables
+ compute
io
parquet
csv
diff --git a/docs/source/python/api/arrays.rst
b/docs/source/python/api/arrays.rst
index fa4902d..81a00d8 100644
--- a/docs/source/python/api/arrays.rst
+++ b/docs/source/python/api/arrays.rst
@@ -75,43 +75,48 @@ may expose data type-specific methods or properties.
.. _api.scalar:
-Array Scalars
--------------
+Scalars
+-------
-Indexing an array wraps the represented value in a scalar object whose
-concrete type depends on the array data type. You shouldn't instantiate
-any of those classes directly.
+This function constructs a new Arrow scalar:
+
+.. autosummary::
+ :toctree: ../generated/
+
+ scalar
+
+A scalar's python class depends on its data type. Concrete scalar
+classes may expose data type-specific methods or properties.
.. autosummary::
:toctree: ../generated/
NA
Scalar
- ArrayValue
- BooleanValue
- Int8Value
- Int16Value
- Int32Value
- Int64Value
- UInt8Value
- UInt16Value
- UInt32Value
- UInt64Value
- FloatValue
- DoubleValue
- BinaryValue
- StringValue
- FixedSizeBinaryValue
- LargeBinaryValue
- LargeStringValue
- Time32Value
- Time64Value
- Date32Value
- Date64Value
- TimestampValue
- DecimalValue
- DictionaryValue
- ListValue
- LargeListValue
- StructValue
- UnionValue
+ BooleanScalar
+ Int8Scalar
+ Int16Scalar
+ Int32Scalar
+ Int64Scalar
+ UInt8Scalar
+ UInt16Scalar
+ UInt32Scalar
+ UInt64Scalar
+ FloatScalar
+ DoubleScalar
+ BinaryScalar
+ StringScalar
+ FixedSizeBinaryScalar
+ LargeBinaryScalar
+ LargeStringScalar
+ Time32Scalar
+ Time64Scalar
+ Date32Scalar
+ Date64Scalar
+ TimestampScalar
+ Decimal128Scalar
+ DictionaryScalar
+ ListScalar
+ LargeListScalar
+ StructScalar
+ UnionScalar
diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst
index 33d3123..6c07ad9 100644
--- a/docs/source/python/dataset.rst
+++ b/docs/source/python/dataset.rst
@@ -329,8 +329,8 @@ Reading from Minio
------------------
In addition to cloud storage, pyarrow also supports reading from a
-`MinIO https://github.com/minio/minio`_ object storage instance emulating S3
-APIs. Paired with `toxiproxy https://github.com/shopify/toxiproxy`_, this is
+`MinIO <https://github.com/minio/minio>`_ object storage instance emulating S3
+APIs. Paired with `toxiproxy <https://github.com/shopify/toxiproxy>`_, this is
useful for testing or benchmarking.
.. code-block:: python