This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new a8d3fae21d Migrate documentation for Aggregate Functions to code
(#12861)
a8d3fae21d is described below
commit a8d3fae21d12a9caafad77a11b8a22bcfe9555fd
Author: Jonathan Chen <[email protected]>
AuthorDate: Fri Oct 11 09:58:41 2024 -0400
Migrate documentation for Aggregate Functions to code (#12861)
* aggregate function migration
* fmt fix
---
.../functions-aggregate/src/approx_distinct.rs | 35 +-
.../functions-aggregate/src/approx_median.rs | 35 +-
.../src/approx_percentile_cont.rs | 37 +-
.../src/approx_percentile_cont_with_weight.rs | 38 +-
datafusion/functions-aggregate/src/array_agg.rs | 34 +-
datafusion/functions-aggregate/src/average.rs | 36 +-
datafusion/functions-aggregate/src/bool_and_or.rs | 59 +-
datafusion/functions-aggregate/src/correlation.rs | 34 +-
datafusion/functions-aggregate/src/count.rs | 41 +-
datafusion/functions-aggregate/src/covariance.rs | 62 ++-
datafusion/functions-aggregate/src/first_last.rs | 63 ++-
datafusion/functions-aggregate/src/grouping.rs | 39 +-
datafusion/functions-aggregate/src/median.rs | 36 +-
datafusion/functions-aggregate/src/min_max.rs | 62 ++-
datafusion/functions-aggregate/src/nth_value.rs | 41 +-
datafusion/functions-aggregate/src/stddev.rs | 62 ++-
datafusion/functions-aggregate/src/string_agg.rs | 35 +-
datafusion/functions-aggregate/src/sum.rs | 34 +-
docs/source/user-guide/sql/aggregate_functions.md | 329 -----------
.../user-guide/sql/aggregate_functions_new.md | 612 +++++++++++++++++++++
20 files changed, 1363 insertions(+), 361 deletions(-)
diff --git a/datafusion/functions-aggregate/src/approx_distinct.rs
b/datafusion/functions-aggregate/src/approx_distinct.rs
index cf8217fe98..efa9a6d8da 100644
--- a/datafusion/functions-aggregate/src/approx_distinct.rs
+++ b/datafusion/functions-aggregate/src/approx_distinct.rs
@@ -31,13 +31,17 @@ use datafusion_common::ScalarValue;
use datafusion_common::{
downcast_value, internal_err, not_impl_err, DataFusionError, Result,
};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::utils::format_state_name;
-use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, Volatility};
+use datafusion_expr::{
+ Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
+};
use std::any::Any;
use std::fmt::{Debug, Formatter};
use std::hash::Hash;
use std::marker::PhantomData;
+use std::sync::OnceLock;
make_udaf_expr_and_func!(
ApproxDistinct,
approx_distinct,
@@ -303,4 +307,33 @@ impl AggregateUDFImpl for ApproxDistinct {
};
Ok(accumulator)
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_approx_distinct_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_approx_distinct_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_APPROXIMATE)
+ .with_description(
+ "Returns the approximate number of distinct input values
calculated using the HyperLogLog algorithm.",
+ )
+ .with_syntax_example("approx_distinct(expression)")
+ .with_sql_example(r#"```sql
+> SELECT approx_distinct(column_name) FROM table_name;
++-----------------------------------+
+| approx_distinct(column_name) |
++-----------------------------------+
+| 42 |
++-----------------------------------+
+```"#,
+ )
+ .with_argument("expression", "Expression to operate on. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .build()
+ .unwrap()
+ })
}
diff --git a/datafusion/functions-aggregate/src/approx_median.rs
b/datafusion/functions-aggregate/src/approx_median.rs
index 7a7b124325..dd5bb8d441 100644
--- a/datafusion/functions-aggregate/src/approx_median.rs
+++ b/datafusion/functions-aggregate/src/approx_median.rs
@@ -19,15 +19,19 @@
use std::any::Any;
use std::fmt::Debug;
+use std::sync::OnceLock;
use arrow::{datatypes::DataType, datatypes::Field};
use arrow_schema::DataType::{Float64, UInt64};
use datafusion_common::{not_impl_err, plan_err, Result};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::type_coercion::aggregates::NUMERICS;
use datafusion_expr::utils::format_state_name;
-use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, Volatility};
+use datafusion_expr::{
+ Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
+};
use crate::approx_percentile_cont::ApproxPercentileAccumulator;
@@ -116,4 +120,33 @@ impl AggregateUDFImpl for ApproxMedian {
acc_args.exprs[0].data_type(acc_args.schema)?,
)))
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_approx_median_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_approx_median_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_APPROXIMATE)
+ .with_description(
+ "Returns the approximate median (50th percentile) of input
values. It is an alias of `approx_percentile_cont(x, 0.5)`.",
+ )
+ .with_syntax_example("approx_median(expression)")
+ .with_sql_example(r#"```sql
+> SELECT approx_median(column_name) FROM table_name;
++-----------------------------------+
+| approx_median(column_name) |
++-----------------------------------+
+| 23.5 |
++-----------------------------------+
+```"#,
+ )
+ .with_argument("expression", "Expression to operate on. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .build()
+ .unwrap()
+ })
}
diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs
b/datafusion/functions-aggregate/src/approx_percentile_cont.rs
index 5578aebbf4..b4488d6d9e 100644
--- a/datafusion/functions-aggregate/src/approx_percentile_cont.rs
+++ b/datafusion/functions-aggregate/src/approx_percentile_cont.rs
@@ -17,7 +17,7 @@
use std::any::Any;
use std::fmt::{Debug, Formatter};
-use std::sync::Arc;
+use std::sync::{Arc, OnceLock};
use arrow::array::{Array, RecordBatch};
use arrow::compute::{filter, is_not_null};
@@ -34,12 +34,13 @@ use datafusion_common::{
downcast_value, internal_err, not_impl_datafusion_err, not_impl_err,
plan_err,
DataFusionError, Result, ScalarValue,
};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS};
use datafusion_expr::utils::format_state_name;
use datafusion_expr::{
- Accumulator, AggregateUDFImpl, ColumnarValue, Expr, Signature,
TypeSignature,
- Volatility,
+ Accumulator, AggregateUDFImpl, ColumnarValue, Documentation, Expr,
Signature,
+ TypeSignature, Volatility,
};
use datafusion_functions_aggregate_common::tdigest::{
TDigest, TryIntoF64, DEFAULT_MAX_SIZE,
@@ -268,6 +269,36 @@ impl AggregateUDFImpl for ApproxPercentileCont {
}
Ok(arg_types[0].clone())
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_approx_percentile_cont_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_approx_percentile_cont_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_APPROXIMATE)
+ .with_description(
+ "Returns the approximate percentile of input values using the
t-digest algorithm.",
+ )
+ .with_syntax_example("approx_percentile_cont(expression,
percentile, centroids)")
+ .with_sql_example(r#"```sql
+> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name;
++-------------------------------------------------+
+| approx_percentile_cont(column_name, 0.75, 100) |
++-------------------------------------------------+
+| 65.0 |
++-------------------------------------------------+
+```"#)
+ .with_argument("expression", "Expression to operate on. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .with_argument("percentile", "Percentile to compute. Must be a
float value between 0 and 1 (inclusive).")
+ .with_argument("centroids", "Number of centroids to use in the
t-digest algorithm. _Default is 100_. A higher number results in more accurate
approximation but requires more memory.")
+ .build()
+ .unwrap()
+ })
}
#[derive(Debug)]
diff --git
a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs
b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs
index fee67ba162..8cbf9587a7 100644
--- a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs
+++ b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs
@@ -17,7 +17,7 @@
use std::any::Any;
use std::fmt::{Debug, Formatter};
-use std::sync::Arc;
+use std::sync::{Arc, OnceLock};
use arrow::{
array::ArrayRef,
@@ -26,10 +26,13 @@ use arrow::{
use datafusion_common::ScalarValue;
use datafusion_common::{not_impl_err, plan_err, Result};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::type_coercion::aggregates::NUMERICS;
use datafusion_expr::Volatility::Immutable;
-use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, TypeSignature};
+use datafusion_expr::{
+ Accumulator, AggregateUDFImpl, Documentation, Signature, TypeSignature,
+};
use datafusion_functions_aggregate_common::tdigest::{
Centroid, TDigest, DEFAULT_MAX_SIZE,
};
@@ -151,6 +154,37 @@ impl AggregateUDFImpl for ApproxPercentileContWithWeight {
fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<Field>> {
self.approx_percentile_cont.state_fields(args)
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_approx_percentile_cont_with_weight_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_approx_percentile_cont_with_weight_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_APPROXIMATE)
+ .with_description(
+ "Returns the weighted approximate percentile of input values
using the t-digest algorithm.",
+ )
+
.with_syntax_example("approx_percentile_cont_with_weight(expression, weight,
percentile)")
+ .with_sql_example(r#"```sql
+> SELECT approx_percentile_cont_with_weight(column_name, weight_column, 0.90)
FROM table_name;
++----------------------------------------------------------------------+
+| approx_percentile_cont_with_weight(column_name, weight_column, 0.90) |
++----------------------------------------------------------------------+
+| 78.5 |
++----------------------------------------------------------------------+
+```"#,
+ )
+ .with_argument("expression", "Expression to operate on. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .with_argument("weight", "Expression to use as weight. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .with_argument("percentile", "Percentile to compute. Must be a
float value between 0 and 1 (inclusive).")
+ .build()
+ .unwrap()
+ })
}
#[derive(Debug)]
diff --git a/datafusion/functions-aggregate/src/array_agg.rs
b/datafusion/functions-aggregate/src/array_agg.rs
index 15146fc4a2..b44c4e6874 100644
--- a/datafusion/functions-aggregate/src/array_agg.rs
+++ b/datafusion/functions-aggregate/src/array_agg.rs
@@ -25,15 +25,16 @@ use datafusion_common::cast::as_list_array;
use datafusion_common::utils::{array_into_list_array_nullable, get_row_at_idx};
use datafusion_common::{exec_err, ScalarValue};
use datafusion_common::{internal_err, Result};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::utils::format_state_name;
-use datafusion_expr::AggregateUDFImpl;
use datafusion_expr::{Accumulator, Signature, Volatility};
+use datafusion_expr::{AggregateUDFImpl, Documentation};
use datafusion_functions_aggregate_common::merge_arrays::merge_ordered_arrays;
use datafusion_functions_aggregate_common::utils::ordering_fields;
use datafusion_physical_expr_common::sort_expr::{LexOrdering,
PhysicalSortExpr};
use std::collections::{HashSet, VecDeque};
-use std::sync::Arc;
+use std::sync::{Arc, OnceLock};
make_udaf_expr_and_func!(
ArrayAgg,
@@ -142,6 +143,35 @@ impl AggregateUDFImpl for ArrayAgg {
fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF {
datafusion_expr::ReversedUDAF::Reversed(array_agg_udaf())
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_array_agg_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_array_agg_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_GENERAL)
+ .with_description(
+ "Returns an array created from the expression elements. If
ordering is required, elements are inserted in the specified order.",
+ )
+ .with_syntax_example("array_agg(expression [ORDER BY expression])")
+ .with_sql_example(r#"```sql
+> SELECT array_agg(column_name ORDER BY other_column) FROM table_name;
++-----------------------------------------------+
+| array_agg(column_name ORDER BY other_column) |
++-----------------------------------------------+
+| [element1, element2, element3] |
++-----------------------------------------------+
+```"#,
+ )
+ .with_argument("expression", "Expression to operate on. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .build()
+ .unwrap()
+ })
}
#[derive(Debug)]
diff --git a/datafusion/functions-aggregate/src/average.rs
b/datafusion/functions-aggregate/src/average.rs
index ddad76a873..ad58eecdf9 100644
--- a/datafusion/functions-aggregate/src/average.rs
+++ b/datafusion/functions-aggregate/src/average.rs
@@ -28,12 +28,14 @@ use arrow::datatypes::{
Float64Type, UInt64Type,
};
use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::type_coercion::aggregates::{avg_return_type,
coerce_avg_type};
use datafusion_expr::utils::format_state_name;
use datafusion_expr::Volatility::Immutable;
use datafusion_expr::{
- Accumulator, AggregateUDFImpl, EmitTo, GroupsAccumulator, ReversedUDAF,
Signature,
+ Accumulator, AggregateUDFImpl, Documentation, EmitTo, GroupsAccumulator,
+ ReversedUDAF, Signature,
};
use
datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::NullState;
@@ -45,7 +47,7 @@ use
datafusion_functions_aggregate_common::utils::DecimalAverager;
use log::debug;
use std::any::Any;
use std::fmt::Debug;
-use std::sync::Arc;
+use std::sync::{Arc, OnceLock};
make_udaf_expr_and_func!(
Avg,
@@ -235,6 +237,36 @@ impl AggregateUDFImpl for Avg {
}
coerce_avg_type(self.name(), arg_types)
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_avg_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_avg_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_GENERAL)
+ .with_description(
+ "Returns the average of numeric values in the specified
column.",
+ )
+ .with_syntax_example("avg(expression)")
+ .with_sql_example(r#"```sql
+> SELECT avg(column_name) FROM table_name;
++---------------------------+
+| avg(column_name) |
++---------------------------+
+| 42.75 |
++---------------------------+
+```"#,
+ )
+ .with_argument("expression", "Expression to operate on. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .with_argument("Aliases: ", "`mean`")
+ .build()
+ .unwrap()
+ })
}
/// An accumulator to compute the average
diff --git a/datafusion/functions-aggregate/src/bool_and_or.rs
b/datafusion/functions-aggregate/src/bool_and_or.rs
index 7cc7d9ff7f..e212ba8d61 100644
--- a/datafusion/functions-aggregate/src/bool_and_or.rs
+++ b/datafusion/functions-aggregate/src/bool_and_or.rs
@@ -18,6 +18,7 @@
//! Defines physical expressions that can evaluated at runtime during query
execution
use std::any::Any;
+use std::sync::OnceLock;
use arrow::array::ArrayRef;
use arrow::array::BooleanArray;
@@ -29,10 +30,12 @@ use arrow::datatypes::Field;
use datafusion_common::internal_err;
use datafusion_common::{downcast_value, not_impl_err};
use datafusion_common::{DataFusionError, Result, ScalarValue};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::utils::{format_state_name, AggregateOrderSensitivity};
use datafusion_expr::{
- Accumulator, AggregateUDFImpl, GroupsAccumulator, ReversedUDAF, Signature,
Volatility,
+ Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator,
ReversedUDAF,
+ Signature, Volatility,
};
use
datafusion_functions_aggregate_common::aggregate::groups_accumulator::bool_op::BooleanGroupsAccumulator;
@@ -172,6 +175,34 @@ impl AggregateUDFImpl for BoolAnd {
fn reverse_expr(&self) -> ReversedUDAF {
ReversedUDAF::Identical
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_bool_and_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_bool_and_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_GENERAL)
+ .with_description(
+ "Returns true if all non-null input values are true, otherwise
false.",
+ )
+ .with_syntax_example("bool_and(expression)")
+ .with_sql_example(r#"```sql
+> SELECT bool_and(column_name) FROM table_name;
++----------------------------+
+| bool_and(column_name) |
++----------------------------+
+| true |
++----------------------------+
+```"#)
+ .with_argument("expression", "Expression to operate on. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .build()
+ .unwrap()
+ })
}
#[derive(Debug, Default)]
@@ -293,6 +324,32 @@ impl AggregateUDFImpl for BoolOr {
fn reverse_expr(&self) -> ReversedUDAF {
ReversedUDAF::Identical
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_bool_or_doc())
+ }
+}
+
+fn get_bool_or_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_GENERAL)
+ .with_description(
+ "Returns true if any non-null input value is true, otherwise
false.",
+ )
+ .with_syntax_example("bool_or(expression)")
+ .with_sql_example(r#"```sql
+> SELECT bool_or(column_name) FROM table_name;
++----------------------------+
+| bool_or(column_name) |
++----------------------------+
+| true |
++----------------------------+
+```"#)
+ .with_standard_argument("expression", "Expression to operate on.
Can be a constant, column, or function, and any combination of arithmetic
operators.")
+ .build()
+ .unwrap()
+ })
}
#[derive(Debug, Default)]
diff --git a/datafusion/functions-aggregate/src/correlation.rs
b/datafusion/functions-aggregate/src/correlation.rs
index 88f01b06d2..60be3608e9 100644
--- a/datafusion/functions-aggregate/src/correlation.rs
+++ b/datafusion/functions-aggregate/src/correlation.rs
@@ -19,7 +19,7 @@
use std::any::Any;
use std::fmt::Debug;
-use std::sync::Arc;
+use std::sync::{Arc, OnceLock};
use arrow::compute::{and, filter, is_not_null};
use arrow::{
@@ -30,11 +30,12 @@ use arrow::{
use crate::covariance::CovarianceAccumulator;
use crate::stddev::StddevAccumulator;
use datafusion_common::{plan_err, Result, ScalarValue};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL;
use datafusion_expr::{
function::{AccumulatorArgs, StateFieldsArgs},
type_coercion::aggregates::NUMERICS,
utils::format_state_name,
- Accumulator, AggregateUDFImpl, Signature, Volatility,
+ Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
};
use datafusion_functions_aggregate_common::stats::StatsType;
@@ -107,6 +108,35 @@ impl AggregateUDFImpl for Correlation {
),
])
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_corr_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_corr_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_STATISTICAL)
+ .with_description(
+ "Returns the coefficient of correlation between two numeric
values.",
+ )
+ .with_syntax_example("corr(expression1, expression2)")
+ .with_sql_example(r#"```sql
+> SELECT corr(column1, column2) FROM table_name;
++--------------------------------+
+| corr(column1, column2) |
++--------------------------------+
+| 0.85 |
++--------------------------------+
+```"#)
+ .with_argument("expression1", "First expression to operate on. Can
be a constant, column, or function, and any combination of arithmetic
operators.")
+ .with_argument("expression2", "Second expression to operate on.
Can be a constant, column, or function, and any combination of arithmetic
operators.")
+ .build()
+ .unwrap()
+ })
}
/// An accumulator to compute correlation
diff --git a/datafusion/functions-aggregate/src/count.rs
b/datafusion/functions-aggregate/src/count.rs
index cc245b3572..23dd5b65bf 100644
--- a/datafusion/functions-aggregate/src/count.rs
+++ b/datafusion/functions-aggregate/src/count.rs
@@ -20,8 +20,9 @@ use datafusion_common::stats::Precision;
use
datafusion_functions_aggregate_common::aggregate::count_distinct::BytesViewDistinctCountAccumulator;
use datafusion_physical_expr::expressions;
use std::collections::HashSet;
+use std::fmt::Debug;
use std::ops::BitAnd;
-use std::{fmt::Debug, sync::Arc};
+use std::sync::{Arc, OnceLock};
use arrow::{
array::{ArrayRef, AsArray},
@@ -43,10 +44,11 @@ use arrow::{
use datafusion_common::{
downcast_value, internal_err, not_impl_err, DataFusionError, Result,
ScalarValue,
};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL;
use datafusion_expr::function::StateFieldsArgs;
use datafusion_expr::{
function::AccumulatorArgs, utils::format_state_name, Accumulator,
AggregateUDFImpl,
- EmitTo, GroupsAccumulator, Signature, Volatility,
+ Documentation, EmitTo, GroupsAccumulator, Signature, Volatility,
};
use datafusion_expr::{Expr, ReversedUDAF, StatisticsArgs, TypeSignature};
use datafusion_functions_aggregate_common::aggregate::count_distinct::{
@@ -324,6 +326,41 @@ impl AggregateUDFImpl for Count {
}
None
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_count_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_count_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_GENERAL)
+ .with_description(
+ "Returns the number of non-null values in the specified
column. To include null values in the total count, use `count(*)`.",
+ )
+ .with_syntax_example("count(expression)")
+ .with_sql_example(r#"```sql
+> SELECT count(column_name) FROM table_name;
++-----------------------+
+| count(column_name) |
++-----------------------+
+| 100 |
++-----------------------+
+
+> SELECT count(*) FROM table_name;
++------------------+
+| count(*) |
++------------------+
+| 120 |
++------------------+
+```"#)
+ .with_argument("expression", "Expression to operate on. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .build()
+ .unwrap()
+ })
}
#[derive(Debug)]
diff --git a/datafusion/functions-aggregate/src/covariance.rs
b/datafusion/functions-aggregate/src/covariance.rs
index d0abb079ef..c599b58ed2 100644
--- a/datafusion/functions-aggregate/src/covariance.rs
+++ b/datafusion/functions-aggregate/src/covariance.rs
@@ -18,6 +18,7 @@
//! [`CovarianceSample`]: covariance sample aggregations.
use std::fmt::Debug;
+use std::sync::OnceLock;
use arrow::{
array::{ArrayRef, Float64Array, UInt64Array},
@@ -29,11 +30,12 @@ use datafusion_common::{
downcast_value, plan_err, unwrap_or_internal_err, DataFusionError, Result,
ScalarValue,
};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL;
use datafusion_expr::{
function::{AccumulatorArgs, StateFieldsArgs},
type_coercion::aggregates::NUMERICS,
utils::format_state_name,
- Accumulator, AggregateUDFImpl, Signature, Volatility,
+ Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
};
use datafusion_functions_aggregate_common::stats::StatsType;
@@ -124,6 +126,36 @@ impl AggregateUDFImpl for CovarianceSample {
fn aliases(&self) -> &[String] {
&self.aliases
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_covar_samp_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_covar_samp_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_STATISTICAL)
+ .with_description(
+ "Returns the sample covariance of a set of number pairs.",
+ )
+ .with_syntax_example("covar_samp(expression1, expression2)")
+ .with_sql_example(r#"```sql
+> SELECT covar_samp(column1, column2) FROM table_name;
++-----------------------------------+
+| covar_samp(column1, column2) |
++-----------------------------------+
+| 8.25 |
++-----------------------------------+
+```"#,
+ )
+ .with_argument("expression1", "First expression to operate on. Can
be a constant, column, or function, and any combination of arithmetic
operators.")
+ .with_argument("expression2", "Second expression to operate on.
Can be a constant, column, or function, and any combination of arithmetic
operators.")
+ .build()
+ .unwrap()
+ })
}
pub struct CovariancePopulation {
@@ -193,6 +225,34 @@ impl AggregateUDFImpl for CovariancePopulation {
StatsType::Population,
)?))
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_covar_pop_doc())
+ }
+}
+
+fn get_covar_pop_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_STATISTICAL)
+ .with_description(
+ "Returns the population covariance of a set of number pairs.",
+ )
+ .with_syntax_example("covar_pop(expression1, expression2)")
+ .with_sql_example(r#"```sql
+> SELECT covar_pop(column1, column2) FROM table_name;
++-----------------------------------+
+| covar_pop(column1, column2) |
++-----------------------------------+
+| 7.63 |
++-----------------------------------+
+```"#,
+ )
+ .with_argument("expression1", "First expression to operate on. Can
be a constant, column, or function, and any combination of arithmetic
operators.")
+ .with_argument("expression2", "Second expression to operate on.
Can be a constant, column, or function, and any combination of arithmetic
operators.")
+ .build()
+ .unwrap()
+ })
}
/// An accumulator to compute covariance
diff --git a/datafusion/functions-aggregate/src/first_last.rs
b/datafusion/functions-aggregate/src/first_last.rs
index 41ac787579..02b8c52282 100644
--- a/datafusion/functions-aggregate/src/first_last.rs
+++ b/datafusion/functions-aggregate/src/first_last.rs
@@ -19,7 +19,7 @@
use std::any::Any;
use std::fmt::Debug;
-use std::sync::Arc;
+use std::sync::{Arc, OnceLock};
use arrow::array::{ArrayRef, AsArray, BooleanArray};
use arrow::compute::{self, lexsort_to_indices, SortColumn};
@@ -28,11 +28,12 @@ use datafusion_common::utils::{compare_rows,
get_row_at_idx, take_arrays};
use datafusion_common::{
arrow_datafusion_err, internal_err, DataFusionError, Result, ScalarValue,
};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::utils::{format_state_name, AggregateOrderSensitivity};
use datafusion_expr::{
- Accumulator, AggregateUDFImpl, ArrayFunctionSignature, Expr,
ExprFunctionExt,
- Signature, SortExpr, TypeSignature, Volatility,
+ Accumulator, AggregateUDFImpl, ArrayFunctionSignature, Documentation, Expr,
+ ExprFunctionExt, Signature, SortExpr, TypeSignature, Volatility,
};
use datafusion_functions_aggregate_common::utils::get_sort_options;
use datafusion_physical_expr_common::sort_expr::{LexOrdering,
PhysicalSortExpr};
@@ -165,6 +166,35 @@ impl AggregateUDFImpl for FirstValue {
fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF {
datafusion_expr::ReversedUDAF::Reversed(last_value_udaf())
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_first_value_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_first_value_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_GENERAL)
+ .with_description(
+ "Returns the first element in an aggregation group according
to the requested ordering. If no ordering is given, returns an arbitrary
element from the group.",
+ )
+ .with_syntax_example("first_value(expression [ORDER BY
expression])")
+ .with_sql_example(r#"```sql
+> SELECT first_value(column_name ORDER BY other_column) FROM table_name;
++-----------------------------------------------+
+| first_value(column_name ORDER BY other_column)|
++-----------------------------------------------+
+| first_element |
++-----------------------------------------------+
+```"#,
+ )
+ .with_argument("expression", "Expression to operate on. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .build()
+ .unwrap()
+ })
}
#[derive(Debug)]
@@ -466,6 +496,33 @@ impl AggregateUDFImpl for LastValue {
fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF {
datafusion_expr::ReversedUDAF::Reversed(first_value_udaf())
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_last_value_doc())
+ }
+}
+
+fn get_last_value_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_GENERAL)
+ .with_description(
+ "Returns the last element in an aggregation group according to
the requested ordering. If no ordering is given, returns an arbitrary element
from the group.",
+ )
+ .with_syntax_example("last_value(expression [ORDER BY
expression])")
+ .with_sql_example(r#"```sql
+> SELECT last_value(column_name ORDER BY other_column) FROM table_name;
++-----------------------------------------------+
+| last_value(column_name ORDER BY other_column) |
++-----------------------------------------------+
+| last_element |
++-----------------------------------------------+
+```"#,
+ )
+ .with_argument("expression", "Expression to operate on. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .build()
+ .unwrap()
+ })
}
#[derive(Debug)]
diff --git a/datafusion/functions-aggregate/src/grouping.rs
b/datafusion/functions-aggregate/src/grouping.rs
index 6fb7c3800f..09e9b90b2e 100644
--- a/datafusion/functions-aggregate/src/grouping.rs
+++ b/datafusion/functions-aggregate/src/grouping.rs
@@ -19,14 +19,18 @@
use std::any::Any;
use std::fmt;
+use std::sync::OnceLock;
use arrow::datatypes::DataType;
use arrow::datatypes::Field;
use datafusion_common::{not_impl_err, Result};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL;
use datafusion_expr::function::AccumulatorArgs;
use datafusion_expr::function::StateFieldsArgs;
use datafusion_expr::utils::format_state_name;
-use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, Volatility};
+use datafusion_expr::{
+ Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
+};
make_udaf_expr_and_func!(
Grouping,
@@ -94,4 +98,37 @@ impl AggregateUDFImpl for Grouping {
"physical plan is not yet implemented for GROUPING aggregate
function"
)
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_grouping_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_grouping_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_GENERAL)
+ .with_description(
+ "Returns 1 if the data is aggregated across the specified
column, or 0 if it is not aggregated in the result set.",
+ )
+ .with_syntax_example("grouping(expression)")
+ .with_sql_example(r#"```sql
+> SELECT column_name, GROUPING(column_name) AS group_column
+ FROM table_name
+ GROUP BY GROUPING SETS ((column_name), ());
++-------------+-------------+
+| column_name | group_column |
++-------------+-------------+
+| value1 | 0 |
+| value2 | 0 |
+| NULL | 1 |
++-------------+-------------+
+```"#,
+ )
+ .with_argument("expression", "Expression to evaluate whether data
is aggregated across the specified column. Can be a constant, column, or
function.")
+ .build()
+ .unwrap()
+ })
}
diff --git a/datafusion/functions-aggregate/src/median.rs
b/datafusion/functions-aggregate/src/median.rs
index 7dd0de14c3..8eb17db1ec 100644
--- a/datafusion/functions-aggregate/src/median.rs
+++ b/datafusion/functions-aggregate/src/median.rs
@@ -16,8 +16,8 @@
// under the License.
use std::collections::HashSet;
-use std::fmt::Formatter;
-use std::{fmt::Debug, sync::Arc};
+use std::fmt::{Debug, Formatter};
+use std::sync::{Arc, OnceLock};
use arrow::array::{downcast_integer, ArrowNumericType};
use arrow::{
@@ -33,10 +33,11 @@ use arrow::array::ArrowNativeTypeOp;
use arrow::datatypes::ArrowNativeType;
use datafusion_common::{DataFusionError, Result, ScalarValue};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL;
use datafusion_expr::function::StateFieldsArgs;
use datafusion_expr::{
function::AccumulatorArgs, utils::format_state_name, Accumulator,
AggregateUDFImpl,
- Signature, Volatility,
+ Documentation, Signature, Volatility,
};
use datafusion_functions_aggregate_common::utils::Hashable;
@@ -152,6 +153,35 @@ impl AggregateUDFImpl for Median {
fn aliases(&self) -> &[String] {
&[]
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_median_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_median_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_GENERAL)
+ .with_description(
+ "Returns the median value in the specified column.",
+ )
+ .with_syntax_example("median(expression)")
+ .with_sql_example(r#"```sql
+> SELECT median(column_name) FROM table_name;
++----------------------+
+| median(column_name) |
++----------------------+
+| 45.5 |
++----------------------+
+```"#,
+ )
+ .with_argument("expression", "Expression to operate on. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .build()
+ .unwrap()
+ })
}
/// The median accumulator accumulates the raw input values
diff --git a/datafusion/functions-aggregate/src/min_max.rs
b/datafusion/functions-aggregate/src/min_max.rs
index 7d2e8e66e2..3d2915fd09 100644
--- a/datafusion/functions-aggregate/src/min_max.rs
+++ b/datafusion/functions-aggregate/src/min_max.rs
@@ -38,6 +38,7 @@ use datafusion_common::stats::Precision;
use datafusion_common::{
downcast_value, exec_err, internal_err, ColumnStatistics, DataFusionError,
Result,
};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL;
use
datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator;
use datafusion_physical_expr::expressions;
use std::fmt::Debug;
@@ -51,11 +52,13 @@ use arrow::datatypes::{
use datafusion_common::ScalarValue;
use datafusion_expr::{
- function::AccumulatorArgs, Accumulator, AggregateUDFImpl, Signature,
Volatility,
+ function::AccumulatorArgs, Accumulator, AggregateUDFImpl, Documentation,
Signature,
+ Volatility,
};
use datafusion_expr::{GroupsAccumulator, StatisticsArgs};
use half::f16;
use std::ops::Deref;
+use std::sync::OnceLock;
fn get_min_max_result_type(input_types: &[DataType]) -> Result<Vec<DataType>> {
// make sure that the input types only has one element.
@@ -321,6 +324,35 @@ impl AggregateUDFImpl for Max {
fn value_from_stats(&self, statistics_args: &StatisticsArgs) ->
Option<ScalarValue> {
self.value_from_statistics(statistics_args)
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_max_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_max_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_GENERAL)
+ .with_description(
+ "Returns the maximum value in the specified column.",
+ )
+ .with_syntax_example("max(expression)")
+ .with_sql_example(r#"```sql
+> SELECT max(column_name) FROM table_name;
++----------------------+
+| max(column_name) |
++----------------------+
+| 150 |
++----------------------+
+```"#,
+ )
+ .with_argument("expression", "Expression to operate on. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .build()
+ .unwrap()
+ })
}
// Statically-typed version of min/max(array) -> ScalarValue for string types
@@ -1119,7 +1151,35 @@ impl AggregateUDFImpl for Min {
fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF {
datafusion_expr::ReversedUDAF::Identical
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_min_doc())
+ }
+}
+
+fn get_min_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_GENERAL)
+ .with_description(
+ "Returns the minimum value in the specified column.",
+ )
+ .with_syntax_example("min(expression)")
+ .with_sql_example(r#"```sql
+> SELECT min(column_name) FROM table_name;
++----------------------+
+| min(column_name) |
++----------------------+
+| 12 |
++----------------------+
+```"#,
+ )
+ .with_argument("expression", "Expression to operate on. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .build()
+ .unwrap()
+ })
}
+
/// An accumulator to compute the minimum value
#[derive(Debug)]
pub struct MinAccumulator {
diff --git a/datafusion/functions-aggregate/src/nth_value.rs
b/datafusion/functions-aggregate/src/nth_value.rs
index bbfe56914c..6d8cea8f05 100644
--- a/datafusion/functions-aggregate/src/nth_value.rs
+++ b/datafusion/functions-aggregate/src/nth_value.rs
@@ -20,18 +20,19 @@
use std::any::Any;
use std::collections::VecDeque;
-use std::sync::Arc;
+use std::sync::{Arc, OnceLock};
use arrow::array::{new_empty_array, ArrayRef, AsArray, StructArray};
use arrow_schema::{DataType, Field, Fields};
use datafusion_common::utils::{array_into_list_array_nullable, get_row_at_idx};
use datafusion_common::{exec_err, internal_err, not_impl_err, Result,
ScalarValue};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::utils::format_state_name;
use datafusion_expr::{
- lit, Accumulator, AggregateUDFImpl, ExprFunctionExt, ReversedUDAF,
Signature,
- SortExpr, Volatility,
+ lit, Accumulator, AggregateUDFImpl, Documentation, ExprFunctionExt,
ReversedUDAF,
+ Signature, SortExpr, Volatility,
};
use datafusion_functions_aggregate_common::merge_arrays::merge_ordered_arrays;
use datafusion_functions_aggregate_common::utils::ordering_fields;
@@ -161,6 +162,40 @@ impl AggregateUDFImpl for NthValueAgg {
fn reverse_expr(&self) -> ReversedUDAF {
ReversedUDAF::Reversed(nth_value_udaf())
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_nth_value_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_nth_value_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_STATISTICAL)
+ .with_description(
+ "Returns the nth value in a group of values.",
+ )
+ .with_syntax_example("nth_value(expression, n ORDER BY
expression)")
+ .with_sql_example(r#"```sql
+> SELECT dept_id, salary, NTH_VALUE(salary, 2) OVER (PARTITION BY dept_id
ORDER BY salary ASC) AS second_salary_by_dept
+ FROM employee;
++---------+--------+-------------------------+
+| dept_id | salary | second_salary_by_dept |
++---------+--------+-------------------------+
+| 1 | 30000 | NULL |
+| 1 | 40000 | 40000 |
+| 1 | 50000 | 40000 |
+| 2 | 35000 | NULL |
+| 2 | 45000 | 45000 |
++---------+--------+-------------------------+
+```"#)
+ .with_standard_argument("expression", "The column or expression to
retrieve the nth value from.")
+ .with_argument("n", "The position (nth) of the value to retrieve,
based on the ordering.")
+ .build()
+ .unwrap()
+ })
}
#[derive(Debug)]
diff --git a/datafusion/functions-aggregate/src/stddev.rs
b/datafusion/functions-aggregate/src/stddev.rs
index a25ab5e319..9f9da0c585 100644
--- a/datafusion/functions-aggregate/src/stddev.rs
+++ b/datafusion/functions-aggregate/src/stddev.rs
@@ -19,17 +19,19 @@
use std::any::Any;
use std::fmt::{Debug, Formatter};
-use std::sync::Arc;
+use std::sync::{Arc, OnceLock};
use arrow::array::Float64Array;
use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field};
use datafusion_common::{internal_err, not_impl_err, Result};
use datafusion_common::{plan_err, ScalarValue};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::utils::format_state_name;
use datafusion_expr::{
- Accumulator, AggregateUDFImpl, GroupsAccumulator, Signature, Volatility,
+ Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, Signature,
+ Volatility,
};
use datafusion_functions_aggregate_common::stats::StatsType;
@@ -132,6 +134,35 @@ impl AggregateUDFImpl for Stddev {
) -> Result<Box<dyn GroupsAccumulator>> {
Ok(Box::new(StddevGroupsAccumulator::new(StatsType::Sample)))
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_stddev_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_stddev_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_STATISTICAL)
+ .with_description(
+ "Returns the standard deviation of a set of numbers.",
+ )
+ .with_syntax_example("stddev(expression)")
+ .with_sql_example(r#"```sql
+> SELECT stddev(column_name) FROM table_name;
++----------------------+
+| stddev(column_name) |
++----------------------+
+| 12.34 |
++----------------------+
+```"#,
+ )
+ .with_argument("expression", "Expression to operate on. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .build()
+ .unwrap()
+ })
}
make_udaf_expr_and_func!(
@@ -228,6 +259,33 @@ impl AggregateUDFImpl for StddevPop {
StatsType::Population,
)))
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_stddev_pop_doc())
+ }
+}
+
+fn get_stddev_pop_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_STATISTICAL)
+ .with_description(
+ "Returns the population standard deviation of a set of
numbers.",
+ )
+ .with_syntax_example("stddev_pop(expression)")
+ .with_sql_example(r#"```sql
+> SELECT stddev_pop(column_name) FROM table_name;
++--------------------------+
+| stddev_pop(column_name) |
++--------------------------+
+| 10.56 |
++--------------------------+
+```"#,
+ )
+ .with_argument("expression", "Expression to operate on. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .build()
+ .unwrap()
+ })
}
/// An accumulator to compute the average
diff --git a/datafusion/functions-aggregate/src/string_agg.rs
b/datafusion/functions-aggregate/src/string_agg.rs
index a7e9a37e23..66fc199106 100644
--- a/datafusion/functions-aggregate/src/string_agg.rs
+++ b/datafusion/functions-aggregate/src/string_agg.rs
@@ -22,12 +22,14 @@ use arrow_schema::DataType;
use datafusion_common::cast::as_generic_string_array;
use datafusion_common::Result;
use datafusion_common::{not_impl_err, ScalarValue};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL;
use datafusion_expr::function::AccumulatorArgs;
use datafusion_expr::{
- Accumulator, AggregateUDFImpl, Signature, TypeSignature, Volatility,
+ Accumulator, AggregateUDFImpl, Documentation, Signature, TypeSignature,
Volatility,
};
use datafusion_physical_expr::expressions::Literal;
use std::any::Any;
+use std::sync::OnceLock;
make_udaf_expr_and_func!(
StringAgg,
@@ -98,6 +100,37 @@ impl AggregateUDFImpl for StringAgg {
not_impl_err!("expect literal")
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_string_agg_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_string_agg_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_GENERAL)
+ .with_description(
+ "Concatenates the values of string expressions and places
separator values between them."
+ )
+ .with_syntax_example("string_agg(expression, delimiter)")
+ .with_sql_example(r#"```sql
+> SELECT string_agg(name, ', ') AS names_list
+ FROM employee;
++--------------------------+
+| names_list |
++--------------------------+
+| Alice, Bob, Charlie |
++--------------------------+
+```"#,
+ )
+ .with_argument("expression", "The string expression to
concatenate. Can be a column or any valid string expression.")
+ .with_argument("delimiter", "A literal string used as a separator
between the concatenated values.")
+ .build()
+ .unwrap()
+ })
}
#[derive(Debug)]
diff --git a/datafusion/functions-aggregate/src/sum.rs
b/datafusion/functions-aggregate/src/sum.rs
index 7e40c1bd17..91e777dd2a 100644
--- a/datafusion/functions-aggregate/src/sum.rs
+++ b/datafusion/functions-aggregate/src/sum.rs
@@ -21,6 +21,7 @@ use ahash::RandomState;
use datafusion_expr::utils::AggregateOrderSensitivity;
use std::any::Any;
use std::collections::HashSet;
+use std::sync::OnceLock;
use arrow::array::Array;
use arrow::array::ArrowNativeTypeOp;
@@ -33,11 +34,13 @@ use arrow::datatypes::{
};
use arrow::{array::ArrayRef, datatypes::Field};
use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue};
+use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL;
use datafusion_expr::function::AccumulatorArgs;
use datafusion_expr::function::StateFieldsArgs;
use datafusion_expr::utils::format_state_name;
use datafusion_expr::{
- Accumulator, AggregateUDFImpl, GroupsAccumulator, ReversedUDAF, Signature,
Volatility,
+ Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator,
ReversedUDAF,
+ Signature, Volatility,
};
use
datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator;
use datafusion_functions_aggregate_common::utils::Hashable;
@@ -233,6 +236,35 @@ impl AggregateUDFImpl for Sum {
fn order_sensitivity(&self) -> AggregateOrderSensitivity {
AggregateOrderSensitivity::Insensitive
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_sum_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_sum_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_GENERAL)
+ .with_description(
+ "Returns the sum of all values in the specified column.",
+ )
+ .with_syntax_example("sum(expression)")
+ .with_sql_example(r#"```sql
+> SELECT sum(column_name) FROM table_name;
++-----------------------+
+| sum(column_name) |
++-----------------------+
+| 12345 |
++-----------------------+
+```"#,
+ )
+ .with_argument("expression", "Expression to operate on. Can be a
constant, column, or function, and any combination of arithmetic operators.")
+ .build()
+ .unwrap()
+ })
}
/// This accumulator computes SUM incrementally
diff --git a/docs/source/user-guide/sql/aggregate_functions.md
b/docs/source/user-guide/sql/aggregate_functions.md
index 1c25874c08..4f774fe6d0 100644
--- a/docs/source/user-guide/sql/aggregate_functions.md
+++ b/docs/source/user-guide/sql/aggregate_functions.md
@@ -27,183 +27,9 @@ the rest of the documentation.
[automatically created from the codebase]:
https://github.com/apache/datafusion/issues/12740
-## General
-
-- [avg](#avg)
-- [bool_and](#bool_and)
-- [bool_or](#bool_or)
-- [count](#count)
-- [max](#max)
-- [mean](#mean)
-- [median](#median)
-- [min](#min)
-- [sum](#sum)
-- [array_agg](#array_agg)
-- [first_value](#first_value)
-- [last_value](#last_value)
-
-### `avg`
-
-Returns the average of numeric values in the specified column.
-
-```
-avg(expression)
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-#### Aliases
-
-- `mean`
-
-### `bool_and`
-
-Returns true if all non-null input values are true, otherwise false.
-
-```
-bool_and(expression)
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-### `bool_or`
-
-Returns true if any non-null input value is true, otherwise false.
-
-```
-bool_or(expression)
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-### `count`
-
-Returns the number of non-null values in the specified column.
-
-To include _null_ values in the total count, use `count(*)`.
-
-```
-count(expression)
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-### `max`
-
-Returns the maximum value in the specified column.
-
-```
-max(expression)
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-### `mean`
-
-_Alias of [avg](#avg)._
-
-### `median`
-
-Returns the median value in the specified column.
-
-```
-median(expression)
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-### `min`
-
-Returns the minimum value in the specified column.
-
-```
-min(expression)
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-### `sum`
-
-Returns the sum of all values in the specified column.
-
-```
-sum(expression)
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-### `array_agg`
-
-Returns an array created from the expression elements. If ordering requirement
is given, elements are inserted in the order of required ordering.
-
-```
-array_agg(expression [ORDER BY expression])
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-### `first_value`
-
-Returns the first element in an aggregation group according to the requested
ordering. If no ordering is given, returns an arbitrary element from the group.
-
-```
-first_value(expression [ORDER BY expression])
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-### `last_value`
-
-Returns the last element in an aggregation group according to the requested
ordering. If no ordering is given, returns an arbitrary element from the group.
-
-```
-last_value(expression [ORDER BY expression])
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
## Statistical
-- [corr](#corr)
- [covar](#covar)
-- [covar_pop](#covar_pop)
-- [covar_samp](#covar_samp)
-- [stddev](#stddev)
-- [stddev_pop](#stddev_pop)
-- [stddev_samp](#stddev_samp)
- [regr_avgx](#regr_avgx)
- [regr_avgy](#regr_avgy)
- [regr_count](#regr_count)
@@ -214,21 +40,6 @@ last_value(expression [ORDER BY expression])
- [regr_syy](#regr_syy)
- [regr_sxy](#regr_sxy)
-### `corr`
-
-Returns the coefficient of correlation between two numeric values.
-
-```
-corr(expression1, expression2)
-```
-
-#### Arguments
-
-- **expression1**: First expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-- **expression2**: Second expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
### `covar`
Returns the covariance of a set of number pairs.
@@ -244,75 +55,6 @@ covar(expression1, expression2)
- **expression2**: Second expression to operate on.
Can be a constant, column, or function, and any combination of arithmetic
operators.
-### `covar_pop`
-
-Returns the population covariance of a set of number pairs.
-
-```
-covar_pop(expression1, expression2)
-```
-
-#### Arguments
-
-- **expression1**: First expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-- **expression2**: Second expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-### `covar_samp`
-
-Returns the sample covariance of a set of number pairs.
-
-```
-covar_samp(expression1, expression2)
-```
-
-#### Arguments
-
-- **expression1**: First expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-- **expression2**: Second expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-### `stddev`
-
-Returns the standard deviation of a set of numbers.
-
-```
-stddev(expression)
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-### `stddev_pop`
-
-Returns the population standard deviation of a set of numbers.
-
-```
-stddev_pop(expression)
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-### `stddev_samp`
-
-Returns the sample standard deviation of a set of numbers.
-
-```
-stddev_samp(expression)
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
### `regr_slope`
Returns the slope of the linear regression line for non-null pairs in
aggregate columns.
@@ -448,74 +190,3 @@ regr_sxy(expression_y, expression_x)
Can be a constant, column, or function, and any combination of arithmetic
operators.
- **expression_x**: Independent variable.
Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-## Approximate
-
-- [approx_distinct](#approx_distinct)
-- [approx_median](#approx_median)
-- [approx_percentile_cont](#approx_percentile_cont)
-- [approx_percentile_cont_with_weight](#approx_percentile_cont_with_weight)
-
-### `approx_distinct`
-
-Returns the approximate number of distinct input values calculated using the
-HyperLogLog algorithm.
-
-```
-approx_distinct(expression)
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-### `approx_median`
-
-Returns the approximate median (50th percentile) of input values.
-It is an alias of `approx_percentile_cont(x, 0.5)`.
-
-```
-approx_median(expression)
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-
-### `approx_percentile_cont`
-
-Returns the approximate percentile of input values using the t-digest
algorithm.
-
-```
-approx_percentile_cont(expression, percentile, centroids)
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-- **percentile**: Percentile to compute. Must be a float value between 0 and 1
(inclusive).
-- **centroids**: Number of centroids to use in the t-digest algorithm.
_Default is 100_.
-
- If there are this number or fewer unique values, you can expect an exact
result.
- A higher number of centroids results in a more accurate approximation, but
- requires more memory to compute.
-
-### `approx_percentile_cont_with_weight`
-
-Returns the weighted approximate percentile of input values using the
-t-digest algorithm.
-
-```
-approx_percentile_cont_with_weight(expression, weight, percentile)
-```
-
-#### Arguments
-
-- **expression**: Expression to operate on.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-- **weight**: Expression to use as weight.
- Can be a constant, column, or function, and any combination of arithmetic
operators.
-- **percentile**: Percentile to compute. Must be a float value between 0 and 1
(inclusive).
diff --git a/docs/source/user-guide/sql/aggregate_functions_new.md
b/docs/source/user-guide/sql/aggregate_functions_new.md
index 08cdb0a986..fc918c3b15 100644
--- a/docs/source/user-guide/sql/aggregate_functions_new.md
+++ b/docs/source/user-guide/sql/aggregate_functions_new.md
@@ -37,15 +37,80 @@ Aggregate functions operate on a set of values to compute a
single result.
## General Functions
+- [array_agg](#array_agg)
+- [avg](#avg)
- [bit_and](#bit_and)
- [bit_or](#bit_or)
- [bit_xor](#bit_xor)
+- [bool_and](#bool_and)
+- [bool_or](#bool_or)
+- [count](#count)
+- [first_value](#first_value)
+- [grouping](#grouping)
+- [last_value](#last_value)
+- [max](#max)
+- [mean](#mean)
+- [median](#median)
+- [min](#min)
+- [string_agg](#string_agg)
+- [sum](#sum)
- [var](#var)
- [var_pop](#var_pop)
- [var_population](#var_population)
- [var_samp](#var_samp)
- [var_sample](#var_sample)
+### `array_agg`
+
+Returns an array created from the expression elements. If ordering is
required, elements are inserted in the specified order.
+
+```
+array_agg(expression [ORDER BY expression])
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT array_agg(column_name ORDER BY other_column) FROM table_name;
++-----------------------------------------------+
+| array_agg(column_name ORDER BY other_column) |
++-----------------------------------------------+
+| [element1, element2, element3] |
++-----------------------------------------------+
+```
+
+### `avg`
+
+Returns the average of numeric values in the specified column.
+
+```
+avg(expression)
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+- **Aliases: **: `mean`
+
+#### Example
+
+```sql
+> SELECT avg(column_name) FROM table_name;
++---------------------------+
+| avg(column_name) |
++---------------------------+
+| 42.75 |
++---------------------------+
+```
+
+#### Aliases
+
+- mean
+
### `bit_and`
Computes the bitwise AND of all non-null input values.
@@ -82,6 +147,276 @@ bit_xor(expression)
- **expression**: Integer expression to operate on. Can be a constant, column,
or function, and any combination of operators.
+### `bool_and`
+
+Returns true if all non-null input values are true, otherwise false.
+
+```
+bool_and(expression)
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT bool_and(column_name) FROM table_name;
++----------------------------+
+| bool_and(column_name) |
++----------------------------+
+| true |
++----------------------------+
+```
+
+### `bool_or`
+
+Returns true if all non-null input values are true, otherwise false.
+
+```
+bool_and(expression)
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT bool_and(column_name) FROM table_name;
++----------------------------+
+| bool_and(column_name) |
++----------------------------+
+| true |
++----------------------------+
+```
+
+### `count`
+
+Returns the number of non-null values in the specified column. To include null
values in the total count, use `count(*)`.
+
+```
+count(expression)
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT count(column_name) FROM table_name;
++-----------------------+
+| count(column_name) |
++-----------------------+
+| 100 |
++-----------------------+
+
+> SELECT count(*) FROM table_name;
++------------------+
+| count(*) |
++------------------+
+| 120 |
++------------------+
+```
+
+### `first_value`
+
+Returns the first element in an aggregation group according to the requested
ordering. If no ordering is given, returns an arbitrary element from the group.
+
+```
+first_value(expression [ORDER BY expression])
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT first_value(column_name ORDER BY other_column) FROM table_name;
++-----------------------------------------------+
+| first_value(column_name ORDER BY other_column)|
++-----------------------------------------------+
+| first_element |
++-----------------------------------------------+
+```
+
+### `grouping`
+
+Returns 1 if the data is aggregated across the specified column, or 0 if it is
not aggregated in the result set.
+
+```
+grouping(expression)
+```
+
+#### Arguments
+
+- **expression**: Expression to evaluate whether data is aggregated across the
specified column. Can be a constant, column, or function.
+
+#### Example
+
+```sql
+> SELECT column_name, GROUPING(column_name) AS group_column
+ FROM table_name
+ GROUP BY GROUPING SETS ((column_name), ());
++-------------+-------------+
+| column_name | group_column |
++-------------+-------------+
+| value1 | 0 |
+| value2 | 0 |
+| NULL | 1 |
++-------------+-------------+
+```
+
+### `last_value`
+
+Returns the first element in an aggregation group according to the requested
ordering. If no ordering is given, returns an arbitrary element from the group.
+
+```
+first_value(expression [ORDER BY expression])
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT first_value(column_name ORDER BY other_column) FROM table_name;
++-----------------------------------------------+
+| first_value(column_name ORDER BY other_column)|
++-----------------------------------------------+
+| first_element |
++-----------------------------------------------+
+```
+
+### `max`
+
+Returns the maximum value in the specified column.
+
+```
+max(expression)
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT max(column_name) FROM table_name;
++----------------------+
+| max(column_name) |
++----------------------+
+| 150 |
++----------------------+
+```
+
+### `mean`
+
+_Alias of [avg](#avg)._
+
+### `median`
+
+Returns the median value in the specified column.
+
+```
+median(expression)
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT median(column_name) FROM table_name;
++----------------------+
+| median(column_name) |
++----------------------+
+| 45.5 |
++----------------------+
+```
+
+### `min`
+
+Returns the maximum value in the specified column.
+
+```
+max(expression)
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT max(column_name) FROM table_name;
++----------------------+
+| max(column_name) |
++----------------------+
+| 150 |
++----------------------+
+```
+
+### `string_agg`
+
+Concatenates the values of string expressions and places separator values
between them.
+
+```
+string_agg(expression, delimiter)
+```
+
+#### Arguments
+
+- **expression**: The string expression to concatenate. Can be a column or any
valid string expression.
+- **delimiter**: A literal string used as a separator between the concatenated
values.
+
+#### Example
+
+```sql
+> SELECT string_agg(name, ', ') AS names_list
+ FROM employee;
++--------------------------+
+| names_list |
++--------------------------+
+| Alice, Bob, Charlie |
++--------------------------+
+```
+
+### `sum`
+
+Returns the sum of all values in the specified column.
+
+```
+sum(expression)
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT sum(column_name) FROM table_name;
++-----------------------+
+| sum(column_name) |
++-----------------------+
+| 12345 |
++-----------------------+
+```
+
### `var`
Returns the statistical sample variance of a set of numbers.
@@ -126,3 +461,280 @@ _Alias of [var](#var)._
### `var_sample`
_Alias of [var](#var)._
+
+## Statistical Functions
+
+- [corr](#corr)
+- [covar](#covar)
+- [covar_pop](#covar_pop)
+- [covar_samp](#covar_samp)
+- [nth_value](#nth_value)
+- [stddev](#stddev)
+- [stddev_pop](#stddev_pop)
+- [stddev_samp](#stddev_samp)
+
+### `corr`
+
+Returns the coefficient of correlation between two numeric values.
+
+```
+corr(expression1, expression2)
+```
+
+#### Arguments
+
+- **expression1**: First expression to operate on. Can be a constant, column,
or function, and any combination of arithmetic operators.
+- **expression2**: Second expression to operate on. Can be a constant, column,
or function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT corr(column1, column2) FROM table_name;
++--------------------------------+
+| corr(column1, column2) |
++--------------------------------+
+| 0.85 |
++--------------------------------+
+```
+
+### `covar`
+
+_Alias of [covar_samp](#covar_samp)._
+
+### `covar_pop`
+
+Returns the sample covariance of a set of number pairs.
+
+```
+covar_samp(expression1, expression2)
+```
+
+#### Arguments
+
+- **expression1**: First expression to operate on. Can be a constant, column,
or function, and any combination of arithmetic operators.
+- **expression2**: Second expression to operate on. Can be a constant, column,
or function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT covar_samp(column1, column2) FROM table_name;
++-----------------------------------+
+| covar_samp(column1, column2) |
++-----------------------------------+
+| 8.25 |
++-----------------------------------+
+```
+
+### `covar_samp`
+
+Returns the sample covariance of a set of number pairs.
+
+```
+covar_samp(expression1, expression2)
+```
+
+#### Arguments
+
+- **expression1**: First expression to operate on. Can be a constant, column,
or function, and any combination of arithmetic operators.
+- **expression2**: Second expression to operate on. Can be a constant, column,
or function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT covar_samp(column1, column2) FROM table_name;
++-----------------------------------+
+| covar_samp(column1, column2) |
++-----------------------------------+
+| 8.25 |
++-----------------------------------+
+```
+
+#### Aliases
+
+- covar
+
+### `nth_value`
+
+Returns the nth value in a group of values.
+
+```
+nth_value(expression, n ORDER BY expression)
+```
+
+#### Arguments
+
+- **expression**: The column or expression to retrieve the nth value from.
expression to operate on. Can be a constant, column, or function, and any
combination of operators.
+- **n**: The position (nth) of the value to retrieve, based on the ordering.
+
+#### Example
+
+```sql
+> SELECT dept_id, salary, NTH_VALUE(salary, 2) OVER (PARTITION BY dept_id
ORDER BY salary ASC) AS second_salary_by_dept
+ FROM employee;
++---------+--------+-------------------------+
+| dept_id | salary | second_salary_by_dept |
++---------+--------+-------------------------+
+| 1 | 30000 | NULL |
+| 1 | 40000 | 40000 |
+| 1 | 50000 | 40000 |
+| 2 | 35000 | NULL |
+| 2 | 45000 | 45000 |
++---------+--------+-------------------------+
+```
+
+### `stddev`
+
+Returns the standard deviation of a set of numbers.
+
+```
+stddev(expression)
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT stddev(column_name) FROM table_name;
++----------------------+
+| stddev(column_name) |
++----------------------+
+| 12.34 |
++----------------------+
+```
+
+#### Aliases
+
+- stddev_samp
+
+### `stddev_pop`
+
+Returns the standard deviation of a set of numbers.
+
+```
+stddev(expression)
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT stddev(column_name) FROM table_name;
++----------------------+
+| stddev(column_name) |
++----------------------+
+| 12.34 |
++----------------------+
+```
+
+### `stddev_samp`
+
+_Alias of [stddev](#stddev)._
+
+## Approximate Functions
+
+- [approx_distinct](#approx_distinct)
+- [approx_median](#approx_median)
+- [approx_percentile_cont](#approx_percentile_cont)
+- [approx_percentile_cont_with_weight](#approx_percentile_cont_with_weight)
+
+### `approx_distinct`
+
+Returns the approximate number of distinct input values calculated using the
HyperLogLog algorithm.
+
+```
+approx_distinct(expression)
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT approx_distinct(column_name) FROM table_name;
++-----------------------------------+
+| approx_distinct(column_name) |
++-----------------------------------+
+| 42 |
++-----------------------------------+
+```
+
+### `approx_median`
+
+Returns the approximate median (50th percentile) of input values. It is an
alias of `approx_percentile_cont(x, 0.5)`.
+
+```
+approx_median(expression)
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> SELECT approx_median(column_name) FROM table_name;
++-----------------------------------+
+| approx_median(column_name) |
++-----------------------------------+
+| 23.5 |
++-----------------------------------+
+```
+
+### `approx_percentile_cont`
+
+Returns the approximate percentile of input values using the t-digest
algorithm.
+
+```
+approx_percentile_cont(expression, percentile, centroids)
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+- **percentile**: Percentile to compute. Must be a float value between 0 and 1
(inclusive).
+- **centroids**: Number of centroids to use in the t-digest algorithm.
_Default is 100_. A higher number results in more accurate approximation but
requires more memory.
+
+#### Example
+
+```sql
+> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name;
++-------------------------------------------------+
+| approx_percentile_cont(column_name, 0.75, 100) |
++-------------------------------------------------+
+| 65.0 |
++-------------------------------------------------+
+```
+
+### `approx_percentile_cont_with_weight`
+
+Returns the weighted approximate percentile of input values using the t-digest
algorithm.
+
+```
+approx_percentile_cont_with_weight(expression, weight, percentile)
+```
+
+#### Arguments
+
+- **expression**: Expression to operate on. Can be a constant, column, or
function, and any combination of arithmetic operators.
+- **weight**: Expression to use as weight. Can be a constant, column, or
function, and any combination of arithmetic operators.
+- **percentile**: Percentile to compute. Must be a float value between 0 and 1
(inclusive).
+
+#### Example
+
+```sql
+> SELECT approx_percentile_cont_with_weight(column_name, weight_column, 0.90)
FROM table_name;
++----------------------------------------------------------------------+
+| approx_percentile_cont_with_weight(column_name, weight_column, 0.90) |
++----------------------------------------------------------------------+
+| 78.5 |
++----------------------------------------------------------------------+
+```
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]