alamb commented on code in PR #3146:
URL: https://github.com/apache/arrow-datafusion/pull/3146#discussion_r947653826
##########
datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs:
##########
@@ -52,39 +53,35 @@ impl ApproxPercentileCont {
// Arguments should be [ColumnExpr, DesiredPercentileLiteral]
debug_assert_eq!(expr.len(), 2);
- // Extract the desired percentile literal
- let lit = expr[1]
- .as_any()
- .downcast_ref::<Literal>()
- .ok_or_else(|| {
- DataFusionError::Internal(
- "desired percentile argument must be float
literal".to_string(),
- )
- })?
- .value();
- let percentile = match lit {
- ScalarValue::Float32(Some(q)) => *q as f64,
- ScalarValue::Float64(Some(q)) => *q as f64,
- got => return Err(DataFusionError::NotImplemented(format!(
- "Percentile value for 'APPROX_PERCENTILE_CONT' must be Float32
or Float64 literal (got data type {})",
- got
- )))
- };
+ let percentile = validate_input_percentile_expr(&expr[1])?;
Review Comment:
👍
##########
datafusion/core/tests/sql/aggregates.rs:
##########
@@ -925,6 +925,54 @@ async fn csv_query_approx_percentile_cont_with_weight() ->
Result<()> {
Ok(())
}
+#[tokio::test]
+async fn csv_query_approx_percentile_cont_with_histogram_bins() -> Result<()> {
+ let ctx = SessionContext::new();
+ register_aggregate_csv(&ctx).await?;
+
+ // compare approx_percentile_cont and approx_percentile_cont_with_weight
+ let sql = "SELECT c1, approx_percentile_cont(c3, 0.95, 200) AS c3_p95 FROM
aggregate_test_100 GROUP BY 1 ORDER BY 1";
+ let actual = execute_to_batches(&ctx, sql).await;
+ let expected = vec![
+ "+----+--------+",
Review Comment:
I am not a statistics expert but I verified that this is different than the
output generated by `SELECT c1, approx_percentile_cont_with_weight(c3, c2,
0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1` a few lines above
##########
datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs:
##########
@@ -113,6 +116,64 @@ impl ApproxPercentileCont {
}
}
+fn validate_input_percentile_expr(expr: &Arc<dyn PhysicalExpr>) -> Result<f64>
{
+ // Extract the desired percentile literal
+ let lit = expr
+ .as_any()
+ .downcast_ref::<Literal>()
+ .ok_or_else(|| {
+ DataFusionError::Internal(
+ "desired percentile argument must be float
literal".to_string(),
+ )
+ })?
+ .value();
+ let percentile = match lit {
+ ScalarValue::Float32(Some(q)) => *q as f64,
+ ScalarValue::Float64(Some(q)) => *q as f64,
+ got => return Err(DataFusionError::NotImplemented(format!(
+ "Percentile value for 'APPROX_PERCENTILE_CONT' must be Float32 or
Float64 literal (got data type {})",
+ got.get_datatype()
+ )))
+ };
+
+ // Ensure the percentile is between 0 and 1.
+ if !(0.0..=1.0).contains(&percentile) {
+ return Err(DataFusionError::Plan(format!(
+ "Percentile value must be between 0.0 and 1.0 inclusive, {} is
invalid",
+ percentile
+ )));
+ }
+ Ok(percentile)
+}
+
+fn validate_input_max_size_expr(expr: &Arc<dyn PhysicalExpr>) -> Result<usize>
{
+ // Extract the desired percentile literal
+ let lit = expr
+ .as_any()
+ .downcast_ref::<Literal>()
+ .ok_or_else(|| {
+ DataFusionError::Internal(
+ "desired percentile argument must be float
literal".to_string(),
+ )
+ })?
+ .value();
+ let max_size = match lit {
+ ScalarValue::UInt8(Some(q)) => *q as usize,
Review Comment:
Yeah, I think we should file a ticket to allow creating unsigned int types
from SQL (like in `CREATE TABLE`) statements.
I ran into this limitation while I was fixing
https://github.com/apache/arrow-datafusion/pull/3167 --
https://github.com/apache/arrow-datafusion/pull/3167/files#r946145318
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]