realno commented on a change in pull request #1525:
URL: https://github.com/apache/arrow-datafusion/pull/1525#discussion_r780582296



##########
File path: datafusion/src/physical_plan/expressions/variance.rs
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines physical expressions that can evaluated at runtime during query 
execution
+
+use std::any::Any;
+use std::sync::Arc;
+
+use crate::error::{DataFusionError, Result};
+use crate::physical_plan::{Accumulator, AggregateExpr, PhysicalExpr};
+use crate::scalar::ScalarValue;
+use arrow::datatypes::DataType;
+use arrow::datatypes::Field;
+
+use super::format_state_name;
+
+/// VARIANCE aggregate expression
+#[derive(Debug)]
+pub struct Variance {
+    name: String,
+    expr: Arc<dyn PhysicalExpr>,
+}
+
+/// function return type of variance
+pub fn variance_return_type(arg_type: &DataType) -> Result<DataType> {
+    match arg_type {
+        DataType::Int8
+        | DataType::Int16
+        | DataType::Int32
+        | DataType::Int64
+        | DataType::UInt8
+        | DataType::UInt16
+        | DataType::UInt32
+        | DataType::UInt64
+        | DataType::Float32
+        | DataType::Float64 => Ok(DataType::Float64),
+        other => Err(DataFusionError::Plan(format!(
+            "VARIANCE does not support {:?}",
+            other
+        ))),
+    }
+}
+
+pub(crate) fn is_variance_support_arg_type(arg_type: &DataType) -> bool {
+    matches!(
+        arg_type,
+        DataType::UInt8
+            | DataType::UInt16
+            | DataType::UInt32
+            | DataType::UInt64
+            | DataType::Int8
+            | DataType::Int16
+            | DataType::Int32
+            | DataType::Int64
+            | DataType::Float32
+            | DataType::Float64
+    )
+}
+
+impl Variance {
+    /// Create a new VARIANCE aggregate function
+    pub fn new(
+        expr: Arc<dyn PhysicalExpr>,
+        name: impl Into<String>,
+        data_type: DataType,
+    ) -> Self {
+        // the result of variance just support FLOAT64 data type.
+        assert!(matches!(data_type, DataType::Float64));
+        Self {
+            name: name.into(),
+            expr,
+        }
+    }
+}
+
+impl AggregateExpr for Variance {
+    /// Return a reference to Any that can be used for downcasting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn field(&self) -> Result<Field> {
+        Ok(Field::new(&self.name, DataType::Float64, true))
+    }
+
+    fn create_accumulator(&self) -> Result<Box<dyn Accumulator>> {
+        Ok(Box::new(VarianceAccumulator::try_new()?))
+    }
+
+    fn state_fields(&self) -> Result<Vec<Field>> {
+        Ok(vec![
+            Field::new(
+                &format_state_name(&self.name, "count"),
+                DataType::UInt64,
+                true,
+            ),
+            Field::new(
+                &format_state_name(&self.name, "mean"),
+                DataType::Float64,
+                true,
+            ),
+            Field::new(
+                &format_state_name(&self.name, "m2"),
+                DataType::Float64,
+                true,
+            ),
+        ])
+    }
+
+    fn expressions(&self) -> Vec<Arc<dyn PhysicalExpr>> {
+        vec![self.expr.clone()]
+    }
+
+    fn name(&self) -> &str {
+        &self.name
+    }
+}
+
+/// An accumulator to compute variance
+#[derive(Debug)]
+pub struct VarianceAccumulator {
+    m2: ScalarValue,
+    mean: ScalarValue,
+    count: u64,
+}
+
+impl VarianceAccumulator {
+    /// Creates a new `VarianceAccumulator`
+    pub fn try_new() -> Result<Self> {
+        Ok(Self {
+            m2: ScalarValue::from(0 as f64),
+            mean: ScalarValue::from(0 as f64),
+            count: 0,
+        })
+    }
+
+    pub fn get_count(&self) -> u64 {
+        self.count
+    }
+
+    pub fn get_mean(&self) -> ScalarValue {
+        self.mean.clone()
+    }
+
+    pub fn get_m2(&self) -> ScalarValue {
+        self.m2.clone()
+    }
+}
+
+impl Accumulator for VarianceAccumulator {
+    fn state(&self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![
+            ScalarValue::from(self.count),
+            self.mean.clone(),
+            self.m2.clone(),
+        ])
+    }
+
+    fn update(&mut self, values: &[ScalarValue]) -> Result<()> {
+        let values = &values[0];
+        let is_empty = values.is_null();
+
+        if !is_empty {
+            let new_count = self.count + 1;
+            let delta1 = ScalarValue::add(values, 
&self.mean.arithmetic_negate())?;

Review comment:
       Completely agree. I have plan to investigate this as followup PR. The 
current challenge is the algorithm will loose parallelizability if using a 
batch friendly algorithm. And I need to spend more time to understand the code. 
One question I have is will there be a chance `update` and `batch_update` can 
be used in the same job, i.e. if one job can call `update` on some data and 
`batch_update` on some other data. Reason for that is the online version of the 
algorithm requires an intermediate value to be calculated so it is not 
compatible with batch mode, that is, we can only do all batch or all online. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to