jychen7 commented on a change in pull request #2031:
URL: https://github.com/apache/arrow-datafusion/pull/2031#discussion_r829662200



##########
File path: datafusion-physical-expr/src/tdigest/mod.rs
##########
@@ -708,29 +714,12 @@ mod tests {
 
     #[test]
     fn test_int64_uniform() {
-        let values = (1i64..=1000).map(|v| ScalarValue::Int64(Some(v)));
-
-        let t = TDigest::new(100);
-        let t = t.merge_unsorted(values).unwrap();
-
-        assert_error_bounds!(t, quantile = 0.1, want = 100.0);
-        assert_error_bounds!(t, quantile = 0.5, want = 500.0);
-        assert_error_bounds!(t, quantile = 0.9, want = 900.0);
-        assert_state_roundtrip!(t);
-    }
-
-    #[test]
-    fn test_int64_uniform_with_nulls() {

Review comment:
       remove this test case because now `merge_unsorted_f64` will not accept 
None, the None filter should be test in `try_as_f64`
   
   
https://github.com/apache/arrow-datafusion/blob/136e9e9803b17c0a248939e904e22f218f3efb4b/datafusion-physical-expr/src/tdigest/mod.rs#L53-L60

##########
File path: datafusion-physical-expr/src/tdigest/mod.rs
##########
@@ -708,29 +714,12 @@ mod tests {
 
     #[test]
     fn test_int64_uniform() {
-        let values = (1i64..=1000).map(|v| ScalarValue::Int64(Some(v)));
-
-        let t = TDigest::new(100);
-        let t = t.merge_unsorted(values).unwrap();
-
-        assert_error_bounds!(t, quantile = 0.1, want = 100.0);
-        assert_error_bounds!(t, quantile = 0.5, want = 500.0);
-        assert_error_bounds!(t, quantile = 0.9, want = 900.0);
-        assert_state_roundtrip!(t);
-    }
-
-    #[test]
-    fn test_int64_uniform_with_nulls() {

Review comment:
       remove this test case because now `merge_unsorted_f64` will not accept 
None, the None filter may be test in `try_as_f64`
   
   
https://github.com/apache/arrow-datafusion/blob/136e9e9803b17c0a248939e904e22f218f3efb4b/datafusion-physical-expr/src/tdigest/mod.rs#L53-L60

##########
File path: datafusion-physical-expr/src/expressions/approx_percentile_cont.rs
##########
@@ -194,75 +204,125 @@ pub struct ApproxPercentileAccumulator {
 impl ApproxPercentileAccumulator {
     pub fn new(percentile: f64, return_type: DataType) -> Self {
         Self {
-            digest: TDigest::new(100),
+            digest: TDigest::new(DEFAULT_MAX_SIZE),
             percentile,
             return_type,
         }
     }
-}
 
-impl Accumulator for ApproxPercentileAccumulator {
-    fn state(&self) -> Result<Vec<ScalarValue>> {
-        Ok(self.digest.to_scalar_state())
+    pub(crate) fn merge_digests(&mut self, digests: &[TDigest]) {
+        self.digest = TDigest::merge_digests(digests);
     }
 
-    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        debug_assert_eq!(
-            values.len(),
-            1,
-            "invalid number of values in batch percentile update"
-        );
-        let values = &values[0];
-
-        self.digest = match values.data_type() {
+    pub(crate) fn convert_to_ordered_float(
+        values: &ArrayRef,
+    ) -> Result<Vec<OrderedFloat<f64>>> {
+        match values.data_type() {
             DataType::Float64 => {
                 let array = 
values.as_any().downcast_ref::<Float64Array>().unwrap();
-                self.digest.merge_unsorted(array.values().iter().cloned())?
+                Ok(array
+                    .values()
+                    .iter()
+                    .filter_map(|v| v.try_as_f64().transpose())
+                    .collect::<Result<Vec<_>>>()?)

Review comment:
       originally, I want to just return `array.values().iter().cloned()` and 
then pass to `merge_sorted` as `impl IntoIterator<Item = T>`, but I didn't 
figure out how to correct `type` it, so I change to `Vec<OrderedFloat<f64>>`
   
   
   
https://github.com/apache/arrow-datafusion/blob/485bfa6ffcf8dc836a482127ecd367d9abfa7248/datafusion-physical-expr/src/tdigest/mod.rs#L252-L255




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to