2010YOUY01 commented on code in PR #19278:
URL: https://github.com/apache/datafusion/pull/19278#discussion_r2617868417
##########
datafusion/functions-aggregate/src/median.rs:
##########
@@ -289,14 +290,51 @@ impl<T: ArrowNumericType> Accumulator for
MedianAccumulator<T> {
}
fn evaluate(&mut self) -> Result<ScalarValue> {
- let d = std::mem::take(&mut self.all_values);
- let median = calculate_median::<T>(d);
+ let median = calculate_median::<T>(&mut self.all_values);
ScalarValue::new_primitive::<T>(median, &self.data_type)
}
fn size(&self) -> usize {
size_of_val(self) + self.all_values.capacity() * size_of::<T::Native>()
}
+
+ fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+ let mut to_remove: HashMap<ScalarValue, usize> = HashMap::new();
Review Comment:
This seems like a good optimization with minimal added complexity.
##########
datafusion/sqllogictest/test_files/aggregate.slt:
##########
@@ -991,6 +991,89 @@ SELECT approx_median(col_f64_nan) FROM median_table
----
NaN
+# median_sliding_window
+statement ok
+CREATE TABLE median_window_test (
+ timestamp INT,
+ tags VARCHAR,
+ value DOUBLE
+);
+
+statement ok
+INSERT INTO median_window_test (timestamp, tags, value) VALUES
+(1, 'tag1', 10.0),
+(2, 'tag1', 20.0),
+(3, 'tag1', 30.0),
+(4, 'tag1', 40.0),
+(5, 'tag1', 50.0),
+(1, 'tag2', 60.0),
+(2, 'tag2', 70.0),
+(3, 'tag2', 80.0),
+(4, 'tag2', 90.0),
+(5, 'tag2', 100.0);
+
+query ITRR
+SELECT
+ timestamp,
+ tags,
+ value,
+ median(value) OVER (
+ PARTITION BY tags
+ ORDER BY timestamp
+ ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
+ ) AS value_median_3
+FROM median_window_test
+ORDER BY tags, timestamp;
+----
+1 tag1 10 15
+2 tag1 20 20
+3 tag1 30 30
+4 tag1 40 40
+5 tag1 50 45
+1 tag2 60 65
+2 tag2 70 70
+3 tag2 80 80
+4 tag2 90 90
+5 tag2 100 95
+
+# median_non_sliding_window
+query ITRRRR
+SELECT
+ timestamp,
+ tags,
+ value,
+ median(value) OVER (
+ PARTITION BY tags
+ ORDER BY timestamp
+ ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+ ) AS value_median_unbounded_preceding,
+ median(value) OVER (
+ PARTITION BY tags
+ ORDER BY timestamp
+ ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+ ) AS value_median_unbounded_both,
Review Comment:
I think we should file a ticket, the previous impl should be able to handle
`unbounded preceding` as @Jefffrey explained, and the inconsistent results is
likely to indicate a bug.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]