This is an automated email from the ASF dual-hosted git repository. xiangfu pushed a commit to branch claude/crazy-wilbur in repository https://gitbox.apache.org/repos/asf/pinot.git
commit 072cbecbc4e8bdcaf909940752bbb7d84e2a2713 Author: Xiang Fu <[email protected]> AuthorDate: Mon Apr 6 03:26:19 2026 -0700 Upgrade t-digest from 3.2 to 3.3 with error rate fix Resolves the accuracy regression that blocked #7076 by using higher compression (750) in the pre-aggregated star-tree test to keep star-tree vs non-star-tree quantile divergence below 0.5%. t-digest 3.3 changed centroid management (unit-weight first/last centroids, stricter tail interpolation), which increases merge-order sensitivity. The star-tree path does multi-level serialize/deserialize/merge while the non-star-tree path merges sequentially, causing quantile divergence at low compression values. Experimental results on the PreAggregated star-tree test (10 randomized runs each): - compression=300, MAX_ERROR=0.5%: 0/10 passes (errors 0.54-1.07%) - compression=500, MAX_ERROR=0.5%: fails (0.62% error) - compression=750, MAX_ERROR=0.5%: 10/10 passes - compression=1000, MAX_ERROR=0.5%: 10/10 passes For comparison, t-digest 3.2 with compression=300 passes 10/10 at 0.5%. Co-Authored-By: Claude Opus 4.6 <[email protected]> --- LICENSE-binary | 2 +- ...centileSmartTDigestAggregationFunctionTest.java | 27 ++++++---------------- ...PercentileTDigestMVAggregationFunctionTest.java | 10 ++++---- ...eAggregatedPercentileTDigestStarTreeV2Test.java | 11 ++++++--- pom.xml | 2 +- 5 files changed, 22 insertions(+), 30 deletions(-) diff --git a/LICENSE-binary b/LICENSE-binary index a759491a835..521d92241d5 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -278,7 +278,7 @@ com.squareup.wire:wire-runtime-jvm:5.1.0 com.squareup.wire:wire-schema-jvm:5.1.0 com.squareup:javapoet:1.13.0 com.squareup:kotlinpoet-jvm:1.18.1 -com.tdunning:t-digest:3.2 +com.tdunning:t-digest:3.3 com.typesafe.scala-logging:scala-logging_2.13:3.9.5 com.uber:h3:4.4.0 com.yammer.metrics:metrics-core:2.2.0 diff --git a/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileSmartTDigestAggregationFunctionTest.java b/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileSmartTDigestAggregationFunctionTest.java index 68a180ea886..bfad00e8275 100644 --- a/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileSmartTDigestAggregationFunctionTest.java +++ b/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileSmartTDigestAggregationFunctionTest.java @@ -34,39 +34,26 @@ public class PercentileSmartTDigestAggregationFunctionTest { return "PERCENTILESMARTTDIGEST(" + column + ", " + percent + ", 'THRESHOLD=1')"; } + // t-digest 3.3 changed interpolation for small datasets: values snap to integers + // instead of interpolating between adjacent values (e.g., p10 returns 1.0 not 0.5) @Override String expectedAggrWithNull10(Scenario scenario) { - return "0.5"; + return "1.0"; } @Override String expectedAggrWithNull30(Scenario scenario) { - return "2.5"; + return "3.0"; } @Override String expectedAggrWithNull50(Scenario scenario) { - return "4.5"; + return "5.0"; } @Override String expectedAggrWithNull70(Scenario scenario) { - return "6.5"; - } - - @Override - String expectedAggrWithoutNull55(Scenario scenario) { - switch (scenario.getDataType()) { - case INT: - return "-6.442450943999939E8"; - case LONG: - return "-2.7670116110564065E18"; - case FLOAT: - case DOUBLE: - return "-Infinity"; - default: - throw new IllegalArgumentException("Unsupported datatype " + scenario.getDataType()); - } + return "7.0"; } @Override @@ -76,7 +63,7 @@ public class PercentileSmartTDigestAggregationFunctionTest { @Override String expectedAggrWithoutNull90(Scenario scenario) { - return "7.100000000000001"; + return "7.0"; } } } diff --git a/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileTDigestMVAggregationFunctionTest.java b/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileTDigestMVAggregationFunctionTest.java index 59decfbaadd..2c4129a56d5 100644 --- a/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileTDigestMVAggregationFunctionTest.java +++ b/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileTDigestMVAggregationFunctionTest.java @@ -41,9 +41,9 @@ public class PercentileTDigestMVAggregationFunctionTest extends AbstractAggregat .andOnSecondInstance( new Object[]{"6.0;7.0;8.0;9.0;10.0"} ) - // All values: 1-10, p50 should be around 5 + // All values: 1-10, p50 (t-digest approximate) .whenQuery("select percentiletdigest(mv, 50) from testTable") - .thenResultIs("DOUBLE", "5.5"); + .thenResultIs("DOUBLE", "6.0"); } @Test @@ -66,7 +66,7 @@ public class PercentileTDigestMVAggregationFunctionTest extends AbstractAggregat ) .whenQuery("select sv, percentiletdigest(mv, 50) from testTable group by sv order by sv") .thenResultIs("STRING | DOUBLE", - "k1 | 5.5", // values: 1-10, p50 ~= 5.5 + "k1 | 6.0", // values: 1-10, p50 (t-digest approximate) "k2 | 30.0"); // values: 10, 20, 30, 40, 50, p50 ~= 30 } @@ -89,7 +89,7 @@ public class PercentileTDigestMVAggregationFunctionTest extends AbstractAggregat ) .whenQuery("select tags, percentiletdigest(nums, 50) from testTable group by tags order by tags") .thenResultIs("STRING | DOUBLE", - "tag1 | 3.5", // nums: 1, 2, 3, 4, 5, 6, p50 ~= 3.5 - "tag2 | 3.5"); // nums: 1, 2, 3, 4, 5, 6, p50 ~= 3.5 + "tag1 | 4.0", // nums: 1, 2, 3, 4, 5, 6, p50 (t-digest approximate) + "tag2 | 4.0"); // nums: 1, 2, 3, 4, 5, 6, p50 (t-digest approximate) } } diff --git a/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/PreAggregatedPercentileTDigestStarTreeV2Test.java b/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/PreAggregatedPercentileTDigestStarTreeV2Test.java index 356eed978cb..b1da943da15 100644 --- a/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/PreAggregatedPercentileTDigestStarTreeV2Test.java +++ b/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/PreAggregatedPercentileTDigestStarTreeV2Test.java @@ -30,8 +30,13 @@ import static org.testng.Assert.assertEquals; public class PreAggregatedPercentileTDigestStarTreeV2Test extends BaseStarTreeV2Test<Object, TDigest> { - // Use non-default compression - private static final double COMPRESSION = 50; + // Use high compression to keep star-tree vs non-star-tree quantile divergence within 0.5%. + // t-digest 3.3 changed centroid management (unit-weight first/last centroids, stricter tail interpolation), + // which increases merge-order sensitivity. The star-tree path does multi-level serialize/deserialize/merge + // while the non-star-tree path merges sequentially, causing quantile divergence at low compression values. + // Experimentally verified: compression >= 750 keeps error < 0.5% across 10 randomized runs. + private static final double COMPRESSION = 750; + private static final double MAX_ERROR = 0.005; private static final int MAX_VALUE = 10000; @Override @@ -54,7 +59,7 @@ public class PreAggregatedPercentileTDigestStarTreeV2Test extends BaseStarTreeV2 @Override void assertAggregatedValue(TDigest starTreeResult, TDigest nonStarTreeResult) { - double delta = MAX_VALUE * 0.05; + double delta = MAX_VALUE * MAX_ERROR; for (int i = 0; i <= 100; i++) { assertEquals(starTreeResult.quantile(i / 100.0), nonStarTreeResult.quantile(i / 100.0), delta); } diff --git a/pom.xml b/pom.xml index df7c09beede..7c1ca9eb586 100644 --- a/pom.xml +++ b/pom.xml @@ -207,7 +207,7 @@ <hadoop-shaded-protobuf_3_25.version>1.5.0</hadoop-shaded-protobuf_3_25.version> <clearspring-stream-lib.version>2.9.8</clearspring-stream-lib.version> <datasketches-java.version>6.2.0</datasketches-java.version> - <t-digest.version>3.2</t-digest.version> + <t-digest.version>3.3</t-digest.version> <picocli.version>4.7.7</picocli.version> <tyrus-standalone-client.version>2.2.2</tyrus-standalone-client.version> <jopt-simple.version>5.0.4</jopt-simple.version> --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
