okumin commented on code in PR #6294:
URL: https://github.com/apache/hive/pull/6294#discussion_r2922700307
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/stats/annotation/TestStatsRulesProcFactory.java:
##########
@@ -631,4 +643,153 @@ private static ColStatistics createColStatistics(
return colStatistics;
}
+
+ /**
+ * Test that computeAggregateColumnMinMax properly handles numNulls=-1
(unknown).
+ * With the fix, numNulls=-1 should be treated as 0, giving valuesCount =
numRows.
+ * Without the fix, valuesCount = numRows - (-1) = numRows + 1 (wrong).
+ */
+ @Test
+ public void testComputeAggregateColumnMinMaxWithUnknownNumNulls() throws
Exception {
+ // Get the private static method via reflection
+ Class<?> groupByStatsRuleClass = Class.forName(
+ StatsRulesProcFactory.class.getName() + "$GroupByStatsRule");
+
+ Method method = groupByStatsRuleClass.getDeclaredMethod(
+ "computeAggregateColumnMinMax",
+ ColStatistics.class, HiveConf.class, AggregationDesc.class,
String.class, Statistics.class);
+ method.setAccessible(true);
Review Comment:
Could you please use `@VisibleForTesting`? It is more aligned with the
coding style of Hive and convenient when we use an IDE.
https://github.com/apache/hive/blob/7ef64d3e4662ce92f7008ef4c4a6dc6139eb7143/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java#L1241
##########
ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java:
##########
@@ -240,29 +240,30 @@ public void setColumnStats(List<ColStatistics> colStats) {
}
public void addToColumnStats(List<ColStatistics> colStats) {
-
+ if (colStats == null) {
+ return;
+ }
if (columnStats == null) {
columnStats = Maps.newHashMap();
}
- if (colStats != null) {
- for (ColStatistics cs : colStats) {
- ColStatistics updatedCS = null;
- if (cs != null) {
-
- String key = cs.getColumnName();
- // if column statistics for a column is already found then merge the
statistics
- if (columnStats.containsKey(key) && columnStats.get(key) != null) {
- updatedCS = columnStats.get(key);
- updatedCS.setAvgColLen(Math.max(updatedCS.getAvgColLen(),
cs.getAvgColLen()));
- updatedCS.setNumNulls(StatsUtils.safeAdd(updatedCS.getNumNulls(),
cs.getNumNulls()));
- updatedCS.setCountDistint(Math.max(updatedCS.getCountDistint(),
cs.getCountDistint()));
- columnStats.put(key, updatedCS);
- } else {
- columnStats.put(key, cs);
- }
- }
+ for (ColStatistics cs : colStats) {
+ if (cs == null) {
+ continue;
+ }
+ String key = cs.getColumnName();
+ ColStatistics existing = columnStats.get(key);
+ if (existing == null) {
+ columnStats.put(key, cs);
Review Comment:
Let's follow this one.
https://sonarcloud.io/project/issues?sinceLeakPeriod=true&issueStatuses=OPEN%2CCONFIRMED&pullRequest=6294&id=apache_hive&open=AZzhEkKWBs171Xl6dTm0
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/stats/annotation/TestStatsRulesProcFactory.java:
##########
@@ -631,4 +643,153 @@ private static ColStatistics createColStatistics(
return colStatistics;
}
+
+ /**
+ * Test that computeAggregateColumnMinMax properly handles numNulls=-1
(unknown).
+ * With the fix, numNulls=-1 should be treated as 0, giving valuesCount =
numRows.
+ * Without the fix, valuesCount = numRows - (-1) = numRows + 1 (wrong).
+ */
+ @Test
+ public void testComputeAggregateColumnMinMaxWithUnknownNumNulls() throws
Exception {
+ // Get the private static method via reflection
+ Class<?> groupByStatsRuleClass = Class.forName(
+ StatsRulesProcFactory.class.getName() + "$GroupByStatsRule");
+
+ Method method = groupByStatsRuleClass.getDeclaredMethod(
+ "computeAggregateColumnMinMax",
+ ColStatistics.class, HiveConf.class, AggregationDesc.class,
String.class, Statistics.class);
+ method.setAccessible(true);
+
+ // Create output ColStatistics for the COUNT result
+ ColStatistics cs = new ColStatistics("_col0", "bigint");
+
+ // Create HiveConf
+ HiveConf conf = new HiveConf();
+
+ // Create parent column stats with numNulls=-1 (unknown) and Range(1, 100)
+ ColStatistics parentColStats = new ColStatistics("val", "int");
+ parentColStats.setNumNulls(-1); // unknown numNulls - this is what we're
testing
+ parentColStats.setCountDistint(100);
+ parentColStats.setRange(1, 100);
+
+ // Create parent Statistics with 100 rows
+ Statistics parentStats = new Statistics(100, 400, 400, 400);
+ parentStats.addToColumnStats(Collections.singletonList(parentColStats));
+
+ // Create ExprNodeColumnDesc for the "val" column
+ ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(
+ TypeInfoFactory.intTypeInfo, "val", "t", false);
+
+ // Create AggregationDesc for COUNT(val) using no-arg constructor to avoid
NPE
+ AggregationDesc agg = new AggregationDesc();
+ agg.setGenericUDAFName("count");
+ agg.setParameters(Collections.singletonList(colExpr));
+ agg.setDistinct(false);
+ agg.setMode(GenericUDAFEvaluator.Mode.COMPLETE);
+
+ // Call the method
+ method.invoke(null, cs, conf, agg, "bigint", parentStats);
+
+ // Verify: With the fix, COUNT Range should be (0, 100)
+ // numNulls=-1 is treated as 0, so valuesCount = 100 - 0 = 100
+ // Without the fix, valuesCount = 100 - (-1) = 101 (WRONG)
+ assertNotNull("Range should be set on COUNT column", cs.getRange());
+ assertEquals("COUNT min should be 0", 0L, ((Number)
cs.getRange().minValue).longValue());
+ assertEquals("COUNT max should be 100 (numRows), not 101",
+ 100L, ((Number) cs.getRange().maxValue).longValue());
+ }
+
+ @Test
+ public void testComputeAggregateColumnMinMaxWithKnownNumNulls() throws
Exception {
+ // Get the private static method via reflection
+ Class<?> groupByStatsRuleClass = Class.forName(
+ StatsRulesProcFactory.class.getName() + "$GroupByStatsRule");
+
+ Method method = groupByStatsRuleClass.getDeclaredMethod(
+ "computeAggregateColumnMinMax",
+ ColStatistics.class, HiveConf.class, AggregationDesc.class,
String.class, Statistics.class);
+ method.setAccessible(true);
+
+ ColStatistics cs = new ColStatistics("_col0", "bigint");
+ HiveConf conf = new HiveConf();
+
+ // Create parent column stats with numNulls=20 (known) and Range
+ ColStatistics parentColStats = new ColStatistics("val", "int");
+ parentColStats.setNumNulls(20); // known numNulls
+ parentColStats.setCountDistint(80);
+ parentColStats.setRange(1, 100);
+
+ Statistics parentStats = new Statistics(100, 400, 400, 400);
+ parentStats.addToColumnStats(Collections.singletonList(parentColStats));
+
+ ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(
+ TypeInfoFactory.intTypeInfo, "val", "t", false);
+ AggregationDesc agg = new AggregationDesc();
+ agg.setGenericUDAFName("count");
+ agg.setParameters(Collections.singletonList(colExpr));
+ agg.setDistinct(false);
+ agg.setMode(GenericUDAFEvaluator.Mode.COMPLETE);
+
+ method.invoke(null, cs, conf, agg, "bigint", parentStats);
+
+ // With known numNulls=20, valuesCount = 100 - 20 = 80
+ assertNotNull("Range should be set", cs.getRange());
+ assertEquals(0L, ((Number) cs.getRange().minValue).longValue());
+ assertEquals("COUNT max should be 80 (numRows - numNulls)",
+ 80L, ((Number) cs.getRange().maxValue).longValue());
+ }
+
+ /**
+ * Test that JoinStatsRule.updateNumNulls preserves unknown numNulls (-1).
+ * With the fix, when numNulls is -1 (unknown), the method returns early
without modification.
+ * Without the fix, LEFT_OUTER_JOIN would calculate: newNumNulls =
oldNumNulls + leftUnmatchedRows = -1 + 100 = 99
+ */
+ @Test
+ public void testUpdateNumNullsPreservesUnknownNumNulls() throws Exception {
+ // Get the private JoinStatsRule inner class
+ Class<?> joinStatsRuleClass = Class.forName(
+ StatsRulesProcFactory.class.getName() + "$JoinStatsRule");
+
+ // Create an instance of JoinStatsRule
+ Constructor<?> ctor = joinStatsRuleClass.getDeclaredConstructor();
+ ctor.setAccessible(true);
+ Object joinStatsRule = ctor.newInstance();
Review Comment:
Ditto
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/stats/annotation/TestStatsRulesProcFactory.java:
##########
@@ -631,4 +643,153 @@ private static ColStatistics createColStatistics(
return colStatistics;
}
+
+ /**
+ * Test that computeAggregateColumnMinMax properly handles numNulls=-1
(unknown).
+ * With the fix, numNulls=-1 should be treated as 0, giving valuesCount =
numRows.
+ * Without the fix, valuesCount = numRows - (-1) = numRows + 1 (wrong).
+ */
+ @Test
+ public void testComputeAggregateColumnMinMaxWithUnknownNumNulls() throws
Exception {
+ // Get the private static method via reflection
+ Class<?> groupByStatsRuleClass = Class.forName(
+ StatsRulesProcFactory.class.getName() + "$GroupByStatsRule");
+
+ Method method = groupByStatsRuleClass.getDeclaredMethod(
+ "computeAggregateColumnMinMax",
+ ColStatistics.class, HiveConf.class, AggregationDesc.class,
String.class, Statistics.class);
+ method.setAccessible(true);
+
+ // Create output ColStatistics for the COUNT result
+ ColStatistics cs = new ColStatistics("_col0", "bigint");
+
+ // Create HiveConf
+ HiveConf conf = new HiveConf();
+
+ // Create parent column stats with numNulls=-1 (unknown) and Range(1, 100)
+ ColStatistics parentColStats = new ColStatistics("val", "int");
+ parentColStats.setNumNulls(-1); // unknown numNulls - this is what we're
testing
+ parentColStats.setCountDistint(100);
+ parentColStats.setRange(1, 100);
+
+ // Create parent Statistics with 100 rows
+ Statistics parentStats = new Statistics(100, 400, 400, 400);
+ parentStats.addToColumnStats(Collections.singletonList(parentColStats));
+
+ // Create ExprNodeColumnDesc for the "val" column
+ ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(
+ TypeInfoFactory.intTypeInfo, "val", "t", false);
+
+ // Create AggregationDesc for COUNT(val) using no-arg constructor to avoid
NPE
+ AggregationDesc agg = new AggregationDesc();
+ agg.setGenericUDAFName("count");
+ agg.setParameters(Collections.singletonList(colExpr));
+ agg.setDistinct(false);
+ agg.setMode(GenericUDAFEvaluator.Mode.COMPLETE);
+
+ // Call the method
+ method.invoke(null, cs, conf, agg, "bigint", parentStats);
+
+ // Verify: With the fix, COUNT Range should be (0, 100)
+ // numNulls=-1 is treated as 0, so valuesCount = 100 - 0 = 100
+ // Without the fix, valuesCount = 100 - (-1) = 101 (WRONG)
+ assertNotNull("Range should be set on COUNT column", cs.getRange());
+ assertEquals("COUNT min should be 0", 0L, ((Number)
cs.getRange().minValue).longValue());
+ assertEquals("COUNT max should be 100 (numRows), not 101",
+ 100L, ((Number) cs.getRange().maxValue).longValue());
+ }
+
+ @Test
+ public void testComputeAggregateColumnMinMaxWithKnownNumNulls() throws
Exception {
+ // Get the private static method via reflection
+ Class<?> groupByStatsRuleClass = Class.forName(
+ StatsRulesProcFactory.class.getName() + "$GroupByStatsRule");
+
+ Method method = groupByStatsRuleClass.getDeclaredMethod(
+ "computeAggregateColumnMinMax",
+ ColStatistics.class, HiveConf.class, AggregationDesc.class,
String.class, Statistics.class);
+ method.setAccessible(true);
Review Comment:
Ditto
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]