Re: [PR] HIVE-29275: Stats autogather calculates the min statistic incorrectly [hive]

via GitHub Mon, 17 Nov 2025 04:17:52 -0800


InvisibleProgrammer commented on code in PR #6177:
URL: https://github.com/apache/hive/pull/6177#discussion_r2533840128



##########
standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/DecimalComparatorTest.java:
##########
@@ -0,0 +1,446 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.metastore.columnstats;
+
+import com.google.common.math.BigIntegerMath;
+import org.apache.commons.lang3.exception.ExceptionUtils;
+import org.apache.hadoop.hive.metastore.api.Decimal;
+import org.apache.hadoop.hive.metastore.columnstats.DecimalComparator.Approach;
+import org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils;
+import org.junit.Test;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.math.RoundingMode;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HexFormat;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Random;
+import java.util.function.Consumer;
+
+import static 
org.apache.hadoop.hive.metastore.columnstats.DecimalComparator.PAD_NEG;
+import static 
org.apache.hadoop.hive.metastore.columnstats.DecimalComparator.PAD_POS;
+import static 
org.apache.hadoop.hive.metastore.columnstats.DecimalComparator.bitLog;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+public class DecimalComparatorTest {
+
+  public record ExpectApproach(Approach expected) implements 
DecimalComparator.ApproachInfoCallback {
+
+    @Override
+    public void accept(Approach approach) {
+      assertEquals(expected, approach);
+    }
+  }
+
+  public static final HexFormat FORMAT = HexFormat.ofDelimiter(" ");
+
+  public String toStr(Decimal val) {
+    return Objects.toString(new BigDecimal(new BigInteger(val.getUnscaled()), 
val.getScale()));
+  }
+
+  /**
+   * Create a Decimal.
+   * @param bigDecimal the value
+   * @param scaleDrift multiply by 10^scaleDrift * 10^-scaleDrift.
+   *                   The former goes to the significand, the latter to the 
exponent.
+   *                   In case of a negative scaleDrift, the significand is 
divided by 10^scaleDrift.
+   * @return the Decimal
+   */
+  public static Decimal createDecimal(BigDecimal bigDecimal, int scaleDrift) {
+    BigDecimal powTen = BigDecimal.TEN.pow(Math.abs(scaleDrift));
+    BigDecimal adapted = scaleDrift >= 0 ? bigDecimal.multiply(powTen) : 
bigDecimal.divide(powTen, RoundingMode.DOWN);
+    byte[] byteArray = adapted.unscaledValue().toByteArray();
+    ByteBuffer wrap = ByteBuffer.wrap(byteArray);
+    return new Decimal((short) (bigDecimal.scale() + scaleDrift), wrap);
+  }
+
+  public void check(String n1, String n2, Approach expectedApproach) {
+    check(n1, 0, n2, 0, expectedApproach);
+  }
+
+  public void check(String n1, int scaleDrift1, String n2, int scaleDrift2, 
Approach expectedApproach) {
+    BigDecimal bd1 = new BigDecimal(n1), bd2 = new BigDecimal(n2);
+    checkInner(bd1, scaleDrift1, bd2, scaleDrift2, expectedApproach);
+    checkInner(bd2, scaleDrift2, bd1, scaleDrift1, expectedApproach);
+  }
+
+  public int normalizeCompareTo(int cmp) {
+    return Integer.compare(cmp, 0);
+  }
+
+  public void checkInner(BigDecimal bd1, int scaleDrift1, BigDecimal bd2, int 
scaleDrift2, Approach expectedApproach) {
+    Decimal d1 = createDecimal(bd1, scaleDrift1);
+    Decimal d2 = createDecimal(bd2, scaleDrift2);
+
+    int expected = normalizeCompareTo(bd1.compareTo(bd2));
+    int actual = normalizeCompareTo(new DecimalComparator().compareInner(d1, 
d2, false,
+        expectedApproach == null ? null : new 
ExpectApproach(expectedApproach)));
+    if (expected != actual) {
+      System.out.println("compareTo result was wrong for " + toStr(d1) + " and 
" + toStr(
+          d2) + ": " + expected + ", but was " + actual);
+      assertEquals("compareTo result was wrong for " + bd1 + " and " + bd2, 
expected, actual);
+    }
+  }
+
+  @Test
+  public void testDecimal() {
+    interface Helper {
+      void check(String input, int scaleDrift, double expectedDouble, String 
expectedHiveStr, String expectedJavaStr);
+    }
+    Helper helper = (input, scaleDrift, expectedDouble, expectedHiveStr, 
expectedJavaStr) -> {
+      Decimal d3 = createDecimal(new BigDecimal(input), scaleDrift);
+      assertEquals(expectedDouble, MetaStoreServerUtils.decimalToDouble(d3), 
Double.MIN_VALUE);
+      assertEquals(expectedHiveStr, MetaStoreServerUtils.decimalToString(d3));
+      assertEquals(expectedJavaStr, toStr(d3));
+    };
+
+    helper.check("1.2", 0, 1.2, "1.2", "1.2");
+    helper.check("123.45", 0, 123.45, "123.45", "123.45");
+    helper.check("123.45", 1, 123.45, "123.45", "123.450");
+    helper.check("123.45", 10, 123.45, "123.45", "123.450000000000");
+    helper.check("1", 2, 1, "1", "1.00");
+  }
+
+  @Test
+  public void testApproachSign() {
+    check("1", "-1", Approach.SIGN);
+    check("123", "-123", Approach.SIGN);
+    check("123", "-11.1", Approach.SIGN);
+    check("123.23", "-11.1", Approach.SIGN);
+    check("123.23", "-11", Approach.SIGN);
+  }
+
+  @Test
+  public void testApproachSameScale() {
+    check("2", "8", Approach.EQSCALE);
+    check("-2", "-8", Approach.EQSCALE);
+    check("20", "80", Approach.EQSCALE);
+    check("50", "20", Approach.EQSCALE);
+    check("-20", "-80", Approach.EQSCALE);
+    check("-50", "-20", Approach.EQSCALE);
+    check("1000", "1001", Approach.EQSCALE);
+    check("-1000", "-1001", Approach.EQSCALE);
+    check("100000000", "100000001", Approach.EQSCALE);
+    check("-100000000", "-100000001", Approach.EQSCALE);
+
+    check("10", "1", Approach.EQSCALE);
+    check("1", "10", Approach.EQSCALE);
+    check("-10", "-1", Approach.EQSCALE);
+    check("-1", "-10", Approach.EQSCALE);
+
+    check("-10.2", "-123.2", Approach.EQSCALE);
+    check("-123.2", "-10.2", Approach.EQSCALE);
+
+    check("-2E+1", "-8E+1", Approach.EQSCALE);
+  }
+
+  @Test
+  public void testApproachBitLog() {
+    // positive values
+    check("3", 0, "20", -1, Approach.BITLOG_A);
+    check("9.123", "8113", Approach.BITLOG_A);
+    check("9123", "8.113", Approach.BITLOG_A);
+    check("9123", "1.113", Approach.BITLOG_A);
+    check("1123", "9.113", Approach.BITLOG_A);
+    check("10.21232", "123.2", Approach.BITLOG_A);
+
+    check("9.1", "8113.123", Approach.BITLOG_B);
+    check("9123.123", "8.1", Approach.BITLOG_B);
+    check("9123.123", "1.1", Approach.BITLOG_B);
+    check("1123.123", "9.1", Approach.BITLOG_B);
+
+    // negative values
+    check("-9.123", "-8113", Approach.BITLOG_A);
+    check("-9123", "-8.113", Approach.BITLOG_A);
+    check("-9123", "-1.113", Approach.BITLOG_A);
+    check("-1123", "-9.113", Approach.BITLOG_A);
+    check("-10.21232", "-123.2", Approach.BITLOG_A);
+
+    check("-9.1", "-8113.123", Approach.BITLOG_B);
+    check("-9123.123", "-8.1", Approach.BITLOG_B);
+    check("-9123.123", "-1.1", Approach.BITLOG_B);
+    check("-1123.123", "-9.1", Approach.BITLOG_B);
+
+    // with scale drift
+    check("10.21232", 2, "123.2", 0, Approach.BITLOG_A);
+    check("10.21232", 10, "123.2", 0, Approach.BITLOG_A);
+    check("10.21232", 0, "123.2", 3, Approach.BITLOG_A);
+    // scale (digits after dots) of n1 is 5, scale of n2 is 1,
+    // with a drift of 4 the second number also has 5 digits after the dot
+    check("10.21232", 0, "123.2", 4, Approach.EQSCALE);
+    check("10.21232", 0, "123.2", 5, Approach.BITLOG_B);
+    check("10.21232", 0, "123.2", 10, Approach.BITLOG_B);
+  }
+
+  @Test
+  public void testBitLog1() {
+    interface Helper {
+      void accept(int expected, int input);
+    }
+    Helper check = (expected, input) -> assertEquals(expected,
+        bitLog(BigInteger.valueOf(input).toByteArray(), input >= 0 ? PAD_POS : 
PAD_NEG));
+
+    check.accept(1, 1);
+    check.accept(2, 2);
+    check.accept(2, 3);
+    check.accept(8, 255);
+    check.accept(9, 256);
+    check.accept(9, 511);
+    check.accept(10, 512);
+    check.accept(10, 1023);
+    check.accept(11, 1024);
+
+    check.accept(1, -1);
+    check.accept(2, -2);
+    check.accept(2, -3);
+    check.accept(8, -255);
+    check.accept(9, -256);
+    check.accept(9, -511);
+    check.accept(10, -512);
+    check.accept(10, -1023);
+    check.accept(11, -1024);
+
+    // log2(5800) is about 12.5, bitLog floor(log(...))+1
+    check.accept(13, 5800);
+    check.accept(13, -5800);
+  }
+
+  @Test
+  public void testBitLog2() {
+    interface Helper {
+      void accept(int expected, String input, byte pad);
+    }
+    Helper check = (expected, input, pad) -> assertEquals(expected, 
bitLog(FORMAT.parseHex(input), pad));
+
+    check.accept(8, "FF 16", PAD_NEG);
+
+    check.accept(0, "00 00 00", PAD_POS);
+    check.accept(1, "00 00 01", PAD_POS);
+    check.accept(2, "00 00 02", PAD_POS);
+    check.accept(3, "00 00 04", PAD_POS);
+    check.accept(4, "00 00 08", PAD_POS);
+    check.accept(5, "00 00 10", PAD_POS);
+    check.accept(5, "00 00 11", PAD_POS);
+    check.accept(9, "00 01 11", PAD_POS);
+    check.accept(17, "01 11 11", PAD_POS);
+    check.accept(17, "01 00 00", PAD_POS);
+    check.accept(17, "00 00 00 01 00 00", PAD_POS);
+
+    check.accept(1, "FF FF FF", PAD_NEG);
+    check.accept(2, "FF FF FE", PAD_NEG);
+    check.accept(2, "FF FF FD", PAD_NEG);
+    check.accept(3, "FF FF FB", PAD_NEG);
+    check.accept(4, "FF FF F7", PAD_NEG);
+    check.accept(5, "FF FF EF", PAD_NEG);
+    check.accept(5, "FF FF EE", PAD_NEG);
+    check.accept(9, "FF FE EE", PAD_NEG);
+    check.accept(17, "FE EE EE", PAD_NEG);
+    check.accept(17, "FE FF FF", PAD_NEG);
+    check.accept(17, "FF FF FF FE FF FF", PAD_NEG);
+  }
+
+  @Test
+  public void testBitLenPostcondition() {
+    // define helper which executes the asserts
+    Consumer<BigInteger> check = x -> {
+      int bl = bitLog(x.toByteArray(), x.signum() == -1 ? PAD_NEG : PAD_POS);
+      int log = BigIntegerMath.log2(x.abs(), RoundingMode.DOWN);
+      if (log < 0) {
+        fail("Log may not be negative: " + x);
+      }
+
+      if (log + 1 != bl) {
+        fail(x.toString());
+      }
+      // check inequalities
+      if (!(bl - 1 <= log)) {
+        fail(x.toString());
+      }
+      if (!(log < bl)) {
+        fail(x.toString());
+      }
+    };
+
+    // check all integers from -18 to 18 (inclusive)
+    for (int i = -18; i <= 18; i++) {
+      if (i == 0) {
+        continue;
+      }
+      BigInteger x = BigInteger.valueOf(i);
+      check.accept(x);
+    }
+
+    // check powers of two from 2^5 to 2^200 (about 60 decimal digits)
+    for (int i = 5; i < 200; i++) {
+      BigInteger p = BigInteger.TWO.pow(i);
+      // check negative and positive powers of two: -2^p and +2^p
+      for (int neg = 0; neg < 2; neg++) {
+        if (neg == 1) {
+          p = p.negate();
+        }
+
+        // check the neighborhood of each power of two
+        for (int j = -2; j <= 2; j++) {
+          BigInteger x = p.add(BigInteger.valueOf(j));
+          if (x.compareTo(BigInteger.ZERO) == 0) {
+            continue;
+          }
+          check.accept(x);
+        }
+      }
+    }
+  }
+
+  @Test
+  public void testRandomized() {

Review Comment:
   It is an interesting question: 
   
   I can agree with both of you: As @kasakrisz mentioned, it is a painful 
feeling when somebody has a whole different change and it suddenly fails with a 
test case like this. What should be the expected engineering action in that 
case? 
   And I can agree with @thomasrebele as well - as we cannot possibly test all 
the possible numbers, adding a randomization can identify some edge cases. 
   The question is what should be the expected action if it fails once: If I 
were a decision maker, I would immediately disable the test case and create a 
ticket to solve the test issue. so that my change can move forward. Considering 
our workflow it would led to a couple of days delay. 
   
   I was curious and checked how OpenJDK tests BigDecimal: 
https://github.com/openjdk/jdk/blob/master/test/jdk/java/math/BigDecimal/CompareToTests.java
   
   It seems they have a parameterized test and no randomization. 
   And I have no idea why it runs in a main method - maybe they just ran the 
tests on release and related code change? I don't know. 
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] HIVE-29275: Stats autogather calculates the min statistic incorrectly [hive]

Reply via email to