kbendick commented on code in PR #5513:
URL: https://github.com/apache/iceberg/pull/5513#discussion_r945207959


##########
spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkBucketFunction.java:
##########
@@ -0,0 +1,330 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark.sql;
+
+import java.nio.charset.StandardCharsets;
+import org.apache.iceberg.AssertHelpers;
+import org.apache.iceberg.relocated.com.google.common.io.BaseEncoding;
+import org.apache.iceberg.spark.SparkTestBaseWithCatalog;
+import org.apache.spark.sql.AnalysisException;
+import org.assertj.core.api.Assertions;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestSparkBucketFunction extends SparkTestBaseWithCatalog {
+  public TestSparkBucketFunction() {}
+
+  @Before
+  public void useCatalog() {
+    sql("USE %s", catalogName);
+  }
+
+  @Test
+  public void testBucketIntegers() {
+    Assert.assertEquals(
+        "Byte type should bucket similarly to integer",
+        3,
+        scalarSql("SELECT system.bucket(10, 8Y)"));
+    Assert.assertEquals(
+        "Short type should bucket similarly to integer",
+        3,
+        scalarSql("SELECT system.bucket(10, 8S)"));
+    // Integers
+    Assert.assertEquals(3, scalarSql("SELECT system.bucket(10, 8)"));
+    Assert.assertEquals(79, scalarSql("SELECT system.bucket(100, 34)"));
+    Assert.assertNull(scalarSql("SELECT system.bucket(1, CAST(null AS INT))"));
+  }
+
+  @Test
+  public void testBucketDates() {
+    Assert.assertEquals(3, scalarSql("SELECT system.bucket(10, 
date('1970-01-09'))"));
+    Assert.assertEquals(79, scalarSql("SELECT system.bucket(100, 
date('1970-02-04'))"));
+    Assert.assertNull(scalarSql("SELECT system.bucket(1, CAST(null AS 
DATE))"));
+  }
+
+  @Test
+  public void testBucketLong() {
+    Assert.assertEquals(79, scalarSql("SELECT system.bucket(100, 34L)"));
+    Assert.assertEquals(76, scalarSql("SELECT system.bucket(100, 0L)"));
+    Assert.assertEquals(97, scalarSql("SELECT system.bucket(100, -34L)"));
+    Assert.assertEquals(0, scalarSql("SELECT system.bucket(2, -1L)"));
+    Assert.assertNull(scalarSql("SELECT system.bucket(2, CAST(null AS 
LONG))"));
+  }
+
+  @Test
+  public void testBucketDecimal() {
+    Assert.assertEquals(56, scalarSql("SELECT system.bucket(64, CAST('12.34' 
as DECIMAL(9, 2)))"));
+    Assert.assertEquals(13, scalarSql("SELECT system.bucket(18, CAST('12.30' 
as DECIMAL(9, 2)))"));
+    Assert.assertEquals(2, scalarSql("SELECT system.bucket(16, CAST('12.999' 
as DECIMAL(9, 3)))"));
+    Assert.assertEquals(21, scalarSql("SELECT system.bucket(32, CAST('0.05' as 
DECIMAL(5, 2)))"));
+    Assert.assertEquals(85, scalarSql("SELECT system.bucket(128, CAST('0.05' 
as DECIMAL(9, 2)))"));
+    Assert.assertEquals(3, scalarSql("SELECT system.bucket(18, CAST('0.05' as 
DECIMAL(9, 2)))"));
+
+    Assert.assertNull(
+        "Null input should return null",
+        scalarSql("SELECT system.bucket(2, CAST(null AS decimal))"));
+  }
+
+  @Test
+  public void testBucketTimestamp() {
+    Assert.assertEquals(
+        99, scalarSql("SELECT system.bucket(100, TIMESTAMP '1997-01-01 
00:00:00 UTC+00:00')"));
+    Assert.assertEquals(
+        85, scalarSql("SELECT system.bucket(100, TIMESTAMP '1997-01-31 
09:26:56 UTC+00:00')"));
+    Assert.assertEquals(
+        62, scalarSql("SELECT system.bucket(100, TIMESTAMP '2022-08-08 
00:00:00 UTC+00:00')"));
+    Assert.assertNull(scalarSql("SELECT system.bucket(2, CAST(null AS 
timestamp))"));
+  }
+
+  @Test
+  public void testBucketString() {
+    Assert.assertEquals(4, scalarSql("SELECT system.bucket(5, 'abcdefg')"));
+    Assert.assertEquals(122, scalarSql("SELECT system.bucket(128, 'abc')"));
+    Assert.assertEquals(54, scalarSql("SELECT system.bucket(64, 'abcde')"));
+    Assert.assertEquals(8, scalarSql("SELECT system.bucket(12, '测试')"));
+    Assert.assertEquals(1, scalarSql("SELECT system.bucket(16, '测试raul试测')"));
+    Assert.assertEquals(
+        "Varchar should work like string",
+        1,
+        scalarSql("SELECT system.bucket(16, CAST('测试raul试测' AS varchar(8)))"));
+    Assert.assertEquals(
+        "Char should work like string",
+        1,
+        scalarSql("SELECT system.bucket(16, CAST('测试raul试测' AS char(8)))"));
+    Assert.assertEquals(
+        "Should not fail on the empty string", 0, scalarSql("SELECT 
system.bucket(16, '')"));
+    Assert.assertNull(
+        "Null input should return null as output",
+        scalarSql("SELECT system.bucket(16, CAST(null AS string))"));
+  }
+
+  @Test
+  public void testBucketBinary() {
+    Assert.assertEquals(
+        1, scalarSql("SELECT system.bucket(10, 
X'0102030405060708090a0b0c0d0e0f')"));
+    Assert.assertEquals(10, scalarSql("SELECT system.bucket(12, %s)", 
asBytesLiteral("abcdefg")));
+    Assert.assertEquals(13, scalarSql("SELECT system.bucket(18, %s)", 
asBytesLiteral("abc\0\0")));
+    Assert.assertEquals(42, scalarSql("SELECT system.bucket(48, %s)", 
asBytesLiteral("abc")));
+    Assert.assertEquals(3, scalarSql("SELECT system.bucket(16, %s)", 
asBytesLiteral("测试_")));
+
+    Assert.assertNull(
+        "Null input should return null as output",
+        scalarSql("SELECT system.bucket(100, CAST(null AS binary))"));
+  }
+
+  @Test
+  public void testNumBucketsAcceptsShortAndByte() {
+    Assert.assertEquals(
+        "Short types should be usable for the number of buckets field",
+        1,
+        scalarSql("SELECT system.bucket(5S, 1L)"));
+
+    Assert.assertEquals(
+        "Byte types should be allowed for the number of buckets field",
+        1,
+        scalarSql("SELECT system.bucket(5Y, 1)"));
+  }
+
+  @Test
+  public void testWrongNumberOfArguments() {
+    AssertHelpers.assertThrows(
+        "Function resolution should not work with zero arguments",
+        AnalysisException.class,
+        "Function 'bucket' cannot process input: (): Wrong number of inputs 
(expected numBuckets and value)",
+        () -> scalarSql("SELECT system.bucket()"));
+
+    AssertHelpers.assertThrows(
+        "Function resolution should not work with only one argument",
+        AnalysisException.class,
+        "Function 'bucket' cannot process input: (int): Wrong number of inputs 
(expected numBuckets and value)",
+        () -> scalarSql("SELECT system.bucket(1)"));
+
+    AssertHelpers.assertThrows(
+        "Function resolution should not work with more than two arguments",
+        AnalysisException.class,
+        "Function 'bucket' cannot process input: (int, bigint, int): Wrong 
number of inputs (expected numBuckets and value)",
+        () -> scalarSql("SELECT system.bucket(1, 1L, 1)"));
+  }
+
+  @Test
+  public void testInvalidTypesCannotBeUsedForNumberOfBuckets() {
+    AssertHelpers.assertThrows(
+        "Decimal type should not be coercible to the number of buckets",
+        AnalysisException.class,
+        "Function 'bucket' cannot process input: (decimal(9,2), int): Expected 
number of buckets to be tinyint, shortint or int",
+        () -> scalarSql("SELECT system.bucket(CAST('12.34' as DECIMAL(9, 2)), 
10)"));
+
+    AssertHelpers.assertThrows(
+        "Long type should not be coercible to the number of buckets",
+        AnalysisException.class,
+        "Function 'bucket' cannot process input: (bigint, int): Expected 
number of buckets to be tinyint, shortint or int",
+        () -> scalarSql("SELECT system.bucket(12L, 10)"));
+
+    AssertHelpers.assertThrows(
+        "String type should not be coercible to the number of buckets",
+        AnalysisException.class,
+        "Function 'bucket' cannot process input: (string, int): Expected 
number of buckets to be tinyint, shortint or int",
+        () -> scalarSql("SELECT system.bucket('5', 10)"));
+
+    AssertHelpers.assertThrows(
+        "Interval year to month  type should not be coercible to the number of 
buckets",
+        AnalysisException.class,
+        "Function 'bucket' cannot process input: (interval year to month, 
int): Expected number of buckets to be tinyint, shortint or int",
+        () -> scalarSql("SELECT system.bucket(INTERVAL '100-00' YEAR TO MONTH, 
10)"));
+
+    AssertHelpers.assertThrows(
+        "Interval day-time type should not be coercible to the number of 
buckets",
+        AnalysisException.class,
+        "Function 'bucket' cannot process input: (interval day to second, 
int): Expected number of buckets to be tinyint, shortint or int",
+        () -> scalarSql("SELECT system.bucket(CAST('11 23:4:0' AS INTERVAL DAY 
TO SECOND), 10)"));
+  }
+
+  @Test
+  public void testInvalidTypesForBucketColumn() {
+    AssertHelpers.assertThrows(
+        "Double type should not be bucketable",
+        AnalysisException.class,
+        "Function 'bucket' cannot process input: (int, float): Expected 
bucketed column to be date, tinyint, smallint, int, bigint, decimal, timestamp, 
string, or binary",
+        () -> scalarSql("SELECT system.bucket(10, cast(12.3456 as float))"));
+
+    AssertHelpers.assertThrows(
+        "Double type should not be bucketable",
+        AnalysisException.class,
+        "Function 'bucket' cannot process input: (int, double): Expected 
bucketed column to be date, tinyint, smallint, int, bigint, decimal, timestamp, 
string, or binary",
+        () -> scalarSql("SELECT system.bucket(10, cast(12.3456 as double))"));
+
+    AssertHelpers.assertThrows(
+        "Boolean type should not be bucketable",
+        AnalysisException.class,
+        "Function 'bucket' cannot process input: (int, boolean)",
+        () -> scalarSql("SELECT system.bucket(10, true)"));
+
+    AssertHelpers.assertThrows(
+        "Map types should not be bucketable",
+        AnalysisException.class,
+        "Function 'bucket' cannot process input: (int, map<int,int>)",
+        () -> scalarSql("SELECT system.bucket(10, map(1, 1))"));
+
+    AssertHelpers.assertThrows(
+        "Array types should not be bucketable",
+        AnalysisException.class,
+        "Function 'bucket' cannot process input: (int, array<bigint>)",
+        () -> scalarSql("SELECT system.bucket(10, array(1L))"));
+
+    AssertHelpers.assertThrows(
+        "Interval year-to-month type should not be bucketable",
+        AnalysisException.class,
+        "Function 'bucket' cannot process input: (int, interval year to 
month)",
+        () -> scalarSql("SELECT system.bucket(10, INTERVAL '100-00' YEAR TO 
MONTH)"));
+
+    AssertHelpers.assertThrows(
+        "Interval day-time type should not be bucketable",
+        AnalysisException.class,
+        "Function 'bucket' cannot process input: (int, interval day to 
second)",
+        () -> scalarSql("SELECT system.bucket(10, CAST('11 23:4:0' AS INTERVAL 
DAY TO SECOND))"));
+  }
+
+  @Test
+  public void testMagicFunctionsResolveForTinyIntAndSmallIntNumberOfBuckets() {
+    // Magic functions have staticinvoke in the explain output.
+    // Nonmagic calls use applyfunctionexpression instead and go through 
produceResults.
+    String tinyIntWidthExplain = (String) scalarSql("EXPLAIN EXTENDED SELECT 
system.bucket(1Y, 6)");
+    Assertions.assertThat(tinyIntWidthExplain)
+        .contains("cast(1 as int)")
+        .contains("staticinvoke(class 
org.apache.iceberg.spark.functions.BucketFunction$BucketInt");

Review Comment:
   I'll make sure there are tests for the result when using smallint / tinyint 
for the width.
   
   This test is just to ensure that we don't need to add classes that 
specifically accept smallint / tinyint to get the benefit of the magic function.
   
   Given I have verified that though, I can remove these as the explain output 
is quite likely to change with different Spark versions.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to