aokolnychyi commented on code in PR #5601:
URL: https://github.com/apache/iceberg/pull/5601#discussion_r959913091
##########
api/src/main/java/org/apache/iceberg/transforms/Bucket.java:
##########
@@ -159,111 +188,61 @@ public Type getResultType(Type sourceType) {
return Types.IntegerType.get();
}
- private static class BucketInteger extends Bucket<Integer> {
+ private static class BucketInteger extends Bucket<Integer>
+ implements SerializableFunction<Integer, Integer> {
+
private BucketInteger(int numBuckets) {
super(numBuckets);
}
@Override
- public int hash(Integer value) {
+ protected int hash(Integer value) {
return BucketUtil.hash(value);
}
-
- @Override
- public boolean canTransform(Type type) {
Review Comment:
I am not sure removing `canTransform` from each specific `BucketXXX` class
was absolutely necessary. We have an unbound generic `Bucket`, which can
transform all types, but `BucketInteger` can't transform `String`, for instance.
##########
api/src/main/java/org/apache/iceberg/transforms/Dates.java:
##########
@@ -27,44 +27,67 @@
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.expressions.UnboundPredicate;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
+import org.apache.iceberg.util.SerializableFunction;
enum Dates implements Transform<Integer, Integer> {
YEAR(ChronoUnit.YEARS, "year"),
MONTH(ChronoUnit.MONTHS, "month"),
DAY(ChronoUnit.DAYS, "day");
+ static class Apply implements SerializableFunction<Integer, Integer> {
+ private final ChronoUnit granularity;
+
+ Apply(ChronoUnit granularity) {
+ this.granularity = granularity;
+ }
+
+ @Override
+ public Integer apply(Integer days) {
+ if (days == null) {
+ return null;
+ }
+
+ if (granularity == ChronoUnit.DAYS) {
+ return days;
+ }
+
+ if (days >= 0) {
+ LocalDate date = EPOCH.plusDays(days);
+ return (int) granularity.between(EPOCH, date);
+ } else {
+ // add 1 day to the value to account for the case where there is
exactly 1 unit between the
+ // date and epoch
+ // because the result will always be decremented.
Review Comment:
nit: I know you probably copied this from another place, which was
auto-formatted, but I think the comment can fit on two lines.
##########
api/src/main/java/org/apache/iceberg/transforms/Years.java:
##########
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.transforms;
+
+import java.io.ObjectStreamException;
+import org.apache.iceberg.types.Type;
+import org.apache.iceberg.types.Types;
+
+class Years<T> extends TimeTransform<T> {
+ private static final Years<?> INSTANCE = new Years<>();
+
+ @SuppressWarnings("unchecked")
+ static <T> Years<T> get() {
+ return (Years<T>) INSTANCE;
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ protected Transform<T, Integer> toEnum(Type type) {
+ switch (type.typeId()) {
+ case DATE:
+ return (Transform<T, Integer>) Dates.YEAR;
+ case TIMESTAMP:
+ return (Transform<T, Integer>) Timestamps.YEAR;
+ }
+ throw new IllegalArgumentException("Unsupported type: " + type);
Review Comment:
nit: Here too.
##########
api/src/main/java/org/apache/iceberg/transforms/Timestamps.java:
##########
@@ -27,48 +27,72 @@
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.expressions.UnboundPredicate;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
+import org.apache.iceberg.util.SerializableFunction;
enum Timestamps implements Transform<Long, Integer> {
YEAR(ChronoUnit.YEARS, "year"),
MONTH(ChronoUnit.MONTHS, "month"),
DAY(ChronoUnit.DAYS, "day"),
HOUR(ChronoUnit.HOURS, "hour");
+ static class Apply implements SerializableFunction<Long, Integer> {
+ private final ChronoUnit granularity;
+
+ Apply(ChronoUnit granularity) {
+ this.granularity = granularity;
+ }
+
+ @Override
+ public Integer apply(Long timestampMicros) {
+ if (timestampMicros == null) {
+ return null;
+ }
+
+ if (timestampMicros >= 0) {
+ OffsetDateTime timestamp =
+ Instant.ofEpochSecond(
+ Math.floorDiv(timestampMicros, 1_000_000),
+ Math.floorMod(timestampMicros, 1_000_000) * 1000)
+ .atOffset(ZoneOffset.UTC);
+ return (int) granularity.between(EPOCH, timestamp);
+ } else {
+ // add 1 micro to the value to account for the case where there is
exactly 1 unit between
+ // the
Review Comment:
nit: Formatting of the comment.
##########
api/src/main/java/org/apache/iceberg/PartitionSpec.java:
##########
@@ -465,11 +464,7 @@ public Builder day(String sourceName, String targetName) {
checkAndAddPartitionName(targetName);
Types.NestedField sourceColumn = findSourceColumn(sourceName);
PartitionField field =
- new PartitionField(
- sourceColumn.fieldId(),
- nextFieldId(),
- targetName,
- Transforms.day(sourceColumn.type()));
+ new PartitionField(sourceColumn.fieldId(), nextFieldId(),
targetName, Transforms.day());
Review Comment:
The new line length is quite unfortunate.
##########
api/src/main/java/org/apache/iceberg/transforms/Months.java:
##########
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.transforms;
+
+import java.io.ObjectStreamException;
+import org.apache.iceberg.types.Type;
+import org.apache.iceberg.types.Types;
+
+public class Months<T> extends TimeTransform<T> {
+ private static final Months<?> INSTANCE = new Months<>();
+
+ @SuppressWarnings("unchecked")
+ static <T> Months<T> get() {
+ return (Months<T>) INSTANCE;
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ protected Transform<T, Integer> toEnum(Type type) {
+ switch (type.typeId()) {
+ case DATE:
+ return (Transform<T, Integer>) Dates.MONTH;
+ case TIMESTAMP:
+ return (Transform<T, Integer>) Timestamps.MONTH;
+ }
+ throw new IllegalArgumentException("Unsupported type: " + type);
Review Comment:
nit: Here as well.
##########
api/src/main/java/org/apache/iceberg/transforms/Days.java:
##########
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.transforms;
+
+import java.io.ObjectStreamException;
+import org.apache.iceberg.types.Type;
+import org.apache.iceberg.types.Types;
+
+public class Days<T> extends TimeTransform<T> {
+ private static final Days<?> INSTANCE = new Days<>();
+
+ @SuppressWarnings("unchecked")
+ static <T> Days<T> get() {
+ return (Days<T>) INSTANCE;
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ protected Transform<T, Integer> toEnum(Type type) {
+ switch (type.typeId()) {
+ case DATE:
+ return (Transform<T, Integer>) Dates.DAY;
+ case TIMESTAMP:
+ return (Transform<T, Integer>) Timestamps.DAY;
+ }
+ throw new IllegalArgumentException("Unsupported type: " + type);
Review Comment:
nit: It might be a personal thing, but I find having a default branch and
throwing an exception from there a bit more obvious.
##########
api/src/main/java/org/apache/iceberg/transforms/Timestamps.java:
##########
@@ -27,48 +27,72 @@
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.expressions.UnboundPredicate;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
+import org.apache.iceberg.util.SerializableFunction;
enum Timestamps implements Transform<Long, Integer> {
YEAR(ChronoUnit.YEARS, "year"),
MONTH(ChronoUnit.MONTHS, "month"),
DAY(ChronoUnit.DAYS, "day"),
HOUR(ChronoUnit.HOURS, "hour");
+ static class Apply implements SerializableFunction<Long, Integer> {
+ private final ChronoUnit granularity;
+
+ Apply(ChronoUnit granularity) {
+ this.granularity = granularity;
+ }
+
+ @Override
+ public Integer apply(Long timestampMicros) {
+ if (timestampMicros == null) {
+ return null;
+ }
+
+ if (timestampMicros >= 0) {
+ OffsetDateTime timestamp =
+ Instant.ofEpochSecond(
Review Comment:
Optional: I know it was copied from another place but it is a bit hard to
read this block because of formatting, I'd consider adding temp vars.
```
if (timestampMicros >= 0) {
long epochSecond = Math.floorDiv(timestampMicros, 1_000_000);
int nanoAdjustment = Math.floorMod(timestampMicros, 1_000_000) * 1000;
Instant instant = Instant.ofEpochSecond(epochSecond, nanoAdjustment);
return (int) granularity.between(EPOCH, instant.atOffset(ZoneOffset.UTC));
} else {
// ...
long epochSecond = Math.floorDiv(timestampMicros, 1_000_000);
int nanoAdjustment = Math.floorMod(timestampMicros + 1, 1_000_000) * 1000;
Instant instant = Instant.ofEpochSecond(epochSecond, nanoAdjustment);
return (int) granularity.between(EPOCH, instant.atOffset(ZoneOffset.UTC);)
- 1;
}
```
##########
api/src/main/java/org/apache/iceberg/transforms/Bucket.java:
##########
@@ -18,52 +18,53 @@
*/
package org.apache.iceberg.transforms;
-import static org.apache.iceberg.types.Type.TypeID;
-
+import java.io.Serializable;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
-import java.util.Set;
import java.util.UUID;
+import java.util.function.Function;
import org.apache.iceberg.expressions.BoundPredicate;
import org.apache.iceberg.expressions.BoundTransform;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.expressions.UnboundPredicate;
-import
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.base.Objects;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
-import org.apache.iceberg.relocated.com.google.common.collect.Sets;
-import org.apache.iceberg.relocated.com.google.common.hash.HashFunction;
-import org.apache.iceberg.relocated.com.google.common.hash.Hashing;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.BucketUtil;
+import org.apache.iceberg.util.SerializableFunction;
-abstract class Bucket<T> implements Transform<T, Integer> {
- private static final HashFunction MURMUR3 = Hashing.murmur3_32_fixed();
+class Bucket<T> implements Transform<T, Integer>, Serializable {
+ static <T> Bucket<T> get(int numBuckets) {
+ Preconditions.checkArgument(
+ numBuckets > 0, "Invalid number of buckets: %s (must be > 0)",
numBuckets);
+ return new Bucket<>(numBuckets);
+ }
@SuppressWarnings("unchecked")
- static <T> Bucket<T> get(Type type, int numBuckets) {
+ static <T, B extends Bucket<T> & SerializableFunction<T, Integer>> B get(
Review Comment:
Shall we deprecate this like we did in `Truncate`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]