rdblue commented on a change in pull request #1963:
URL: https://github.com/apache/iceberg/pull/1963#discussion_r579895706
##########
File path: core/src/main/java/org/apache/iceberg/avro/ValueWriters.java
##########
@@ -492,4 +561,153 @@ protected Object get(IndexedRecord struct, int pos) {
return struct.get(pos);
}
}
+
+ private abstract static class FloatingPointWriter<T extends Comparable<T>>
+ extends ComparableWriter<T> {
+ private long nanValueCount;
+
+ FloatingPointWriter(int id) {
+ super(id);
+ }
+
+ @Override
+ public void write(T datum, Encoder encoder) throws IOException {
+ valueCount++;
+
+ if (datum == null) {
+ nullValueCount++;
+ } else if (NaNUtil.isNaN(datum)) {
+ nanValueCount++;
+ } else {
+ if (max == null || datum.compareTo(max) > 0) {
+ this.max = datum;
+ }
+ if (min == null || datum.compareTo(min) < 0) {
+ this.min = datum;
+ }
+ }
+
+ writeVal(datum, encoder);
+ }
+
+ @Override
+ public Stream<FieldMetrics> metrics() {
+ return Stream.of(new FieldMetrics<>(id, valueCount, nullValueCount,
nanValueCount, min, max));
+ }
+ }
+
+ public abstract static class MetricsAwareStringWriter<T extends
Comparable<T>> extends ComparableWriter<T> {
+ public MetricsAwareStringWriter(int id) {
+ super(id);
+ }
+
+ @Override
+ public Stream<FieldMetrics> metrics() {
+ // convert min/max to string to allow upper/lower bound truncation when
gathering metrics,
+ // as in different implementations there's no guarantee that input to
string writer will be char sequence
+ return metrics(Object::toString);
+ }
+ }
+
+ private abstract static class MetricsAwareByteArrayWriter extends
MetricsAwareWriter<byte[]> {
+ MetricsAwareByteArrayWriter(int id) {
+ super(id, Comparators.unsignedByteArrays());
+ }
+
+ @Override
+ public Stream<FieldMetrics> metrics() {
+ // convert min/max to byte buffer to allow upper/lower bound truncation
when gathering metrics.
+ return metrics(ByteBuffer::wrap);
+ }
+ }
+
+ public abstract static class ComparableWriter<T extends Comparable<T>>
extends MetricsAwareWriter<T> {
+ public ComparableWriter(int id) {
+ super(id, Comparable::compareTo);
+ }
+ }
+
+ /**
+ * A value writer wrapper that keeps track of column statistics (metrics)
during writing.
+ *
+ * @param <T> Input type
+ */
+ public abstract static class MetricsAwareWriter<T> extends
MetricsAwareTransformWriter<T, T> {
+ public MetricsAwareWriter(int id, Comparator<T> comparator) {
+ super(id, comparator, Function.identity());
+ }
+
+ /**
+ * Helper class to transform the input type when collecting metrics.
+ * The transform function converts the stats information from the specific
type that the underlying writer
+ * understands to a more general type that could be transformed to binary
following iceberg single-value
+ * serialization spec.
+ *
+ * @param func tranformation function
+ * @return a stream of field metrics with bounds converted by the given
transformation
+ */
+ protected Stream<FieldMetrics> metrics(Function<T, ?> func) {
+ return Stream.of(new FieldMetrics<>(id, valueCount, nullValueCount, 0,
+ updateBound(min, func), updateBound(max, func)));
+ }
+
+ private <T3, T4> T4 updateBound(T3 bound, Function<T3, T4> func) {
+ return bound == null ? null : func.apply(bound);
+ }
+ }
+
+ /**
+ * A value writer wrapper that keeps track of column statistics (metrics)
during writing, and accepts a
+ * transformation in its constructor.
+ * The transformation will apply to the input data to produce the type that
the underlying writer accepts.
+ * Stats will also be tracked with the type after transformation.
+ *
+ * @param <T1> Input type
+ * @param <T2> Type after transformation
+ */
+ @SuppressWarnings("checkstyle:VisibilityModifier")
+ public abstract static class MetricsAwareTransformWriter<T1, T2> implements
ValueWriter<T1> {
+ protected final int id;
+ protected long valueCount;
+ protected long nullValueCount;
+ protected T2 max;
+ protected T2 min;
+ protected final Function<T1, T2> transformation;
+
+ private final Comparator<T2> comparator;
+
+ public MetricsAwareTransformWriter(int id, Comparator<T2> comparator,
Function<T1, T2> func) {
+ this.id = id;
+ this.comparator = comparator;
+ this.transformation = func;
+ }
+
+ @Override
+ public void write(T1 datum, Encoder encoder) throws IOException {
+ valueCount++;
+ if (datum == null) {
+ nullValueCount++;
+ writeVal(null, encoder);
+
+ } else {
+ T2 transformedDatum = transformation.apply(datum);
+ if (max == null || comparator.compare(transformedDatum, max) > 0) {
+ max = transformedDatum;
+ }
+ if (min == null || comparator.compare(transformedDatum, min) < 0) {
+ min = transformedDatum;
+ }
+ writeVal(transformedDatum, encoder);
Review comment:
After removing the null handling from this method, this is the main
block of code provided by this class. I think that the class structure here
tries too hard to avoid duplication of these 8 lines and ends up causing an
unnecessary performance slow down.
Most of the subclasses don't actually provide a transform, so it doesn't
make a lot of sense to provide one. In addition, the cases that do provide a
transform only produce `int` and `long` values. Using this inheritance
structure, all writers will apply a function and the `int` and `long` cases
will box the result of that function call.
I think separating this into `StoredAsIntwriter`, `StoredAsLongWriter`, and
`MetricsAwareWriter` would duplicate these lines, but avoid the function call
and boxing for a lot of cases. It would also avoid needing to implement
`writeVal(Integer, Encoder)` in every subclass. Instead, you'd need to add `int
convert(T)` to convert to the int value before writing to the encoder directly
in `write`.
Could you update to use that structure?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]