rdblue commented on a change in pull request #1963:
URL: https://github.com/apache/iceberg/pull/1963#discussion_r579892185



##########
File path: core/src/main/java/org/apache/iceberg/avro/ValueWriters.java
##########
@@ -492,4 +561,153 @@ protected Object get(IndexedRecord struct, int pos) {
       return struct.get(pos);
     }
   }
+
+  private abstract static class FloatingPointWriter<T extends Comparable<T>>
+      extends ComparableWriter<T> {
+    private long nanValueCount;
+
+    FloatingPointWriter(int id) {
+      super(id);
+    }
+
+    @Override
+    public void write(T datum, Encoder encoder) throws IOException {
+      valueCount++;
+
+      if (datum == null) {
+        nullValueCount++;
+      } else if (NaNUtil.isNaN(datum)) {
+        nanValueCount++;
+      } else {
+        if (max == null || datum.compareTo(max) > 0) {
+          this.max = datum;
+        }
+        if (min == null || datum.compareTo(min) < 0) {
+          this.min = datum;
+        }
+      }
+
+      writeVal(datum, encoder);
+    }
+
+    @Override
+    public Stream<FieldMetrics> metrics() {
+      return Stream.of(new FieldMetrics<>(id, valueCount, nullValueCount, 
nanValueCount, min, max));
+    }
+  }
+
+  public abstract static class MetricsAwareStringWriter<T extends 
Comparable<T>> extends ComparableWriter<T> {
+    public MetricsAwareStringWriter(int id) {
+      super(id);
+    }
+
+    @Override
+    public Stream<FieldMetrics> metrics() {
+      // convert min/max to string to allow upper/lower bound truncation when 
gathering metrics,
+      // as in different implementations there's no guarantee that input to 
string writer will be char sequence
+      return metrics(Object::toString);
+    }
+  }
+
+  private abstract static class MetricsAwareByteArrayWriter extends 
MetricsAwareWriter<byte[]> {
+    MetricsAwareByteArrayWriter(int id) {
+      super(id, Comparators.unsignedByteArrays());
+    }
+
+    @Override
+    public Stream<FieldMetrics> metrics() {
+      // convert min/max to byte buffer to allow upper/lower bound truncation 
when gathering metrics.
+      return metrics(ByteBuffer::wrap);
+    }
+  }
+
+  public abstract static class ComparableWriter<T extends Comparable<T>> 
extends MetricsAwareWriter<T> {
+    public ComparableWriter(int id) {
+      super(id, Comparable::compareTo);
+    }
+  }
+
+  /**
+   * A value writer wrapper that keeps track of column statistics (metrics) 
during writing.
+   *
+   * @param <T> Input type
+   */
+  public abstract static class MetricsAwareWriter<T> extends 
MetricsAwareTransformWriter<T, T> {
+    public MetricsAwareWriter(int id, Comparator<T> comparator) {
+      super(id, comparator, Function.identity());
+    }
+
+    /**
+     * Helper class to transform the input type when collecting metrics.
+     * The transform function converts the stats information from the specific 
type that the underlying writer
+     * understands to a more general type that could be transformed to binary 
following iceberg single-value
+     * serialization spec.
+     *
+     * @param func tranformation function
+     * @return a stream of field metrics with bounds converted by the given 
transformation
+     */
+    protected Stream<FieldMetrics> metrics(Function<T, ?> func) {
+      return Stream.of(new FieldMetrics<>(id, valueCount, nullValueCount, 0,
+          updateBound(min, func), updateBound(max, func)));
+    }
+
+    private <T3, T4> T4 updateBound(T3 bound, Function<T3, T4> func) {
+      return bound == null ? null : func.apply(bound);
+    }
+  }
+
+  /**
+   * A value writer wrapper that keeps track of column statistics (metrics) 
during writing, and accepts a
+   * transformation in its constructor.
+   * The transformation will apply to the input data to produce the type that 
the underlying writer accepts.
+   * Stats will also be tracked with the type after transformation.
+   *
+   * @param <T1> Input type
+   * @param <T2> Type after transformation
+   */
+  @SuppressWarnings("checkstyle:VisibilityModifier")
+  public abstract static class MetricsAwareTransformWriter<T1, T2> implements 
ValueWriter<T1> {
+    protected final int id;
+    protected long valueCount;
+    protected long nullValueCount;
+    protected T2 max;
+    protected T2 min;
+    protected final Function<T1, T2> transformation;
+
+    private final Comparator<T2> comparator;
+
+    public MetricsAwareTransformWriter(int id, Comparator<T2> comparator, 
Function<T1, T2> func) {
+      this.id = id;
+      this.comparator = comparator;
+      this.transformation = func;
+    }
+
+    @Override
+    public void write(T1 datum, Encoder encoder) throws IOException {
+      valueCount++;
+      if (datum == null) {

Review comment:
       Is this necessary? Before, all of the type-specific writers assumed that 
the input value was non-null because null isn't allowed unless the type is 
optional. If it is optional, the writer will be wrapped in an option writer, so 
there is no need to handle null.
   
   I think this should similarly assume that the option writer tracks null 
values and that all values will be non-null. That simplifies this class quite a 
bit.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to