szehon-ho commented on code in PR #5376:
URL: https://github.com/apache/iceberg/pull/5376#discussion_r1038602262


##########
core/src/main/java/org/apache/iceberg/MetricsUtil.java:
##########
@@ -56,4 +69,293 @@ public static MetricsModes.MetricsMode metricsMode(
     String columnName = inputSchema.findColumnName(fieldId);
     return metricsConfig.columnMode(columnName);
   }
+
+  public static final List<ReadableMetricColDefinition> READABLE_METRIC_COLS =
+      ImmutableList.of(
+          new ReadableMetricColDefinition(
+              "column_size",
+              "Total size on disk",
+              DataFile.COLUMN_SIZES,
+              field -> Types.LongType.get(),
+              (file, field) ->
+                  file.columnSizes() == null ? null : 
file.columnSizes().get(field.fieldId())),
+          new ReadableMetricColDefinition(
+              "value_count",
+              "Total count, including null and NaN",
+              DataFile.VALUE_COUNTS,
+              field -> Types.LongType.get(),
+              (file, field) ->
+                  file.valueCounts() == null ? null : 
file.valueCounts().get(field.fieldId())),
+          new ReadableMetricColDefinition(
+              "null_value_count",
+              "Null value count",
+              DataFile.NULL_VALUE_COUNTS,
+              field -> Types.LongType.get(),
+              (file, field) ->
+                  file.nullValueCounts() == null
+                      ? null
+                      : file.nullValueCounts().get(field.fieldId())),
+          new ReadableMetricColDefinition(
+              "nan_value_count",
+              "NaN value count",
+              DataFile.NAN_VALUE_COUNTS,
+              field -> Types.LongType.get(),
+              (file, field) ->
+                  file.nanValueCounts() == null
+                      ? null
+                      : file.nanValueCounts().get(field.fieldId())),
+          new ReadableMetricColDefinition(
+              "lower_bound",
+              "Lower bound",
+              DataFile.LOWER_BOUNDS,
+              Types.NestedField::type,
+              (file, field) ->
+                  file.lowerBounds() == null
+                      ? null
+                      : Conversions.fromByteBuffer(
+                          field.type(), 
file.lowerBounds().get(field.fieldId()))),
+          new ReadableMetricColDefinition(
+              "upper_bound",
+              "Upper bound",
+              DataFile.UPPER_BOUNDS,
+              Types.NestedField::type,
+              (file, field) ->
+                  file.upperBounds() == null
+                      ? null
+                      : Conversions.fromByteBuffer(
+                          field.type(), 
file.upperBounds().get(field.fieldId()))));
+
+  public static final String READABLE_METRICS = "readable_metrics";
+
+  /**
+   * Fixed definition of a readable metric column, ie a mapping of a raw 
metric to a readable metric
+   */
+  public static class ReadableMetricColDefinition {
+    private final String name;
+    private final String doc;
+    private final Types.NestedField originalCol;
+    private final TypeFunction typeFunction;
+    private final MetricFunction metricFunction;
+
+    public interface TypeFunction {
+      Type type(Types.NestedField originalCol);
+    }
+
+    public interface MetricFunction {
+      Object metric(ContentFile<?> file, Types.NestedField originalCol);
+    }
+
+    /**
+     * @param name column name
+     * @param doc column doc
+     * @param originalCol original (raw) metric column field on metadata table
+     * @param typeFunction function that returns the readable metric column 
type from original field
+     *     type
+     * @param metricFunction function that returns readable metric from data 
file
+     */
+    ReadableMetricColDefinition(
+        String name,
+        String doc,
+        Types.NestedField originalCol,
+        TypeFunction typeFunction,
+        MetricFunction metricFunction) {
+      this.name = name;
+      this.doc = doc;
+      this.originalCol = originalCol;
+      this.typeFunction = typeFunction;
+      this.metricFunction = metricFunction;
+    }
+
+    Types.NestedField originalCol() {
+      return originalCol;
+    }
+
+    Type colType(Types.NestedField field) {
+      return typeFunction.type(field);
+    }
+
+    String name() {
+      return name;
+    }
+
+    String doc() {
+      return doc;
+    }
+
+    Object value(ContentFile<?> dataFile, Types.NestedField dataField) {
+      return metricFunction.metric(dataFile, dataField);
+    }
+  }
+
+  /** A struct of readable metric values for a primitive column */
+  public static class ReadableColMetricsStruct implements StructLike {
+
+    private final String columnName;
+    private final Map<Integer, Integer> projectionMap;
+    private final Object[] metrics;
+
+    public ReadableColMetricsStruct(
+        String columnName, Types.NestedField projection, Object... metrics) {
+      this.columnName = columnName;
+      this.projectionMap = readableMetricsProjection(projection);
+      this.metrics = metrics;
+    }
+
+    @Override
+    public int size() {
+      return projectionMap.size();
+    }
+
+    @Override
+    public <T> T get(int pos, Class<T> javaClass) {
+      Object value = get(pos);
+      return value == null ? null : javaClass.cast(value);
+    }
+
+    @Override
+    public <T> void set(int pos, T value) {
+      throw new UnsupportedOperationException("ReadableMetricsStruct is read 
only");
+    }
+
+    private Object get(int pos) {
+      int projectedPos = projectionMap.get(pos);
+      return metrics[projectedPos];
+    }
+
+    /** Returns map of projected position to actual position of this struct's 
fields */
+    private Map<Integer, Integer> readableMetricsProjection(Types.NestedField 
projection) {
+      Map<Integer, Integer> result = Maps.newHashMap();
+
+      Set<String> projectedFields =
+          Sets.newHashSet(
+              projection.type().asStructType().fields().stream()
+                  .map(Types.NestedField::name)
+                  .collect(Collectors.toSet()));
+
+      int projectedIndex = 0;
+      for (int fieldIndex = 0; fieldIndex < READABLE_METRIC_COLS.size(); 
fieldIndex++) {
+        ReadableMetricColDefinition readableMetric = 
READABLE_METRIC_COLS.get(fieldIndex);
+
+        if (projectedFields.contains(readableMetric.name())) {
+          result.put(projectedIndex, fieldIndex);
+          projectedIndex++;
+        }
+      }
+      return result;
+    }
+
+    String columnName() {
+      return columnName;
+    }
+  }
+
+  /**
+   * A struct, consisting of all {@link ReadableColMetricsStruct} for all 
primitive columns of the
+   * table
+   */
+  public static class ReadableMetricsStruct implements StructLike {
+
+    private final List<StructLike> columnMetrics;
+
+    public ReadableMetricsStruct(List<StructLike> columnMetrics) {
+      this.columnMetrics = columnMetrics;
+    }
+
+    @Override
+    public int size() {
+      return columnMetrics.size();
+    }
+
+    @Override
+    public <T> T get(int pos, Class<T> javaClass) {
+      return javaClass.cast(columnMetrics.get(pos));
+    }
+
+    @Override
+    public <T> void set(int pos, T value) {
+      throw new UnsupportedOperationException("ReadableMetricsStruct is read 
only");
+    }
+  }
+
+  /**
+   * Calculates a dynamic schema for readable_metrics to add to metadata 
tables. The type will be
+   * the struct {@link ReadableColMetricsStruct}, composed of {@link 
ReadableMetricsStruct} for all
+   * primitive columns in the data table
+   *
+   * @param dataTableSchema schema of data table
+   * @param metadataTableSchema schema of existing metadata table (to ensure 
id uniqueness)
+   * @return schema of readable_metrics struct
+   */
+  public static Schema readableMetricsSchema(Schema dataTableSchema, Schema 
metadataTableSchema) {
+    List<Types.NestedField> fields = Lists.newArrayList();
+    Map<Integer, String> idToName = dataTableSchema.idToName();
+    AtomicInteger nextId =

Review Comment:
   Re: highestFieldId(), good to know!  Done.



##########
core/src/test/java/org/apache/iceberg/TestMetadataTableScans.java:
##########
@@ -527,6 +527,63 @@ public void testDeleteFilesTableSelection() throws 
IOException {
     Assert.assertEquals(expected, scan.schema().asStruct());
   }
 
+  @Test
+  public void testFilesTableReadableMetricsSchema() {
+
+    Table filesTable = new FilesTable(table.ops(), table);
+    Types.StructType actual = 
filesTable.newScan().schema().select("readable_metrics").asStruct();
+

Review Comment:
   Done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to