[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #199: ORC metrics

GitBox Fri, 22 May 2020 13:44:30 -0700


rdblue commented on a change in pull request #199:
URL: https://github.com/apache/incubator-iceberg/pull/199#discussion_r429443956




##########
File path: orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java
##########
@@ -19,51 +19,211 @@
 
 package org.apache.iceberg.orc;
 
+import com.google.common.collect.Maps;
 import java.io.IOException;
-import java.util.Collections;
+import java.math.BigDecimal;
+import java.nio.ByteBuffer;
+import java.sql.Timestamp;
+import java.time.Instant;
+import java.time.LocalDate;
+import java.time.OffsetDateTime;
+import java.time.ZoneOffset;
+import java.time.temporal.ChronoUnit;
+import java.util.Map;
+import java.util.Optional;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.iceberg.Metrics;
+import org.apache.iceberg.Schema;
 import org.apache.iceberg.exceptions.RuntimeIOException;
 import org.apache.iceberg.hadoop.HadoopInputFile;
 import org.apache.iceberg.io.InputFile;
+import org.apache.iceberg.types.Conversions;
+import org.apache.iceberg.types.Type;
+import org.apache.iceberg.types.Types;
+import org.apache.orc.BooleanColumnStatistics;
+import org.apache.orc.ColumnStatistics;
+import org.apache.orc.DateColumnStatistics;
+import org.apache.orc.DecimalColumnStatistics;
+import org.apache.orc.DoubleColumnStatistics;
+import org.apache.orc.IntegerColumnStatistics;
 import org.apache.orc.Reader;
+import org.apache.orc.StringColumnStatistics;
+import org.apache.orc.TimestampColumnStatistics;
+import org.apache.orc.TypeDescription;
 import org.apache.orc.Writer;
 
 public class OrcMetrics {
 
   private OrcMetrics() {
   }
 
+  static final OffsetDateTime EPOCH = 
Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC);
+  static final LocalDate EPOCH_DAY = EPOCH.toLocalDate();
+
   public static Metrics fromInputFile(InputFile file) {
     final Configuration config = (file instanceof HadoopInputFile) ?
         ((HadoopInputFile) file).getConf() : new Configuration();
     return fromInputFile(file, config);
   }
 
-  public static Metrics fromInputFile(InputFile file, Configuration config) {
+  static Metrics fromInputFile(InputFile file, Configuration config) {
     try (Reader orcReader = ORC.newFileReader(file, config)) {
-
-      // TODO: implement rest of the methods for ORC metrics
-      // https://github.com/apache/incubator-iceberg/pull/199
-      return new Metrics(orcReader.getNumberOfRows(),
-          null,
-          null,
-          Collections.emptyMap(),
-          null,
-          null);
+      return buildOrcMetrics(orcReader.getNumberOfRows(),
+          orcReader.getSchema(), orcReader.getStatistics());
     } catch (IOException ioe) {
-      throw new RuntimeIOException(ioe, "Failed to read footer of file: %s", 
file);
+      throw new RuntimeIOException(ioe, "Failed to open file: %s", 
file.location());
     }
   }
 
+  private static Metrics buildOrcMetrics(final long numOfRows, final 
TypeDescription orcSchema,
+                                         final ColumnStatistics[] colStats) {
+    final Schema schema = ORCSchemaUtil.convert(orcSchema);
+    Map<Integer, Long> columSizes = 
Maps.newHashMapWithExpectedSize(colStats.length);
+    Map<Integer, Long> valueCounts = 
Maps.newHashMapWithExpectedSize(colStats.length);
+    Map<Integer, Long> nullCounts = 
Maps.newHashMapWithExpectedSize(colStats.length);
+    Map<Integer, ByteBuffer> lowerBounds = Maps.newHashMap();
+    Map<Integer, ByteBuffer> upperBounds = Maps.newHashMap();
+
+    for (int i = 0; i < colStats.length; i++) {
+      final ColumnStatistics colStat = colStats[i];
+      final TypeDescription orcCol = orcSchema.findSubtype(i);
+      final Optional<Types.NestedField> icebergColOpt = 
ORCSchemaUtil.icebergID(orcCol)
+          .map(schema::findField);
+
+      if (icebergColOpt.isPresent()) {
+        final Types.NestedField icebergCol = icebergColOpt.get();
+        final int fieldId = icebergCol.fieldId();
+
+        if (colStat.hasNull()) {
+          nullCounts.put(fieldId, numOfRows - colStat.getNumberOfValues());
+        } else {
+          nullCounts.put(fieldId, 0L);
+        }
+        columSizes.put(fieldId, colStat.getBytesOnDisk());
+        valueCounts.put(fieldId, colStat.getNumberOfValues() + 
nullCounts.getOrDefault(fieldId, 0L));

Review comment:
       This should only set the value count if the null count is present. If 
the null count is unknown (repeated fields) then the value count is also 
unknown.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #199: ORC metrics

Reply via email to