[incubator-iceberg] branch master updated: Store split offsets for ORC files (#192)

blue Thu, 06 Jun 2019 09:31:24 -0700

This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-iceberg.git



The following commit(s) were added to refs/heads/master by this push:
     new 4c840ce  Store split offsets for ORC files (#192)
4c840ce is described below

commit 4c840ce9c68b7a9a4ddecae5b8208edaa8794fda
Author: Ratandeep Ratti <[email protected]>
AuthorDate: Thu Jun 6 09:31:13 2019 -0700

    Store split offsets for ORC files (#192)
---
 orc/src/main/java/org/apache/iceberg/orc/ORC.java  |  3 +-
 .../org/apache/iceberg/orc/OrcFileAppender.java    | 25 ++++++++-
 .../apache/iceberg/spark/data/TestOrcWrite.java    | 61 ++++++++++++++++++++++
 3 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORC.java 
b/orc/src/main/java/org/apache/iceberg/orc/ORC.java
index c6cb036..f275594 100644
--- a/orc/src/main/java/org/apache/iceberg/orc/ORC.java
+++ b/orc/src/main/java/org/apache/iceberg/orc/ORC.java
@@ -92,9 +92,8 @@ public class ORC {
 
     public <D> FileAppender<D> build() {
       Preconditions.checkNotNull(schema, "Schema is required");
-      OrcFile.WriterOptions options = OrcFile.writerOptions(conf);
       return new OrcFileAppender<>(TypeConversion.toOrc(schema, new 
ColumnIdMap()),
-          this.file, createWriterFunc, options, metadata,
+          this.file, createWriterFunc, conf, metadata,
           conf.getInt(VECTOR_ROW_BATCH_SIZE, DEFAULT_SIZE));
     }
   }
diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java 
b/orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java
index 60c738c..381fcf7 100644
--- a/orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java
+++ b/orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java
@@ -22,15 +22,22 @@ package org.apache.iceberg.orc;
 import com.google.common.base.Preconditions;
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.function.Function;
+import com.google.common.collect.Lists;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.iceberg.Metrics;
+import org.apache.iceberg.exceptions.RuntimeIOException;
 import org.apache.iceberg.io.FileAppender;
 import org.apache.iceberg.io.OutputFile;
 import org.apache.orc.ColumnStatistics;
 import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.StripeInformation;
 import org.apache.orc.TypeDescription;
 import org.apache.orc.Writer;
 import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
@@ -47,18 +54,21 @@ class OrcFileAppender<D> implements FileAppender<D> {
   private final VectorizedRowBatch batch;
   private final OrcValueWriter<D> valueWriter;
   private boolean isClosed = false;
+  private final Configuration conf;
 
   private static final String COLUMN_NUMBERS_ATTRIBUTE = "iceberg.column.ids";
 
   OrcFileAppender(TypeDescription schema, OutputFile file,
                   Function<TypeDescription, OrcValueWriter<?>> 
createWriterFunc,
-                  OrcFile.WriterOptions options, Map<String, byte[]> metadata,
+                  Configuration conf, Map<String, byte[]> metadata,
                   int batchSize) {
+    this.conf = conf;
     orcSchema = schema;
     path = new Path(file.location());
     this.batchSize = batchSize;
     batch = orcSchema.createRowBatch(this.batchSize);
 
+    OrcFile.WriterOptions options = OrcFile.writerOptions(conf);
     options.setSchema(orcSchema);
     writer = newOrcWriter(file, columnIds, options, metadata);
     valueWriter = newOrcValueWriter(orcSchema, createWriterFunc);
@@ -114,6 +124,19 @@ class OrcFileAppender<D> implements FileAppender<D> {
   }
 
   @Override
+  public List<Long> splitOffsets() {
+    Preconditions.checkState(isClosed, "File is not yet closed");
+    Reader reader;
+    try {
+      reader = OrcFile.createReader(path, new OrcFile.ReaderOptions(conf));
+    } catch (IOException e) {
+      throw new RuntimeIOException("Cannot read file " + path, e);
+    }
+    List<StripeInformation> stripes = reader.getStripes();
+    return Collections.unmodifiableList(Lists.transform(stripes, 
StripeInformation::getOffset));
+  }
+
+  @Override
   public void close() throws IOException {
     if (!isClosed) {
       try {
diff --git 
a/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java 
b/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java
new file mode 100644
index 0000000..0d5c951
--- /dev/null
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.data;
+
+import org.apache.iceberg.Files;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.io.FileAppender;
+import org.apache.iceberg.orc.ORC;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.junit.Assert;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import java.io.File;
+import java.io.IOException;
+
+import static org.apache.iceberg.types.Types.NestedField.optional;
+
+public class TestOrcWrite {
+  @Rule
+  public TemporaryFolder temp = new TemporaryFolder();
+
+  private static final Schema SCHEMA = new Schema(
+      optional(1, "id", Types.IntegerType.get()),
+      optional(2, "data", Types.StringType.get())
+  );
+
+  @Test
+  public void splitOffsets() throws IOException {
+    File testFile = temp.newFile();
+    Assert.assertTrue("Delete should succeed", testFile.delete());
+
+    Iterable<InternalRow> rows = RandomData.generateSpark(SCHEMA, 1, 0L);
+    FileAppender<InternalRow> writer = ORC.write(Files.localOutput(testFile))
+        .createWriterFunc(SparkOrcWriter::new)
+        .schema(SCHEMA)
+        .build();
+
+    writer.addAll(rows);
+    writer.close();
+    Assert.assertNotNull("Split offsets not present", writer.splitOffsets());
+  }
+}

[incubator-iceberg] branch master updated: Store split offsets for ORC files (#192)

Reply via email to