This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 4c840ce Store split offsets for ORC files (#192)
4c840ce is described below
commit 4c840ce9c68b7a9a4ddecae5b8208edaa8794fda
Author: Ratandeep Ratti <[email protected]>
AuthorDate: Thu Jun 6 09:31:13 2019 -0700
Store split offsets for ORC files (#192)
---
orc/src/main/java/org/apache/iceberg/orc/ORC.java | 3 +-
.../org/apache/iceberg/orc/OrcFileAppender.java | 25 ++++++++-
.../apache/iceberg/spark/data/TestOrcWrite.java | 61 ++++++++++++++++++++++
3 files changed, 86 insertions(+), 3 deletions(-)
diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORC.java
b/orc/src/main/java/org/apache/iceberg/orc/ORC.java
index c6cb036..f275594 100644
--- a/orc/src/main/java/org/apache/iceberg/orc/ORC.java
+++ b/orc/src/main/java/org/apache/iceberg/orc/ORC.java
@@ -92,9 +92,8 @@ public class ORC {
public <D> FileAppender<D> build() {
Preconditions.checkNotNull(schema, "Schema is required");
- OrcFile.WriterOptions options = OrcFile.writerOptions(conf);
return new OrcFileAppender<>(TypeConversion.toOrc(schema, new
ColumnIdMap()),
- this.file, createWriterFunc, options, metadata,
+ this.file, createWriterFunc, conf, metadata,
conf.getInt(VECTOR_ROW_BATCH_SIZE, DEFAULT_SIZE));
}
}
diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java
b/orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java
index 60c738c..381fcf7 100644
--- a/orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java
+++ b/orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java
@@ -22,15 +22,22 @@ package org.apache.iceberg.orc;
import com.google.common.base.Preconditions;
import java.io.IOException;
import java.nio.ByteBuffer;
+import java.util.Collections;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.function.Function;
+import com.google.common.collect.Lists;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.iceberg.Metrics;
+import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.io.FileAppender;
import org.apache.iceberg.io.OutputFile;
import org.apache.orc.ColumnStatistics;
import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.StripeInformation;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
@@ -47,18 +54,21 @@ class OrcFileAppender<D> implements FileAppender<D> {
private final VectorizedRowBatch batch;
private final OrcValueWriter<D> valueWriter;
private boolean isClosed = false;
+ private final Configuration conf;
private static final String COLUMN_NUMBERS_ATTRIBUTE = "iceberg.column.ids";
OrcFileAppender(TypeDescription schema, OutputFile file,
Function<TypeDescription, OrcValueWriter<?>>
createWriterFunc,
- OrcFile.WriterOptions options, Map<String, byte[]> metadata,
+ Configuration conf, Map<String, byte[]> metadata,
int batchSize) {
+ this.conf = conf;
orcSchema = schema;
path = new Path(file.location());
this.batchSize = batchSize;
batch = orcSchema.createRowBatch(this.batchSize);
+ OrcFile.WriterOptions options = OrcFile.writerOptions(conf);
options.setSchema(orcSchema);
writer = newOrcWriter(file, columnIds, options, metadata);
valueWriter = newOrcValueWriter(orcSchema, createWriterFunc);
@@ -114,6 +124,19 @@ class OrcFileAppender<D> implements FileAppender<D> {
}
@Override
+ public List<Long> splitOffsets() {
+ Preconditions.checkState(isClosed, "File is not yet closed");
+ Reader reader;
+ try {
+ reader = OrcFile.createReader(path, new OrcFile.ReaderOptions(conf));
+ } catch (IOException e) {
+ throw new RuntimeIOException("Cannot read file " + path, e);
+ }
+ List<StripeInformation> stripes = reader.getStripes();
+ return Collections.unmodifiableList(Lists.transform(stripes,
StripeInformation::getOffset));
+ }
+
+ @Override
public void close() throws IOException {
if (!isClosed) {
try {
diff --git
a/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java
b/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java
new file mode 100644
index 0000000..0d5c951
--- /dev/null
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.data;
+
+import org.apache.iceberg.Files;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.io.FileAppender;
+import org.apache.iceberg.orc.ORC;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.junit.Assert;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import java.io.File;
+import java.io.IOException;
+
+import static org.apache.iceberg.types.Types.NestedField.optional;
+
+public class TestOrcWrite {
+ @Rule
+ public TemporaryFolder temp = new TemporaryFolder();
+
+ private static final Schema SCHEMA = new Schema(
+ optional(1, "id", Types.IntegerType.get()),
+ optional(2, "data", Types.StringType.get())
+ );
+
+ @Test
+ public void splitOffsets() throws IOException {
+ File testFile = temp.newFile();
+ Assert.assertTrue("Delete should succeed", testFile.delete());
+
+ Iterable<InternalRow> rows = RandomData.generateSpark(SCHEMA, 1, 0L);
+ FileAppender<InternalRow> writer = ORC.write(Files.localOutput(testFile))
+ .createWriterFunc(SparkOrcWriter::new)
+ .schema(SCHEMA)
+ .build();
+
+ writer.addAll(rows);
+ writer.close();
+ Assert.assertNotNull("Split offsets not present", writer.splitOffsets());
+ }
+}