rdblue commented on a change in pull request #139: ORC support integration for 
Spark 2.4.0
URL: https://github.com/apache/incubator-iceberg/pull/139#discussion_r280131618
 
 

 ##########
 File path: orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java
 ##########
 @@ -0,0 +1,101 @@
+package org.apache.iceberg.orc;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.function.Function;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.exceptions.RuntimeIOException;
+import org.apache.iceberg.io.CloseableGroup;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.InputFile;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
+
+/**
+ * @author Edgar Rodriguez-Diaz
+ * @since
+ */
+public class OrcIterable<T> extends CloseableGroup implements 
CloseableIterable<T> {
+  private final Schema schema;
+  private final Function<Schema, OrcValueReader<?>> readerFunction;
+  private final VectorizedRowBatchIterator orcIter;
+
+  public OrcIterable(InputFile file, Configuration config, Schema schema,
+                     Long start, Long length,
+                     Function<Schema, OrcValueReader<?>> readerFunction) {
+    this.schema = schema;
+    this.readerFunction = readerFunction;
+    final Reader orcFileReader = newFileReader(file, config);
+    this.orcIter = newOrcIterator(file, TypeConversion.toOrc(schema, new 
ColumnIdMap()),
+        start, length, orcFileReader);
+  }
+
+  @SuppressWarnings("unchecked")
+  @Override
+  public Iterator<T> iterator() {
+    return new OrcIterator(orcIter, (OrcValueReader<T>) 
readerFunction.apply(schema));
+  }
+
+  private static VectorizedRowBatchIterator newOrcIterator(InputFile file, 
TypeDescription orcSchema,
+                                                           Long start, Long 
length,
+                                                           Reader 
orcFileReader) {
+    final Reader.Options options = orcFileReader.options();
+    if (start != null) {
+      options.range(start, length);
+    }
+    options.schema(orcSchema);
+
+    try {
+      return new VectorizedRowBatchIterator(file.location(), orcSchema, 
orcFileReader.rows(options));
+    }
+    catch (IOException ioe) {
+      throw new RuntimeIOException(ioe, "Failed to get ORC rows for file: %s", 
file);
+    }
+  }
+
+  private static Reader newFileReader(InputFile file, Configuration config) {
+    try {
+      return OrcFile.createReader(new Path(file.location()),
+          OrcFile.readerOptions(config));
+    }
+    catch (IOException ioe) {
+      throw new RuntimeIOException(ioe, "Failed to open file: %s", file);
+    }
+  }
+
+  private class OrcIterator implements Iterator<T> {
 
 Review comment:
   Usually, `Iterator` classes should be static to ensure that the iterator 
shares no state with the `Iterable` other than what was passed in.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to