[orc] branch master updated: ORC-621. Fix reader for empty positions list in first row index entry.

omalley Tue, 21 Apr 2020 13:50:26 -0700

This is an automated email from the ASF dual-hosted git repository.

omalley pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git



The following commit(s) were added to refs/heads/master by this push:
     new 6c519eb  ORC-621. Fix reader for empty positions list in first row 
index entry.
6c519eb is described below

commit 6c519ebb33cf7c82d6f4f498b8ce65ebde4b5ccf
Author: Owen O'Malley <[email protected]>
AuthorDate: Mon Apr 20 15:55:10 2020 -0700

    ORC-621. Fix reader for empty positions list in first row index entry.
    
    Fixes #502
    
    Signed-off-by: Owen O'Malley <[email protected]>
---
 examples/TestStringDictionary.testRowIndex.orc     | Bin 0 -> 74580 bytes
 .../TestStringDictionary.testRowIndex.jsn.gz       | Bin 0 -> 85500 bytes
 java/core/src/java/org/apache/orc/Reader.java      |  10 ---
 .../java/org/apache/orc/impl/RecordReaderImpl.java |  15 +++-
 .../org/apache/orc/impl/reader/StripePlanner.java  |   2 +-
 .../test/org/apache/orc/TestStringDictionary.java  |  85 +++++++++++++++++++++
 6 files changed, 100 insertions(+), 12 deletions(-)

diff --git a/examples/TestStringDictionary.testRowIndex.orc 
b/examples/TestStringDictionary.testRowIndex.orc
new file mode 100644
index 0000000..cba483d
Binary files /dev/null and b/examples/TestStringDictionary.testRowIndex.orc 
differ
diff --git a/examples/expected/TestStringDictionary.testRowIndex.jsn.gz 
b/examples/expected/TestStringDictionary.testRowIndex.jsn.gz
new file mode 100644
index 0000000..5fb46b6
Binary files /dev/null and 
b/examples/expected/TestStringDictionary.testRowIndex.jsn.gz differ
diff --git a/java/core/src/java/org/apache/orc/Reader.java 
b/java/core/src/java/org/apache/orc/Reader.java
index 23d7a3e..007c515 100644
--- a/java/core/src/java/org/apache/orc/Reader.java
+++ b/java/core/src/java/org/apache/orc/Reader.java
@@ -408,16 +408,6 @@ public interface Reader extends Closeable {
       if (sarg != null) {
         buffer.append(", sarg: ");
         buffer.append(sarg.toString());
-        buffer.append(", columns: [");
-        for(int i=0; i < columnNames.length; ++i) {
-          if (i != 0) {
-            buffer.append(", ");
-          }
-          buffer.append("'");
-          buffer.append(columnNames[i]);
-          buffer.append("'");
-        }
-        buffer.append("]");
       }
       if (schema != null) {
         buffer.append(", schema: ");
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java 
b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index f3b2079..f5a65ba 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -267,6 +267,13 @@ public class RecordReaderImpl implements RecordReader {
     }
   }
 
+  public static final class ZeroPositionProvider implements PositionProvider {
+    @Override
+    public long getNext() {
+      return 0;
+    }
+  }
+
   public OrcProto.StripeFooter readStripeFooter(StripeInformation stripe
                                                 ) throws IOException {
     return dataReader.readStripeFooter(stripe);
@@ -1260,7 +1267,13 @@ public class RecordReaderImpl implements RecordReader {
     PositionProvider[] index = new PositionProvider[rowIndices.length];
     for (int i = 0; i < index.length; ++i) {
       if (rowIndices[i] != null) {
-        index[i] = new PositionProviderImpl(rowIndices[i].getEntry(rowEntry));
+        OrcProto.RowIndexEntry entry = rowIndices[i].getEntry(rowEntry);
+        // This is effectively a test for pre-ORC-569 files.
+        if (rowEntry == 0 && entry.getPositionsCount() == 0) {
+          index[i] = new ZeroPositionProvider();
+        } else {
+          index[i] = new PositionProviderImpl(entry);
+        }
       }
     }
     reader.seek(index);
diff --git a/java/core/src/java/org/apache/orc/impl/reader/StripePlanner.java 
b/java/core/src/java/org/apache/orc/impl/reader/StripePlanner.java
index 41a30e6..463a70c 100644
--- a/java/core/src/java/org/apache/orc/impl/reader/StripePlanner.java
+++ b/java/core/src/java/org/apache/orc/impl/reader/StripePlanner.java
@@ -472,7 +472,7 @@ public class StripePlanner {
                   encodings[stream.column].getKind(), kind, stream.kind,
                   isCompressed, hasNull[column]);
               long start = Math.max(alreadyRead,
-                  stream.offset + ri.getEntry(group).getPositions(posn));
+                  stream.offset + (group == 0 ? 0 : 
ri.getEntry(group).getPositions(posn)));
               long end = stream.offset;
               if (endGroup == includedRowGroups.length - 1) {
                 end += stream.length;
diff --git a/java/core/src/test/org/apache/orc/TestStringDictionary.java 
b/java/core/src/test/org/apache/orc/TestStringDictionary.java
index ec7d547..420c9e1 100644
--- a/java/core/src/test/org/apache/orc/TestStringDictionary.java
+++ b/java/core/src/test/org/apache/orc/TestStringDictionary.java
@@ -32,6 +32,9 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 
+import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
 import org.apache.orc.impl.OrcIndex;
 import org.apache.orc.impl.OutStream;
 import org.apache.orc.impl.RecordReaderImpl;
@@ -564,4 +567,86 @@ public class TestStringDictionary {
       }
     }
   }
+
+  /**
+   * That when we disable dictionaries, we don't get broken row indexes.
+   */
+  @Test
+  public void testRowIndex() throws Exception {
+    TypeDescription schema =
+        TypeDescription.fromString("struct<str:string>");
+    // turn off the dictionaries
+    OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.setDouble(conf, 0);
+    Writer writer = OrcFile.createWriter(
+        testFilePath,
+        OrcFile.writerOptions(conf).setSchema(schema).rowIndexStride(4 * 
1024));
+
+    VectorizedRowBatch batch = schema.createRowBatch();
+    BytesColumnVector strVector = (BytesColumnVector) batch.cols[0];
+
+    for (int i = 0; i < 32 * 1024; i++) {
+      if (batch.size == batch.getMaxSize()) {
+        writer.addRowBatch(batch);
+        batch.reset();
+      }
+      byte[] value = String.format("row %06d", 
i).getBytes(StandardCharsets.UTF_8);
+      strVector.setRef(batch.size, value, 0, value.length);
+      ++batch.size;
+    }
+    writer.addRowBatch(batch);
+    writer.close();
+
+    Reader reader = OrcFile.createReader(testFilePath, 
OrcFile.readerOptions(conf).filesystem(fs));
+    SearchArgument sarg = SearchArgumentFactory.newBuilder(conf)
+        .lessThan("str", PredicateLeaf.Type.STRING, "row 001000")
+        .build();
+    RecordReader recordReader = 
reader.rows(reader.options().searchArgument(sarg, null));
+    batch = reader.getSchema().createRowBatch();
+    strVector = (BytesColumnVector) batch.cols[0];
+    long base = 0;
+    while (recordReader.nextBatch(batch)) {
+      for(int r=0; r < batch.size; ++r) {
+        String value = String.format("row %06d", r + base);
+        assertEquals("row " + (r + base), value, strVector.toString(r));
+      }
+      base += batch.size;
+    }
+    // We should only read the first row group.
+    assertEquals(4 * 1024, base);
+  }
+
+  /**
+   * Test that files written before ORC-569 are read correctly.
+   */
+  @Test
+  public void testRowIndexPreORC569() throws Exception {
+    testFilePath = new Path(System.getProperty("example.dir"), 
"TestStringDictionary.testRowIndex.orc");
+    SearchArgument sarg = SearchArgumentFactory.newBuilder(conf)
+        .lessThan("str", PredicateLeaf.Type.STRING, "row 001000")
+        .build();
+    try (Reader reader = OrcFile.createReader(testFilePath, 
OrcFile.readerOptions(conf).filesystem(fs))) {
+      try (RecordReader recordReader = 
reader.rows(reader.options().searchArgument(sarg, null))) {
+        VectorizedRowBatch batch = reader.getSchema().createRowBatch();
+        BytesColumnVector strVector = (BytesColumnVector) batch.cols[0];
+        long base = 0;
+        while (recordReader.nextBatch(batch)) {
+          for (int r = 0; r < batch.size; ++r) {
+            String value = String.format("row %06d", r + base);
+            assertEquals("row " + (r + base), value, strVector.toString(r));
+          }
+          base += batch.size;
+        }
+        // We should only read the first row group.
+        assertEquals(4 * 1024, base);
+      }
+
+      try (RecordReader recordReader = reader.rows()) {
+        VectorizedRowBatch batch = reader.getSchema().createRowBatch();
+        recordReader.seekToRow(4 * 1024);
+        assertTrue(recordReader.nextBatch(batch));
+        recordReader.seekToRow(0);
+        assertTrue(recordReader.nextBatch(batch));
+      }
+    }
+  }
 }

[orc] branch master updated: ORC-621. Fix reader for empty positions list in first row index entry.

Reply via email to