This is an automated email from the ASF dual-hosted git repository.
omalley pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/master by this push:
new 6c519eb ORC-621. Fix reader for empty positions list in first row
index entry.
6c519eb is described below
commit 6c519ebb33cf7c82d6f4f498b8ce65ebde4b5ccf
Author: Owen O'Malley <[email protected]>
AuthorDate: Mon Apr 20 15:55:10 2020 -0700
ORC-621. Fix reader for empty positions list in first row index entry.
Fixes #502
Signed-off-by: Owen O'Malley <[email protected]>
---
examples/TestStringDictionary.testRowIndex.orc | Bin 0 -> 74580 bytes
.../TestStringDictionary.testRowIndex.jsn.gz | Bin 0 -> 85500 bytes
java/core/src/java/org/apache/orc/Reader.java | 10 ---
.../java/org/apache/orc/impl/RecordReaderImpl.java | 15 +++-
.../org/apache/orc/impl/reader/StripePlanner.java | 2 +-
.../test/org/apache/orc/TestStringDictionary.java | 85 +++++++++++++++++++++
6 files changed, 100 insertions(+), 12 deletions(-)
diff --git a/examples/TestStringDictionary.testRowIndex.orc
b/examples/TestStringDictionary.testRowIndex.orc
new file mode 100644
index 0000000..cba483d
Binary files /dev/null and b/examples/TestStringDictionary.testRowIndex.orc
differ
diff --git a/examples/expected/TestStringDictionary.testRowIndex.jsn.gz
b/examples/expected/TestStringDictionary.testRowIndex.jsn.gz
new file mode 100644
index 0000000..5fb46b6
Binary files /dev/null and
b/examples/expected/TestStringDictionary.testRowIndex.jsn.gz differ
diff --git a/java/core/src/java/org/apache/orc/Reader.java
b/java/core/src/java/org/apache/orc/Reader.java
index 23d7a3e..007c515 100644
--- a/java/core/src/java/org/apache/orc/Reader.java
+++ b/java/core/src/java/org/apache/orc/Reader.java
@@ -408,16 +408,6 @@ public interface Reader extends Closeable {
if (sarg != null) {
buffer.append(", sarg: ");
buffer.append(sarg.toString());
- buffer.append(", columns: [");
- for(int i=0; i < columnNames.length; ++i) {
- if (i != 0) {
- buffer.append(", ");
- }
- buffer.append("'");
- buffer.append(columnNames[i]);
- buffer.append("'");
- }
- buffer.append("]");
}
if (schema != null) {
buffer.append(", schema: ");
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index f3b2079..f5a65ba 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -267,6 +267,13 @@ public class RecordReaderImpl implements RecordReader {
}
}
+ public static final class ZeroPositionProvider implements PositionProvider {
+ @Override
+ public long getNext() {
+ return 0;
+ }
+ }
+
public OrcProto.StripeFooter readStripeFooter(StripeInformation stripe
) throws IOException {
return dataReader.readStripeFooter(stripe);
@@ -1260,7 +1267,13 @@ public class RecordReaderImpl implements RecordReader {
PositionProvider[] index = new PositionProvider[rowIndices.length];
for (int i = 0; i < index.length; ++i) {
if (rowIndices[i] != null) {
- index[i] = new PositionProviderImpl(rowIndices[i].getEntry(rowEntry));
+ OrcProto.RowIndexEntry entry = rowIndices[i].getEntry(rowEntry);
+ // This is effectively a test for pre-ORC-569 files.
+ if (rowEntry == 0 && entry.getPositionsCount() == 0) {
+ index[i] = new ZeroPositionProvider();
+ } else {
+ index[i] = new PositionProviderImpl(entry);
+ }
}
}
reader.seek(index);
diff --git a/java/core/src/java/org/apache/orc/impl/reader/StripePlanner.java
b/java/core/src/java/org/apache/orc/impl/reader/StripePlanner.java
index 41a30e6..463a70c 100644
--- a/java/core/src/java/org/apache/orc/impl/reader/StripePlanner.java
+++ b/java/core/src/java/org/apache/orc/impl/reader/StripePlanner.java
@@ -472,7 +472,7 @@ public class StripePlanner {
encodings[stream.column].getKind(), kind, stream.kind,
isCompressed, hasNull[column]);
long start = Math.max(alreadyRead,
- stream.offset + ri.getEntry(group).getPositions(posn));
+ stream.offset + (group == 0 ? 0 :
ri.getEntry(group).getPositions(posn)));
long end = stream.offset;
if (endGroup == includedRowGroups.length - 1) {
end += stream.length;
diff --git a/java/core/src/test/org/apache/orc/TestStringDictionary.java
b/java/core/src/test/org/apache/orc/TestStringDictionary.java
index ec7d547..420c9e1 100644
--- a/java/core/src/test/org/apache/orc/TestStringDictionary.java
+++ b/java/core/src/test/org/apache/orc/TestStringDictionary.java
@@ -32,6 +32,9 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
import org.apache.orc.impl.OrcIndex;
import org.apache.orc.impl.OutStream;
import org.apache.orc.impl.RecordReaderImpl;
@@ -564,4 +567,86 @@ public class TestStringDictionary {
}
}
}
+
+ /**
+ * That when we disable dictionaries, we don't get broken row indexes.
+ */
+ @Test
+ public void testRowIndex() throws Exception {
+ TypeDescription schema =
+ TypeDescription.fromString("struct<str:string>");
+ // turn off the dictionaries
+ OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.setDouble(conf, 0);
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).rowIndexStride(4 *
1024));
+
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector strVector = (BytesColumnVector) batch.cols[0];
+
+ for (int i = 0; i < 32 * 1024; i++) {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ byte[] value = String.format("row %06d",
i).getBytes(StandardCharsets.UTF_8);
+ strVector.setRef(batch.size, value, 0, value.length);
+ ++batch.size;
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs));
+ SearchArgument sarg = SearchArgumentFactory.newBuilder(conf)
+ .lessThan("str", PredicateLeaf.Type.STRING, "row 001000")
+ .build();
+ RecordReader recordReader =
reader.rows(reader.options().searchArgument(sarg, null));
+ batch = reader.getSchema().createRowBatch();
+ strVector = (BytesColumnVector) batch.cols[0];
+ long base = 0;
+ while (recordReader.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ String value = String.format("row %06d", r + base);
+ assertEquals("row " + (r + base), value, strVector.toString(r));
+ }
+ base += batch.size;
+ }
+ // We should only read the first row group.
+ assertEquals(4 * 1024, base);
+ }
+
+ /**
+ * Test that files written before ORC-569 are read correctly.
+ */
+ @Test
+ public void testRowIndexPreORC569() throws Exception {
+ testFilePath = new Path(System.getProperty("example.dir"),
"TestStringDictionary.testRowIndex.orc");
+ SearchArgument sarg = SearchArgumentFactory.newBuilder(conf)
+ .lessThan("str", PredicateLeaf.Type.STRING, "row 001000")
+ .build();
+ try (Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs))) {
+ try (RecordReader recordReader =
reader.rows(reader.options().searchArgument(sarg, null))) {
+ VectorizedRowBatch batch = reader.getSchema().createRowBatch();
+ BytesColumnVector strVector = (BytesColumnVector) batch.cols[0];
+ long base = 0;
+ while (recordReader.nextBatch(batch)) {
+ for (int r = 0; r < batch.size; ++r) {
+ String value = String.format("row %06d", r + base);
+ assertEquals("row " + (r + base), value, strVector.toString(r));
+ }
+ base += batch.size;
+ }
+ // We should only read the first row group.
+ assertEquals(4 * 1024, base);
+ }
+
+ try (RecordReader recordReader = reader.rows()) {
+ VectorizedRowBatch batch = reader.getSchema().createRowBatch();
+ recordReader.seekToRow(4 * 1024);
+ assertTrue(recordReader.nextBatch(batch));
+ recordReader.seekToRow(0);
+ assertTrue(recordReader.nextBatch(batch));
+ }
+ }
+ }
}