Author: prasanthj
Date: Tue Feb 3 18:34:35 2015
New Revision: 1656881
URL: http://svn.apache.org/r1656881
Log:
HIVE-9471: Bad seek in uncompressed ORC, at row-group boundary. (Mithun
Radhakrishnan reviewed by Prasanth Jayachandran)
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/InStream.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/IntegerWriter.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriter.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
hive/trunk/ql/src/test/resources/orc-file-has-null.out
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/InStream.java
URL:
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/InStream.java?rev=1656881&r1=1656880&r2=1656881&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/InStream.java
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/InStream.java Tue
Feb 3 18:34:35 2015
@@ -98,6 +98,12 @@ abstract class InStream extends InputStr
public void seek(long desired) {
for(int i = 0; i < bytes.length; ++i) {
+ if (desired == 0 && bytes[i].remaining() == 0) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Attempting seek into empty stream (" + name + ")
Skipping stream.");
+ }
+ return;
+ }
if (offsets[i] <= desired &&
desired - offsets[i] < bytes[i].remaining()) {
currentOffset = desired;
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/IntegerWriter.java
URL:
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/IntegerWriter.java?rev=1656881&r1=1656880&r2=1656881&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/IntegerWriter.java
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/IntegerWriter.java
Tue Feb 3 18:34:35 2015
@@ -40,6 +40,11 @@ interface IntegerWriter {
void write(long value) throws IOException;
/**
+ * Suppress underlying stream.
+ */
+ void suppress();
+
+ /**
* Flush the buffer
* @throws IOException
*/
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
URL:
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java?rev=1656881&r1=1656880&r2=1656881&view=diff
==============================================================================
---
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
(original)
+++
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
Tue Feb 3 18:34:35 2015
@@ -1557,32 +1557,36 @@ class RecordReaderImpl implements Record
StreamName name = new StreamName(columnId,
OrcProto.Stream.Kind.DICTIONARY_DATA);
InStream in = streams.get(name);
- if (in.available() > 0) {
- dictionaryBuffer = new DynamicByteArray(64, in.available());
- dictionaryBuffer.readAll(in);
- // Since its start of strip invalidate the cache.
- dictionaryBufferInBytesCache = null;
+ if (in != null) { // Guard against empty dictionary stream.
+ if (in.available() > 0) {
+ dictionaryBuffer = new DynamicByteArray(64, in.available());
+ dictionaryBuffer.readAll(in);
+ // Since its start of strip invalidate the cache.
+ dictionaryBufferInBytesCache = null;
+ }
+ in.close();
} else {
dictionaryBuffer = null;
}
- in.close();
// read the lengths
name = new StreamName(columnId, OrcProto.Stream.Kind.LENGTH);
in = streams.get(name);
- IntegerReader lenReader = createIntegerReader(encodings.get(columnId)
- .getKind(), in, false);
- int offset = 0;
- if (dictionaryOffsets == null ||
- dictionaryOffsets.length < dictionarySize + 1) {
- dictionaryOffsets = new int[dictionarySize + 1];
+ if (in != null) { // Guard against empty LENGTH stream.
+ IntegerReader lenReader = createIntegerReader(encodings.get(columnId)
+ .getKind(), in, false);
+ int offset = 0;
+ if (dictionaryOffsets == null ||
+ dictionaryOffsets.length < dictionarySize + 1) {
+ dictionaryOffsets = new int[dictionarySize + 1];
+ }
+ for (int i = 0; i < dictionarySize; ++i) {
+ dictionaryOffsets[i] = offset;
+ offset += (int) lenReader.next();
+ }
+ dictionaryOffsets[dictionarySize] = offset;
+ in.close();
}
- for(int i=0; i < dictionarySize; ++i) {
- dictionaryOffsets[i] = offset;
- offset += (int) lenReader.next();
- }
- dictionaryOffsets[dictionarySize] = offset;
- in.close();
// set up the row reader
name = new StreamName(columnId, OrcProto.Stream.Kind.DATA);
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriter.java
URL:
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriter.java?rev=1656881&r1=1656880&r2=1656881&view=diff
==============================================================================
---
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriter.java
(original)
+++
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriter.java
Tue Feb 3 18:34:35 2015
@@ -31,7 +31,7 @@ class RunLengthIntegerWriter implements
static final int MIN_DELTA = -128;
static final int MAX_LITERAL_SIZE = 128;
private static final int MAX_REPEAT_SIZE = 127 + MIN_REPEAT_SIZE;
- private final PositionedOutputStream output;
+ private final OutStream output;
private final boolean signed;
private final long[] literals = new long[MAX_LITERAL_SIZE];
private int numLiterals = 0;
@@ -40,7 +40,7 @@ class RunLengthIntegerWriter implements
private int tailRunLength = 0;
private SerializationUtils utils;
- RunLengthIntegerWriter(PositionedOutputStream output,
+ RunLengthIntegerWriter(OutStream output,
boolean signed) {
this.output = output;
this.signed = signed;
@@ -135,6 +135,11 @@ class RunLengthIntegerWriter implements
}
@Override
+ public void suppress() {
+ this.output.suppress();
+ }
+
+ @Override
public void getPosition(PositionRecorder recorder) throws IOException {
output.getPosition(recorder);
recorder.addPosition(numLiterals);
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java
URL:
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java?rev=1656881&r1=1656880&r2=1656881&view=diff
==============================================================================
---
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java
(original)
+++
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java
Tue Feb 3 18:34:35 2015
@@ -138,7 +138,7 @@ class RunLengthIntegerWriterV2 implement
private int fixedRunLength = 0;
private int variableRunLength = 0;
private final long[] literals = new long[MAX_SCOPE];
- private final PositionedOutputStream output;
+ private final OutStream output;
private final boolean signed;
private EncodingType encoding;
private int numLiterals;
@@ -160,11 +160,11 @@ class RunLengthIntegerWriterV2 implement
private SerializationUtils utils;
private boolean alignedBitpacking;
- RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed) {
+ RunLengthIntegerWriterV2(OutStream output, boolean signed) {
this(output, signed, true);
}
- RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed,
+ RunLengthIntegerWriterV2(OutStream output, boolean signed,
boolean alignedBitpacking) {
this.output = output;
this.signed = signed;
@@ -818,6 +818,11 @@ class RunLengthIntegerWriterV2 implement
}
}
+ @Override
+ public void suppress() {
+ this.output.suppress();
+ }
+
private void initializeLiterals(long val) {
literals[numLiterals++] = val;
fixedRunLength = 1;
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
URL:
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java?rev=1656881&r1=1656880&r2=1656881&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java Tue
Feb 3 18:34:35 2015
@@ -630,7 +630,7 @@ class WriterImpl implements Writer, Memo
return rowIndexEntry;
}
- IntegerWriter createIntegerWriter(PositionedOutputStream output,
+ IntegerWriter createIntegerWriter(OutStream output,
boolean signed, boolean isDirectV2,
StreamFactory writer) {
if (isDirectV2) {
@@ -882,7 +882,7 @@ class WriterImpl implements Writer, Memo
StreamFactory writer,
boolean nullable) throws IOException {
super(columnId, inspector, writer, nullable);
- PositionedOutputStream out = writer.createStream(id,
+ OutStream out = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
this.isDirectV2 = isNewWriteFormat(writer);
this.writer = createIntegerWriter(out, true, isDirectV2, writer);
@@ -1162,6 +1162,14 @@ class WriterImpl implements Writer, Memo
// Write the dictionary by traversing the red-black tree writing out
// the bytes and lengths; and creating the map from the original order
// to the final sorted order.
+ if (dictionary.size() == 0) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Empty dictionary. Suppressing dictionary stream.");
+ }
+ stringOutput.suppress();
+ lengthOutput.suppress();
+ }
+
dictionary.visit(new StringRedBlackTree.Visitor() {
private int currentId = 0;
@Override
@@ -1467,7 +1475,7 @@ class WriterImpl implements Writer, Memo
StreamFactory writer,
boolean nullable) throws IOException {
super(columnId, inspector, writer, nullable);
- PositionedOutputStream out = writer.createStream(id,
+ OutStream out = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
this.isDirectV2 = isNewWriteFormat(writer);
this.writer = createIntegerWriter(out, true, isDirectV2, writer);
Modified: hive/trunk/ql/src/test/resources/orc-file-has-null.out
URL:
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/resources/orc-file-has-null.out?rev=1656881&r1=1656880&r2=1656881&view=diff
==============================================================================
--- hive/trunk/ql/src/test/resources/orc-file-has-null.out (original)
+++ hive/trunk/ql/src/test/resources/orc-file-has-null.out Tue Feb 3 18:34:35
2015
@@ -48,7 +48,7 @@ Stripes:
Entry 2:count: 1000 hasNull: false min: RG3 max: RG3 sum: 3000
positions: 0,2,125,0,0,66,488
Entry 3:count: 0 hasNull: true positions: 0,4,125,0,0,136,488
Entry 4:count: 0 hasNull: true positions: 0,6,125,0,0,136,488
- Stripe: offset: 424 data: 156 rows: 5000 tail: 60 index: 119
+ Stripe: offset: 424 data: 156 rows: 5000 tail: 55 index: 119
Stream: column 0 section ROW_INDEX start: 424 length 17
Stream: column 1 section ROW_INDEX start: 441 length 63
Stream: column 2 section ROW_INDEX start: 504 length 39
@@ -56,8 +56,6 @@ Stripes:
Stream: column 1 section LENGTH start: 656 length 32
Stream: column 2 section PRESENT start: 688 length 11
Stream: column 2 section DATA start: 699 length 0
- Stream: column 2 section LENGTH start: 699 length 0
- Stream: column 2 section DICTIONARY_DATA start: 699 length 0
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DICTIONARY_V2[0]
@@ -67,15 +65,15 @@ Stripes:
Entry 2:count: 0 hasNull: true positions: 0,2,120,0,0,0,0
Entry 3:count: 0 hasNull: true positions: 0,4,115,0,0,0,0
Entry 4:count: 0 hasNull: true positions: 0,6,110,0,0,0,0
- Stripe: offset: 759 data: 186 rows: 5000 tail: 60 index: 148
- Stream: column 0 section ROW_INDEX start: 759 length 17
- Stream: column 1 section ROW_INDEX start: 776 length 63
- Stream: column 2 section ROW_INDEX start: 839 length 68
- Stream: column 1 section DATA start: 907 length 113
- Stream: column 1 section LENGTH start: 1020 length 32
- Stream: column 2 section DATA start: 1052 length 24
- Stream: column 2 section LENGTH start: 1076 length 6
- Stream: column 2 section DICTIONARY_DATA start: 1082 length 11
+ Stripe: offset: 754 data: 186 rows: 5000 tail: 60 index: 148
+ Stream: column 0 section ROW_INDEX start: 754 length 17
+ Stream: column 1 section ROW_INDEX start: 771 length 63
+ Stream: column 2 section ROW_INDEX start: 834 length 68
+ Stream: column 1 section DATA start: 902 length 113
+ Stream: column 1 section LENGTH start: 1015 length 32
+ Stream: column 2 section DATA start: 1047 length 24
+ Stream: column 2 section LENGTH start: 1071 length 6
+ Stream: column 2 section DICTIONARY_DATA start: 1077 length 11
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DICTIONARY_V2[1]
@@ -85,16 +83,14 @@ Stripes:
Entry 2:count: 1000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 8000
positions: 0,198,464
Entry 3:count: 1000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 8000
positions: 0,330,440
Entry 4:count: 1000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 8000
positions: 0,462,416
- Stripe: offset: 1153 data: 156 rows: 5000 tail: 60 index: 119
- Stream: column 0 section ROW_INDEX start: 1153 length 17
- Stream: column 1 section ROW_INDEX start: 1170 length 63
- Stream: column 2 section ROW_INDEX start: 1233 length 39
- Stream: column 1 section DATA start: 1272 length 113
- Stream: column 1 section LENGTH start: 1385 length 32
- Stream: column 2 section PRESENT start: 1417 length 11
- Stream: column 2 section DATA start: 1428 length 0
- Stream: column 2 section LENGTH start: 1428 length 0
- Stream: column 2 section DICTIONARY_DATA start: 1428 length 0
+ Stripe: offset: 1148 data: 156 rows: 5000 tail: 55 index: 119
+ Stream: column 0 section ROW_INDEX start: 1148 length 17
+ Stream: column 1 section ROW_INDEX start: 1165 length 63
+ Stream: column 2 section ROW_INDEX start: 1228 length 39
+ Stream: column 1 section DATA start: 1267 length 113
+ Stream: column 1 section LENGTH start: 1380 length 32
+ Stream: column 2 section PRESENT start: 1412 length 11
+ Stream: column 2 section DATA start: 1423 length 0
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DICTIONARY_V2[0]
@@ -105,6 +101,6 @@ Stripes:
Entry 3:count: 0 hasNull: true positions: 0,4,115,0,0,0,0
Entry 4:count: 0 hasNull: true positions: 0,6,110,0,0,0,0
-File length: 1736 bytes
+File length: 1728 bytes
Padding length: 0 bytes
Padding ratio: 0%