[
https://issues.apache.org/jira/browse/ORC-101?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15495150#comment-15495150
]
ASF GitHub Bot commented on ORC-101:
------------------------------------
Github user prasanthj commented on a diff in the pull request:
https://github.com/apache/orc/pull/60#discussion_r79095035
--- Diff: java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java ---
@@ -106,49 +198,58 @@ public OrcIndex readRowIndex(StripeInformation stripe,
if (indexes == null) {
indexes = new OrcProto.RowIndex[typeCount];
}
+ if (bloomFilterKinds == null) {
+ bloomFilterKinds = new OrcProto.Stream.Kind[typeCount];
+ }
if (bloomFilterIndices == null) {
bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
}
- long offset = stripe.getOffset();
- List<OrcProto.Stream> streams = footer.getStreamsList();
- for (int i = 0; i < streams.size(); i++) {
- OrcProto.Stream stream = streams.get(i);
- OrcProto.Stream nextStream = null;
- if (i < streams.size() - 1) {
- nextStream = streams.get(i+1);
+ DiskRangeList ranges = planIndexReading(fileSchema, footer,
+ ignoreNonUtf8BloomFilter, included, sargColumns,
bloomFilterKinds);
+ ranges = readDiskRanges(file, zcr, stripe.getOffset(), ranges,
false);
+ long offset = 0;
+ DiskRangeList range = ranges;
+ for(OrcProto.Stream stream: footer.getStreamsList()) {
+ // advance to find the next range
+ while (range != null && range.getEnd() <= offset) {
+ range = range.next;
}
- int col = stream.getColumn();
- int len = (int) stream.getLength();
- // row index stream and bloom filter are interlaced, check if the
sarg column contains bloom
- // filter and combine the io to read row index and bloom filters
for that column together
- if (stream.hasKind() && (stream.getKind() ==
OrcProto.Stream.Kind.ROW_INDEX)) {
- boolean readBloomFilter = false;
- if (sargColumns != null && sargColumns[col] &&
- nextStream.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER) {
- len += nextStream.getLength();
- i += 1;
- readBloomFilter = true;
- }
- if ((included == null || included[col]) && indexes[col] == null)
{
- byte[] buffer = new byte[len];
- file.readFully(offset, buffer, 0, buffer.length);
- ByteBuffer bb = ByteBuffer.wrap(buffer);
- indexes[col] =
OrcProto.RowIndex.parseFrom(InStream.create("index",
- ReaderImpl.singleton(new BufferChunk(bb, 0)),
stream.getLength(),
- codec, bufferSize));
- if (readBloomFilter) {
- bb.position((int) stream.getLength());
- bloomFilterIndices[col] =
OrcProto.BloomFilterIndex.parseFrom(InStream.create(
- "bloom_filter", ReaderImpl.singleton(new BufferChunk(bb,
0)),
- nextStream.getLength(), codec, bufferSize));
- }
+ // no more ranges, so we are done
+ if (range == null) {
+ break;
+ }
+ int column = stream.getColumn();
+ if (stream.hasKind() && range.getOffset() <= offset) {
+ switch (stream.getKind()) {
+ case ROW_INDEX:
+ if (included == null || included[column]) {
+ ByteBuffer bb = range.getData().duplicate();
+ bb.position((int) (offset - range.getOffset()));
+ bb.limit((int) (bb.position() + stream.getLength()));
+ indexes[column] =
OrcProto.RowIndex.parseFrom(InStream.create("index",
+ ReaderImpl.singleton(new BufferChunk(bb, 0)),
stream.getLength(),
+ codec, bufferSize));
+ }
+ break;
+ case BLOOM_FILTER:
+ case BLOOM_FILTER_UTF8:
+ if (sargColumns != null && sargColumns[column]) {
+ ByteBuffer bb = range.getData().duplicate();
+ bb.position((int) (offset - range.getOffset()));
+ bb.limit((int) (bb.position() + stream.getLength()));
+ bloomFilterIndices[column] =
OrcProto.BloomFilterIndex.parseFrom
+ (InStream.create("bloom_filter",
--- End diff --
same here.
> Correct the use of the default charset in the bloomfilter
> ---------------------------------------------------------
>
> Key: ORC-101
> URL: https://issues.apache.org/jira/browse/ORC-101
> Project: Orc
> Issue Type: Improvement
> Reporter: Owen O'Malley
> Assignee: Owen O'Malley
>
> Currently ORC's bloom filter depends on the default character set, which
> isn't constant between computers.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)