[ https://issues.apache.org/jira/browse/ORC-101?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15495150#comment-15495150 ]
ASF GitHub Bot commented on ORC-101: ------------------------------------ Github user prasanthj commented on a diff in the pull request: https://github.com/apache/orc/pull/60#discussion_r79095035 --- Diff: java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java --- @@ -106,49 +198,58 @@ public OrcIndex readRowIndex(StripeInformation stripe, if (indexes == null) { indexes = new OrcProto.RowIndex[typeCount]; } + if (bloomFilterKinds == null) { + bloomFilterKinds = new OrcProto.Stream.Kind[typeCount]; + } if (bloomFilterIndices == null) { bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount]; } - long offset = stripe.getOffset(); - List<OrcProto.Stream> streams = footer.getStreamsList(); - for (int i = 0; i < streams.size(); i++) { - OrcProto.Stream stream = streams.get(i); - OrcProto.Stream nextStream = null; - if (i < streams.size() - 1) { - nextStream = streams.get(i+1); + DiskRangeList ranges = planIndexReading(fileSchema, footer, + ignoreNonUtf8BloomFilter, included, sargColumns, bloomFilterKinds); + ranges = readDiskRanges(file, zcr, stripe.getOffset(), ranges, false); + long offset = 0; + DiskRangeList range = ranges; + for(OrcProto.Stream stream: footer.getStreamsList()) { + // advance to find the next range + while (range != null && range.getEnd() <= offset) { + range = range.next; } - int col = stream.getColumn(); - int len = (int) stream.getLength(); - // row index stream and bloom filter are interlaced, check if the sarg column contains bloom - // filter and combine the io to read row index and bloom filters for that column together - if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.ROW_INDEX)) { - boolean readBloomFilter = false; - if (sargColumns != null && sargColumns[col] && - nextStream.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER) { - len += nextStream.getLength(); - i += 1; - readBloomFilter = true; - } - if ((included == null || included[col]) && indexes[col] == null) { - byte[] buffer = new byte[len]; - file.readFully(offset, buffer, 0, buffer.length); - ByteBuffer bb = ByteBuffer.wrap(buffer); - indexes[col] = OrcProto.RowIndex.parseFrom(InStream.create("index", - ReaderImpl.singleton(new BufferChunk(bb, 0)), stream.getLength(), - codec, bufferSize)); - if (readBloomFilter) { - bb.position((int) stream.getLength()); - bloomFilterIndices[col] = OrcProto.BloomFilterIndex.parseFrom(InStream.create( - "bloom_filter", ReaderImpl.singleton(new BufferChunk(bb, 0)), - nextStream.getLength(), codec, bufferSize)); - } + // no more ranges, so we are done + if (range == null) { + break; + } + int column = stream.getColumn(); + if (stream.hasKind() && range.getOffset() <= offset) { + switch (stream.getKind()) { + case ROW_INDEX: + if (included == null || included[column]) { + ByteBuffer bb = range.getData().duplicate(); + bb.position((int) (offset - range.getOffset())); + bb.limit((int) (bb.position() + stream.getLength())); + indexes[column] = OrcProto.RowIndex.parseFrom(InStream.create("index", + ReaderImpl.singleton(new BufferChunk(bb, 0)), stream.getLength(), + codec, bufferSize)); + } + break; + case BLOOM_FILTER: + case BLOOM_FILTER_UTF8: + if (sargColumns != null && sargColumns[column]) { + ByteBuffer bb = range.getData().duplicate(); + bb.position((int) (offset - range.getOffset())); + bb.limit((int) (bb.position() + stream.getLength())); + bloomFilterIndices[column] = OrcProto.BloomFilterIndex.parseFrom + (InStream.create("bloom_filter", --- End diff -- same here. > Correct the use of the default charset in the bloomfilter > --------------------------------------------------------- > > Key: ORC-101 > URL: https://issues.apache.org/jira/browse/ORC-101 > Project: Orc > Issue Type: Improvement > Reporter: Owen O'Malley > Assignee: Owen O'Malley > > Currently ORC's bloom filter depends on the default character set, which > isn't constant between computers. -- This message was sent by Atlassian JIRA (v6.3.4#6332)