jpountz commented on a change in pull request #1234: Add compression for Binary doc value fields URL: https://github.com/apache/lucene-solr/pull/1234#discussion_r376528169
########## File path: lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java ########## @@ -742,6 +755,131 @@ public BytesRef binaryValue() throws IOException { }; } } + } + + // Decompresses blocks of binary values to retrieve content + class BinaryDecoder { + + private final LongValues addresses; + private final IndexInput compressedData; + // Cache of last uncompressed block + private long lastBlockId = -1; + private int []uncompressedDocEnds = new int[Lucene80DocValuesFormat.BINARY_DOCS_PER_COMPRESSED_BLOCK]; + private int uncompressedBlockLength = 0; + private int numDocsInBlock = 0; + private final byte[] uncompressedBlock; + private final BytesRef uncompressedBytesRef; + + public BinaryDecoder(LongValues addresses, IndexInput compressedData, int biggestUncompressedBlockSize) { + super(); + this.addresses = addresses; + this.compressedData = compressedData; + // pre-allocate a byte array large enough for the biggest uncompressed block needed. + this.uncompressedBlock = new byte[biggestUncompressedBlockSize]; + uncompressedBytesRef = new BytesRef(uncompressedBlock); + + } + + BytesRef decode(int docNumber) throws IOException { + int blockId = docNumber >> Lucene80DocValuesFormat.BINARY_BLOCK_SHIFT; + int docInBlockId = docNumber % Lucene80DocValuesFormat.BINARY_DOCS_PER_COMPRESSED_BLOCK; + assert docInBlockId < Lucene80DocValuesFormat.BINARY_DOCS_PER_COMPRESSED_BLOCK; + + + // already read and uncompressed? + if (blockId != lastBlockId) { + lastBlockId = blockId; + long blockStartOffset = addresses.get(blockId); + compressedData.seek(blockStartOffset); + + numDocsInBlock = compressedData.readVInt(); Review comment: do we really need to record the number of documents in the block? It should be 32 for all blocks except for the last one? Maybe at index-time we could append dummy values to the last block to make sure it has 32 values too, and we wouldn't need this vInt anymore? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org