[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r245993566 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,601 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block offset and index, and a rank structure + * for DENSE block index lookups. + * + * The lookup table is an array of {@code int}-pairs, with a pair for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each int-pair entry consists of 2 logical parts: + * + * The first 32 bit int holds the index (number of set bits in the blocks) up to just before the + * wanted block. The maximum number of set bits is the maximum number of documents, which is < 2^31. + * + * The next int holds the offset in bytes into the underlying slice. As there is a maximum of 2^16 + * blocks, it follows that the maximum size of any block must not exceed 2^15 bytes to avoid + * overflow (2^16 bytes if the int is treated as unsigned). This is currently the case, with the + * largest block being DENSE and using 2^13 + 36 bytes. + * + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of byte-pairs with an entry for each + * sub-block (default 512 bits) out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-2 = 65634 and thus will always fit into an byte-pair of 16 bits. + *
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r245364034 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,601 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block offset and index, and a rank structure + * for DENSE block index lookups. + * + * The lookup table is an array of {@code int}-pairs, with a pair for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each int-pair entry consists of 2 logical parts: + * + * The first 32 bit int holds the index (number of set bits in the blocks) up to just before the + * wanted block. The maximum number of set bits is the maximum number of documents, which is < 2^31. + * + * The next int holds the offset in bytes into the underlying slice. As there is a maximum of 2^16 + * blocks, it follows that the maximum size of any block must not exceed 2^15 bytes to avoid + * overflow (2^16 bytes if the int is treated as unsigned). This is currently the case, with the + * largest block being DENSE and using 2^13 + 36 bytes. + * + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of byte-pairs with an entry for each + * sub-block (default 512 bits) out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-2 = 65634 and thus will always fit into an byte-pair of 16 bits. + *
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r245040952 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits hold the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset in bytes into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 288 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure
[GitHub] lucene-solr pull request #:
Github user dsmiley commented on the pull request: https://github.com/apache/lucene-solr/commit/c04a3ac306b017054173d986e32ea217e2fc6595#commitcomment-31829970 In solr/core/src/java/org/apache/solr/response/transform/ChildDocTransformerFactory.java: In solr/core/src/java/org/apache/solr/response/transform/ChildDocTransformerFactory.java on line 162: I think this illustrates a problem with this syntax that you invented. The code here says the absence of any '/' means it's a "regular filter, not hierarchy based". But how then do you articulate a path filter for all elements named "foo" regardless of parentage? So I think "childFilter" is trying to do double-duty here leading to some edge cases as we try to guess what was intended. I propose instead we have a different local param "pathFilter". Both filters will be AND'ed together. WDYT? --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #305: SOLR-11853: Make check for used tool "service...
Github user Mandalka closed the pull request at: https://github.com/apache/lucene-solr/pull/305 --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #531: SOLR-12768
GitHub user moshebla opened a pull request: https://github.com/apache/lucene-solr/pull/531 SOLR-12768 You can merge this pull request into a Git repository by running: $ git pull https://github.com/moshebla/lucene-solr SOLR-12768 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/lucene-solr/pull/531.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #531 commit 6b147506ab7e8c7eda52dde9748fdebef8133194 Author: Moshe Date: 2019-01-01T10:29:15Z SOLR-12678: applied David's patch commit 83560ecf38f52f31d48b133d0c1a925fa7df8c28 Author: Moshe Date: 2019-01-01T12:37:47Z SOLR-12768: remove last / from _NEST_PATH_FIELD filter in ChildDocTransformerFactory#processPathHierarchyQueryString commit 75a1ad79eb5255a4b2e6807042140bbe3d09bb58 Author: Moshe Date: 2019-01-01T15:06:38Z SOLR-12768: fix partial and full path support in ChildDocTransformer --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #530: Branch 7 4 : IndexOutOfBoundsException random...
Github user IntraCherche closed the pull request at: https://github.com/apache/lucene-solr/pull/530 --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r244336948 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits hold the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset in bytes into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 288 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r244332659 --- Diff: lucene/core/src/test/org/apache/lucene/index/TestDocValues.java --- @@ -123,8 +124,72 @@ public void testNumericField() throws Exception { iw.close(); dir.close(); } - - /** + + // The LUCENE-8585 jump-tables enables O(1) skipping of IndexedDISI blocks, DENSE block lookup + // and numeric multi blocks. This test focuses on testing these jumps. + @Slow + public void testNumericFieldJumpTables() throws Exception { +Directory dir = newDirectory(); +IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null)); +final int maxDoc = atLeast(3*65536); // Must be above 3*65536 to trigger IndexedDISI block skips --- End diff -- Moved to `TestLucene80DocValuesFormat` and adjusted as you suggested. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r244331405 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits hold the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset in bytes into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 288 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r244329158 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits hold the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset in bytes into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 288 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r244328885 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits hold the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset in bytes into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 288 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. --- End diff -- Nice catch. That was me mixing bits & bytes and arriving at too harsh a requirement, making the representation needlessly complicated: All block types are < 2^17 *bits*, but the offset is in *bytes*, so we do not require 2^16 * 2^17 bits to hold it; only 2^16 * 2^17 / 2^3 = 30 bits. With the offset-requirement lowered from 33 to 30 bits, it is much more natural to represent offset & index as two ints. This will be in the next commit. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r244318898 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure --- End diff -- Indeed I did. I have replaced it with `offset` in the next push. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user barrotsteindev commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r244007693 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java --- @@ -42,15 +53,36 @@ public DistributedStandaloneUpdateProcessor(SolrQueryRequest req, } @Override - String getCollectionName(CloudDescriptor cloudDesc) { + String computeCollectionName(CloudDescriptor cloudDesc) { return null; } @Override - Replica.Type getReplicaType(CloudDescriptor cloudDesc) { + Replica.Type computeReplicaType(CloudDescriptor cloudDesc) { return Replica.Type.NRT; } + @Override + public void processCommit(CommitUpdateCommand cmd) throws IOException { + +assert TestInjection.injectFailUpdateRequests(); + +updateCommand = cmd; + +CompletionService completionService = new ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor()); --- End diff -- Hopefully I addressed your concerns with latest commit. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #530: Branch 7 4 : IndexOutOfBoundsException random...
GitHub user IntraCherche opened a pull request: https://github.com/apache/lucene-solr/pull/530 Branch 7 4 : IndexOutOfBoundsException randomly appearing while indexing Hi, First of all sorry if it is not the right place to write the issue I am encountering. I saw other people where having the same problem on StackOverflow (https://stackoverflow.com/questions/52783491/solr-indexing-error-possible-analysis-error) although there was no solution provided so far. Anyway feel free to reroute my pull request. So while indexing documents with SOLR 7.4 the following exception appears randomly. It is very annoying because I cannot reproduce it on my machine and I don't have access to customer's data. All I know is that it can appear on pdf, xls, or eml documents. It looks like the issues stems from FlattenGraphFilter class and specifically _restoreState(inputNode.tokens.get(inputNode.nextOut))_ The stack trace shows : Caused by: java.lang.IndexOutOfBoundsException: Index: 1, Size: 1 at java.util.ArrayList.rangeCheck(ArrayList.java:657) at java.util.ArrayList.get(ArrayList.java:433) at org.apache.lucene.analysis.core.FlattenGraphFilter.releaseBufferedToken(FlattenGraphFilter.java:204) at org.apache.lucene.analysis.core.FlattenGraphFilter.incrementToken(FlattenGraphFilter.java:258) at org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:738) at org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:428) at org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392) at org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:251) at org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:494) at org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1602) at org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1594) at org.apache.solr.update.DirectUpdateHandler2.updateDocument(DirectUpdateHandler2.java:982) at org.apache.solr.update.DirectUpdateHandler2.updateDocOrDocValues(DirectUpdateHandler2.java:971) at org.apache.solr.update.DirectUpdateHandler2.doNormalUpdate(DirectUpdateHandler2.java:348) at org.apache.solr.update.DirectUpdateHandler2.addDoc0(DirectUpdateHandler2.java:284) at org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:234) If more info is needed I would be delighted to provide it as long as I have it (eg documents are customer confidentials so I don't have access to them). Kind regards You can merge this pull request into a Git repository by running: $ git pull https://github.com/apache/lucene-solr branch_7_4 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/lucene-solr/pull/530.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #530 commit 1ed95c097b82ee5f175e93f3fe62572abe064da6 Author: Jim Ferenczi Date: 2018-05-03T14:11:52Z LUCENE-8231: Add missing part of speech filter in the SPI META-INF file commit 0e45b39476f287fe56ec4734c5c8695a99f8cb5d Author: Christine Poerschke Date: 2018-05-03T17:42:56Z json-facet-api.adoc typo fix. commit dad48603aec715063fdcb71e11fe73599d63c3a2 Author: Simon Willnauer Date: 2018-05-03T07:29:12Z LUCENE-8293: Ensure only hard deletes are carried over in a merge Today we carry over hard deletes based on the SegmentReaders liveDocs. This is not correct if soft-deletes are used especially with rentention policies. If a soft delete is added while a segment is merged the document might end up hard deleted in the target segment. This isn't necessarily a correctness issue but causes unnecessary writes of hard-deletes. The biggest issue here is that we assert that previously deleted documents are still deleted in the live-docs we apply and that might be violated by the retention policy. commit 27992e3386f7ba9521329ec8a1957103d73da2e4 Author: Adrien Grand Date: 2018-05-04T12:38:42Z LUCENE-8295: Remove useless liveDocsSharedPending flag. commit 6ec1198a5144e73b47bc88cd79f534878cbcbef4 Author: Anshum Gupta Date: 2018-05-03T22:00:47Z SOLR-11277: Add auto hard commit setting based on tlog size (this closes #358) commit 2558a06f3c8040d47de6606da2539532e4c67854 Author: Erick Erickson Date: 2018-05-05T05:30:18Z SOLR-12028: BadApple and AwaitsFix annotations usage (cherry picked from commit 89fc02a) commit 96f079b4b47eaadff65c7aaf0e5bafe68e30ec3b Author: Uwe Schindler Date: 2018-05-06T12:21:34Z SOLR-12316: Do not allow to use absolute URIs for including other files in solrconfig.xml and schema
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user dsmiley commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r243999855 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java --- @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.update.processor; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.CompletionService; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.Future; + +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.update.AddUpdateCommand; +import org.apache.solr.update.CommitUpdateCommand; +import org.apache.solr.update.DeleteUpdateCommand; +import org.apache.solr.update.UpdateCommand; +import org.apache.solr.util.TestInjection; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DistributedStandaloneUpdateProcessor extends DistributedUpdateProcessor { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + public DistributedStandaloneUpdateProcessor(SolrQueryRequest req, + SolrQueryResponse rsp, UpdateRequestProcessor next) { +super(req, rsp, next); + } + + public DistributedStandaloneUpdateProcessor(SolrQueryRequest req, + SolrQueryResponse rsp, AtomicUpdateDocumentMerger docMerger, + UpdateRequestProcessor next) { +super(req, rsp, docMerger, next); + } + + @Override + Replica.Type computeReplicaType() { +return Replica.Type.NRT; + } + + @Override + public void processCommit(CommitUpdateCommand cmd) throws IOException { + +assert TestInjection.injectFailUpdateRequests(); + +updateCommand = cmd; + +CompletionService completionService = new ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor()); +Set> pending = new HashSet<>(); +if (replicaType == Replica.Type.TLOG) { + if (isLeader) { +super.processCommit(cmd); + } +} else if (replicaType == Replica.Type.PULL) { + log.warn("Commit not supported on replicas of type " + Replica.Type.PULL); +} else { + // NRT replicas will always commit + super.processCommit(cmd); +} + } + + @Override + public void processAdd(AddUpdateCommand cmd) throws IOException { +assert TestInjection.injectFailUpdateRequests(); + +updateCommand = cmd; +isLeader = getNonZkLeaderAssumption(req); + +super.processAdd(cmd); + } + + @Override + protected void doDeleteById(DeleteUpdateCommand cmd) throws IOException { +isLeader = getNonZkLeaderAssumption(req); +super.doDeleteById(cmd); + } + + @Override + protected void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException { +// even in non zk mode, tests simulate updates from a leader +isLeader = getNonZkLeaderAssumption(req); +super.doDeleteByQuery(cmd, null, null); + } + + @Override + public void finish() throws IOException { --- End diff -- nevermind; just leave this be then. Lets keep our changes to more clear/obvious ones. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user barrotsteindev commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r243995067 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java --- @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.update.processor; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.CompletionService; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.Future; + +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.update.AddUpdateCommand; +import org.apache.solr.update.CommitUpdateCommand; +import org.apache.solr.update.DeleteUpdateCommand; +import org.apache.solr.update.UpdateCommand; +import org.apache.solr.util.TestInjection; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DistributedStandaloneUpdateProcessor extends DistributedUpdateProcessor { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + public DistributedStandaloneUpdateProcessor(SolrQueryRequest req, + SolrQueryResponse rsp, UpdateRequestProcessor next) { +super(req, rsp, next); + } + + public DistributedStandaloneUpdateProcessor(SolrQueryRequest req, + SolrQueryResponse rsp, AtomicUpdateDocumentMerger docMerger, + UpdateRequestProcessor next) { +super(req, rsp, docMerger, next); + } + + @Override + Replica.Type computeReplicaType() { +return Replica.Type.NRT; + } + + @Override + public void processCommit(CommitUpdateCommand cmd) throws IOException { + +assert TestInjection.injectFailUpdateRequests(); + +updateCommand = cmd; + +CompletionService completionService = new ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor()); +Set> pending = new HashSet<>(); +if (replicaType == Replica.Type.TLOG) { + if (isLeader) { +super.processCommit(cmd); + } +} else if (replicaType == Replica.Type.PULL) { + log.warn("Commit not supported on replicas of type " + Replica.Type.PULL); +} else { + // NRT replicas will always commit + super.processCommit(cmd); +} + } + + @Override + public void processAdd(AddUpdateCommand cmd) throws IOException { +assert TestInjection.injectFailUpdateRequests(); + +updateCommand = cmd; +isLeader = getNonZkLeaderAssumption(req); + +super.processAdd(cmd); + } + + @Override + protected void doDeleteById(DeleteUpdateCommand cmd) throws IOException { +isLeader = getNonZkLeaderAssumption(req); +super.doDeleteById(cmd); + } + + @Override + protected void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException { +// even in non zk mode, tests simulate updates from a leader +isLeader = getNonZkLeaderAssumption(req); +super.doDeleteByQuery(cmd, null, null); + } + + @Override + public void finish() throws IOException { --- End diff -- I'm still trying to dig in further and see why the last line (`if (next != null && nodes == null) next.finish();`) was put in an if and not an else if statement. nodes is supposed to be the replicas of the current nodes, and when zkEnabled was false it was set to null. I am still no sure why this was in a completely separate if statement. @dsmiley , WDYT? --- -
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user barrotsteindev commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r243850805 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java --- @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.update.processor; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.CompletionService; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.Future; + +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.update.AddUpdateCommand; +import org.apache.solr.update.CommitUpdateCommand; +import org.apache.solr.update.DeleteUpdateCommand; +import org.apache.solr.update.UpdateCommand; +import org.apache.solr.util.TestInjection; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DistributedStandaloneUpdateProcessor extends DistributedUpdateProcessor { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + public DistributedStandaloneUpdateProcessor(SolrQueryRequest req, + SolrQueryResponse rsp, UpdateRequestProcessor next) { +super(req, rsp, next); + } + + public DistributedStandaloneUpdateProcessor(SolrQueryRequest req, + SolrQueryResponse rsp, AtomicUpdateDocumentMerger docMerger, + UpdateRequestProcessor next) { +super(req, rsp, docMerger, next); + } + + @Override + Replica.Type computeReplicaType() { +return Replica.Type.NRT; + } + + @Override + public void processCommit(CommitUpdateCommand cmd) throws IOException { + +assert TestInjection.injectFailUpdateRequests(); + +updateCommand = cmd; + +CompletionService completionService = new ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor()); +Set> pending = new HashSet<>(); +if (replicaType == Replica.Type.TLOG) { + if (isLeader) { +super.processCommit(cmd); + } +} else if (replicaType == Replica.Type.PULL) { + log.warn("Commit not supported on replicas of type " + Replica.Type.PULL); +} else { + // NRT replicas will always commit + super.processCommit(cmd); +} + } + + @Override + public void processAdd(AddUpdateCommand cmd) throws IOException { +assert TestInjection.injectFailUpdateRequests(); + +updateCommand = cmd; +isLeader = getNonZkLeaderAssumption(req); + +super.processAdd(cmd); + } + + @Override + protected void doDeleteById(DeleteUpdateCommand cmd) throws IOException { +isLeader = getNonZkLeaderAssumption(req); +super.doDeleteById(cmd); + } + + @Override + protected void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException { +// even in non zk mode, tests simulate updates from a leader +isLeader = getNonZkLeaderAssumption(req); --- End diff -- I think it can also be used in the zkProcessor, I'll investigate further. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user barrotsteindev commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r243850226 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java --- @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.update.processor; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.CompletionService; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.Future; + +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.update.AddUpdateCommand; +import org.apache.solr.update.CommitUpdateCommand; +import org.apache.solr.update.DeleteUpdateCommand; +import org.apache.solr.update.UpdateCommand; +import org.apache.solr.util.TestInjection; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DistributedStandaloneUpdateProcessor extends DistributedUpdateProcessor { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + public DistributedStandaloneUpdateProcessor(SolrQueryRequest req, + SolrQueryResponse rsp, UpdateRequestProcessor next) { +super(req, rsp, next); + } + + public DistributedStandaloneUpdateProcessor(SolrQueryRequest req, + SolrQueryResponse rsp, AtomicUpdateDocumentMerger docMerger, + UpdateRequestProcessor next) { +super(req, rsp, docMerger, next); + } + + @Override + Replica.Type computeReplicaType() { +return Replica.Type.NRT; + } + + @Override + public void processCommit(CommitUpdateCommand cmd) throws IOException { + +assert TestInjection.injectFailUpdateRequests(); + +updateCommand = cmd; + +CompletionService completionService = new ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor()); +Set> pending = new HashSet<>(); +if (replicaType == Replica.Type.TLOG) { + if (isLeader) { +super.processCommit(cmd); + } +} else if (replicaType == Replica.Type.PULL) { + log.warn("Commit not supported on replicas of type " + Replica.Type.PULL); +} else { + // NRT replicas will always commit + super.processCommit(cmd); +} + } + + @Override + public void processAdd(AddUpdateCommand cmd) throws IOException { +assert TestInjection.injectFailUpdateRequests(); + +updateCommand = cmd; +isLeader = getNonZkLeaderAssumption(req); + +super.processAdd(cmd); + } + + @Override + protected void doDeleteById(DeleteUpdateCommand cmd) throws IOException { +isLeader = getNonZkLeaderAssumption(req); +super.doDeleteById(cmd); + } + + @Override + protected void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException { +// even in non zk mode, tests simulate updates from a leader +isLeader = getNonZkLeaderAssumption(req); +super.doDeleteByQuery(cmd, null, null); + } + + @Override + public void finish() throws IOException { --- End diff -- Well, previously, the code was: ``` @Override public void finish() throws IOException { assert ! finished : "lifecycle sanity check"; finished = true; if (zkEnabled) doFinish(); if (next != null && nodes == null) next.finish(); } ``` So nodes was always set to null when zkEnabled was false(see DistributedUpdateProcessor#setupRequest on master). So, I just split
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user dsmiley commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r243847609 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java --- @@ -1250,23 +1250,10 @@ private UpdateCommand fetchFullUpdateFromLeader(AddUpdateCommand inplaceAdd, lon params.set("onlyIfActive", true); SolrRequest ur = new GenericSolrRequest(METHOD.GET, "/get", params); -String leaderUrl = req.getParams().get(DISTRIB_FROM); - -if (leaderUrl == null) { - // An update we're dependent upon didn't arrive! This is unexpected. Perhaps likely our leader is - // down or partitioned from us for some reason. Lets force refresh cluster state, and request the - // leader for the update. - if (zkController == null) { // we should be in cloud mode, but wtf? could be a unit test -throw new SolrException(ErrorCode.SERVER_ERROR, "Can't find document with id=" + id + ", but fetching from leader " -+ "failed since we're not in cloud mode."); - } - Replica leader; - try { -leader = zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId()); - } catch (InterruptedException e) { -throw new SolrException(ErrorCode.SERVER_ERROR, "Exception during fetching from leader.", e); - } - leaderUrl = leader.getCoreUrl(); +String leaderUrl = getLeaderUrl(id); --- End diff -- I looked again and it seems fine. I could be the diff I was looking at confused the matter. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user dsmiley commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r243847326 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java --- @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.update.processor; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.CompletionService; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.Future; + +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.update.AddUpdateCommand; +import org.apache.solr.update.CommitUpdateCommand; +import org.apache.solr.update.DeleteUpdateCommand; +import org.apache.solr.update.UpdateCommand; +import org.apache.solr.util.TestInjection; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DistributedStandaloneUpdateProcessor extends DistributedUpdateProcessor { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + public DistributedStandaloneUpdateProcessor(SolrQueryRequest req, + SolrQueryResponse rsp, UpdateRequestProcessor next) { +super(req, rsp, next); + } + + public DistributedStandaloneUpdateProcessor(SolrQueryRequest req, + SolrQueryResponse rsp, AtomicUpdateDocumentMerger docMerger, + UpdateRequestProcessor next) { +super(req, rsp, docMerger, next); + } + + @Override + Replica.Type computeReplicaType() { +return Replica.Type.NRT; + } + + @Override + public void processCommit(CommitUpdateCommand cmd) throws IOException { + +assert TestInjection.injectFailUpdateRequests(); + +updateCommand = cmd; + +CompletionService completionService = new ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor()); +Set> pending = new HashSet<>(); +if (replicaType == Replica.Type.TLOG) { + if (isLeader) { +super.processCommit(cmd); + } +} else if (replicaType == Replica.Type.PULL) { + log.warn("Commit not supported on replicas of type " + Replica.Type.PULL); +} else { + // NRT replicas will always commit + super.processCommit(cmd); +} + } + + @Override + public void processAdd(AddUpdateCommand cmd) throws IOException { +assert TestInjection.injectFailUpdateRequests(); + +updateCommand = cmd; +isLeader = getNonZkLeaderAssumption(req); + +super.processAdd(cmd); + } + + @Override + protected void doDeleteById(DeleteUpdateCommand cmd) throws IOException { +isLeader = getNonZkLeaderAssumption(req); +super.doDeleteById(cmd); + } + + @Override + protected void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException { +// even in non zk mode, tests simulate updates from a leader +isLeader = getNonZkLeaderAssumption(req); +super.doDeleteByQuery(cmd, null, null); + } + + @Override + public void finish() throws IOException { --- End diff -- isn't this what a base finish() will do? so why subclass. If there really is a reason then a comment here would help --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user dsmiley commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r243847270 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java --- @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.update.processor; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.CompletionService; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.Future; + +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.update.AddUpdateCommand; +import org.apache.solr.update.CommitUpdateCommand; +import org.apache.solr.update.DeleteUpdateCommand; +import org.apache.solr.update.UpdateCommand; +import org.apache.solr.util.TestInjection; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DistributedStandaloneUpdateProcessor extends DistributedUpdateProcessor { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + public DistributedStandaloneUpdateProcessor(SolrQueryRequest req, + SolrQueryResponse rsp, UpdateRequestProcessor next) { +super(req, rsp, next); + } + + public DistributedStandaloneUpdateProcessor(SolrQueryRequest req, + SolrQueryResponse rsp, AtomicUpdateDocumentMerger docMerger, + UpdateRequestProcessor next) { +super(req, rsp, docMerger, next); + } + + @Override + Replica.Type computeReplicaType() { +return Replica.Type.NRT; + } + + @Override + public void processCommit(CommitUpdateCommand cmd) throws IOException { + +assert TestInjection.injectFailUpdateRequests(); + +updateCommand = cmd; + +CompletionService completionService = new ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor()); +Set> pending = new HashSet<>(); +if (replicaType == Replica.Type.TLOG) { + if (isLeader) { +super.processCommit(cmd); + } +} else if (replicaType == Replica.Type.PULL) { + log.warn("Commit not supported on replicas of type " + Replica.Type.PULL); +} else { + // NRT replicas will always commit + super.processCommit(cmd); +} + } + + @Override + public void processAdd(AddUpdateCommand cmd) throws IOException { +assert TestInjection.injectFailUpdateRequests(); + +updateCommand = cmd; +isLeader = getNonZkLeaderAssumption(req); + +super.processAdd(cmd); + } + + @Override + protected void doDeleteById(DeleteUpdateCommand cmd) throws IOException { +isLeader = getNonZkLeaderAssumption(req); +super.doDeleteById(cmd); + } + + @Override + protected void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException { +// even in non zk mode, tests simulate updates from a leader +isLeader = getNonZkLeaderAssumption(req); --- End diff -- I think just call setupRequest instead of getNonZkLeaderAssumption? setupRequest calls that and sets isLeader. This comment goes for all/most of the methods here. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user barrotsteindev commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r243844969 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java --- @@ -42,15 +53,36 @@ public DistributedStandaloneUpdateProcessor(SolrQueryRequest req, } @Override - String getCollectionName(CloudDescriptor cloudDesc) { + String computeCollectionName(CloudDescriptor cloudDesc) { return null; } @Override - Replica.Type getReplicaType(CloudDescriptor cloudDesc) { + Replica.Type computeReplicaType(CloudDescriptor cloudDesc) { return Replica.Type.NRT; } + @Override + public void processCommit(CommitUpdateCommand cmd) throws IOException { + +assert TestInjection.injectFailUpdateRequests(); + +updateCommand = cmd; + +CompletionService completionService = new ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor()); --- End diff -- I guess I missed the fact that replicas in standalone mode are always NRT, right? --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user barrotsteindev commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r243844755 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java --- @@ -1250,23 +1250,10 @@ private UpdateCommand fetchFullUpdateFromLeader(AddUpdateCommand inplaceAdd, lon params.set("onlyIfActive", true); SolrRequest ur = new GenericSolrRequest(METHOD.GET, "/get", params); -String leaderUrl = req.getParams().get(DISTRIB_FROM); - -if (leaderUrl == null) { - // An update we're dependent upon didn't arrive! This is unexpected. Perhaps likely our leader is - // down or partitioned from us for some reason. Lets force refresh cluster state, and request the - // leader for the update. - if (zkController == null) { // we should be in cloud mode, but wtf? could be a unit test -throw new SolrException(ErrorCode.SERVER_ERROR, "Can't find document with id=" + id + ", but fetching from leader " -+ "failed since we're not in cloud mode."); - } - Replica leader; - try { -leader = zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId()); - } catch (InterruptedException e) { -throw new SolrException(ErrorCode.SERVER_ERROR, "Exception during fetching from leader.", e); - } - leaderUrl = leader.getCoreUrl(); +String leaderUrl = getLeaderUrl(id); --- End diff -- Hmmm, if it is crucial to keep the exception message identical, I could throw the exceptions inside getLeaderUrl(the abstract method). --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user dsmiley commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r243844185 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java --- @@ -1250,23 +1250,10 @@ private UpdateCommand fetchFullUpdateFromLeader(AddUpdateCommand inplaceAdd, lon params.set("onlyIfActive", true); SolrRequest ur = new GenericSolrRequest(METHOD.GET, "/get", params); -String leaderUrl = req.getParams().get(DISTRIB_FROM); - -if (leaderUrl == null) { - // An update we're dependent upon didn't arrive! This is unexpected. Perhaps likely our leader is - // down or partitioned from us for some reason. Lets force refresh cluster state, and request the - // leader for the update. - if (zkController == null) { // we should be in cloud mode, but wtf? could be a unit test -throw new SolrException(ErrorCode.SERVER_ERROR, "Can't find document with id=" + id + ", but fetching from leader " -+ "failed since we're not in cloud mode."); - } - Replica leader; - try { -leader = zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId()); - } catch (InterruptedException e) { -throw new SolrException(ErrorCode.SERVER_ERROR, "Exception during fetching from leader.", e); - } - leaderUrl = leader.getCoreUrl(); +String leaderUrl = getLeaderUrl(id); --- End diff -- I'm not sure how this diff is a refactoring (keeps semantics identical). For example... the exception message isn't the same. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user barrotsteindev commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r243843852 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java --- @@ -42,15 +53,36 @@ public DistributedStandaloneUpdateProcessor(SolrQueryRequest req, } @Override - String getCollectionName(CloudDescriptor cloudDesc) { + String computeCollectionName(CloudDescriptor cloudDesc) { return null; } @Override - Replica.Type getReplicaType(CloudDescriptor cloudDesc) { + Replica.Type computeReplicaType(CloudDescriptor cloudDesc) { return Replica.Type.NRT; } + @Override + public void processCommit(CommitUpdateCommand cmd) throws IOException { + +assert TestInjection.injectFailUpdateRequests(); + +updateCommand = cmd; + +CompletionService completionService = new ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor()); --- End diff -- I didn't see any use for the completionService in the previous processor. I meant to ask whether there might be a particular reason it is needed, since I didn't see any uses for this service. Should I remove it from both zk and standalone processors, or just this one? --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user dsmiley commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r243843542 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java --- @@ -42,15 +53,36 @@ public DistributedStandaloneUpdateProcessor(SolrQueryRequest req, } @Override - String getCollectionName(CloudDescriptor cloudDesc) { + String computeCollectionName(CloudDescriptor cloudDesc) { return null; } @Override - Replica.Type getReplicaType(CloudDescriptor cloudDesc) { + Replica.Type computeReplicaType(CloudDescriptor cloudDesc) { return Replica.Type.NRT; } + @Override + public void processCommit(CommitUpdateCommand cmd) throws IOException { + +assert TestInjection.injectFailUpdateRequests(); + +updateCommand = cmd; + +CompletionService completionService = new ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor()); --- End diff -- Why is all this stuff here in standalone mode? --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243646386 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits hold the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset in bytes into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 288 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243649022 --- Diff: lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java --- @@ -1204,6 +1204,12 @@ public void testRandomSortedBytes() throws IOException { } private void doTestNumericsVsStoredFields(double density, LongSupplier longs) throws Exception { +doTestNumericsVsStoredFields(density, longs, 256); +// TODO: 20 docs are needed to test jumps properly (see LUCENE-8585), but that is quite slow (few minutes). +// Maybe it can be nightly? +//doTestNumericsVsStoredFields(density, longs, 20); --- End diff -- By the way if we want to test such large numbers of documents, we should avoid RandomIndexWriter (use IndexWriter instead) and set reasonable values of maxBufferedDocs+RAMSizeBufferMB+mergePolicy instead of relying on random values that are set by LuceneTestCase#newIndexWriterConfig and make indexing slow (but are useful to increase coverage of flushing+merging) --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243647937 --- Diff: lucene/core/src/test/org/apache/lucene/index/TestDocValues.java --- @@ -123,8 +124,72 @@ public void testNumericField() throws Exception { iw.close(); dir.close(); } - - /** + + // The LUCENE-8585 jump-tables enables O(1) skipping of IndexedDISI blocks, DENSE block lookup + // and numeric multi blocks. This test focuses on testing these jumps. + @Slow + public void testNumericFieldJumpTables() throws Exception { +Directory dir = newDirectory(); +IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null)); +final int maxDoc = atLeast(3*65536); // Must be above 3*65536 to trigger IndexedDISI block skips --- End diff -- Why do we have this test here as opposed to TestLucene80DocValuesFormat? Given that you are indexing so many documents and don't care about testing indexing as much as you care about checking iterators, you should make sure that the index writer config isn't too crazy and enforce sane values of maxBufferedDocs, RAMBufferSizeMB, and a sane merge policy (eg. not the alcoholic one). See for instance TestLucene70DocValuesFormat#doTestSortedNumericBlocksOfVariousBitsPerValue. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243644347 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java --- @@ -0,0 +1,1430 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.Closeable; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LongValues; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.DirectMonotonicReader; +import org.apache.lucene.util.packed.DirectReader; + +/** reader for {@link Lucene80DocValuesFormat} */ +final class Lucene80DocValuesProducer extends DocValuesProducer implements Closeable { + private final Map numerics = new HashMap<>(); + private final Map binaries = new HashMap<>(); + private final Map sorted = new HashMap<>(); + private final Map sortedSets = new HashMap<>(); + private final Map sortedNumerics = new HashMap<>(); + private long ramBytesUsed; + private final IndexInput data; + private final int maxDoc; + + /** expert: instantiates a new reader */ + Lucene80DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { +String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); +this.maxDoc = state.segmentInfo.maxDoc(); +ramBytesUsed = RamUsageEstimator.shallowSizeOfInstance(getClass()); + +int version = -1; + +// read in the entries from the metadata file. +try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) { + Throwable priorE = null; + try { +version = CodecUtil.checkIndexHeader(in, metaCodec, + Lucene80DocValuesFormat.VERSION_START, + Lucene80DocValuesFormat.VERSION_CURRENT, +state.segmentInfo.getId(), +state.segmentSuffix); +readFields(in, state.fieldInfos); + } catch (Throwable exception) { +priorE = exception; + } finally { +CodecUtil.checkFooter(in, priorE); + } +} + +String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); +this.data = state.directory.openInput(dataName, state.context); +boolean success = false; +try { + final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, + Lucene80DocValuesFormat.VERSION_START, +
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243644248 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java --- @@ -0,0 +1,1430 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.Closeable; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LongValues; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.DirectMonotonicReader; +import org.apache.lucene.util.packed.DirectReader; + +/** reader for {@link Lucene80DocValuesFormat} */ +final class Lucene80DocValuesProducer extends DocValuesProducer implements Closeable { + private final Map numerics = new HashMap<>(); + private final Map binaries = new HashMap<>(); + private final Map sorted = new HashMap<>(); + private final Map sortedSets = new HashMap<>(); + private final Map sortedNumerics = new HashMap<>(); + private long ramBytesUsed; + private final IndexInput data; + private final int maxDoc; + + /** expert: instantiates a new reader */ + Lucene80DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { +String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); +this.maxDoc = state.segmentInfo.maxDoc(); +ramBytesUsed = RamUsageEstimator.shallowSizeOfInstance(getClass()); + +int version = -1; + +// read in the entries from the metadata file. +try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) { + Throwable priorE = null; + try { +version = CodecUtil.checkIndexHeader(in, metaCodec, + Lucene80DocValuesFormat.VERSION_START, + Lucene80DocValuesFormat.VERSION_CURRENT, +state.segmentInfo.getId(), +state.segmentSuffix); +readFields(in, state.fieldInfos); + } catch (Throwable exception) { +priorE = exception; + } finally { +CodecUtil.checkFooter(in, priorE); + } +} + +String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); +this.data = state.directory.openInput(dataName, state.context); +boolean success = false; +try { + final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, + Lucene80DocValuesFormat.VERSION_START, +
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243642604 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java --- @@ -0,0 +1,663 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + + +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.EmptyDocValuesProducer; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.SortedSetSelector; +import org.apache.lucene.store.GrowableByteArrayDataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMOutputStream; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.MathUtil; +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.packed.DirectMonotonicWriter; +import org.apache.lucene.util.packed.DirectWriter; + +import static org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; +import static org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.NUMERIC_BLOCK_SHIFT; +import static org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE; + +/** writer for {@link Lucene80DocValuesFormat} */ +final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Closeable { + + IndexOutput data, meta; + final int maxDoc; + + /** expert: Creates a new writer */ + public Lucene80DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { +boolean success = false; +try { + String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); + data = state.directory.createOutput(dataName, state.context); + CodecUtil.writeIndexHeader(data, dataCodec, Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + meta = state.directory.createOutput(metaName, state.context); + CodecUtil.writeIndexHeader(meta, metaCodec, Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + maxDoc = state.segmentInfo.maxDoc(); + success = true; +} finally { + if (!success) { +IOUtils.closeWhileHandlingException(this); + } +} + } + + @Override + public void close() throws IOException { +boolean success = false; +try { + if (meta != null) { +meta.writeInt(-1); // write EOF marker +CodecUtil.writeFooter(meta); // write checksum + } + if (data != null) { +CodecUtil.writeFooter(data); // write checksum + } + success = true; +} finally { + if (success) { +IOUtils.close(data, meta); + } else {
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243643855 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java --- @@ -0,0 +1,1430 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.Closeable; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LongValues; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.DirectMonotonicReader; +import org.apache.lucene.util.packed.DirectReader; + +/** reader for {@link Lucene80DocValuesFormat} */ +final class Lucene80DocValuesProducer extends DocValuesProducer implements Closeable { + private final Map numerics = new HashMap<>(); + private final Map binaries = new HashMap<>(); + private final Map sorted = new HashMap<>(); + private final Map sortedSets = new HashMap<>(); + private final Map sortedNumerics = new HashMap<>(); + private long ramBytesUsed; + private final IndexInput data; + private final int maxDoc; + + /** expert: instantiates a new reader */ + Lucene80DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { +String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); +this.maxDoc = state.segmentInfo.maxDoc(); +ramBytesUsed = RamUsageEstimator.shallowSizeOfInstance(getClass()); + +int version = -1; + +// read in the entries from the metadata file. +try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) { + Throwable priorE = null; + try { +version = CodecUtil.checkIndexHeader(in, metaCodec, + Lucene80DocValuesFormat.VERSION_START, + Lucene80DocValuesFormat.VERSION_CURRENT, +state.segmentInfo.getId(), +state.segmentSuffix); +readFields(in, state.fieldInfos); + } catch (Throwable exception) { +priorE = exception; + } finally { +CodecUtil.checkFooter(in, priorE); + } +} + +String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); +this.data = state.directory.openInput(dataName, state.context); +boolean success = false; +try { + final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, + Lucene80DocValuesFormat.VERSION_START, +
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243638367 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits hold the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset in bytes into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 288 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243641644 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits hold the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset in bytes into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 288 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243648602 --- Diff: lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java --- @@ -1204,6 +1204,12 @@ public void testRandomSortedBytes() throws IOException { } private void doTestNumericsVsStoredFields(double density, LongSupplier longs) throws Exception { +doTestNumericsVsStoredFields(density, longs, 256); +// TODO: 20 docs are needed to test jumps properly (see LUCENE-8585), but that is quite slow (few minutes). +// Maybe it can be nightly? +//doTestNumericsVsStoredFields(density, longs, 20); --- End diff -- Let's not do that in the base test case as this will be too slow with some codecs such as the SimpleTextCodec. What we have done before for formats that have branches that only get triggered with many documens is that we added tests to their own test file, ie. TestLucene80DocValuesFormat. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243630309 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits hold the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset in bytes into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 288 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243640446 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits hold the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset in bytes into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 288 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243630434 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243629404 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits hold the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset in bytes into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 288 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243636154 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits hold the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset in bytes into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 288 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243634870 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits hold the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset in bytes into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 288 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243625997 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits hold the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset in bytes into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 288 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. --- End diff -- I think you mean "not exceed 2^17 _bytes_"? Then maybe we should also express the maximum size of a block in bytes to make it easy to compare, ie. "2^13 + 36 bytes"? Should we simplify the encoding so that the long is actualy two 32-bit ints rather than one of 31 bits and another one of 33 bits? --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243624209 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure --- End diff -- I'm not sure to understand this sentence, did you mean to write another word than "blockCache" here? --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243631028 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits hold the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset in bytes into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 288 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r243628701 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits hold the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset in bytes into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 288 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user barrotsteindev commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r243628041 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java --- @@ -425,4 +448,10 @@ void setupRequest(UpdateCommand cmd) { nodes = setupRequest(dcmd.getId(), null); } } + + @Override + protected boolean shouldCloneCmdDoc() { +boolean willDistrib = isLeader && nodes != null && nodes.size() > 0; --- End diff -- This could probably optimized, since cloneRequiredOnLeader is already set during construction of the instance: `if(!cloneRequiredOnLeader) { return false; } // will distrib command return isLeader && nodes != null && nodes.size() > 0;` --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #343: SOLR-12121: JWT Token authentication plugin
Github user janhoy commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/343#discussion_r243600574 --- Diff: solr/core/src/test/org/apache/solr/security/JWTAuthPluginIntegrationTest.java --- @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.security; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +import org.apache.http.Header; +import org.apache.http.HttpException; +import org.apache.http.HttpHeaders; +import org.apache.http.HttpRequest; +import org.apache.http.HttpRequestInterceptor; +import org.apache.http.entity.ContentType; +import org.apache.http.protocol.HttpContext; +import org.apache.solr.client.solrj.impl.HttpClientUtil; +import org.apache.solr.cloud.SolrCloudAuthTestCase; +import org.apache.solr.common.util.Pair; +import org.jose4j.jwk.PublicJsonWebKey; +import org.jose4j.jwk.RsaJsonWebKey; +import org.jose4j.jws.AlgorithmIdentifiers; +import org.jose4j.jws.JsonWebSignature; +import org.jose4j.jwt.JwtClaims; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Validate that JWT token authentication works in a real cluster. + * TODO: Test also using SolrJ as client. But that requires a way to set Authorization header on request, see SOLR-13070 + */ +public class JWTAuthPluginIntegrationTest extends SolrCloudAuthTestCase { + protected static final int NUM_SERVERS = 2; + protected static final int NUM_SHARDS = 2; + protected static final int REPLICATION_FACTOR = 1; + private static final String COLLECTION = "jwtColl"; + private static String jwtTestToken; + private static String baseUrl; + private static AtomicInteger jwtInterceptCount = new AtomicInteger(); + private static AtomicInteger pkiInterceptCount = new AtomicInteger(); + private static final CountInterceptor interceptor = new CountInterceptor(); + + @BeforeClass + public static void setupClass() throws Exception { +configureCluster(NUM_SERVERS)// nodes + .withSecurityJson(TEST_PATH().resolve("security").resolve("jwt_plugin_jwk_security.json")) +.addConfig("conf1", TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf")) +.configure(); +baseUrl = cluster.getRandomJetty(random()).getBaseUrl().toString(); + +String jwkJSON = "{\n" + +" \"kty\": \"RSA\",\n" + +" \"d\": \"i6pyv2z3o-MlYytWsOr3IE1olu2RXZBzjPRBNgWAP1TlLNaphHEvH5aHhe_CtBAastgFFMuP29CFhaL3_tGczkvWJkSveZQN2AHWHgRShKgoSVMspkhOt3Ghha4CvpnZ9BnQzVHnaBnHDTTTfVgXz7P1ZNBhQY4URG61DKIF-JSSClyh1xKuMoJX0lILXDYGGcjVTZL_hci4IXPPTpOJHV51-pxuO7WU5M9252UYoiYyCJ56ai8N49aKIMsqhdGuO4aWUwsGIW4oQpjtce5eEojCprYl-9rDhTwLAFoBtjy6LvkqlR2Ae5dKZYpStljBjK8PJrBvWZjXAEMDdQ8PuQ\",\n" + +" \"e\": \"AQAB\",\n" + +" \"use\": \"sig\",\n" + +" \"kid\": \"test\",\n" + +" \"alg\": \"RS256\",\n" + +" \"n\": \"jeyrvOaZrmKWjyNXt0myAc_pJ1hNt3aRupExJEx1ewPaL9J9HFgSCjMrYxCB1ETO1NDyZ3nSgjZis-jHHDqBxBjRdq_t1E2rkGFaYbxAyKt220Pwgme_SFTB9MXVrFQGkKyjmQeVmOmV6zM3KK8uMdKQJ4aoKmwBcF5Zg7EZdDcKOFgpgva1Jq-FlEsaJ2xrYDYo3KnGcOHIt9_0NQeLsqZbeWYLxYni7uROFncXYV5FhSJCeR4A_rrbwlaCydGxE0ToC_9HNYibUHlkJjqyUhAgORCbNS8JLCJH8NUi5sDdIawK9GTSyvsJXZ-QHqo4cMUuxWV5AJtaRGghuMUfqQ\"\n" + +"}"; + +PublicJsonWebKey jwk = RsaJsonWebKey.Factory.newPublicJwk(jwkJSON); +JwtClaims claims = JWTAuthPluginTest.generateClaims(); +JsonWebSignature jws = new JsonWebSignature(); +jws.setPayload(claims.toJson()); +
[GitHub] lucene-solr pull request #342: SOLR-12120: New AuditLoggerPlugin type allowi...
Github user janhoy commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/342#discussion_r243532741 --- Diff: solr/core/src/java/org/apache/solr/security/AuditEvent.java --- @@ -0,0 +1,388 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.security; + +import javax.servlet.http.HttpServletRequest; +import java.lang.invoke.MethodHandles; +import java.security.Principal; +import java.util.Date; +import java.util.Enumeration; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.solr.common.SolrException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static org.apache.solr.security.AuditEvent.EventType.ANONYMOUS; + +/** + * Audit event that takes request and auth context as input to be able to audit log custom things + */ +public class AuditEvent { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private String message; + private Level level; + private Date date; + private String username; + private String session; + private String clientIp; + private List collections; + private Map context; + private HashMap headers; + private Map solrParams; + private String solrHost; + private int solrPort; + private String solrIp; + private String resource; + private String httpMethod; + private String queryString; + private EventType eventType; + private AuthorizationResponse autResponse; + private String requestType; + private double QTime = -1; + private int status = 0; + private Throwable exception; + + /* Predefined event types. Custom types can be made through constructor */ + public enum EventType { +AUTHENTICATED("Authenticated", "User successfully authenticated", Level.INFO), +REJECTED("Rejected", "Authentication request rejected", Level.WARN), +ANONYMOUS("Anonymous", "Request proceeds with unknown user", Level.INFO), +ANONYMOUS_REJECTED("AnonymousRejected", "Request from unknown user rejected", Level.WARN), +AUTHORIZED("Authorized", "Authorization succeeded", Level.INFO), +UNAUTHORIZED("Unauthorized", "Authorization failed", Level.WARN), +COMPLETED("Completed", "Request completed", Level.INFO), --- End diff -- In the latest commits I simplified this further. Now the default is to ever only generate events when the request is finished, not any intermediate ones, meaning that AUTHENTICATED, ANONYMOUS and AUTHORIZED are not logged by default. This saves some unnecessary object generation. Further, we now do a lightweight check on `shouldLog(eventType)` before even creating the AuditEvent object, which should further speed up things. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #529: LUCENE-8617: Use SimpleFSDirectory on non-def...
GitHub user marschall opened a pull request: https://github.com/apache/lucene-solr/pull/529 LUCENE-8617: Use SimpleFSDirectory on non-default FS Currently we use MMapDirectory on a non-default file system on Unix. This will almost always fail as only the default file system can provide mmap(). Switch to SimpleFSDirectory instead for non-default file systems independent of the operating system. You can merge this pull request into a Git repository by running: $ git pull https://github.com/marschall/lucene-solr LUCENE-8617 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/lucene-solr/pull/529.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #529 commit 611e4e05206737ffd8f63e2d9c659266b068ca55 Author: Philippe Marschall Date: 2018-12-20T15:19:08Z LUCENE-8617: Use SimpleFSDirectory on non-default FS Currently we use MMapDirectory on a non-default file system on Unix. This will almost always fail as only the default file system can provide mmap(). Switch to SimpleFSDirectory instead for non-default file systems independent of the operating system. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242875578 --- Diff: lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java --- @@ -1204,6 +1204,12 @@ public void testRandomSortedBytes() throws IOException { } private void doTestNumericsVsStoredFields(double density, LongSupplier longs) throws Exception { +doTestNumericsVsStoredFields(density, longs, 256); +// TODO: 20 docs are needed to test jumps properly (see LUCENE-8585), but that is quite slow (few minutes). +// Maybe it can be nightly? +//doTestNumericsVsStoredFields(density, longs, 20); --- End diff -- Should we add a `if (TEST_NIGHTLY) ...`-switch here to to enable the 200K test case? It would increase coverage for jumps a great deal. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242857603 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242175311 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242172918 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242159267 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242158001 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242148673 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242092902 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242073523 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java --- @@ -0,0 +1,1413 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.Closeable; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LongValues; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.DirectMonotonicReader; +import org.apache.lucene.util.packed.DirectReader; + +/** reader for {@link Lucene80DocValuesFormat} */ +final class Lucene80DocValuesProducer extends DocValuesProducer implements Closeable { + private final Map numerics = new HashMap<>(); + private final Map binaries = new HashMap<>(); + private final Map sorted = new HashMap<>(); + private final Map sortedSets = new HashMap<>(); + private final Map sortedNumerics = new HashMap<>(); + private long ramBytesUsed; + private final IndexInput data; + private final int maxDoc; + + /** expert: instantiates a new reader */ + Lucene80DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { +String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); +this.maxDoc = state.segmentInfo.maxDoc(); +ramBytesUsed = RamUsageEstimator.shallowSizeOfInstance(getClass()); + +int version = -1; + +// read in the entries from the metadata file. +try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) { + Throwable priorE = null; + try { +version = CodecUtil.checkIndexHeader(in, metaCodec, + Lucene80DocValuesFormat.VERSION_START, + Lucene80DocValuesFormat.VERSION_CURRENT, +state.segmentInfo.getId(), +state.segmentSuffix); +readFields(in, state.fieldInfos); + } catch (Throwable exception) { +priorE = exception; + } finally { +CodecUtil.checkFooter(in, priorE); + } +} + +String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); +this.data = state.directory.openInput(dataName, state.context); +boolean success = false; +try { + final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, + Lucene80DocValuesFormat.VERSION_START, +
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242073366 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java --- @@ -0,0 +1,1413 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.Closeable; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LongValues; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.DirectMonotonicReader; +import org.apache.lucene.util.packed.DirectReader; + +/** reader for {@link Lucene80DocValuesFormat} */ +final class Lucene80DocValuesProducer extends DocValuesProducer implements Closeable { + private final Map numerics = new HashMap<>(); + private final Map binaries = new HashMap<>(); + private final Map sorted = new HashMap<>(); + private final Map sortedSets = new HashMap<>(); + private final Map sortedNumerics = new HashMap<>(); + private long ramBytesUsed; + private final IndexInput data; + private final int maxDoc; + + /** expert: instantiates a new reader */ + Lucene80DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { +String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); +this.maxDoc = state.segmentInfo.maxDoc(); +ramBytesUsed = RamUsageEstimator.shallowSizeOfInstance(getClass()); + +int version = -1; + +// read in the entries from the metadata file. +try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) { + Throwable priorE = null; + try { +version = CodecUtil.checkIndexHeader(in, metaCodec, + Lucene80DocValuesFormat.VERSION_START, + Lucene80DocValuesFormat.VERSION_CURRENT, +state.segmentInfo.getId(), +state.segmentSuffix); +readFields(in, state.fieldInfos); + } catch (Throwable exception) { +priorE = exception; + } finally { +CodecUtil.checkFooter(in, priorE); + } +} + +String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); +this.data = state.directory.openInput(dataName, state.context); +boolean success = false; +try { + final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, + Lucene80DocValuesFormat.VERSION_START, +
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242072816 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242070202 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242069261 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242068739 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242062202 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242060979 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242060186 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user tokee commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r242055494 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java --- @@ -284,12 +284,12 @@ * Stores additional per-position metadata information such as character offsets and user payloads * * - * {@link org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Norms} + * org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Norms * .nvd, .nvm * Encodes length and boost factors for docs and fields * * - * {@link org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat Per-Document Values} + * org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat Per-Document Values * .dvd, .dvm * Encodes additional scoring factors or other per-document information. * --- End diff -- Seems reasonable. I have mimicked the phrasing from the lucene60 `package-info.java` in the upcoming commit and also adjusted the references from lucene50, lucene60 to point to lucene80. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user barrotsteindev commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r241967695 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java --- @@ -56,8 +59,15 @@ public void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException { super.doDeleteByQuery(cmd, Collections.emptyList(), null); } + @Override + protected void postProcessDeleteByQuery(DeleteUpdateCommand cmd, + List replicas, DocCollection coll) throws IOException { +// no op + } + @Override boolean isLeader(UpdateCommand cmd) { +updateCommand = cmd; --- End diff -- Perhaps we should have another isLeader method that returns a boolean, since the current method is used by other classes in the package to setup the request and determine whether the core is the leader for the specified update command. I'll work on a patch ASAP. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user dsmiley commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r241967674 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java --- @@ -1417,45 +1407,7 @@ private void doDeleteById(DeleteUpdateCommand cmd) throws IOException { return; } -if (zkEnabled && isLeader && !isSubShardLeader) { - DocCollection coll = zkController.getClusterState().getCollection(collection); - List subShardLeaders = getSubShardLeaders(coll, cloudDesc.getShardId(), cmd.getId(), null); - // the list will actually have only one element for an add request - if (subShardLeaders != null && !subShardLeaders.isEmpty()) { -ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams())); -params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString()); -params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl( -zkController.getBaseUrl(), req.getCore().getName())); -params.set(DISTRIB_FROM_PARENT, cloudDesc.getShardId()); -cmdDistrib.distribDelete(cmd, subShardLeaders, params, true, null, null); - } - - final List nodesByRoutingRules = getNodesByRoutingRules(zkController.getClusterState(), coll, cmd.getId(), null); - if (nodesByRoutingRules != null && !nodesByRoutingRules.isEmpty()) { -ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams())); -params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString()); -params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl( -zkController.getBaseUrl(), req.getCore().getName())); -params.set(DISTRIB_FROM_COLLECTION, collection); -params.set(DISTRIB_FROM_SHARD, cloudDesc.getShardId()); -cmdDistrib.distribDelete(cmd, nodesByRoutingRules, params, true, null, null); - } -} - -if (nodes != null) { - ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams())); - params.set(DISTRIB_UPDATE_PARAM, - (isLeader || isSubShardLeader ? DistribPhase.FROMLEADER.toString() - : DistribPhase.TOLEADER.toString())); - params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl( - zkController.getBaseUrl(), req.getCore().getName())); - - if (req.getParams().get(UpdateRequest.MIN_REPFACT) != null) { -// TODO: Kept for rolling upgrades only. Remove in Solr 9 -params.add(UpdateRequest.MIN_REPFACT, req.getParams().get(UpdateRequest.MIN_REPFACT)); - } - cmdDistrib.distribDelete(cmd, nodes, params, false, rollupReplicationTracker, leaderReplicationTracker); -} +postProcessDeleteById(cmd); --- End diff -- I can understand why you took the current approach. Lets put this off towards near the end to see if it's straight-forward or not. Maybe it won't be and we shouldn't do it. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user dsmiley commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r241967637 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java --- @@ -56,8 +59,15 @@ public void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException { super.doDeleteByQuery(cmd, Collections.emptyList(), null); } + @Override + protected void postProcessDeleteByQuery(DeleteUpdateCommand cmd, + List replicas, DocCollection coll) throws IOException { +// no op + } + @Override boolean isLeader(UpdateCommand cmd) { +updateCommand = cmd; --- End diff -- +1 to setupRequest name. I don't think it needs to return "isLeader"; it should set this and return void. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user barrotsteindev commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r241954714 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java --- @@ -56,8 +59,15 @@ public void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException { super.doDeleteByQuery(cmd, Collections.emptyList(), null); } + @Override + protected void postProcessDeleteByQuery(DeleteUpdateCommand cmd, + List replicas, DocCollection coll) throws IOException { +// no op + } + @Override boolean isLeader(UpdateCommand cmd) { +updateCommand = cmd; --- End diff -- since this is a package private method, perhaps it should be renamed to setupRequest(like the protected methods) just that this once takes an update command( Distributed#setupRequest(UpdateCommand cmd) ). It should be stated in the java doc that the method returns whether this node is the leader for the supplied update command(cmd). WDYT? --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user barrotsteindev commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r241949194 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java --- @@ -1417,45 +1407,7 @@ private void doDeleteById(DeleteUpdateCommand cmd) throws IOException { return; } -if (zkEnabled && isLeader && !isSubShardLeader) { - DocCollection coll = zkController.getClusterState().getCollection(collection); - List subShardLeaders = getSubShardLeaders(coll, cloudDesc.getShardId(), cmd.getId(), null); - // the list will actually have only one element for an add request - if (subShardLeaders != null && !subShardLeaders.isEmpty()) { -ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams())); -params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString()); -params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl( -zkController.getBaseUrl(), req.getCore().getName())); -params.set(DISTRIB_FROM_PARENT, cloudDesc.getShardId()); -cmdDistrib.distribDelete(cmd, subShardLeaders, params, true, null, null); - } - - final List nodesByRoutingRules = getNodesByRoutingRules(zkController.getClusterState(), coll, cmd.getId(), null); - if (nodesByRoutingRules != null && !nodesByRoutingRules.isEmpty()) { -ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams())); -params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString()); -params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl( -zkController.getBaseUrl(), req.getCore().getName())); -params.set(DISTRIB_FROM_COLLECTION, collection); -params.set(DISTRIB_FROM_SHARD, cloudDesc.getShardId()); -cmdDistrib.distribDelete(cmd, nodesByRoutingRules, params, true, null, null); - } -} - -if (nodes != null) { - ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams())); - params.set(DISTRIB_UPDATE_PARAM, - (isLeader || isSubShardLeader ? DistribPhase.FROMLEADER.toString() - : DistribPhase.TOLEADER.toString())); - params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl( - zkController.getBaseUrl(), req.getCore().getName())); - - if (req.getParams().get(UpdateRequest.MIN_REPFACT) != null) { -// TODO: Kept for rolling upgrades only. Remove in Solr 9 -params.add(UpdateRequest.MIN_REPFACT, req.getParams().get(UpdateRequest.MIN_REPFACT)); - } - cmdDistrib.distribDelete(cmd, nodes, params, false, rollupReplicationTracker, leaderReplicationTracker); -} +postProcessDeleteById(cmd); --- End diff -- Well I could try, I didn't want to introduce too many changes to the logic. Would changing this and letting the full sold test suite run be of assurance that the change is safe? --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user dsmiley commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r241945926 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java --- @@ -1417,45 +1407,7 @@ private void doDeleteById(DeleteUpdateCommand cmd) throws IOException { return; } -if (zkEnabled && isLeader && !isSubShardLeader) { - DocCollection coll = zkController.getClusterState().getCollection(collection); - List subShardLeaders = getSubShardLeaders(coll, cloudDesc.getShardId(), cmd.getId(), null); - // the list will actually have only one element for an add request - if (subShardLeaders != null && !subShardLeaders.isEmpty()) { -ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams())); -params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString()); -params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl( -zkController.getBaseUrl(), req.getCore().getName())); -params.set(DISTRIB_FROM_PARENT, cloudDesc.getShardId()); -cmdDistrib.distribDelete(cmd, subShardLeaders, params, true, null, null); - } - - final List nodesByRoutingRules = getNodesByRoutingRules(zkController.getClusterState(), coll, cmd.getId(), null); - if (nodesByRoutingRules != null && !nodesByRoutingRules.isEmpty()) { -ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams())); -params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString()); -params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl( -zkController.getBaseUrl(), req.getCore().getName())); -params.set(DISTRIB_FROM_COLLECTION, collection); -params.set(DISTRIB_FROM_SHARD, cloudDesc.getShardId()); -cmdDistrib.distribDelete(cmd, nodesByRoutingRules, params, true, null, null); - } -} - -if (nodes != null) { - ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams())); - params.set(DISTRIB_UPDATE_PARAM, - (isLeader || isSubShardLeader ? DistribPhase.FROMLEADER.toString() - : DistribPhase.TOLEADER.toString())); - params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl( - zkController.getBaseUrl(), req.getCore().getName())); - - if (req.getParams().get(UpdateRequest.MIN_REPFACT) != null) { -// TODO: Kept for rolling upgrades only. Remove in Solr 9 -params.add(UpdateRequest.MIN_REPFACT, req.getParams().get(UpdateRequest.MIN_REPFACT)); - } - cmdDistrib.distribDelete(cmd, nodes, params, false, rollupReplicationTracker, leaderReplicationTracker); -} +postProcessDeleteById(cmd); --- End diff -- I wonder if we actually need this postPorcessDeleteById(cmd) method... could the subclass put its logic after calling super.doDeleteById()? --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user dsmiley commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r241945549 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java --- @@ -223,6 +289,7 @@ private String getLeaderUrlZk(String id) { @Override boolean isLeader(UpdateCommand cmd) { +updateCommand = cmd; --- End diff -- again; this line makes me frown --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user dsmiley commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r241945502 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java --- @@ -56,8 +59,15 @@ public void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException { super.doDeleteByQuery(cmd, Collections.emptyList(), null); } + @Override + protected void postProcessDeleteByQuery(DeleteUpdateCommand cmd, + List replicas, DocCollection coll) throws IOException { +// no op + } + @Override boolean isLeader(UpdateCommand cmd) { +updateCommand = cmd; --- End diff -- ouch this piece is ugly. a method like "is..." (or get...) should not have side effects. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #528: SOLR-12955 2
Github user dsmiley commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/528#discussion_r241945460 --- Diff: solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java --- @@ -58,6 +75,127 @@ String getCollectionName(CloudDescriptor cloudDesc) { return cloudDesc.getReplicaType(); } + @Override + public void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException { +// even in non zk mode, tests simulate updates from a leader --- End diff -- this particular comment is I think intended for the standalone mode --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #527: LUCENE-8609: Allow getting consistent docstat...
Github user asfgit closed the pull request at: https://github.com/apache/lucene-solr/pull/527 --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #527: LUCENE-8609: Allow getting consistent docstat...
Github user dnhatn commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/527#discussion_r241832076 --- Diff: lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java --- @@ -3300,7 +3315,7 @@ public int numDeletesToMerge(SegmentCommitInfo info, int delCount, IOSupplier
[GitHub] lucene-solr pull request #527: LUCENE-8609: Allow getting consistent docstat...
Github user dnhatn commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/527#discussion_r241832034 --- Diff: lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java --- @@ -3147,7 +3162,7 @@ public void testSoftUpdateDocuments() throws IOException { for (SegmentCommitInfo info : writer.cloneSegmentInfos()) { numSoftDeleted += info.getSoftDelCount(); } -assertEquals(writer.maxDoc() - writer.numDocs(), numSoftDeleted); +assertEquals(writer.getDocStats().maxDoc - writer.getDocStats().numDocs, numSoftDeleted); --- End diff -- maybe use a single docStats? --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241757056 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest --- End diff -- do you mean bytes rather than bits? Offsets seem to measure bytes? --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241765829 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241784900 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241781712 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241791577 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java --- @@ -0,0 +1,1413 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.Closeable; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LongValues; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.DirectMonotonicReader; +import org.apache.lucene.util.packed.DirectReader; + +/** reader for {@link Lucene80DocValuesFormat} */ +final class Lucene80DocValuesProducer extends DocValuesProducer implements Closeable { + private final Map numerics = new HashMap<>(); + private final Map binaries = new HashMap<>(); + private final Map sorted = new HashMap<>(); + private final Map sortedSets = new HashMap<>(); + private final Map sortedNumerics = new HashMap<>(); + private long ramBytesUsed; + private final IndexInput data; + private final int maxDoc; + + /** expert: instantiates a new reader */ + Lucene80DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { +String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); +this.maxDoc = state.segmentInfo.maxDoc(); +ramBytesUsed = RamUsageEstimator.shallowSizeOfInstance(getClass()); + +int version = -1; + +// read in the entries from the metadata file. +try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) { + Throwable priorE = null; + try { +version = CodecUtil.checkIndexHeader(in, metaCodec, + Lucene80DocValuesFormat.VERSION_START, + Lucene80DocValuesFormat.VERSION_CURRENT, +state.segmentInfo.getId(), +state.segmentSuffix); +readFields(in, state.fieldInfos); + } catch (Throwable exception) { +priorE = exception; + } finally { +CodecUtil.checkFooter(in, priorE); + } +} + +String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); +this.data = state.directory.openInput(dataName, state.context); +boolean success = false; +try { + final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, + Lucene80DocValuesFormat.VERSION_START, +
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241759363 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using --- End diff -- it seems to be ignoring the rank structure? --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241763714 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241762452 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241766560 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241763587 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241764037 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241766876 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241761075 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241768318 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241754841 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure --- End diff -- remove "*"? --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241759598 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the + * wanted block. The next 33 bits holds the offset into the underlying slice. + * As there is a maximum of 2^16 blocks, it follows that the maximum size of any block must + * not exceed 2^17 bits to avoid overflow. This is currently the case, with the largest + * block being DENSE and using 2^16 + 32 bits, and is likely to continue to hold as using + * more than double the amount of bits is unlikely to be an efficient representation. + * The cache overhead is numDocs/1024 bytes. + * + * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). + * In the case of non-existing blocks, the entry in the lookup table has index equal to the + * previous entry and offset equal to the next non-empty block. + * + * The block lookup table is stored at the end of the total block structure. + * + * + * The rank structure for DENSE blocks is an array of unsigned {@code short}s with an entry + * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE block. + * + * Each rank-entry states the number of set bits within the block up to the bit before the + * bit positioned at the start of the sub-block. + * Note that that the rank entry of the first sub-block is always 0 and that the last entry can + * at most be 65536-512 = 65024 and thus will always fit into an unsigned short. + * + * The rank structure for a given DENSE block is stored at the beginning of the
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241756515 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java --- @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RoaringDocIdSet; + +/** + * Disk-based implementation of a {@link DocIdSetIterator} which can return + * the index of the current document, i.e. the ordinal of the current document + * among the list of documents that this iterator can return. This is useful + * to implement sparse doc values by only having to encode values for documents + * that actually have a value. + * Implementation-wise, this {@link DocIdSetIterator} is inspired of + * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} + * documents independently and picks between 3 encodings depending on the + * density of the range: + * {@code ALL} if the range contains 65536 documents exactly, + * {@code DENSE} if the range contains 4096 documents or more; in that + * case documents are stored in a bit set, + * {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are + * stored in a {@link DataInput#readShort() short}. + * + * Only ranges that contain at least one value are encoded. + * This implementation uses 6 bytes per document in the worst-case, which happens + * in the case that all ranges contain exactly one document. + * + * + * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup + * tables are used: * A lookup table for block blockCache and index, and a rank structure + * for DENSE block lookups. + * + * The lookup table is an array of {@code long}s with an entry for each block. It allows for + * direct jumping to the block, as opposed to iteration from the current position and forward + * one block at a time. + * + * Each long entry consists of 2 logical parts: + * + * The first 31 bits holds the index (number of set bits in the blocks) up to just before the --- End diff -- s/holds/hold/ ? --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...
Github user jpountz commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/525#discussion_r241754530 --- Diff: lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java --- @@ -284,12 +284,12 @@ * Stores additional per-position metadata information such as character offsets and user payloads * * - * {@link org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Norms} + * org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Norms * .nvd, .nvm * Encodes length and boost factors for docs and fields * * - * {@link org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat Per-Document Values} + * org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat Per-Document Values * .dvd, .dvm * Encodes additional scoring factors or other per-document information. * --- End diff -- Actually we could remove most of this to simplify. Only the current codec needs to have a good description if its file formats. See eg. other oal.codecs.luceneXX packages which have minimal javadocs. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #343: SOLR-12121: JWT Token authentication plugin
Github user janhoy commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/343#discussion_r241698617 --- Diff: solr/core/src/test/org/apache/solr/security/JWTAuthPluginIntegrationTest.java --- @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.security; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +import org.apache.http.Header; +import org.apache.http.HttpException; +import org.apache.http.HttpHeaders; +import org.apache.http.HttpRequest; +import org.apache.http.HttpRequestInterceptor; +import org.apache.http.entity.ContentType; +import org.apache.http.protocol.HttpContext; +import org.apache.solr.client.solrj.impl.HttpClientUtil; +import org.apache.solr.cloud.SolrCloudAuthTestCase; +import org.apache.solr.common.util.Pair; +import org.jose4j.jwk.PublicJsonWebKey; +import org.jose4j.jwk.RsaJsonWebKey; +import org.jose4j.jws.AlgorithmIdentifiers; +import org.jose4j.jws.JsonWebSignature; +import org.jose4j.jwt.JwtClaims; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Validate that JWT token authentication works in a real cluster. + * TODO: Test also using SolrJ as client. But that requires a way to set Authorization header on request, see SOLR-13070 + */ +public class JWTAuthPluginIntegrationTest extends SolrCloudAuthTestCase { + protected static final int NUM_SERVERS = 2; + protected static final int NUM_SHARDS = 2; + protected static final int REPLICATION_FACTOR = 1; + private static final String COLLECTION = "jwtColl"; + private static String jwtTestToken; + private static String baseUrl; + private static AtomicInteger jwtInterceptCount = new AtomicInteger(); + private static AtomicInteger pkiInterceptCount = new AtomicInteger(); + private static final CountInterceptor interceptor = new CountInterceptor(); + + @BeforeClass + public static void setupClass() throws Exception { +configureCluster(NUM_SERVERS)// nodes + .withSecurityJson(TEST_PATH().resolve("security").resolve("jwt_plugin_jwk_security.json")) +.addConfig("conf1", TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf")) +.configure(); +baseUrl = cluster.getRandomJetty(random()).getBaseUrl().toString(); + +String jwkJSON = "{\n" + +" \"kty\": \"RSA\",\n" + +" \"d\": \"i6pyv2z3o-MlYytWsOr3IE1olu2RXZBzjPRBNgWAP1TlLNaphHEvH5aHhe_CtBAastgFFMuP29CFhaL3_tGczkvWJkSveZQN2AHWHgRShKgoSVMspkhOt3Ghha4CvpnZ9BnQzVHnaBnHDTTTfVgXz7P1ZNBhQY4URG61DKIF-JSSClyh1xKuMoJX0lILXDYGGcjVTZL_hci4IXPPTpOJHV51-pxuO7WU5M9252UYoiYyCJ56ai8N49aKIMsqhdGuO4aWUwsGIW4oQpjtce5eEojCprYl-9rDhTwLAFoBtjy6LvkqlR2Ae5dKZYpStljBjK8PJrBvWZjXAEMDdQ8PuQ\",\n" + +" \"e\": \"AQAB\",\n" + +" \"use\": \"sig\",\n" + +" \"kid\": \"test\",\n" + +" \"alg\": \"RS256\",\n" + +" \"n\": \"jeyrvOaZrmKWjyNXt0myAc_pJ1hNt3aRupExJEx1ewPaL9J9HFgSCjMrYxCB1ETO1NDyZ3nSgjZis-jHHDqBxBjRdq_t1E2rkGFaYbxAyKt220Pwgme_SFTB9MXVrFQGkKyjmQeVmOmV6zM3KK8uMdKQJ4aoKmwBcF5Zg7EZdDcKOFgpgva1Jq-FlEsaJ2xrYDYo3KnGcOHIt9_0NQeLsqZbeWYLxYni7uROFncXYV5FhSJCeR4A_rrbwlaCydGxE0ToC_9HNYibUHlkJjqyUhAgORCbNS8JLCJH8NUi5sDdIawK9GTSyvsJXZ-QHqo4cMUuxWV5AJtaRGghuMUfqQ\"\n" + +"}"; + +PublicJsonWebKey jwk = RsaJsonWebKey.Factory.newPublicJwk(jwkJSON); +JwtClaims claims = JWTAuthPluginTest.generateClaims(); +JsonWebSignature jws = new JsonWebSignature(); +jws.setPayload(claims.toJson()); +