subject:"\[GitHub\] lucene\-solr pull request\:"

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2019-01-08 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r245993566
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,601 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block offset and index, and a rank 
structure
+ * for DENSE block index lookups.
+ *
+ * The lookup table is an array of {@code int}-pairs, with a pair for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each int-pair entry consists of 2 logical parts:
+ *
+ * The first 32 bit int holds the index (number of set bits in the blocks) 
up to just before the
+ * wanted block. The maximum number of set bits is the maximum number of 
documents, which is < 2^31.
+ *
+ * The next int holds the offset in bytes into the underlying slice. As 
there is a maximum of 2^16
+ * blocks, it follows that the maximum size of any block must not exceed 
2^15 bytes to avoid
+ * overflow (2^16 bytes if the int is treated as unsigned). This is 
currently the case, with the
+ * largest block being DENSE and using 2^13 + 36 bytes.
+ *
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of byte-pairs with an 
entry for each
+ * sub-block (default 512 bits) out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-2 = 65634 and thus will always fit into an byte-pair 
of 16 bits.
+ *

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2019-01-04 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r245364034
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,601 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block offset and index, and a rank 
structure
+ * for DENSE block index lookups.
+ *
+ * The lookup table is an array of {@code int}-pairs, with a pair for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each int-pair entry consists of 2 logical parts:
+ *
+ * The first 32 bit int holds the index (number of set bits in the blocks) 
up to just before the
+ * wanted block. The maximum number of set bits is the maximum number of 
documents, which is < 2^31.
+ *
+ * The next int holds the offset in bytes into the underlying slice. As 
there is a maximum of 2^16
+ * blocks, it follows that the maximum size of any block must not exceed 
2^15 bytes to avoid
+ * overflow (2^16 bytes if the int is treated as unsigned). This is 
currently the case, with the
+ * largest block being DENSE and using 2^13 + 36 bytes.
+ *
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of byte-pairs with an 
entry for each
+ * sub-block (default 512 bits) out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-2 = 65634 and thus will always fit into an byte-pair 
of 16 bits.
+ *

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2019-01-03 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r245040952
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits hold the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset in bytes into the 
underlying slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 288 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure

[GitHub] lucene-solr pull request #:

2019-01-03 Thread dsmiley

Github user dsmiley commented on the pull request:


https://github.com/apache/lucene-solr/commit/c04a3ac306b017054173d986e32ea217e2fc6595#commitcomment-31829970
  
In 
solr/core/src/java/org/apache/solr/response/transform/ChildDocTransformerFactory.java:
In 
solr/core/src/java/org/apache/solr/response/transform/ChildDocTransformerFactory.java
 on line 162:
I think this illustrates a problem with this syntax that you invented.  The 
code here says the absence of any '/' means it's a "regular filter, not 
hierarchy based".  But how then do you articulate a path filter for all 
elements named "foo" regardless of parentage?  So I think "childFilter" is 
trying to do double-duty here leading to some edge cases as we try to guess 
what was intended.  I propose instead we have a different local param 
"pathFilter".  Both filters will be AND'ed together.  WDYT?


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #305: SOLR-11853: Make check for used tool "service...

2019-01-02 Thread Mandalka

Github user Mandalka closed the pull request at:

https://github.com/apache/lucene-solr/pull/305


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #531: SOLR-12768

2019-01-01 Thread moshebla

GitHub user moshebla opened a pull request:

https://github.com/apache/lucene-solr/pull/531

SOLR-12768



You can merge this pull request into a Git repository by running:

$ git pull https://github.com/moshebla/lucene-solr SOLR-12768

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/lucene-solr/pull/531.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #531


commit 6b147506ab7e8c7eda52dde9748fdebef8133194
Author: Moshe 
Date:   2019-01-01T10:29:15Z

SOLR-12678: applied David's patch

commit 83560ecf38f52f31d48b133d0c1a925fa7df8c28
Author: Moshe 
Date:   2019-01-01T12:37:47Z

SOLR-12768: remove last / from _NEST_PATH_FIELD filter in 
ChildDocTransformerFactory#processPathHierarchyQueryString

commit 75a1ad79eb5255a4b2e6807042140bbe3d09bb58
Author: Moshe 
Date:   2019-01-01T15:06:38Z

SOLR-12768: fix partial and full path support in ChildDocTransformer




---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #530: Branch 7 4 : IndexOutOfBoundsException random...

2018-12-28 Thread IntraCherche

Github user IntraCherche closed the pull request at:

https://github.com/apache/lucene-solr/pull/530


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-28 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r244336948
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits hold the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset in bytes into the 
underlying slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 288 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-28 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r244332659
  
--- Diff: lucene/core/src/test/org/apache/lucene/index/TestDocValues.java 
---
@@ -123,8 +124,72 @@ public void testNumericField() throws Exception {
 iw.close();
 dir.close();
   }
-  
-  /** 
+
+  // The LUCENE-8585 jump-tables enables O(1) skipping of IndexedDISI 
blocks, DENSE block lookup
+  // and numeric multi blocks. This test focuses on testing these jumps.
+  @Slow
+  public void testNumericFieldJumpTables() throws Exception {
+Directory dir = newDirectory();
+IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
+final int maxDoc = atLeast(3*65536); // Must be above 3*65536 to 
trigger IndexedDISI block skips
--- End diff --

Moved to `TestLucene80DocValuesFormat` and adjusted as you suggested.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-28 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r244331405
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits hold the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset in bytes into the 
underlying slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 288 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-28 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r244329158
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits hold the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset in bytes into the 
underlying slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 288 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-28 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r244328885
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits hold the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset in bytes into the 
underlying slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 288 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
--- End diff --

Nice catch. That was me mixing bits & bytes and arriving at too harsh a 
requirement, making the representation needlessly complicated: All block types 
are < 2^17 *bits*, but the offset is in *bytes*, so we do not require 2^16 * 
2^17 bits to hold it; only 2^16 * 2^17 / 2^3 = 30 bits.

With the offset-requirement lowered from 33 to 30 bits, it is much more 
natural to represent offset & index as two ints. This will be in the next 
commit.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-28 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r244318898
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
--- End diff --

Indeed I did. I have replaced it with `offset` in the next push.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-26 Thread barrotsteindev

Github user barrotsteindev commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r244007693
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java
 ---
@@ -42,15 +53,36 @@ public 
DistributedStandaloneUpdateProcessor(SolrQueryRequest req,
   }
 
   @Override
-  String getCollectionName(CloudDescriptor cloudDesc) {
+  String computeCollectionName(CloudDescriptor cloudDesc) {
 return null;
   }
 
   @Override
-  Replica.Type getReplicaType(CloudDescriptor cloudDesc) {
+  Replica.Type computeReplicaType(CloudDescriptor cloudDesc) {
 return Replica.Type.NRT;
   }
 
+  @Override
+  public void processCommit(CommitUpdateCommand cmd) throws IOException {
+
+assert TestInjection.injectFailUpdateRequests();
+
+updateCommand = cmd;
+
+CompletionService completionService = new 
ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor());
--- End diff --

Hopefully I addressed your concerns with latest commit.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #530: Branch 7 4 : IndexOutOfBoundsException random...

2018-12-26 Thread IntraCherche

GitHub user IntraCherche opened a pull request:

https://github.com/apache/lucene-solr/pull/530

Branch 7 4 : IndexOutOfBoundsException randomly appearing while indexing

Hi,

First of all sorry if it is not the right place to write the issue I am 
encountering. I saw other people where having the same problem on StackOverflow 
(https://stackoverflow.com/questions/52783491/solr-indexing-error-possible-analysis-error)
 although there was no solution provided so far. Anyway feel free to reroute my 
pull request.

So while indexing documents with SOLR 7.4 the following exception appears 
randomly. It is very annoying because I cannot reproduce it on my machine and I 
don't have access to customer's data. All I know is that it can appear on pdf, 
xls, or eml documents. It looks like the issues stems from FlattenGraphFilter 
class and specifically _restoreState(inputNode.tokens.get(inputNode.nextOut))_ 
The stack trace shows : 

 Caused by: java.lang.IndexOutOfBoundsException: Index: 1, Size: 1
at java.util.ArrayList.rangeCheck(ArrayList.java:657)
at java.util.ArrayList.get(ArrayList.java:433)
at 
org.apache.lucene.analysis.core.FlattenGraphFilter.releaseBufferedToken(FlattenGraphFilter.java:204)
at 
org.apache.lucene.analysis.core.FlattenGraphFilter.incrementToken(FlattenGraphFilter.java:258)
at 
org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:738)
at 
org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:428)
at 
org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392)
at 
org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:251)
at 
org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:494)
at 
org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1602)
at 
org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1594)
at 
org.apache.solr.update.DirectUpdateHandler2.updateDocument(DirectUpdateHandler2.java:982)
at 
org.apache.solr.update.DirectUpdateHandler2.updateDocOrDocValues(DirectUpdateHandler2.java:971)
at 
org.apache.solr.update.DirectUpdateHandler2.doNormalUpdate(DirectUpdateHandler2.java:348)
at 
org.apache.solr.update.DirectUpdateHandler2.addDoc0(DirectUpdateHandler2.java:284)
at 
org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:234)

If more info is needed I would be delighted to provide it as long as I have 
it (eg documents are customer confidentials so I don't have access to them).

Kind regards




You can merge this pull request into a Git repository by running:

$ git pull https://github.com/apache/lucene-solr branch_7_4

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/lucene-solr/pull/530.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #530


commit 1ed95c097b82ee5f175e93f3fe62572abe064da6
Author: Jim Ferenczi 
Date:   2018-05-03T14:11:52Z

LUCENE-8231: Add missing part of speech filter in the SPI META-INF file

commit 0e45b39476f287fe56ec4734c5c8695a99f8cb5d
Author: Christine Poerschke 
Date:   2018-05-03T17:42:56Z

json-facet-api.adoc typo fix.

commit dad48603aec715063fdcb71e11fe73599d63c3a2
Author: Simon Willnauer 
Date:   2018-05-03T07:29:12Z

LUCENE-8293: Ensure only hard deletes are carried over in a merge

Today we carry over hard deletes based on the SegmentReaders liveDocs.
This is not correct if soft-deletes are used especially with rentention
policies. If a soft delete is added while a segment is merged the document
might end up hard deleted in the target segment. This isn't necessarily a
correctness issue but causes unnecessary writes of hard-deletes. The biggest
issue here is that we assert that previously deleted documents are still 
deleted
in the live-docs we apply and that might be violated by the retention 
policy.

commit 27992e3386f7ba9521329ec8a1957103d73da2e4
Author: Adrien Grand 
Date:   2018-05-04T12:38:42Z

LUCENE-8295: Remove useless liveDocsSharedPending flag.

commit 6ec1198a5144e73b47bc88cd79f534878cbcbef4
Author: Anshum Gupta 
Date:   2018-05-03T22:00:47Z

SOLR-11277: Add auto hard commit setting based on tlog size (this closes 
#358)

commit 2558a06f3c8040d47de6606da2539532e4c67854
Author: Erick Erickson 
Date:   2018-05-05T05:30:18Z

SOLR-12028: BadApple and AwaitsFix annotations usage

(cherry picked from commit 89fc02a)

commit 96f079b4b47eaadff65c7aaf0e5bafe68e30ec3b
Author: Uwe Schindler 
Date:   2018-05-06T12:21:34Z

SOLR-12316: Do not allow to use absolute URIs for including other files in 
solrconfig.xml and schema

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-26 Thread dsmiley

Github user dsmiley commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r243999855
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java
 ---
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.CompletionService;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.Future;
+
+import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.CommitUpdateCommand;
+import org.apache.solr.update.DeleteUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.apache.solr.util.TestInjection;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class DistributedStandaloneUpdateProcessor extends 
DistributedUpdateProcessor {
+
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  public DistributedStandaloneUpdateProcessor(SolrQueryRequest req,
+  SolrQueryResponse rsp, 
UpdateRequestProcessor next) {
+super(req, rsp, next);
+  }
+
+  public DistributedStandaloneUpdateProcessor(SolrQueryRequest req,
+  SolrQueryResponse rsp, 
AtomicUpdateDocumentMerger docMerger,
+  UpdateRequestProcessor next) 
{
+super(req, rsp, docMerger, next);
+  }
+
+  @Override
+  Replica.Type computeReplicaType() {
+return Replica.Type.NRT;
+  }
+
+  @Override
+  public void processCommit(CommitUpdateCommand cmd) throws IOException {
+
+assert TestInjection.injectFailUpdateRequests();
+
+updateCommand = cmd;
+
+CompletionService completionService = new 
ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor());
+Set> pending = new HashSet<>();
+if (replicaType == Replica.Type.TLOG) {
+  if (isLeader) {
+super.processCommit(cmd);
+  }
+} else if (replicaType == Replica.Type.PULL) {
+  log.warn("Commit not supported on replicas of type " + 
Replica.Type.PULL);
+} else {
+  // NRT replicas will always commit
+  super.processCommit(cmd);
+}
+  }
+
+  @Override
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+assert TestInjection.injectFailUpdateRequests();
+
+updateCommand = cmd;
+isLeader = getNonZkLeaderAssumption(req);
+
+super.processAdd(cmd);
+  }
+
+  @Override
+  protected void doDeleteById(DeleteUpdateCommand cmd) throws IOException {
+isLeader = getNonZkLeaderAssumption(req);
+super.doDeleteById(cmd);
+  }
+
+  @Override
+  protected void doDeleteByQuery(DeleteUpdateCommand cmd) throws 
IOException {
+// even in non zk mode, tests simulate updates from a leader
+isLeader = getNonZkLeaderAssumption(req);
+super.doDeleteByQuery(cmd, null, null);
+  }
+
+  @Override
+  public void finish() throws IOException {
--- End diff --

nevermind; just leave this be then.  Lets keep our changes to more 
clear/obvious ones.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-26 Thread barrotsteindev

Github user barrotsteindev commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r243995067
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java
 ---
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.CompletionService;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.Future;
+
+import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.CommitUpdateCommand;
+import org.apache.solr.update.DeleteUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.apache.solr.util.TestInjection;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class DistributedStandaloneUpdateProcessor extends 
DistributedUpdateProcessor {
+
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  public DistributedStandaloneUpdateProcessor(SolrQueryRequest req,
+  SolrQueryResponse rsp, 
UpdateRequestProcessor next) {
+super(req, rsp, next);
+  }
+
+  public DistributedStandaloneUpdateProcessor(SolrQueryRequest req,
+  SolrQueryResponse rsp, 
AtomicUpdateDocumentMerger docMerger,
+  UpdateRequestProcessor next) 
{
+super(req, rsp, docMerger, next);
+  }
+
+  @Override
+  Replica.Type computeReplicaType() {
+return Replica.Type.NRT;
+  }
+
+  @Override
+  public void processCommit(CommitUpdateCommand cmd) throws IOException {
+
+assert TestInjection.injectFailUpdateRequests();
+
+updateCommand = cmd;
+
+CompletionService completionService = new 
ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor());
+Set> pending = new HashSet<>();
+if (replicaType == Replica.Type.TLOG) {
+  if (isLeader) {
+super.processCommit(cmd);
+  }
+} else if (replicaType == Replica.Type.PULL) {
+  log.warn("Commit not supported on replicas of type " + 
Replica.Type.PULL);
+} else {
+  // NRT replicas will always commit
+  super.processCommit(cmd);
+}
+  }
+
+  @Override
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+assert TestInjection.injectFailUpdateRequests();
+
+updateCommand = cmd;
+isLeader = getNonZkLeaderAssumption(req);
+
+super.processAdd(cmd);
+  }
+
+  @Override
+  protected void doDeleteById(DeleteUpdateCommand cmd) throws IOException {
+isLeader = getNonZkLeaderAssumption(req);
+super.doDeleteById(cmd);
+  }
+
+  @Override
+  protected void doDeleteByQuery(DeleteUpdateCommand cmd) throws 
IOException {
+// even in non zk mode, tests simulate updates from a leader
+isLeader = getNonZkLeaderAssumption(req);
+super.doDeleteByQuery(cmd, null, null);
+  }
+
+  @Override
+  public void finish() throws IOException {
--- End diff --

I'm still trying to dig in further and see why the last line (`if (next != 
null && nodes == null) next.finish();`) was put in an if and not an else if 
statement.
nodes is supposed to be the replicas of the current nodes, and when 
zkEnabled was false it was set to null. I am still no sure why this was in a 
completely separate if statement.
@dsmiley , WDYT?


---

-

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-24 Thread barrotsteindev

Github user barrotsteindev commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r243850805
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java
 ---
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.CompletionService;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.Future;
+
+import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.CommitUpdateCommand;
+import org.apache.solr.update.DeleteUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.apache.solr.util.TestInjection;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class DistributedStandaloneUpdateProcessor extends 
DistributedUpdateProcessor {
+
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  public DistributedStandaloneUpdateProcessor(SolrQueryRequest req,
+  SolrQueryResponse rsp, 
UpdateRequestProcessor next) {
+super(req, rsp, next);
+  }
+
+  public DistributedStandaloneUpdateProcessor(SolrQueryRequest req,
+  SolrQueryResponse rsp, 
AtomicUpdateDocumentMerger docMerger,
+  UpdateRequestProcessor next) 
{
+super(req, rsp, docMerger, next);
+  }
+
+  @Override
+  Replica.Type computeReplicaType() {
+return Replica.Type.NRT;
+  }
+
+  @Override
+  public void processCommit(CommitUpdateCommand cmd) throws IOException {
+
+assert TestInjection.injectFailUpdateRequests();
+
+updateCommand = cmd;
+
+CompletionService completionService = new 
ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor());
+Set> pending = new HashSet<>();
+if (replicaType == Replica.Type.TLOG) {
+  if (isLeader) {
+super.processCommit(cmd);
+  }
+} else if (replicaType == Replica.Type.PULL) {
+  log.warn("Commit not supported on replicas of type " + 
Replica.Type.PULL);
+} else {
+  // NRT replicas will always commit
+  super.processCommit(cmd);
+}
+  }
+
+  @Override
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+assert TestInjection.injectFailUpdateRequests();
+
+updateCommand = cmd;
+isLeader = getNonZkLeaderAssumption(req);
+
+super.processAdd(cmd);
+  }
+
+  @Override
+  protected void doDeleteById(DeleteUpdateCommand cmd) throws IOException {
+isLeader = getNonZkLeaderAssumption(req);
+super.doDeleteById(cmd);
+  }
+
+  @Override
+  protected void doDeleteByQuery(DeleteUpdateCommand cmd) throws 
IOException {
+// even in non zk mode, tests simulate updates from a leader
+isLeader = getNonZkLeaderAssumption(req);
--- End diff --

I think it can also be used in the zkProcessor,
I'll investigate further.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-24 Thread barrotsteindev

Github user barrotsteindev commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r243850226
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java
 ---
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.CompletionService;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.Future;
+
+import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.CommitUpdateCommand;
+import org.apache.solr.update.DeleteUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.apache.solr.util.TestInjection;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class DistributedStandaloneUpdateProcessor extends 
DistributedUpdateProcessor {
+
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  public DistributedStandaloneUpdateProcessor(SolrQueryRequest req,
+  SolrQueryResponse rsp, 
UpdateRequestProcessor next) {
+super(req, rsp, next);
+  }
+
+  public DistributedStandaloneUpdateProcessor(SolrQueryRequest req,
+  SolrQueryResponse rsp, 
AtomicUpdateDocumentMerger docMerger,
+  UpdateRequestProcessor next) 
{
+super(req, rsp, docMerger, next);
+  }
+
+  @Override
+  Replica.Type computeReplicaType() {
+return Replica.Type.NRT;
+  }
+
+  @Override
+  public void processCommit(CommitUpdateCommand cmd) throws IOException {
+
+assert TestInjection.injectFailUpdateRequests();
+
+updateCommand = cmd;
+
+CompletionService completionService = new 
ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor());
+Set> pending = new HashSet<>();
+if (replicaType == Replica.Type.TLOG) {
+  if (isLeader) {
+super.processCommit(cmd);
+  }
+} else if (replicaType == Replica.Type.PULL) {
+  log.warn("Commit not supported on replicas of type " + 
Replica.Type.PULL);
+} else {
+  // NRT replicas will always commit
+  super.processCommit(cmd);
+}
+  }
+
+  @Override
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+assert TestInjection.injectFailUpdateRequests();
+
+updateCommand = cmd;
+isLeader = getNonZkLeaderAssumption(req);
+
+super.processAdd(cmd);
+  }
+
+  @Override
+  protected void doDeleteById(DeleteUpdateCommand cmd) throws IOException {
+isLeader = getNonZkLeaderAssumption(req);
+super.doDeleteById(cmd);
+  }
+
+  @Override
+  protected void doDeleteByQuery(DeleteUpdateCommand cmd) throws 
IOException {
+// even in non zk mode, tests simulate updates from a leader
+isLeader = getNonZkLeaderAssumption(req);
+super.doDeleteByQuery(cmd, null, null);
+  }
+
+  @Override
+  public void finish() throws IOException {
--- End diff --

Well, previously, the code was:
  ```
@Override
public void finish() throws IOException {
assert ! finished : "lifecycle sanity check";
finished = true;

if (zkEnabled) doFinish();

if (next != null && nodes == null) next.finish();
  }
```
So nodes was always set to null when zkEnabled was false(see 
DistributedUpdateProcessor#setupRequest on master).
So, I just split

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-24 Thread dsmiley

Github user dsmiley commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r243847609
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java
 ---
@@ -1250,23 +1250,10 @@ private UpdateCommand 
fetchFullUpdateFromLeader(AddUpdateCommand inplaceAdd, lon
 params.set("onlyIfActive", true);
 SolrRequest ur = new 
GenericSolrRequest(METHOD.GET, "/get", params);
 
-String leaderUrl = req.getParams().get(DISTRIB_FROM);
-
-if (leaderUrl == null) {
-  // An update we're dependent upon didn't arrive! This is unexpected. 
Perhaps likely our leader is
-  // down or partitioned from us for some reason. Lets force refresh 
cluster state, and request the
-  // leader for the update.
-  if (zkController == null) { // we should be in cloud mode, but wtf? 
could be a unit test
-throw new SolrException(ErrorCode.SERVER_ERROR, "Can't find 
document with id=" + id + ", but fetching from leader "
-+ "failed since we're not in cloud mode.");
-  }
-  Replica leader;
-  try {
-leader = 
zkController.getZkStateReader().getLeaderRetry(collection, 
cloudDesc.getShardId());
-  } catch (InterruptedException e) {
-throw new SolrException(ErrorCode.SERVER_ERROR, "Exception during 
fetching from leader.", e);
-  }
-  leaderUrl = leader.getCoreUrl();
+String leaderUrl = getLeaderUrl(id);
--- End diff --

I looked again and it seems fine.  I could be the diff I was looking at 
confused the matter.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-24 Thread dsmiley

Github user dsmiley commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r243847326
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java
 ---
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.CompletionService;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.Future;
+
+import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.CommitUpdateCommand;
+import org.apache.solr.update.DeleteUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.apache.solr.util.TestInjection;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class DistributedStandaloneUpdateProcessor extends 
DistributedUpdateProcessor {
+
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  public DistributedStandaloneUpdateProcessor(SolrQueryRequest req,
+  SolrQueryResponse rsp, 
UpdateRequestProcessor next) {
+super(req, rsp, next);
+  }
+
+  public DistributedStandaloneUpdateProcessor(SolrQueryRequest req,
+  SolrQueryResponse rsp, 
AtomicUpdateDocumentMerger docMerger,
+  UpdateRequestProcessor next) 
{
+super(req, rsp, docMerger, next);
+  }
+
+  @Override
+  Replica.Type computeReplicaType() {
+return Replica.Type.NRT;
+  }
+
+  @Override
+  public void processCommit(CommitUpdateCommand cmd) throws IOException {
+
+assert TestInjection.injectFailUpdateRequests();
+
+updateCommand = cmd;
+
+CompletionService completionService = new 
ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor());
+Set> pending = new HashSet<>();
+if (replicaType == Replica.Type.TLOG) {
+  if (isLeader) {
+super.processCommit(cmd);
+  }
+} else if (replicaType == Replica.Type.PULL) {
+  log.warn("Commit not supported on replicas of type " + 
Replica.Type.PULL);
+} else {
+  // NRT replicas will always commit
+  super.processCommit(cmd);
+}
+  }
+
+  @Override
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+assert TestInjection.injectFailUpdateRequests();
+
+updateCommand = cmd;
+isLeader = getNonZkLeaderAssumption(req);
+
+super.processAdd(cmd);
+  }
+
+  @Override
+  protected void doDeleteById(DeleteUpdateCommand cmd) throws IOException {
+isLeader = getNonZkLeaderAssumption(req);
+super.doDeleteById(cmd);
+  }
+
+  @Override
+  protected void doDeleteByQuery(DeleteUpdateCommand cmd) throws 
IOException {
+// even in non zk mode, tests simulate updates from a leader
+isLeader = getNonZkLeaderAssumption(req);
+super.doDeleteByQuery(cmd, null, null);
+  }
+
+  @Override
+  public void finish() throws IOException {
--- End diff --

isn't this what a base finish() will do?  so why subclass.  If there really 
is a reason then a comment here would help


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-24 Thread dsmiley

Github user dsmiley commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r243847270
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java
 ---
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.CompletionService;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.Future;
+
+import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.CommitUpdateCommand;
+import org.apache.solr.update.DeleteUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.apache.solr.util.TestInjection;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class DistributedStandaloneUpdateProcessor extends 
DistributedUpdateProcessor {
+
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  public DistributedStandaloneUpdateProcessor(SolrQueryRequest req,
+  SolrQueryResponse rsp, 
UpdateRequestProcessor next) {
+super(req, rsp, next);
+  }
+
+  public DistributedStandaloneUpdateProcessor(SolrQueryRequest req,
+  SolrQueryResponse rsp, 
AtomicUpdateDocumentMerger docMerger,
+  UpdateRequestProcessor next) 
{
+super(req, rsp, docMerger, next);
+  }
+
+  @Override
+  Replica.Type computeReplicaType() {
+return Replica.Type.NRT;
+  }
+
+  @Override
+  public void processCommit(CommitUpdateCommand cmd) throws IOException {
+
+assert TestInjection.injectFailUpdateRequests();
+
+updateCommand = cmd;
+
+CompletionService completionService = new 
ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor());
+Set> pending = new HashSet<>();
+if (replicaType == Replica.Type.TLOG) {
+  if (isLeader) {
+super.processCommit(cmd);
+  }
+} else if (replicaType == Replica.Type.PULL) {
+  log.warn("Commit not supported on replicas of type " + 
Replica.Type.PULL);
+} else {
+  // NRT replicas will always commit
+  super.processCommit(cmd);
+}
+  }
+
+  @Override
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+assert TestInjection.injectFailUpdateRequests();
+
+updateCommand = cmd;
+isLeader = getNonZkLeaderAssumption(req);
+
+super.processAdd(cmd);
+  }
+
+  @Override
+  protected void doDeleteById(DeleteUpdateCommand cmd) throws IOException {
+isLeader = getNonZkLeaderAssumption(req);
+super.doDeleteById(cmd);
+  }
+
+  @Override
+  protected void doDeleteByQuery(DeleteUpdateCommand cmd) throws 
IOException {
+// even in non zk mode, tests simulate updates from a leader
+isLeader = getNonZkLeaderAssumption(req);
--- End diff --

I think just call setupRequest instead of getNonZkLeaderAssumption?  
setupRequest calls that and sets isLeader.  This comment goes for all/most of 
the methods here.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-24 Thread barrotsteindev

Github user barrotsteindev commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r243844969
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java
 ---
@@ -42,15 +53,36 @@ public 
DistributedStandaloneUpdateProcessor(SolrQueryRequest req,
   }
 
   @Override
-  String getCollectionName(CloudDescriptor cloudDesc) {
+  String computeCollectionName(CloudDescriptor cloudDesc) {
 return null;
   }
 
   @Override
-  Replica.Type getReplicaType(CloudDescriptor cloudDesc) {
+  Replica.Type computeReplicaType(CloudDescriptor cloudDesc) {
 return Replica.Type.NRT;
   }
 
+  @Override
+  public void processCommit(CommitUpdateCommand cmd) throws IOException {
+
+assert TestInjection.injectFailUpdateRequests();
+
+updateCommand = cmd;
+
+CompletionService completionService = new 
ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor());
--- End diff --

I guess I missed the fact that replicas in standalone mode are always NRT,
right?


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-24 Thread barrotsteindev

Github user barrotsteindev commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r243844755
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java
 ---
@@ -1250,23 +1250,10 @@ private UpdateCommand 
fetchFullUpdateFromLeader(AddUpdateCommand inplaceAdd, lon
 params.set("onlyIfActive", true);
 SolrRequest ur = new 
GenericSolrRequest(METHOD.GET, "/get", params);
 
-String leaderUrl = req.getParams().get(DISTRIB_FROM);
-
-if (leaderUrl == null) {
-  // An update we're dependent upon didn't arrive! This is unexpected. 
Perhaps likely our leader is
-  // down or partitioned from us for some reason. Lets force refresh 
cluster state, and request the
-  // leader for the update.
-  if (zkController == null) { // we should be in cloud mode, but wtf? 
could be a unit test
-throw new SolrException(ErrorCode.SERVER_ERROR, "Can't find 
document with id=" + id + ", but fetching from leader "
-+ "failed since we're not in cloud mode.");
-  }
-  Replica leader;
-  try {
-leader = 
zkController.getZkStateReader().getLeaderRetry(collection, 
cloudDesc.getShardId());
-  } catch (InterruptedException e) {
-throw new SolrException(ErrorCode.SERVER_ERROR, "Exception during 
fetching from leader.", e);
-  }
-  leaderUrl = leader.getCoreUrl();
+String leaderUrl = getLeaderUrl(id);
--- End diff --

Hmmm, if it is crucial to keep the exception message identical,
I could throw the exceptions inside getLeaderUrl(the abstract method).


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-24 Thread dsmiley

Github user dsmiley commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r243844185
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java
 ---
@@ -1250,23 +1250,10 @@ private UpdateCommand 
fetchFullUpdateFromLeader(AddUpdateCommand inplaceAdd, lon
 params.set("onlyIfActive", true);
 SolrRequest ur = new 
GenericSolrRequest(METHOD.GET, "/get", params);
 
-String leaderUrl = req.getParams().get(DISTRIB_FROM);
-
-if (leaderUrl == null) {
-  // An update we're dependent upon didn't arrive! This is unexpected. 
Perhaps likely our leader is
-  // down or partitioned from us for some reason. Lets force refresh 
cluster state, and request the
-  // leader for the update.
-  if (zkController == null) { // we should be in cloud mode, but wtf? 
could be a unit test
-throw new SolrException(ErrorCode.SERVER_ERROR, "Can't find 
document with id=" + id + ", but fetching from leader "
-+ "failed since we're not in cloud mode.");
-  }
-  Replica leader;
-  try {
-leader = 
zkController.getZkStateReader().getLeaderRetry(collection, 
cloudDesc.getShardId());
-  } catch (InterruptedException e) {
-throw new SolrException(ErrorCode.SERVER_ERROR, "Exception during 
fetching from leader.", e);
-  }
-  leaderUrl = leader.getCoreUrl();
+String leaderUrl = getLeaderUrl(id);
--- End diff --

I'm not sure how this diff is a refactoring (keeps semantics identical).  
For example... the exception message isn't the same.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-24 Thread barrotsteindev

Github user barrotsteindev commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r243843852
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java
 ---
@@ -42,15 +53,36 @@ public 
DistributedStandaloneUpdateProcessor(SolrQueryRequest req,
   }
 
   @Override
-  String getCollectionName(CloudDescriptor cloudDesc) {
+  String computeCollectionName(CloudDescriptor cloudDesc) {
 return null;
   }
 
   @Override
-  Replica.Type getReplicaType(CloudDescriptor cloudDesc) {
+  Replica.Type computeReplicaType(CloudDescriptor cloudDesc) {
 return Replica.Type.NRT;
   }
 
+  @Override
+  public void processCommit(CommitUpdateCommand cmd) throws IOException {
+
+assert TestInjection.injectFailUpdateRequests();
+
+updateCommand = cmd;
+
+CompletionService completionService = new 
ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor());
--- End diff --

I didn't see any use for the completionService in the previous processor. I 
meant to ask whether there might be a particular reason it is needed, since I 
didn't see any uses for this service.
Should I remove it from both zk and standalone processors, or just this one?


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-24 Thread dsmiley

Github user dsmiley commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r243843542
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java
 ---
@@ -42,15 +53,36 @@ public 
DistributedStandaloneUpdateProcessor(SolrQueryRequest req,
   }
 
   @Override
-  String getCollectionName(CloudDescriptor cloudDesc) {
+  String computeCollectionName(CloudDescriptor cloudDesc) {
 return null;
   }
 
   @Override
-  Replica.Type getReplicaType(CloudDescriptor cloudDesc) {
+  Replica.Type computeReplicaType(CloudDescriptor cloudDesc) {
 return Replica.Type.NRT;
   }
 
+  @Override
+  public void processCommit(CommitUpdateCommand cmd) throws IOException {
+
+assert TestInjection.injectFailUpdateRequests();
+
+updateCommand = cmd;
+
+CompletionService completionService = new 
ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor());
--- End diff --

Why is all this stuff here in standalone mode?


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243646386
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits hold the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset in bytes into the 
underlying slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 288 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243649022
  
--- Diff: 
lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
 ---
@@ -1204,6 +1204,12 @@ public void testRandomSortedBytes() throws 
IOException {
   }
 
   private void doTestNumericsVsStoredFields(double density, LongSupplier 
longs) throws Exception {
+doTestNumericsVsStoredFields(density, longs, 256);
+// TODO: 20 docs are needed to test jumps properly (see 
LUCENE-8585), but that is quite slow (few minutes).
+// Maybe it can be nightly?
+//doTestNumericsVsStoredFields(density, longs, 20);
--- End diff --

By the way if we want to test such large numbers of documents, we should 
avoid RandomIndexWriter (use IndexWriter instead) and set reasonable values of 
maxBufferedDocs+RAMSizeBufferMB+mergePolicy instead of relying on random values 
that are set by LuceneTestCase#newIndexWriterConfig and make indexing slow (but 
are useful to increase coverage of flushing+merging)


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243647937
  
--- Diff: lucene/core/src/test/org/apache/lucene/index/TestDocValues.java 
---
@@ -123,8 +124,72 @@ public void testNumericField() throws Exception {
 iw.close();
 dir.close();
   }
-  
-  /** 
+
+  // The LUCENE-8585 jump-tables enables O(1) skipping of IndexedDISI 
blocks, DENSE block lookup
+  // and numeric multi blocks. This test focuses on testing these jumps.
+  @Slow
+  public void testNumericFieldJumpTables() throws Exception {
+Directory dir = newDirectory();
+IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
+final int maxDoc = atLeast(3*65536); // Must be above 3*65536 to 
trigger IndexedDISI block skips
--- End diff --

Why do we have this test here as opposed to TestLucene80DocValuesFormat?

Given that you are indexing so many documents and don't care about testing 
indexing as much as you care about checking iterators, you should make sure 
that the index writer config isn't too crazy and enforce sane values of 
maxBufferedDocs, RAMBufferSizeMB, and a sane merge policy (eg. not the 
alcoholic one). See for instance 
TestLucene70DocValuesFormat#doTestSortedNumericBlocksOfVariousBitsPerValue.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243644347
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java
 ---
@@ -0,0 +1,1430 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.ImpactsEnum;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.TermsEnum.SeekStatus;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LongValues;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.packed.DirectMonotonicReader;
+import org.apache.lucene.util.packed.DirectReader;
+
+/** reader for {@link Lucene80DocValuesFormat} */
+final class Lucene80DocValuesProducer extends DocValuesProducer implements 
Closeable {
+  private final Map numerics = new HashMap<>();
+  private final Map binaries = new HashMap<>();
+  private final Map sorted = new HashMap<>();
+  private final Map sortedSets = new HashMap<>();
+  private final Map sortedNumerics = new 
HashMap<>();
+  private long ramBytesUsed;
+  private final IndexInput data;
+  private final int maxDoc;
+
+  /** expert: instantiates a new reader */
+  Lucene80DocValuesProducer(SegmentReadState state, String dataCodec, 
String dataExtension, String metaCodec, String metaExtension) throws 
IOException {
+String metaName = 
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, 
metaExtension);
+this.maxDoc = state.segmentInfo.maxDoc();
+ramBytesUsed = RamUsageEstimator.shallowSizeOfInstance(getClass());
+
+int version = -1;
+
+// read in the entries from the metadata file.
+try (ChecksumIndexInput in = 
state.directory.openChecksumInput(metaName, state.context)) {
+  Throwable priorE = null;
+  try {
+version = CodecUtil.checkIndexHeader(in, metaCodec,
+
Lucene80DocValuesFormat.VERSION_START,
+
Lucene80DocValuesFormat.VERSION_CURRENT,
+state.segmentInfo.getId(),
+state.segmentSuffix);
+readFields(in, state.fieldInfos);
+  } catch (Throwable exception) {
+priorE = exception;
+  } finally {
+CodecUtil.checkFooter(in, priorE);
+  }
+}
+
+String dataName = 
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, 
dataExtension);
+this.data = state.directory.openInput(dataName, state.context);
+boolean success = false;
+try {
+  final int version2 = CodecUtil.checkIndexHeader(data, dataCodec,
+ 
Lucene80DocValuesFormat.VERSION_START,
+

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243644248
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java
 ---
@@ -0,0 +1,1430 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.ImpactsEnum;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.TermsEnum.SeekStatus;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LongValues;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.packed.DirectMonotonicReader;
+import org.apache.lucene.util.packed.DirectReader;
+
+/** reader for {@link Lucene80DocValuesFormat} */
+final class Lucene80DocValuesProducer extends DocValuesProducer implements 
Closeable {
+  private final Map numerics = new HashMap<>();
+  private final Map binaries = new HashMap<>();
+  private final Map sorted = new HashMap<>();
+  private final Map sortedSets = new HashMap<>();
+  private final Map sortedNumerics = new 
HashMap<>();
+  private long ramBytesUsed;
+  private final IndexInput data;
+  private final int maxDoc;
+
+  /** expert: instantiates a new reader */
+  Lucene80DocValuesProducer(SegmentReadState state, String dataCodec, 
String dataExtension, String metaCodec, String metaExtension) throws 
IOException {
+String metaName = 
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, 
metaExtension);
+this.maxDoc = state.segmentInfo.maxDoc();
+ramBytesUsed = RamUsageEstimator.shallowSizeOfInstance(getClass());
+
+int version = -1;
+
+// read in the entries from the metadata file.
+try (ChecksumIndexInput in = 
state.directory.openChecksumInput(metaName, state.context)) {
+  Throwable priorE = null;
+  try {
+version = CodecUtil.checkIndexHeader(in, metaCodec,
+
Lucene80DocValuesFormat.VERSION_START,
+
Lucene80DocValuesFormat.VERSION_CURRENT,
+state.segmentInfo.getId(),
+state.segmentSuffix);
+readFields(in, state.fieldInfos);
+  } catch (Throwable exception) {
+priorE = exception;
+  } finally {
+CodecUtil.checkFooter(in, priorE);
+  }
+}
+
+String dataName = 
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, 
dataExtension);
+this.data = state.directory.openInput(dataName, state.context);
+boolean success = false;
+try {
+  final int version2 = CodecUtil.checkIndexHeader(data, dataCodec,
+ 
Lucene80DocValuesFormat.VERSION_START,
+

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243642604
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java
 ---
@@ -0,0 +1,663 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.EmptyDocValuesProducer;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.SortedSetSelector;
+import org.apache.lucene.store.GrowableByteArrayDataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.MathUtil;
+import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.packed.DirectMonotonicWriter;
+import org.apache.lucene.util.packed.DirectWriter;
+
+import static 
org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
+import static 
org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.NUMERIC_BLOCK_SHIFT;
+import static 
org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE;
+
+/** writer for {@link Lucene80DocValuesFormat} */
+final class Lucene80DocValuesConsumer extends DocValuesConsumer implements 
Closeable {
+
+  IndexOutput data, meta;
+  final int maxDoc;
+
+  /** expert: Creates a new writer */
+  public Lucene80DocValuesConsumer(SegmentWriteState state, String 
dataCodec, String dataExtension, String metaCodec, String metaExtension) throws 
IOException {
+boolean success = false;
+try {
+  String dataName = 
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, 
dataExtension);
+  data = state.directory.createOutput(dataName, state.context);
+  CodecUtil.writeIndexHeader(data, dataCodec, 
Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), 
state.segmentSuffix);
+  String metaName = 
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, 
metaExtension);
+  meta = state.directory.createOutput(metaName, state.context);
+  CodecUtil.writeIndexHeader(meta, metaCodec, 
Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), 
state.segmentSuffix);
+  maxDoc = state.segmentInfo.maxDoc();
+  success = true;
+} finally {
+  if (!success) {
+IOUtils.closeWhileHandlingException(this);
+  }
+}
+  }
+
+  @Override
+  public void close() throws IOException {
+boolean success = false;
+try {
+  if (meta != null) {
+meta.writeInt(-1); // write EOF marker
+CodecUtil.writeFooter(meta); // write checksum
+  }
+  if (data != null) {
+CodecUtil.writeFooter(data); // write checksum
+  }
+  success = true;
+} finally {
+  if (success) {
+IOUtils.close(data, meta);
+  } else {

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243643855
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java
 ---
@@ -0,0 +1,1430 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.ImpactsEnum;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.TermsEnum.SeekStatus;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LongValues;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.packed.DirectMonotonicReader;
+import org.apache.lucene.util.packed.DirectReader;
+
+/** reader for {@link Lucene80DocValuesFormat} */
+final class Lucene80DocValuesProducer extends DocValuesProducer implements 
Closeable {
+  private final Map numerics = new HashMap<>();
+  private final Map binaries = new HashMap<>();
+  private final Map sorted = new HashMap<>();
+  private final Map sortedSets = new HashMap<>();
+  private final Map sortedNumerics = new 
HashMap<>();
+  private long ramBytesUsed;
+  private final IndexInput data;
+  private final int maxDoc;
+
+  /** expert: instantiates a new reader */
+  Lucene80DocValuesProducer(SegmentReadState state, String dataCodec, 
String dataExtension, String metaCodec, String metaExtension) throws 
IOException {
+String metaName = 
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, 
metaExtension);
+this.maxDoc = state.segmentInfo.maxDoc();
+ramBytesUsed = RamUsageEstimator.shallowSizeOfInstance(getClass());
+
+int version = -1;
+
+// read in the entries from the metadata file.
+try (ChecksumIndexInput in = 
state.directory.openChecksumInput(metaName, state.context)) {
+  Throwable priorE = null;
+  try {
+version = CodecUtil.checkIndexHeader(in, metaCodec,
+
Lucene80DocValuesFormat.VERSION_START,
+
Lucene80DocValuesFormat.VERSION_CURRENT,
+state.segmentInfo.getId(),
+state.segmentSuffix);
+readFields(in, state.fieldInfos);
+  } catch (Throwable exception) {
+priorE = exception;
+  } finally {
+CodecUtil.checkFooter(in, priorE);
+  }
+}
+
+String dataName = 
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, 
dataExtension);
+this.data = state.directory.openInput(dataName, state.context);
+boolean success = false;
+try {
+  final int version2 = CodecUtil.checkIndexHeader(data, dataCodec,
+ 
Lucene80DocValuesFormat.VERSION_START,
+

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243638367
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits hold the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset in bytes into the 
underlying slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 288 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243641644
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits hold the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset in bytes into the 
underlying slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 288 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243648602
  
--- Diff: 
lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
 ---
@@ -1204,6 +1204,12 @@ public void testRandomSortedBytes() throws 
IOException {
   }
 
   private void doTestNumericsVsStoredFields(double density, LongSupplier 
longs) throws Exception {
+doTestNumericsVsStoredFields(density, longs, 256);
+// TODO: 20 docs are needed to test jumps properly (see 
LUCENE-8585), but that is quite slow (few minutes).
+// Maybe it can be nightly?
+//doTestNumericsVsStoredFields(density, longs, 20);
--- End diff --

Let's not do that in the base test case as this will be too slow with some 
codecs such as the SimpleTextCodec. What we have done before for formats that 
have branches that only get triggered with many documens is that we added tests 
to their own test file, ie. TestLucene80DocValuesFormat.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243630309
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits hold the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset in bytes into the 
underlying slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 288 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243640446
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits hold the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset in bytes into the 
underlying slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 288 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243630434
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243629404
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits hold the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset in bytes into the 
underlying slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 288 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243636154
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits hold the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset in bytes into the 
underlying slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 288 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243634870
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits hold the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset in bytes into the 
underlying slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 288 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243625997
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits hold the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset in bytes into the 
underlying slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 288 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
--- End diff --

I think you mean "not exceed 2^17 _bytes_"? Then maybe we should also 
express the maximum size of a block in bytes to make it easy to compare, ie. 
"2^13 + 36 bytes"?

Should we simplify the encoding so that the long is actualy two 32-bit ints 
rather than one of 31 bits and another one of 33 bits?


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243624209
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
--- End diff --

I'm not sure to understand this sentence, did you mean to write another 
word than "blockCache" here?


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243631028
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits hold the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset in bytes into the 
underlying slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 288 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-21 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r243628701
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used: A lookup table for block blockCache and index, and a 
rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits hold the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset in bytes into the 
underlying slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 288 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-21 Thread barrotsteindev

Github user barrotsteindev commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r243628041
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
 ---
@@ -425,4 +448,10 @@ void setupRequest(UpdateCommand cmd) {
   nodes = setupRequest(dcmd.getId(), null);
 }
   }
+
+  @Override
+  protected boolean shouldCloneCmdDoc() {
+boolean willDistrib = isLeader && nodes != null && nodes.size() > 0;
--- End diff --

This could probably optimized, since cloneRequiredOnLeader is already set 
during construction of the instance:
`if(!cloneRequiredOnLeader) {
return false;
}
// will distrib command
return isLeader && nodes != null && nodes.size() > 0;`


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #343: SOLR-12121: JWT Token authentication plugin

2018-12-21 Thread janhoy

Github user janhoy commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/343#discussion_r243600574
  
--- Diff: 
solr/core/src/test/org/apache/solr/security/JWTAuthPluginIntegrationTest.java 
---
@@ -0,0 +1,214 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.security;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
+
+import org.apache.http.Header;
+import org.apache.http.HttpException;
+import org.apache.http.HttpHeaders;
+import org.apache.http.HttpRequest;
+import org.apache.http.HttpRequestInterceptor;
+import org.apache.http.entity.ContentType;
+import org.apache.http.protocol.HttpContext;
+import org.apache.solr.client.solrj.impl.HttpClientUtil;
+import org.apache.solr.cloud.SolrCloudAuthTestCase;
+import org.apache.solr.common.util.Pair;
+import org.jose4j.jwk.PublicJsonWebKey;
+import org.jose4j.jwk.RsaJsonWebKey;
+import org.jose4j.jws.AlgorithmIdentifiers;
+import org.jose4j.jws.JsonWebSignature;
+import org.jose4j.jwt.JwtClaims;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Validate that JWT token authentication works in a real cluster.
+ * TODO: Test also using SolrJ as client. But that requires a way to set 
Authorization header on request, see SOLR-13070
+ */
+public class JWTAuthPluginIntegrationTest extends SolrCloudAuthTestCase {
+  protected static final int NUM_SERVERS = 2;
+  protected static final int NUM_SHARDS = 2;
+  protected static final int REPLICATION_FACTOR = 1;
+  private static final String COLLECTION = "jwtColl";
+  private static String jwtTestToken;
+  private static String baseUrl;
+  private static AtomicInteger jwtInterceptCount = new AtomicInteger();
+  private static AtomicInteger pkiInterceptCount = new AtomicInteger();
+  private static final CountInterceptor interceptor = new 
CountInterceptor();
+
+  @BeforeClass
+  public static void setupClass() throws Exception {
+configureCluster(NUM_SERVERS)// nodes
+
.withSecurityJson(TEST_PATH().resolve("security").resolve("jwt_plugin_jwk_security.json"))
+.addConfig("conf1", 
TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf"))
+.configure();
+baseUrl = cluster.getRandomJetty(random()).getBaseUrl().toString();
+
+String jwkJSON = "{\n" +
+"  \"kty\": \"RSA\",\n" +
+"  \"d\": 
\"i6pyv2z3o-MlYytWsOr3IE1olu2RXZBzjPRBNgWAP1TlLNaphHEvH5aHhe_CtBAastgFFMuP29CFhaL3_tGczkvWJkSveZQN2AHWHgRShKgoSVMspkhOt3Ghha4CvpnZ9BnQzVHnaBnHDTTTfVgXz7P1ZNBhQY4URG61DKIF-JSSClyh1xKuMoJX0lILXDYGGcjVTZL_hci4IXPPTpOJHV51-pxuO7WU5M9252UYoiYyCJ56ai8N49aKIMsqhdGuO4aWUwsGIW4oQpjtce5eEojCprYl-9rDhTwLAFoBtjy6LvkqlR2Ae5dKZYpStljBjK8PJrBvWZjXAEMDdQ8PuQ\",\n"
 +
+"  \"e\": \"AQAB\",\n" +
+"  \"use\": \"sig\",\n" +
+"  \"kid\": \"test\",\n" +
+"  \"alg\": \"RS256\",\n" +
+"  \"n\": 
\"jeyrvOaZrmKWjyNXt0myAc_pJ1hNt3aRupExJEx1ewPaL9J9HFgSCjMrYxCB1ETO1NDyZ3nSgjZis-jHHDqBxBjRdq_t1E2rkGFaYbxAyKt220Pwgme_SFTB9MXVrFQGkKyjmQeVmOmV6zM3KK8uMdKQJ4aoKmwBcF5Zg7EZdDcKOFgpgva1Jq-FlEsaJ2xrYDYo3KnGcOHIt9_0NQeLsqZbeWYLxYni7uROFncXYV5FhSJCeR4A_rrbwlaCydGxE0ToC_9HNYibUHlkJjqyUhAgORCbNS8JLCJH8NUi5sDdIawK9GTSyvsJXZ-QHqo4cMUuxWV5AJtaRGghuMUfqQ\"\n"
 +
+"}";
+
+PublicJsonWebKey jwk = RsaJsonWebKey.Factory.newPublicJwk(jwkJSON);
+JwtClaims claims = JWTAuthPluginTest.generateClaims();
+JsonWebSignature jws = new JsonWebSignature();
+jws.setPayload(claims.toJson());
+

[GitHub] lucene-solr pull request #342: SOLR-12120: New AuditLoggerPlugin type allowi...

2018-12-21 Thread janhoy

Github user janhoy commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/342#discussion_r243532741
  
--- Diff: solr/core/src/java/org/apache/solr/security/AuditEvent.java ---
@@ -0,0 +1,388 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.security;
+
+import javax.servlet.http.HttpServletRequest;
+import java.lang.invoke.MethodHandles;
+import java.security.Principal;
+import java.util.Date;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import org.apache.solr.common.SolrException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static org.apache.solr.security.AuditEvent.EventType.ANONYMOUS;
+
+/**
+ * Audit event that takes request and auth context as input to be able to 
audit log custom things
+ */
+public class AuditEvent {
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  private String message;
+  private Level level;
+  private Date date;
+  private String username;
+  private String session;
+  private String clientIp;
+  private List collections;
+  private Map context;
+  private HashMap headers;
+  private Map solrParams;
+  private String solrHost;
+  private int solrPort;
+  private String solrIp;
+  private String resource;
+  private String httpMethod;
+  private String queryString;
+  private EventType eventType;
+  private AuthorizationResponse autResponse;
+  private String requestType;
+  private double QTime = -1;
+  private int status = 0;
+  private Throwable exception;
+
+  /* Predefined event types. Custom types can be made through constructor 
*/
+  public enum EventType {
+AUTHENTICATED("Authenticated", "User successfully authenticated", 
Level.INFO),
+REJECTED("Rejected", "Authentication request rejected", Level.WARN),
+ANONYMOUS("Anonymous", "Request proceeds with unknown user", 
Level.INFO),
+ANONYMOUS_REJECTED("AnonymousRejected", "Request from unknown user 
rejected", Level.WARN),
+AUTHORIZED("Authorized", "Authorization succeeded", Level.INFO),
+UNAUTHORIZED("Unauthorized", "Authorization failed", Level.WARN),
+COMPLETED("Completed", "Request completed", Level.INFO),
--- End diff --

In the latest commits I simplified this further. Now the default is to ever 
only generate events when the request is finished, not any intermediate ones, 
meaning that AUTHENTICATED, ANONYMOUS and AUTHORIZED are not logged by default. 
This saves some unnecessary object generation. Further, we now do a lightweight 
check on `shouldLog(eventType)` before even creating the AuditEvent object, 
which should further speed up things.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #529: LUCENE-8617: Use SimpleFSDirectory on non-def...

2018-12-20 Thread marschall

GitHub user marschall opened a pull request:

https://github.com/apache/lucene-solr/pull/529

LUCENE-8617: Use SimpleFSDirectory on non-default FS

Currently we use MMapDirectory on a non-default file system on Unix.
This will almost always fail as only the default file system can
provide mmap().

Switch to SimpleFSDirectory instead for non-default file systems
independent of the operating system.

You can merge this pull request into a Git repository by running:

$ git pull https://github.com/marschall/lucene-solr LUCENE-8617

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/lucene-solr/pull/529.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #529


commit 611e4e05206737ffd8f63e2d9c659266b068ca55
Author: Philippe Marschall 
Date:   2018-12-20T15:19:08Z

LUCENE-8617: Use SimpleFSDirectory on non-default FS

Currently we use MMapDirectory on a non-default file system on Unix.
This will almost always fail as only the default file system can
provide mmap().

Switch to SimpleFSDirectory instead for non-default file systems
independent of the operating system.




---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-19 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242875578
  
--- Diff: 
lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
 ---
@@ -1204,6 +1204,12 @@ public void testRandomSortedBytes() throws 
IOException {
   }
 
   private void doTestNumericsVsStoredFields(double density, LongSupplier 
longs) throws Exception {
+doTestNumericsVsStoredFields(density, longs, 256);
+// TODO: 20 docs are needed to test jumps properly (see 
LUCENE-8585), but that is quite slow (few minutes).
+// Maybe it can be nightly?
+//doTestNumericsVsStoredFields(density, longs, 20);
--- End diff --

Should we add a `if (TEST_NIGHTLY) ...`-switch here to to enable the 200K 
test case? It would increase coverage for jumps a great deal.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-19 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242857603
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-17 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242175311
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-17 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242172918
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-17 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242159267
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-17 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242158001
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-17 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242148673
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-17 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242092902
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-17 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242073523
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java
 ---
@@ -0,0 +1,1413 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.ImpactsEnum;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.TermsEnum.SeekStatus;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LongValues;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.packed.DirectMonotonicReader;
+import org.apache.lucene.util.packed.DirectReader;
+
+/** reader for {@link Lucene80DocValuesFormat} */
+final class Lucene80DocValuesProducer extends DocValuesProducer implements 
Closeable {
+  private final Map numerics = new HashMap<>();
+  private final Map binaries = new HashMap<>();
+  private final Map sorted = new HashMap<>();
+  private final Map sortedSets = new HashMap<>();
+  private final Map sortedNumerics = new 
HashMap<>();
+  private long ramBytesUsed;
+  private final IndexInput data;
+  private final int maxDoc;
+
+  /** expert: instantiates a new reader */
+  Lucene80DocValuesProducer(SegmentReadState state, String dataCodec, 
String dataExtension, String metaCodec, String metaExtension) throws 
IOException {
+String metaName = 
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, 
metaExtension);
+this.maxDoc = state.segmentInfo.maxDoc();
+ramBytesUsed = RamUsageEstimator.shallowSizeOfInstance(getClass());
+
+int version = -1;
+
+// read in the entries from the metadata file.
+try (ChecksumIndexInput in = 
state.directory.openChecksumInput(metaName, state.context)) {
+  Throwable priorE = null;
+  try {
+version = CodecUtil.checkIndexHeader(in, metaCodec,
+
Lucene80DocValuesFormat.VERSION_START,
+
Lucene80DocValuesFormat.VERSION_CURRENT,
+state.segmentInfo.getId(),
+state.segmentSuffix);
+readFields(in, state.fieldInfos);
+  } catch (Throwable exception) {
+priorE = exception;
+  } finally {
+CodecUtil.checkFooter(in, priorE);
+  }
+}
+
+String dataName = 
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, 
dataExtension);
+this.data = state.directory.openInput(dataName, state.context);
+boolean success = false;
+try {
+  final int version2 = CodecUtil.checkIndexHeader(data, dataCodec,
+ 
Lucene80DocValuesFormat.VERSION_START,
+

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-17 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242073366
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java
 ---
@@ -0,0 +1,1413 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.ImpactsEnum;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.TermsEnum.SeekStatus;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LongValues;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.packed.DirectMonotonicReader;
+import org.apache.lucene.util.packed.DirectReader;
+
+/** reader for {@link Lucene80DocValuesFormat} */
+final class Lucene80DocValuesProducer extends DocValuesProducer implements 
Closeable {
+  private final Map numerics = new HashMap<>();
+  private final Map binaries = new HashMap<>();
+  private final Map sorted = new HashMap<>();
+  private final Map sortedSets = new HashMap<>();
+  private final Map sortedNumerics = new 
HashMap<>();
+  private long ramBytesUsed;
+  private final IndexInput data;
+  private final int maxDoc;
+
+  /** expert: instantiates a new reader */
+  Lucene80DocValuesProducer(SegmentReadState state, String dataCodec, 
String dataExtension, String metaCodec, String metaExtension) throws 
IOException {
+String metaName = 
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, 
metaExtension);
+this.maxDoc = state.segmentInfo.maxDoc();
+ramBytesUsed = RamUsageEstimator.shallowSizeOfInstance(getClass());
+
+int version = -1;
+
+// read in the entries from the metadata file.
+try (ChecksumIndexInput in = 
state.directory.openChecksumInput(metaName, state.context)) {
+  Throwable priorE = null;
+  try {
+version = CodecUtil.checkIndexHeader(in, metaCodec,
+
Lucene80DocValuesFormat.VERSION_START,
+
Lucene80DocValuesFormat.VERSION_CURRENT,
+state.segmentInfo.getId(),
+state.segmentSuffix);
+readFields(in, state.fieldInfos);
+  } catch (Throwable exception) {
+priorE = exception;
+  } finally {
+CodecUtil.checkFooter(in, priorE);
+  }
+}
+
+String dataName = 
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, 
dataExtension);
+this.data = state.directory.openInput(dataName, state.context);
+boolean success = false;
+try {
+  final int version2 = CodecUtil.checkIndexHeader(data, dataCodec,
+ 
Lucene80DocValuesFormat.VERSION_START,
+

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-17 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242072816
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-17 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242070202
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-17 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242069261
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-17 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242068739
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-17 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242062202
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-17 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242060979
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-17 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242060186
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-17 Thread tokee

Github user tokee commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r242055494
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java ---
@@ -284,12 +284,12 @@
  * Stores additional per-position metadata information such as 
character offsets and user payloads
  * 
  * 
- * {@link org.apache.lucene.codecs.lucene70.Lucene70NormsFormat 
Norms}
+ * org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Norms
  * .nvd, .nvm
  * Encodes length and boost factors for docs and fields
  * 
  * 
- * {@link org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat 
Per-Document Values}
+ * org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat 
Per-Document Values
  * .dvd, .dvm
  * Encodes additional scoring factors or other per-document 
information.
  * 
--- End diff --

Seems reasonable. I have mimicked the phrasing from the lucene60 
`package-info.java` in the upcoming commit and also adjusted the references 
from lucene50, lucene60 to point to lucene80.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-15 Thread barrotsteindev

Github user barrotsteindev commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r241967695
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java
 ---
@@ -56,8 +59,15 @@ public void doDeleteByQuery(DeleteUpdateCommand cmd) 
throws IOException {
 super.doDeleteByQuery(cmd, Collections.emptyList(), null);
   }
 
+  @Override
+  protected void postProcessDeleteByQuery(DeleteUpdateCommand cmd,
+  List 
replicas, DocCollection coll) throws IOException {
+// no op
+  }
+
   @Override
   boolean isLeader(UpdateCommand cmd) {
+updateCommand = cmd;
--- End diff --

Perhaps we should have another isLeader method that returns a boolean, 
since the current method is used by other classes in the package to setup the 
request and determine whether the core is the leader for the specified update 
command.
I'll work on a patch ASAP.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-15 Thread dsmiley

Github user dsmiley commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r241967674
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java
 ---
@@ -1417,45 +1407,7 @@ private void doDeleteById(DeleteUpdateCommand cmd) 
throws IOException {
   return;
 }
 
-if (zkEnabled && isLeader && !isSubShardLeader)  {
-  DocCollection coll = 
zkController.getClusterState().getCollection(collection);
-  List subShardLeaders = getSubShardLeaders(coll, 
cloudDesc.getShardId(), cmd.getId(), null);
-  // the list will actually have only one element for an add 
request
-  if (subShardLeaders != null && !subShardLeaders.isEmpty()) {
-ModifiableSolrParams params = new 
ModifiableSolrParams(filterParams(req.getParams()));
-params.set(DISTRIB_UPDATE_PARAM, 
DistribPhase.FROMLEADER.toString());
-params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(
-zkController.getBaseUrl(), req.getCore().getName()));
-params.set(DISTRIB_FROM_PARENT, cloudDesc.getShardId());
-cmdDistrib.distribDelete(cmd, subShardLeaders, params, true, null, 
null);
-  }
-
-  final List nodesByRoutingRules = 
getNodesByRoutingRules(zkController.getClusterState(), coll, cmd.getId(), null);
-  if (nodesByRoutingRules != null && !nodesByRoutingRules.isEmpty())  {
-ModifiableSolrParams params = new 
ModifiableSolrParams(filterParams(req.getParams()));
-params.set(DISTRIB_UPDATE_PARAM, 
DistribPhase.FROMLEADER.toString());
-params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(
-zkController.getBaseUrl(), req.getCore().getName()));
-params.set(DISTRIB_FROM_COLLECTION, collection);
-params.set(DISTRIB_FROM_SHARD, cloudDesc.getShardId());
-cmdDistrib.distribDelete(cmd, nodesByRoutingRules, params, true, 
null, null);
-  }
-}
-
-if (nodes != null) {
-  ModifiableSolrParams params = new 
ModifiableSolrParams(filterParams(req.getParams()));
-  params.set(DISTRIB_UPDATE_PARAM,
-  (isLeader || isSubShardLeader ? 
DistribPhase.FROMLEADER.toString()
-  : DistribPhase.TOLEADER.toString()));
-  params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(
-  zkController.getBaseUrl(), req.getCore().getName()));
-
-  if (req.getParams().get(UpdateRequest.MIN_REPFACT) != null) {
-// TODO: Kept for rolling upgrades only. Remove in Solr 9
-params.add(UpdateRequest.MIN_REPFACT, 
req.getParams().get(UpdateRequest.MIN_REPFACT));
-  }
-  cmdDistrib.distribDelete(cmd, nodes, params, false, 
rollupReplicationTracker, leaderReplicationTracker);
-}
+postProcessDeleteById(cmd);
--- End diff --

I can understand why you took the current approach.  Lets put this off 
towards near the end to see if it's straight-forward or not.  Maybe it won't be 
and we shouldn't do it.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-15 Thread dsmiley

Github user dsmiley commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r241967637
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java
 ---
@@ -56,8 +59,15 @@ public void doDeleteByQuery(DeleteUpdateCommand cmd) 
throws IOException {
 super.doDeleteByQuery(cmd, Collections.emptyList(), null);
   }
 
+  @Override
+  protected void postProcessDeleteByQuery(DeleteUpdateCommand cmd,
+  List 
replicas, DocCollection coll) throws IOException {
+// no op
+  }
+
   @Override
   boolean isLeader(UpdateCommand cmd) {
+updateCommand = cmd;
--- End diff --

+1 to setupRequest name.  I don't think it needs to return "isLeader"; it 
should set this and return void.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-15 Thread barrotsteindev

Github user barrotsteindev commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r241954714
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java
 ---
@@ -56,8 +59,15 @@ public void doDeleteByQuery(DeleteUpdateCommand cmd) 
throws IOException {
 super.doDeleteByQuery(cmd, Collections.emptyList(), null);
   }
 
+  @Override
+  protected void postProcessDeleteByQuery(DeleteUpdateCommand cmd,
+  List 
replicas, DocCollection coll) throws IOException {
+// no op
+  }
+
   @Override
   boolean isLeader(UpdateCommand cmd) {
+updateCommand = cmd;
--- End diff --

since this is a package private method, perhaps it should be renamed to 
setupRequest(like the protected methods) just that this once takes an update 
command( Distributed#setupRequest(UpdateCommand cmd) ). It should be stated in 
the java doc that the method returns whether this node is the leader for the 
supplied update command(cmd).
WDYT?


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-15 Thread barrotsteindev

Github user barrotsteindev commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r241949194
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java
 ---
@@ -1417,45 +1407,7 @@ private void doDeleteById(DeleteUpdateCommand cmd) 
throws IOException {
   return;
 }
 
-if (zkEnabled && isLeader && !isSubShardLeader)  {
-  DocCollection coll = 
zkController.getClusterState().getCollection(collection);
-  List subShardLeaders = getSubShardLeaders(coll, 
cloudDesc.getShardId(), cmd.getId(), null);
-  // the list will actually have only one element for an add 
request
-  if (subShardLeaders != null && !subShardLeaders.isEmpty()) {
-ModifiableSolrParams params = new 
ModifiableSolrParams(filterParams(req.getParams()));
-params.set(DISTRIB_UPDATE_PARAM, 
DistribPhase.FROMLEADER.toString());
-params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(
-zkController.getBaseUrl(), req.getCore().getName()));
-params.set(DISTRIB_FROM_PARENT, cloudDesc.getShardId());
-cmdDistrib.distribDelete(cmd, subShardLeaders, params, true, null, 
null);
-  }
-
-  final List nodesByRoutingRules = 
getNodesByRoutingRules(zkController.getClusterState(), coll, cmd.getId(), null);
-  if (nodesByRoutingRules != null && !nodesByRoutingRules.isEmpty())  {
-ModifiableSolrParams params = new 
ModifiableSolrParams(filterParams(req.getParams()));
-params.set(DISTRIB_UPDATE_PARAM, 
DistribPhase.FROMLEADER.toString());
-params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(
-zkController.getBaseUrl(), req.getCore().getName()));
-params.set(DISTRIB_FROM_COLLECTION, collection);
-params.set(DISTRIB_FROM_SHARD, cloudDesc.getShardId());
-cmdDistrib.distribDelete(cmd, nodesByRoutingRules, params, true, 
null, null);
-  }
-}
-
-if (nodes != null) {
-  ModifiableSolrParams params = new 
ModifiableSolrParams(filterParams(req.getParams()));
-  params.set(DISTRIB_UPDATE_PARAM,
-  (isLeader || isSubShardLeader ? 
DistribPhase.FROMLEADER.toString()
-  : DistribPhase.TOLEADER.toString()));
-  params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(
-  zkController.getBaseUrl(), req.getCore().getName()));
-
-  if (req.getParams().get(UpdateRequest.MIN_REPFACT) != null) {
-// TODO: Kept for rolling upgrades only. Remove in Solr 9
-params.add(UpdateRequest.MIN_REPFACT, 
req.getParams().get(UpdateRequest.MIN_REPFACT));
-  }
-  cmdDistrib.distribDelete(cmd, nodes, params, false, 
rollupReplicationTracker, leaderReplicationTracker);
-}
+postProcessDeleteById(cmd);
--- End diff --

Well I could try,
I didn't want to introduce too many changes to the logic. Would changing 
this and letting the full sold test suite run be of assurance that the change 
is safe?


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-15 Thread dsmiley

Github user dsmiley commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r241945926
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java
 ---
@@ -1417,45 +1407,7 @@ private void doDeleteById(DeleteUpdateCommand cmd) 
throws IOException {
   return;
 }
 
-if (zkEnabled && isLeader && !isSubShardLeader)  {
-  DocCollection coll = 
zkController.getClusterState().getCollection(collection);
-  List subShardLeaders = getSubShardLeaders(coll, 
cloudDesc.getShardId(), cmd.getId(), null);
-  // the list will actually have only one element for an add 
request
-  if (subShardLeaders != null && !subShardLeaders.isEmpty()) {
-ModifiableSolrParams params = new 
ModifiableSolrParams(filterParams(req.getParams()));
-params.set(DISTRIB_UPDATE_PARAM, 
DistribPhase.FROMLEADER.toString());
-params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(
-zkController.getBaseUrl(), req.getCore().getName()));
-params.set(DISTRIB_FROM_PARENT, cloudDesc.getShardId());
-cmdDistrib.distribDelete(cmd, subShardLeaders, params, true, null, 
null);
-  }
-
-  final List nodesByRoutingRules = 
getNodesByRoutingRules(zkController.getClusterState(), coll, cmd.getId(), null);
-  if (nodesByRoutingRules != null && !nodesByRoutingRules.isEmpty())  {
-ModifiableSolrParams params = new 
ModifiableSolrParams(filterParams(req.getParams()));
-params.set(DISTRIB_UPDATE_PARAM, 
DistribPhase.FROMLEADER.toString());
-params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(
-zkController.getBaseUrl(), req.getCore().getName()));
-params.set(DISTRIB_FROM_COLLECTION, collection);
-params.set(DISTRIB_FROM_SHARD, cloudDesc.getShardId());
-cmdDistrib.distribDelete(cmd, nodesByRoutingRules, params, true, 
null, null);
-  }
-}
-
-if (nodes != null) {
-  ModifiableSolrParams params = new 
ModifiableSolrParams(filterParams(req.getParams()));
-  params.set(DISTRIB_UPDATE_PARAM,
-  (isLeader || isSubShardLeader ? 
DistribPhase.FROMLEADER.toString()
-  : DistribPhase.TOLEADER.toString()));
-  params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(
-  zkController.getBaseUrl(), req.getCore().getName()));
-
-  if (req.getParams().get(UpdateRequest.MIN_REPFACT) != null) {
-// TODO: Kept for rolling upgrades only. Remove in Solr 9
-params.add(UpdateRequest.MIN_REPFACT, 
req.getParams().get(UpdateRequest.MIN_REPFACT));
-  }
-  cmdDistrib.distribDelete(cmd, nodes, params, false, 
rollupReplicationTracker, leaderReplicationTracker);
-}
+postProcessDeleteById(cmd);
--- End diff --

I wonder if we actually need this postPorcessDeleteById(cmd) method... 
could the subclass put its logic after calling super.doDeleteById()?


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-15 Thread dsmiley

Github user dsmiley commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r241945549
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
 ---
@@ -223,6 +289,7 @@ private String getLeaderUrlZk(String id) {
 
   @Override
   boolean isLeader(UpdateCommand cmd) {
+updateCommand = cmd;
--- End diff --

again; this line makes me frown


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-15 Thread dsmiley

Github user dsmiley commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r241945502
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedStandaloneUpdateProcessor.java
 ---
@@ -56,8 +59,15 @@ public void doDeleteByQuery(DeleteUpdateCommand cmd) 
throws IOException {
 super.doDeleteByQuery(cmd, Collections.emptyList(), null);
   }
 
+  @Override
+  protected void postProcessDeleteByQuery(DeleteUpdateCommand cmd,
+  List 
replicas, DocCollection coll) throws IOException {
+// no op
+  }
+
   @Override
   boolean isLeader(UpdateCommand cmd) {
+updateCommand = cmd;
--- End diff --

ouch this piece is ugly.  a method like "is..." (or get...)  should not 
have side effects.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #528: SOLR-12955 2

2018-12-15 Thread dsmiley

Github user dsmiley commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/528#discussion_r241945460
  
--- Diff: 
solr/core/src/java/org/apache/solr/update/processor/DistributedZkUpdateProcessor.java
 ---
@@ -58,6 +75,127 @@ String getCollectionName(CloudDescriptor cloudDesc) {
 return cloudDesc.getReplicaType();
   }
 
+  @Override
+  public void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException {
+// even in non zk mode, tests simulate updates from a leader
--- End diff --

this particular comment is I think intended for the standalone mode


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #527: LUCENE-8609: Allow getting consistent docstat...

2018-12-14 Thread asfgit

Github user asfgit closed the pull request at:

https://github.com/apache/lucene-solr/pull/527


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #527: LUCENE-8609: Allow getting consistent docstat...

2018-12-14 Thread dnhatn

Github user dnhatn commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/527#discussion_r241832076
  
--- Diff: lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java 
---
@@ -3300,7 +3315,7 @@ public int numDeletesToMerge(SegmentCommitInfo info, 
int delCount, IOSupplier

[GitHub] lucene-solr pull request #527: LUCENE-8609: Allow getting consistent docstat...

2018-12-14 Thread dnhatn

Github user dnhatn commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/527#discussion_r241832034
  
--- Diff: lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java 
---
@@ -3147,7 +3162,7 @@ public void testSoftUpdateDocuments() throws 
IOException {
 for (SegmentCommitInfo info : writer.cloneSegmentInfos()) {
  numSoftDeleted += info.getSoftDelCount();
 }
-assertEquals(writer.maxDoc() - writer.numDocs(), numSoftDeleted);
+assertEquals(writer.getDocStats().maxDoc - 
writer.getDocStats().numDocs, numSoftDeleted);
--- End diff --

maybe use a single docStats?


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241757056
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
--- End diff --

do you mean bytes rather than bits? Offsets seem to measure bytes?


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241765829
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241784900
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241781712
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241791577
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java
 ---
@@ -0,0 +1,1413 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.ImpactsEnum;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.TermsEnum.SeekStatus;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LongValues;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.packed.DirectMonotonicReader;
+import org.apache.lucene.util.packed.DirectReader;
+
+/** reader for {@link Lucene80DocValuesFormat} */
+final class Lucene80DocValuesProducer extends DocValuesProducer implements 
Closeable {
+  private final Map numerics = new HashMap<>();
+  private final Map binaries = new HashMap<>();
+  private final Map sorted = new HashMap<>();
+  private final Map sortedSets = new HashMap<>();
+  private final Map sortedNumerics = new 
HashMap<>();
+  private long ramBytesUsed;
+  private final IndexInput data;
+  private final int maxDoc;
+
+  /** expert: instantiates a new reader */
+  Lucene80DocValuesProducer(SegmentReadState state, String dataCodec, 
String dataExtension, String metaCodec, String metaExtension) throws 
IOException {
+String metaName = 
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, 
metaExtension);
+this.maxDoc = state.segmentInfo.maxDoc();
+ramBytesUsed = RamUsageEstimator.shallowSizeOfInstance(getClass());
+
+int version = -1;
+
+// read in the entries from the metadata file.
+try (ChecksumIndexInput in = 
state.directory.openChecksumInput(metaName, state.context)) {
+  Throwable priorE = null;
+  try {
+version = CodecUtil.checkIndexHeader(in, metaCodec,
+
Lucene80DocValuesFormat.VERSION_START,
+
Lucene80DocValuesFormat.VERSION_CURRENT,
+state.segmentInfo.getId(),
+state.segmentSuffix);
+readFields(in, state.fieldInfos);
+  } catch (Throwable exception) {
+priorE = exception;
+  } finally {
+CodecUtil.checkFooter(in, priorE);
+  }
+}
+
+String dataName = 
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, 
dataExtension);
+this.data = state.directory.openInput(dataName, state.context);
+boolean success = false;
+try {
+  final int version2 = CodecUtil.checkIndexHeader(data, dataCodec,
+ 
Lucene80DocValuesFormat.VERSION_START,
+

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241759363
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
--- End diff --

it seems to be ignoring the rank structure?


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241763714
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241762452
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241766560
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241763587
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241764037
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241766876
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241761075
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241768318
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241754841
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
--- End diff --

remove "*"?


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241759598
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
+ * wanted block. The next 33 bits holds the offset into the underlying 
slice.
+ * As there is a maximum of 2^16 blocks, it follows that the maximum size 
of any block must
+ * not exceed 2^17 bits to avoid overflow. This is currently the case, 
with the largest
+ * block being DENSE and using 2^16 + 32 bits, and is likely to continue 
to hold as using
+ * more than double the amount of bits is unlikely to be an efficient 
representation.
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing 
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has 
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block 
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of unsigned {@code 
short}s with an entry
+ * for each sub-block of 512 bits out of the 65536 bits in the outer DENSE 
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to 
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and 
that the last entry can
+ * at most be 65536-512 = 65024 and thus will always fit into an unsigned 
short.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning 
of the

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241756515
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,536 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current 
document
+ * among the list of documents that this iterator can return. This is 
useful
+ * to implement sparse doc values by only having to encode values for 
documents
+ * that actually have a value.
+ * Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:
+ *   {@code ALL} if the range contains 65536 documents exactly,
+ *   {@code DENSE} if the range contains 4096 documents or more; in 
that
+ *   case documents are stored in a bit set,
+ *   {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *   stored in a {@link DataInput#readShort() short}.
+ * 
+ * Only ranges that contain at least one value are encoded.
+ * This implementation uses 6 bytes per document in the worst-case, 
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ * 
+ * To avoid O(n) lookup time complexity, with n being the number of 
documents, two lookup
+ * tables are used:  * A lookup table for block blockCache and index, and 
a rank structure
+ * for DENSE block lookups.
+ *
+ * The lookup table is an array of {@code long}s with an entry for each 
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current 
position and forward
+ * one block at a time.
+ *
+ * Each long entry consists of 2 logical parts:
+ *
+ * The first 31 bits holds the index (number of set bits in the blocks) up 
to just before the
--- End diff --

s/holds/hold/ ?


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

2018-12-14 Thread jpountz

Github user jpountz commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/525#discussion_r241754530
  
--- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java ---
@@ -284,12 +284,12 @@
  * Stores additional per-position metadata information such as 
character offsets and user payloads
  * 
  * 
- * {@link org.apache.lucene.codecs.lucene70.Lucene70NormsFormat 
Norms}
+ * org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Norms
  * .nvd, .nvm
  * Encodes length and boost factors for docs and fields
  * 
  * 
- * {@link org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat 
Per-Document Values}
+ * org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat 
Per-Document Values
  * .dvd, .dvm
  * Encodes additional scoring factors or other per-document 
information.
  * 
--- End diff --

Actually we could remove most of this to simplify. Only the current codec 
needs to have a good description if its file formats. See eg. other 
oal.codecs.luceneXX packages which have minimal javadocs.


---

-
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #343: SOLR-12121: JWT Token authentication plugin

2018-12-14 Thread janhoy

Github user janhoy commented on a diff in the pull request:

https://github.com/apache/lucene-solr/pull/343#discussion_r241698617
  
--- Diff: 
solr/core/src/test/org/apache/solr/security/JWTAuthPluginIntegrationTest.java 
---
@@ -0,0 +1,214 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.security;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
+
+import org.apache.http.Header;
+import org.apache.http.HttpException;
+import org.apache.http.HttpHeaders;
+import org.apache.http.HttpRequest;
+import org.apache.http.HttpRequestInterceptor;
+import org.apache.http.entity.ContentType;
+import org.apache.http.protocol.HttpContext;
+import org.apache.solr.client.solrj.impl.HttpClientUtil;
+import org.apache.solr.cloud.SolrCloudAuthTestCase;
+import org.apache.solr.common.util.Pair;
+import org.jose4j.jwk.PublicJsonWebKey;
+import org.jose4j.jwk.RsaJsonWebKey;
+import org.jose4j.jws.AlgorithmIdentifiers;
+import org.jose4j.jws.JsonWebSignature;
+import org.jose4j.jwt.JwtClaims;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Validate that JWT token authentication works in a real cluster.
+ * TODO: Test also using SolrJ as client. But that requires a way to set 
Authorization header on request, see SOLR-13070
+ */
+public class JWTAuthPluginIntegrationTest extends SolrCloudAuthTestCase {
+  protected static final int NUM_SERVERS = 2;
+  protected static final int NUM_SHARDS = 2;
+  protected static final int REPLICATION_FACTOR = 1;
+  private static final String COLLECTION = "jwtColl";
+  private static String jwtTestToken;
+  private static String baseUrl;
+  private static AtomicInteger jwtInterceptCount = new AtomicInteger();
+  private static AtomicInteger pkiInterceptCount = new AtomicInteger();
+  private static final CountInterceptor interceptor = new 
CountInterceptor();
+
+  @BeforeClass
+  public static void setupClass() throws Exception {
+configureCluster(NUM_SERVERS)// nodes
+
.withSecurityJson(TEST_PATH().resolve("security").resolve("jwt_plugin_jwk_security.json"))
+.addConfig("conf1", 
TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf"))
+.configure();
+baseUrl = cluster.getRandomJetty(random()).getBaseUrl().toString();
+
+String jwkJSON = "{\n" +
+"  \"kty\": \"RSA\",\n" +
+"  \"d\": 
\"i6pyv2z3o-MlYytWsOr3IE1olu2RXZBzjPRBNgWAP1TlLNaphHEvH5aHhe_CtBAastgFFMuP29CFhaL3_tGczkvWJkSveZQN2AHWHgRShKgoSVMspkhOt3Ghha4CvpnZ9BnQzVHnaBnHDTTTfVgXz7P1ZNBhQY4URG61DKIF-JSSClyh1xKuMoJX0lILXDYGGcjVTZL_hci4IXPPTpOJHV51-pxuO7WU5M9252UYoiYyCJ56ai8N49aKIMsqhdGuO4aWUwsGIW4oQpjtce5eEojCprYl-9rDhTwLAFoBtjy6LvkqlR2Ae5dKZYpStljBjK8PJrBvWZjXAEMDdQ8PuQ\",\n"
 +
+"  \"e\": \"AQAB\",\n" +
+"  \"use\": \"sig\",\n" +
+"  \"kid\": \"test\",\n" +
+"  \"alg\": \"RS256\",\n" +
+"  \"n\": 
\"jeyrvOaZrmKWjyNXt0myAc_pJ1hNt3aRupExJEx1ewPaL9J9HFgSCjMrYxCB1ETO1NDyZ3nSgjZis-jHHDqBxBjRdq_t1E2rkGFaYbxAyKt220Pwgme_SFTB9MXVrFQGkKyjmQeVmOmV6zM3KK8uMdKQJ4aoKmwBcF5Zg7EZdDcKOFgpgva1Jq-FlEsaJ2xrYDYo3KnGcOHIt9_0NQeLsqZbeWYLxYni7uROFncXYV5FhSJCeR4A_rrbwlaCydGxE0ToC_9HNYibUHlkJjqyUhAgORCbNS8JLCJH8NUi5sDdIawK9GTSyvsJXZ-QHqo4cMUuxWV5AJtaRGghuMUfqQ\"\n"
 +
+"}";
+
+PublicJsonWebKey jwk = RsaJsonWebKey.Factory.newPublicJwk(jwkJSON);
+JwtClaims claims = JWTAuthPluginTest.generateClaims();
+JsonWebSignature jws = new JsonWebSignature();
+jws.setPayload(claims.toJson());
+

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 2680 matches

Mail list logo