On Fri, Sep 3, 2010 at 2:24 PM, Robert Muir <[email protected]> wrote: > Does SimpleFacetsTest fail for you? > I svn up'ed and i see a few failures
Ack - it does! I'll fix. I'm wondering how that happened... I think I may have previously done a ant test -Dtestcase=TestSimpleFacets (notice the misspelling) -Yonik http://lucenerevolution.org Lucene/Solr Conference, Boston Oct 7-8 > On Fri, Sep 3, 2010 at 1:12 PM, <[email protected]> wrote: >> >> Author: yonik >> Date: Fri Sep 3 17:12:26 2010 >> New Revision: 992382 >> >> URL: http://svn.apache.org/viewvc?rev=992382&view=rev >> Log: >> SOLR-2092: use native long PQ to order facet results >> >> Added: >> >> lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java >> (with props) >> Modified: >> lucene/dev/trunk/solr/CHANGES.txt >> >> lucene/dev/trunk/solr/src/java/org/apache/solr/request/SimpleFacets.java >> >> lucene/dev/trunk/solr/src/java/org/apache/solr/request/UnInvertedField.java >> lucene/dev/trunk/solr/src/test/org/apache/solr/util/PrimUtilsTest.java >> >> Modified: lucene/dev/trunk/solr/CHANGES.txt >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=992382&r1=992381&r2=992382&view=diff >> >> ============================================================================== >> --- lucene/dev/trunk/solr/CHANGES.txt (original) >> +++ lucene/dev/trunk/solr/CHANGES.txt Fri Sep 3 17:12:26 2010 >> @@ -288,6 +288,12 @@ Optimizations >> * SOLR-2046: Simplify legacy replication scripts by adding common >> functions >> to scripts-util. (koji) >> >> +* SOLR-2092: Speed up single-valued and multi-valued "fc" faceting. >> Typical >> + improvement is 5%, but can be much greater (up to 10x faster) when >> facet.offset >> + is very large (deep paging). (yonik) >> + >> + >> + >> Bug Fixes >> ---------------------- >> >> >> Modified: >> lucene/dev/trunk/solr/src/java/org/apache/solr/request/SimpleFacets.java >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/request/SimpleFacets.java?rev=992382&r1=992381&r2=992382&view=diff >> >> ============================================================================== >> --- >> lucene/dev/trunk/solr/src/java/org/apache/solr/request/SimpleFacets.java >> (original) >> +++ >> lucene/dev/trunk/solr/src/java/org/apache/solr/request/SimpleFacets.java Fri >> Sep 3 17:12:26 2010 >> @@ -44,6 +44,7 @@ import org.apache.solr.util.BoundedTreeS >> import org.apache.solr.util.ByteUtils; >> import org.apache.solr.util.DateMathParser; >> import org.apache.solr.handler.component.ResponseBuilder; >> +import org.apache.solr.util.LongPriorityQueue; >> >> import java.io.IOException; >> import java.util.*; >> @@ -416,9 +417,9 @@ public class SimpleFacets { >> } >> >> final int nTerms=endTermIndex-startTermIndex; >> + int missingCount = -1; >> >> CharArr spare = new CharArr(); >> - >> if (nTerms>0 && docs.size() >= mincount) { >> >> // count collection array only needs to be as big as the number of >> terms we are >> @@ -475,6 +476,10 @@ public class SimpleFacets { >> } >> } >> >> + if (startTermIndex == 0) { >> + missingCount = counts[0]; >> + } >> + >> // IDEA: we could also maintain a count of "other"... everything >> that fell outside >> // of the top 'N' >> >> @@ -484,7 +489,8 @@ public class SimpleFacets { >> if (sort.equals(FacetParams.FACET_SORT_COUNT) || >> sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) { >> int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1; >> maxsize = Math.min(maxsize, nTerms); >> - final BoundedTreeSet<CountPair<BytesRef,Integer>> queue = new >> BoundedTreeSet<CountPair<BytesRef,Integer>>(maxsize); >> + LongPriorityQueue queue = new >> LongPriorityQueue(Math.min(maxsize,1000), maxsize, Long.MIN_VALUE); >> + >> int min=mincount-1; // the smallest value in the top 'N' values >> for (int i=(startTermIndex==0)?1:0; i<nTerms; i++) { >> int c = counts[i]; >> @@ -492,18 +498,33 @@ public class SimpleFacets { >> // NOTE: we use c>min rather than c>=min as an optimization >> because we are going in >> // index order, so we already know that the keys are ordered. >> This can be very >> // important if a lot of the counts are repeated (like zero >> counts would be). >> - queue.add(new >> CountPair<BytesRef,Integer>(si.lookup(startTermIndex+i, new BytesRef()), >> c)); >> - if (queue.size()>=maxsize) min=queue.last().val; >> + >> + // smaller term numbers sort higher, so subtract the term >> number instead >> + long pair = (((long)c)<<32) + (Integer.MAX_VALUE - i); >> + boolean displaced = queue.insert(pair); >> + if (displaced) min=(int)(queue.top() >>> 32); >> } >> } >> - // now select the right page from the results >> - for (CountPair<BytesRef,Integer> p : queue) { >> - if (--off>=0) continue; >> - if (--lim<0) break; >> + >> + // if we are deep paging, we don't have to order the highest >> "offset" counts. >> + int collectCount = Math.max(0, queue.size() - off); >> + assert collectCount < lim; >> + >> + // the start and end indexes of our list "sorted" (starting with >> the highest value) >> + int sortedIdxStart = queue.size() - (collectCount - 1); >> + int sortedIdxEnd = queue.size() + 1; >> + final long[] sorted = queue.sort(collectCount); >> + >> + for (int i=sortedIdxStart; i<sortedIdxEnd; i++) { >> + long pair = sorted[i]; >> + int c = (int)(pair >>> 32); >> + int tnum = Integer.MAX_VALUE - (int)pair; >> + >> spare.reset(); >> - ft.indexedToReadable(p.key, spare); >> - res.add(spare.toString(), p.val); >> + ft.indexedToReadable(si.lookup(startTermIndex+tnum, br), >> spare); >> + res.add(spare.toString(), c); >> } >> + >> } else { >> // add results in index order >> int i=(startTermIndex==0)?1:0; >> @@ -526,7 +547,10 @@ public class SimpleFacets { >> } >> >> if (missing) { >> - res.add(null, getFieldMissingCount(searcher,docs,fieldName)); >> + if (missingCount < 0) { >> + missingCount = getFieldMissingCount(searcher,docs,fieldName); >> + } >> + res.add(null, missingCount); >> } >> >> return res; >> >> Modified: >> lucene/dev/trunk/solr/src/java/org/apache/solr/request/UnInvertedField.java >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/request/UnInvertedField.java?rev=992382&r1=992381&r2=992382&view=diff >> >> ============================================================================== >> --- >> lucene/dev/trunk/solr/src/java/org/apache/solr/request/UnInvertedField.java >> (original) >> +++ >> lucene/dev/trunk/solr/src/java/org/apache/solr/request/UnInvertedField.java >> Fri Sep 3 17:12:26 2010 >> @@ -37,6 +37,7 @@ import org.apache.solr.core.SolrCore; >> import org.apache.solr.schema.FieldType; >> import org.apache.solr.schema.TrieField; >> import org.apache.solr.search.*; >> +import org.apache.solr.util.LongPriorityQueue; >> import org.apache.solr.util.PrimUtils; >> import org.apache.solr.util.BoundedTreeSet; >> import org.apache.solr.handler.component.StatsValues; >> @@ -470,7 +471,9 @@ public class UnInvertedField { >> if (baseSize >= mincount) { >> >> final int[] index = this.index; >> - final int[] counts = new int[numTermsInField]; >> + // tricky: we add more more element than we need because we will >> reuse this array later >> + // for ordering term ords before converting to term labels. >> + final int[] counts = new int[numTermsInField + 1]; >> >> // >> // If there is prefix, find it's start and end term numbers >> @@ -575,7 +578,8 @@ public class UnInvertedField { >> if (sort.equals(FacetParams.FACET_SORT_COUNT) || >> sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) { >> int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1; >> maxsize = Math.min(maxsize, numTermsInField); >> - final BoundedTreeSet<Long> queue = new >> BoundedTreeSet<Long>(maxsize); >> + LongPriorityQueue queue = new >> LongPriorityQueue(Math.min(maxsize,1000), maxsize, Long.MIN_VALUE); >> + >> int min=mincount-1; // the smallest value in the top 'N' values >> for (int i=startTerm; i<endTerm; i++) { >> int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i]; >> @@ -584,55 +588,63 @@ public class UnInvertedField { >> // index order, so we already know that the keys are ordered. >> This can be very >> // important if a lot of the counts are repeated (like zero >> counts would be). >> >> - // minimize object creation and speed comparison by creating >> a long that >> - // encompasses both count and term number. >> - // Since smaller values are kept in the TreeSet, make higher >> counts smaller. >> - // >> - // for equal counts, lower term numbers >> - // should come first and hence be "greater" >> - >> - //long pair = (((long)c)<<32) | (0x7fffffff-i) ; // use if >> priority queue >> - long pair = (((long)-c)<<32) | i; >> - queue.add(new Long(pair)); >> - if (queue.size()>=maxsize) >> min=-(int)(queue.last().longValue() >>> 32); >> + // smaller term numbers sort higher, so subtract the term >> number instead >> + long pair = (((long)c)<<32) + (Integer.MAX_VALUE - i); >> + boolean displaced = queue.insert(pair); >> + if (displaced) min=(int)(queue.top() >>> 32); >> } >> } >> + >> // now select the right page from the results >> >> + // if we are deep paging, we don't have to order the highest >> "offset" counts. >> + int collectCount = Math.max(0, queue.size() - off); >> + assert collectCount < lim; >> + >> + // the start and end indexes of our list "sorted" (starting with >> the highest value) >> + int sortedIdxStart = queue.size() - (collectCount - 1); >> + int sortedIdxEnd = queue.size() + 1; >> + final long[] sorted = queue.sort(collectCount); >> >> - final int[] tnums = new int[Math.min(Math.max(0, >> queue.size()-off), lim)]; >> final int[] indirect = counts; // reuse the counts array for the >> index into the tnums array >> - assert indirect.length >= tnums.length; >> - >> - int tnumCount = 0; >> + assert indirect.length >= sortedIdxEnd; >> + >> + for (int i=sortedIdxStart; i<sortedIdxEnd; i++) { >> + long pair = sorted[i]; >> + int c = (int)(pair >>> 32); >> + int tnum = Integer.MAX_VALUE - (int)pair; >> + >> + indirect[i] = i; // store the index for indirect sorting >> + sorted[i] = tnum; // reuse the "sorted" array to store the >> term numbers for indirect sorting >> >> - for (Long p : queue) { >> - if (--off>=0) continue; >> - if (--lim<0) break; >> - int c = -(int)(p.longValue() >>> 32); >> - //int tnum = 0x7fffffff - (int)p.longValue(); // use if >> priority queue >> - int tnum = (int)p.longValue(); >> - indirect[tnumCount] = tnumCount; >> - tnums[tnumCount++] = tnum; >> - // String label = ft.indexedToReadable(getTermText(te, tnum)); >> // add a null label for now... we'll fill it in later. >> res.add(null, c); >> } >> >> // now sort the indexes by the term numbers >> - PrimUtils.sort(0, tnumCount, indirect, new >> PrimUtils.IntComparator() { >> + PrimUtils.sort(sortedIdxStart, sortedIdxEnd, indirect, new >> PrimUtils.IntComparator() { >> @Override >> public int compare(int a, int b) { >> - return tnums[a] - tnums[b]; >> + return (int)sorted[a] - (int)sorted[b]; >> + } >> + >> + �...@override >> + public boolean lessThan(int a, int b) { >> + return sorted[a] < sorted[b]; >> + } >> + >> + �...@override >> + public boolean equals(int a, int b) { >> + return sorted[a] == sorted[b]; >> } >> }); >> >> // convert the term numbers to term values and set as the label >> - for (int i=0; i<tnumCount; i++) { >> + for (int i=sortedIdxStart; i<sortedIdxEnd; i++) { >> int idx = indirect[i]; >> - int tnum = tnums[idx]; >> - String label = getReadableValue(getTermValue(te, tnum), ft, >> spare); >> - res.setName(idx, label); >> + int tnum = (int)sorted[idx]; >> + String label = getReadableValue(getTermValue(te, tnum), ft, >> spare); >> + res.setName(idx - sortedIdxStart, label); >> } >> >> } else { >> >> Added: >> lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java?rev=992382&view=auto >> >> ============================================================================== >> --- >> lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java >> (added) >> +++ >> lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java >> Fri Sep 3 17:12:26 2010 >> @@ -0,0 +1,235 @@ >> +package org.apache.solr.util; >> + >> +import java.util.Arrays; >> + >> +/** >> + * Licensed to the Apache Software Foundation (ASF) under one or more >> + * contributor license agreements. See the NOTICE file distributed with >> + * this work for additional information regarding copyright ownership. >> + * The ASF licenses this file to You under the Apache License, Version >> 2.0 >> + * (the "License"); you may not use this file except in compliance with >> + * the License. You may obtain a copy of the License at >> + * >> + * http://www.apache.org/licenses/LICENSE-2.0 >> + * >> + * Unless required by applicable law or agreed to in writing, software >> + * distributed under the License is distributed on an "AS IS" BASIS, >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or >> implied. >> + * See the License for the specific language governing permissions and >> + * limitations under the License. >> + */ >> + >> +/** A native long priority queue. >> + * >> + * @lucene.internal >> +*/ >> +public class LongPriorityQueue { >> + protected int size; // number of elements currently in the >> queue >> + protected int currentCapacity; // number of elements the queue can >> hold w/o expanding >> + protected int maxSize; // max number of elements allowed in >> the queue >> + protected long[] heap; >> + protected final long sentinel; // represents a null return value >> + >> + public LongPriorityQueue(int initialSize, int maxSize, long sentinel) { >> + this.maxSize = maxSize; >> + this.sentinel = sentinel; >> + initialize(initialSize); >> + } >> + >> + >> + protected void initialize(int sz) { >> + int heapSize; >> + if (0 == sz) >> + // We allocate 1 extra to avoid if statement in top() >> + heapSize = 2; >> + else { >> + // NOTE: we add +1 because all access to heap is >> + // 1-based not 0-based. heap[0] is unused. >> + heapSize = Math.max(sz, sz + 1); // handle overflow >> + } >> + heap = new long[heapSize]; >> + currentCapacity = sz; >> + } >> + >> + public int getCurrentCapacity() { >> + return currentCapacity; >> + } >> + >> + public void resize(int sz) { >> + int heapSize; >> + if (sz > maxSize) { >> + maxSize = sz; >> + } >> + if (0 == sz) >> + // We allocate 1 extra to avoid if statement in top() >> + heapSize = 2; >> + else { >> + heapSize = Math.max(sz, sz + 1); // handle overflow >> + } >> + heap = Arrays.copyOf(heap, heapSize); >> + currentCapacity = sz; >> + } >> + >> + /** >> + * Adds an object to a PriorityQueue in log(size) time. If one tries to >> add >> + * more objects than maxSize from initialize an >> + * {...@link ArrayIndexOutOfBoundsException} is thrown. >> + * >> + * @return the new 'top' element in the queue. >> + */ >> + public long add(long element) { >> + if (size >= currentCapacity) { >> + int newSize = Math.min(currentCapacity <<1, maxSize); >> + if (newSize < currentCapacity) newSize = Integer.MAX_VALUE; // >> handle overflow >> + resize(newSize); >> + } >> + size++; >> + heap[size] = element; >> + upHeap(); >> + return heap[1]; >> + } >> + >> + /** >> + * Adds an object to a PriorityQueue in log(size) time. If one tries to >> add >> + * more objects than the current capacity, an >> + * {...@link ArrayIndexOutOfBoundsException} is thrown. >> + */ >> + public void addNoCheck(long element) { >> + ++size; >> + heap[size] = element; >> + upHeap(); >> + } >> + >> + /** >> + * Adds an object to a PriorityQueue in log(size) time. >> + * It returns the smallest object (if any) that was >> + * dropped off the heap because it was full, or >> + * the sentinel value. >> + * >> + * This can be >> + * the given parameter (in case it is smaller than the >> + * full heap's minimum, and couldn't be added), or another >> + * object that was previously the smallest value in the >> + * heap and now has been replaced by a larger one, or null >> + * if the queue wasn't yet full with maxSize elements. >> + */ >> + public long insertWithOverflow(long element) { >> + if (size < maxSize) { >> + add(element); >> + return sentinel; >> + } else if (element > heap[1]) { >> + long ret = heap[1]; >> + heap[1] = element; >> + updateTop(); >> + return ret; >> + } else { >> + return element; >> + } >> + } >> + >> + /** inserts the element and returns true if this element caused another >> element >> + * to be dropped from the queue. */ >> + public boolean insert(long element) { >> + if (size < maxSize) { >> + add(element); >> + return false; >> + } else if (element > heap[1]) { >> + // long ret = heap[1]; >> + heap[1] = element; >> + updateTop(); >> + return true; >> + } else { >> + return false; >> + } >> + } >> + >> + /** Returns the least element of the PriorityQueue in constant time. */ >> + public long top() { >> + return heap[1]; >> + } >> + >> + /** Removes and returns the least element of the PriorityQueue in >> log(size) >> + time. Only valid if size() > 0. >> + */ >> + public long pop() { >> + long result = heap[1]; // save first value >> + heap[1] = heap[size]; // move last to first >> + size--; >> + downHeap(); // adjust heap >> + return result; >> + } >> + >> + /** >> + * Should be called when the Object at top changes values. >> + * @return the new 'top' element. >> + */ >> + public long updateTop() { >> + downHeap(); >> + return heap[1]; >> + } >> + >> + /** Returns the number of elements currently stored in the >> PriorityQueue. */ >> + public int size() { >> + return size; >> + } >> + >> + /** Returns the array used to hold the heap, with the smallest item at >> array[1] >> + * and the last (but not necessarily largest) at array[size()]. This >> is *not* >> + * fully sorted. >> + */ >> + public long[] getInternalArray() { >> + return heap; >> + } >> + >> + /** Pops the smallest n items from the heap, placing them in the >> internal array at >> + * arr[size] through arr[size-(n-1)] with the smallest (first element >> popped) >> + * being at arr[size]. The internal array is returned. >> + */ >> + public long[] sort(int n) { >> + while (--n >= 0) { >> + long result = heap[1]; // save first value >> + heap[1] = heap[size]; // move last to first >> + heap[size] = result; // place it last >> + size--; >> + downHeap(); // adjust heap >> + } >> + return heap; >> + } >> + >> + /** Removes all entries from the PriorityQueue. */ >> + public void clear() { >> + size = 0; >> + } >> + >> + private void upHeap() { >> + int i = size; >> + long node = heap[i]; // save bottom node >> + int j = i >>> 1; >> + while (j > 0 && node < heap[j]) { >> + heap[i] = heap[j]; // shift parents down >> + i = j; >> + j = j >>> 1; >> + } >> + heap[i] = node; // install saved node >> + } >> + >> + private void downHeap() { >> + int i = 1; >> + long node = heap[i]; // save top node >> + int j = i << 1; // find smaller child >> + int k = j + 1; >> + if (k <= size && heap[k] < heap[j]) { >> + j = k; >> + } >> + while (j <= size && heap[j] < node) { >> + heap[i] = heap[j]; // shift up child >> + i = j; >> + j = i << 1; >> + k = j + 1; >> + if (k <= size && heap[k] < heap[j]) { >> + j = k; >> + } >> + } >> + heap[i] = node; // install saved node >> + } >> +} >> >> Propchange: >> lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java >> >> ------------------------------------------------------------------------------ >> svn:eol-style = native >> >> Propchange: >> lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java >> >> ------------------------------------------------------------------------------ >> svn:executable = * >> >> Propchange: >> lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java >> >> ------------------------------------------------------------------------------ >> svn:keywords = Date Author Id Revision HeadURL >> >> Modified: >> lucene/dev/trunk/solr/src/test/org/apache/solr/util/PrimUtilsTest.java >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/util/PrimUtilsTest.java?rev=992382&r1=992381&r2=992382&view=diff >> >> ============================================================================== >> --- lucene/dev/trunk/solr/src/test/org/apache/solr/util/PrimUtilsTest.java >> (original) >> +++ lucene/dev/trunk/solr/src/test/org/apache/solr/util/PrimUtilsTest.java >> Fri Sep 3 17:12:26 2010 >> @@ -51,4 +51,44 @@ public class PrimUtilsTest extends Lucen >> } >> } >> >> + public void testLongPriorityQueue() { >> + int maxSize = 100; >> + long[] a = new long[maxSize]; >> + long[] discards = new long[maxSize]; >> + >> + for (int iter=0; iter<100; iter++) { >> + int discardCount = 0; >> + int startSize = r.nextInt(maxSize) + 1; >> + int endSize = startSize==maxSize ? maxSize : startSize + >> r.nextInt(maxSize-startSize); >> + int adds = r.nextInt(maxSize+1); >> + // System.out.println("startSize=" + startSize + " endSize=" + >> endSize + " adds="+adds); >> + LongPriorityQueue pq = new LongPriorityQueue(startSize, endSize, >> Long.MIN_VALUE); >> + >> + for (int i=0; i<adds; i++) { >> + long v = r.nextLong(); >> + a[i] = v; >> + long out = pq.insertWithOverflow(v); >> + if (i < endSize) { >> + assertEquals(out, Long.MIN_VALUE); >> + } else { >> + discards[discardCount++] = out; >> + } >> + } >> + assertEquals(Math.min(adds,endSize), pq.size()); >> + assertEquals(adds, pq.size() + discardCount); >> + >> + Arrays.sort(a, 0, adds); >> + Arrays.sort(discards, 0, discardCount); >> + for (int i=0; i<discardCount; i++) { >> + assertEquals(a[i], discards[i]); >> + } >> + >> + for (int i=discardCount; i<adds; i++) { >> + assertEquals(a[i], pq.pop()); >> + } >> + >> + assertEquals(0, pq.size()); >> + } >> + } >> + >> } >> \ No newline at end of file >> >> > > > > -- > Robert Muir > [email protected] > --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
