[
https://issues.apache.org/jira/browse/LUCENE-7482?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
AMIRAULT Martin updated LUCENE-7482:
------------------------------------
Description:
We are currently using Lucene here in my company for our main product.
Our search functionnality is quite basic and the results are always sorted
given a predefined field. The user is only able to choose the sort order
(Asc/Desc).
I am currently investigating using the index sort feature with
EarlyTerminationSortingCollector.
This is quite a shame searching on a sorted index in reverse order do not have
any optimization and was wondering if it would be possible to make it faster by
creating a special "ReverseSortingCollector" for this purpose.
I am aware the posting list is designed to be always iterated in the same
order, so it is not about early-terminating the search but more about
filtering-out unneeded documents more efficiently.
If a segment is sorted in reverse order, we just have to delegate collection of
the last matched documents.
Here is a sample quick code:
{code:title=ReverseSortingCollector.java|borderStyle=solid}
public class ReverseSortingCollector extends FilterCollector {
/** Sort used to sort the search results */
protected final Sort sort;
/** Number of documents to collect in each segment */
protected final int numDocsToCollect;
[...]
private List<FlushData> flushList = new ArrayList<>();
private static final class FlushData {
// ring buffer
int[] buffer;
// index of the first element in the buffer
int index;
LeafCollector leafCollector;
FlushData(int[] buffer, LeafCollector leafCollector) {
super();
this.buffer = buffer;
this.leafCollector = leafCollector;
}
}
@Override
public LeafCollector getLeafCollector(LeafReaderContext context) throws
IOException {
//flush previous data if any
flush();
LeafReader reader = context.reader();
Sort segmentSort = reader.getIndexSort();
if (isReverseOrder(sort, segmentSort)) {//segment is sorted in reverse
order than the search sort
int[] buffer = new int[numDocsToCollect];
Arrays.fill(buffer, -1);
FlushData flushData = new FlushData(buffer,
in.getLeafCollector(context));
flushList.add(flushData);
return new LeafCollector() {
@Override
public void setScorer(Scorer scorer) throws IOException {
}
@Override
public void collect(int doc) throws IOException {
//we remember the last `numDocsToCollect`
documents that matched
buffer[flushData.index % buffer.length] = doc;
flushData.index++;
}
};
} else {
return in.getLeafCollector(context);
}
}
//flush the last `numDocsToCollect` collected documents do the
delegated Collector
public void flush() throws IOException {
for (FlushData flushData : flushList) {
for (int i = 0; i < flushData.buffer.length; i++) {
int doc = flushData.buffer[(flushData.index + i) %
flushData.buffer.length];
if (doc != -1) {
flushData.leafCollector.collect(doc);
}
}
}
flushList.clear();
}
}
{code}
This is specially efficient when used along with TopFieldCollector as a lot of
docValue lookup would not take place.
In my experiment it reduced search time up to 90%.
Note 1: Does not support paging.
Note 2: Current implementation probably not thread safe
was:
We are currently using Lucene here in my company for our main product.
Our search functionnality is quite basic and the results are always sorted
given a predefined field. The user is only able to choose the sort order
(Asc/Desc).
I am currently investigating using the index sort feature with
EarlyTerminationSortingCollector.
This is quite a shame searching on a sorted index in reverse order do not have
any optimization and was wondering if it would be possible to make it faster by
creating a special "ReverseSortingCollector" for this purpose.
I am aware the posting list is designed to be always iterated in the same
order, so it is not about early-terminating the search but more about
filtering-out unneeded documents more efficiently.
If a segment is sorted in reverse order, we just have to delegate collection of
the last matched documents.
Here is a sample quick code:
{code:title=ReverseSortingCollector.java|borderStyle=solid}
public class ReverseSortingCollector extends FilterCollector {
/** Sort used to sort the search results */
protected final Sort sort;
/** Number of documents to collect in each segment */
protected final int numDocsToCollect;
[...]
private List<FlushData> flushList = new ArrayList<>();
private static final class FlushData {
// ring buffer
int[] buffer;
// index of the first element in the buffer
int index;
LeafCollector leafCollector;
FlushData(int[] buffer, LeafCollector leafCollector) {
super();
this.buffer = buffer;
this.leafCollector = leafCollector;
}
}
@Override
public LeafCollector getLeafCollector(LeafReaderContext context) throws
IOException {
//flush previous data if any
flush();
LeafReader reader = context.reader();
Sort segmentSort = reader.getIndexSort();
if (isReverseOrder(sort, segmentSort)) {//segment is sorted in reverse
order than the search sort
int[] buffer = new int[numDocsToCollect];
Arrays.fill(buffer, -1);
FlushData flushData = new FlushData(buffer,
in.getLeafCollector(context));
flushList.add(flushData);
return new LeafCollector() {
@Override
public void setScorer(Scorer scorer) throws IOException {
}
@Override
public void collect(int doc) throws IOException {
//we remember the last `numDocsToCollect`
documents that matched
buffer[flushData.index % buffer.length] = doc;
flushData.index++;
}
};
}else{
return in.getLeafCollector(context);
}
}
//flush the last `numDocsToCollect` collected documents do the
delegated Collector
public void flush() throws IOException {
for (FlushData flushData : flushList) {
for (int i = 0; i < flushData.buffer.length; i++) {
int doc = flushData.buffer[(flushData.index + i) %
flushData.buffer.length];
if (doc != -1) {
flushData.leafCollector.collect(doc);
}
}
}
flushList.clear();
}
}
{code}
This is specially efficient when used along with TopFieldCollector as a lot of
docValue lookup would not take place.
In my experiment it reduced search time up to 90%.
Note 1: Does not support paging.
Note 2: Current implementation probably not thread safe
> Faster sorted index search for reverse order search
> ---------------------------------------------------
>
> Key: LUCENE-7482
> URL: https://issues.apache.org/jira/browse/LUCENE-7482
> Project: Lucene - Core
> Issue Type: New Feature
> Reporter: AMIRAULT Martin
> Priority: Minor
>
> We are currently using Lucene here in my company for our main product.
> Our search functionnality is quite basic and the results are always sorted
> given a predefined field. The user is only able to choose the sort order
> (Asc/Desc).
> I am currently investigating using the index sort feature with
> EarlyTerminationSortingCollector.
> This is quite a shame searching on a sorted index in reverse order do not
> have any optimization and was wondering if it would be possible to make it
> faster by creating a special "ReverseSortingCollector" for this purpose.
> I am aware the posting list is designed to be always iterated in the same
> order, so it is not about early-terminating the search but more about
> filtering-out unneeded documents more efficiently.
> If a segment is sorted in reverse order, we just have to delegate collection
> of the last matched documents.
> Here is a sample quick code:
> {code:title=ReverseSortingCollector.java|borderStyle=solid}
> public class ReverseSortingCollector extends FilterCollector {
> /** Sort used to sort the search results */
> protected final Sort sort;
> /** Number of documents to collect in each segment */
> protected final int numDocsToCollect;
>
> [...]
> private List<FlushData> flushList = new ArrayList<>();
> private static final class FlushData {
> // ring buffer
> int[] buffer;
>
> // index of the first element in the buffer
> int index;
>
> LeafCollector leafCollector;
> FlushData(int[] buffer, LeafCollector leafCollector) {
> super();
> this.buffer = buffer;
> this.leafCollector = leafCollector;
> }
> }
> @Override
> public LeafCollector getLeafCollector(LeafReaderContext context) throws
> IOException {
>
> //flush previous data if any
> flush();
>
> LeafReader reader = context.reader();
> Sort segmentSort = reader.getIndexSort();
> if (isReverseOrder(sort, segmentSort)) {//segment is sorted in
> reverse order than the search sort
> int[] buffer = new int[numDocsToCollect];
> Arrays.fill(buffer, -1);
> FlushData flushData = new FlushData(buffer,
> in.getLeafCollector(context));
> flushList.add(flushData);
> return new LeafCollector() {
> @Override
> public void setScorer(Scorer scorer) throws IOException {
> }
>
> @Override
> public void collect(int doc) throws IOException {
> //we remember the last `numDocsToCollect`
> documents that matched
> buffer[flushData.index % buffer.length] = doc;
> flushData.index++;
> }
> };
> } else {
> return in.getLeafCollector(context);
> }
> }
> //flush the last `numDocsToCollect` collected documents do the
> delegated Collector
> public void flush() throws IOException {
> for (FlushData flushData : flushList) {
> for (int i = 0; i < flushData.buffer.length; i++) {
> int doc = flushData.buffer[(flushData.index + i) %
> flushData.buffer.length];
> if (doc != -1) {
> flushData.leafCollector.collect(doc);
> }
> }
> }
> flushList.clear();
> }
>
> }
> {code}
> This is specially efficient when used along with TopFieldCollector as a lot
> of docValue lookup would not take place.
> In my experiment it reduced search time up to 90%.
> Note 1: Does not support paging.
> Note 2: Current implementation probably not thread safe
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]