[GitHub] lucene-solr pull request #317: LUCENE-8145: OffsetsEnum is now unitary
Github user asfgit closed the pull request at: https://github.com/apache/lucene-solr/pull/317 --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #317: LUCENE-8145: OffsetsEnum is now unitary
Github user jimczi commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/317#discussion_r165576425 --- Diff: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java --- @@ -124,34 +123,122 @@ public PostingsEnum getPostingsEnum() { @Override public boolean nextPosition() throws IOException { - if (posCounter < postingsEnum.freq()) { -posCounter++; + if (posCounter > 0) { +posCounter--; postingsEnum.nextPosition(); // note: we don't need to save the position return true; } else { return false; } } +@Override +public BytesRef getTerm() throws IOException { + return term; +} + +@Override +public int startOffset() throws IOException { + return postingsEnum.startOffset(); +} + +@Override +public int endOffset() throws IOException { + return postingsEnum.endOffset(); +} + @Override public int freq() throws IOException { - return postingsEnum.freq(); + return freq; +} + } + + public static final OffsetsEnum EMPTY = new OffsetsEnum() { +@Override +public boolean nextPosition() throws IOException { + return false; } @Override public BytesRef getTerm() throws IOException { - return term; + throw new UnsupportedOperationException(); } @Override public int startOffset() throws IOException { - return postingsEnum.startOffset(); + throw new UnsupportedOperationException(); } @Override public int endOffset() throws IOException { - return postingsEnum.endOffset(); + throw new UnsupportedOperationException(); +} + +@Override +public int freq() throws IOException { + return 0; +} + + }; + + public static class MultiOffsetsEnum extends OffsetsEnum { + +private final PriorityQueue queue; +private boolean started = false; + +public MultiOffsetsEnum(List inner) throws IOException { + this.queue = new PriorityQueue<>(); + for (OffsetsEnum oe : inner) { +if (oe.nextPosition()) + this.queue.add(oe); + } +} + +@Override +public boolean nextPosition() throws IOException { + if (started == false) { +started = true; +return this.queue.size() > 0; + } + if (this.queue.size() > 0) { +OffsetsEnum top = this.queue.poll(); +if (top.nextPosition()) { + this.queue.add(top); + return true; +} +else { + top.close(); +} +return this.queue.size() > 0; + } + return false; +} + +@Override +public BytesRef getTerm() throws IOException { + return this.queue.peek().getTerm(); } +@Override +public int startOffset() throws IOException { + return this.queue.peek().startOffset(); +} + +@Override +public int endOffset() throws IOException { + return this.queue.peek().endOffset(); +} + +@Override +public int freq() throws IOException { + return this.queue.peek().freq(); +} + +@Override +public void close() throws IOException { + // most child enums will have been closed in .nextPosition() + // here all remaining non-exhausted enums are closed + IOUtils.close(queue); +} --- End diff -- ++ --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #317: LUCENE-8145: OffsetsEnum is now unitary
Github user romseygeek commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/317#discussion_r165431325 --- Diff: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java --- @@ -124,34 +123,122 @@ public PostingsEnum getPostingsEnum() { @Override public boolean nextPosition() throws IOException { - if (posCounter < postingsEnum.freq()) { -posCounter++; + if (posCounter > 0) { +posCounter--; postingsEnum.nextPosition(); // note: we don't need to save the position return true; } else { return false; } } +@Override +public BytesRef getTerm() throws IOException { + return term; +} + +@Override +public int startOffset() throws IOException { + return postingsEnum.startOffset(); +} + +@Override +public int endOffset() throws IOException { + return postingsEnum.endOffset(); +} + @Override public int freq() throws IOException { - return postingsEnum.freq(); + return freq; +} + } + + public static final OffsetsEnum EMPTY = new OffsetsEnum() { +@Override +public boolean nextPosition() throws IOException { + return false; } @Override public BytesRef getTerm() throws IOException { - return term; + throw new UnsupportedOperationException(); } @Override public int startOffset() throws IOException { - return postingsEnum.startOffset(); + throw new UnsupportedOperationException(); } @Override public int endOffset() throws IOException { - return postingsEnum.endOffset(); + throw new UnsupportedOperationException(); +} + +@Override +public int freq() throws IOException { + return 0; +} + + }; + + public static class MultiOffsetsEnum extends OffsetsEnum { + +private final PriorityQueue queue; +private boolean started = false; + +public MultiOffsetsEnum(List inner) throws IOException { + this.queue = new PriorityQueue<>(); + for (OffsetsEnum oe : inner) { +if (oe.nextPosition()) + this.queue.add(oe); + } +} + +@Override +public boolean nextPosition() throws IOException { + if (started == false) { +started = true; +return this.queue.size() > 0; + } + if (this.queue.size() > 0) { +OffsetsEnum top = this.queue.poll(); +if (top.nextPosition()) { + this.queue.add(top); + return true; +} +else { + top.close(); +} +return this.queue.size() > 0; + } + return false; +} + +@Override +public BytesRef getTerm() throws IOException { + return this.queue.peek().getTerm(); } +@Override +public int startOffset() throws IOException { + return this.queue.peek().startOffset(); +} + +@Override +public int endOffset() throws IOException { + return this.queue.peek().endOffset(); +} + +@Override +public int freq() throws IOException { + return this.queue.peek().freq(); +} + +@Override +public void close() throws IOException { + // most child enums will have been closed in .nextPosition() + // here all remaining non-exhausted enums are closed + IOUtils.close(queue); +} --- End diff -- I can see this would be useful, but let's wait until OffsetsEnum is exposed to the outside world? It's still all internal to UnifiedHighlighter at the moment. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #317: LUCENE-8145: OffsetsEnum is now unitary
Github user dsmiley commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/317#discussion_r165375993 --- Diff: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java --- @@ -60,42 +65,35 @@ public void addMatch(int startOffset, int endOffset, BytesRef term) { matchEnds[numMatches] = endOffset; matchTerms[numMatches] = term; numMatches++; - } - - /** @lucene.internal */ - public void sort() { -final int starts[] = matchStarts; -final int ends[] = matchEnds; -final BytesRef terms[] = matchTerms; -new InPlaceMergeSorter() { - @Override - protected void swap(int i, int j) { -int temp = starts[i]; -starts[i] = starts[j]; -starts[j] = temp; - -temp = ends[i]; -ends[i] = ends[j]; -ends[j] = temp; - -BytesRef tempTerm = terms[i]; -terms[i] = terms[j]; -terms[j] = tempTerm; - } - - @Override - protected int compare(int i, int j) { -return Integer.compare(starts[i], starts[j]); - } - -}.sort(0, numMatches); +int termIndex = termsHash.add(term); --- End diff -- Perhaps we could push as much of this as possible to `setScore(scorer,contentLength)`? So Imagine the only extra field we add to Passage here is termFreqsInDoc[] that is sized alone with the other arrays. No BytesRefHash, no termFreqsInPassage). Instead, in setScore, that's where it computes the distinct set and termFreqsInPassage, only for the score computation within the scope of that method. It keeps the score related stuff as isolated as possible. And IMO as I stated in JIRA, we could add a method to PassageScorer to do all that stuff there. So a simple PassageScorer that only cared about first-occurrence would simply use the inverse of startOffset. Or we could leave that PassageScorer modification to a separate issue if you prefer. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #317: LUCENE-8145: OffsetsEnum is now unitary
Github user jimczi commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/317#discussion_r165372379 --- Diff: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java --- @@ -124,34 +123,122 @@ public PostingsEnum getPostingsEnum() { @Override public boolean nextPosition() throws IOException { - if (posCounter < postingsEnum.freq()) { -posCounter++; + if (posCounter > 0) { +posCounter--; postingsEnum.nextPosition(); // note: we don't need to save the position return true; } else { return false; } } +@Override +public BytesRef getTerm() throws IOException { + return term; +} + +@Override +public int startOffset() throws IOException { + return postingsEnum.startOffset(); +} + +@Override +public int endOffset() throws IOException { + return postingsEnum.endOffset(); +} + @Override public int freq() throws IOException { - return postingsEnum.freq(); + return freq; +} + } + + public static final OffsetsEnum EMPTY = new OffsetsEnum() { +@Override +public boolean nextPosition() throws IOException { + return false; } @Override public BytesRef getTerm() throws IOException { - return term; + throw new UnsupportedOperationException(); } @Override public int startOffset() throws IOException { - return postingsEnum.startOffset(); + throw new UnsupportedOperationException(); } @Override public int endOffset() throws IOException { - return postingsEnum.endOffset(); + throw new UnsupportedOperationException(); +} + +@Override +public int freq() throws IOException { + return 0; +} + + }; + + public static class MultiOffsetsEnum extends OffsetsEnum { + +private final PriorityQueue queue; +private boolean started = false; + +public MultiOffsetsEnum(List inner) throws IOException { + this.queue = new PriorityQueue<>(); + for (OffsetsEnum oe : inner) { +if (oe.nextPosition()) + this.queue.add(oe); + } +} + +@Override +public boolean nextPosition() throws IOException { + if (started == false) { +started = true; +return this.queue.size() > 0; + } + if (this.queue.size() > 0) { +OffsetsEnum top = this.queue.poll(); +if (top.nextPosition()) { + this.queue.add(top); + return true; +} +else { + top.close(); +} +return this.queue.size() > 0; + } + return false; +} + +@Override +public BytesRef getTerm() throws IOException { + return this.queue.peek().getTerm(); } +@Override +public int startOffset() throws IOException { + return this.queue.peek().startOffset(); +} + +@Override +public int endOffset() throws IOException { + return this.queue.peek().endOffset(); +} + +@Override +public int freq() throws IOException { + return this.queue.peek().freq(); +} + +@Override +public void close() throws IOException { + // most child enums will have been closed in .nextPosition() + // here all remaining non-exhausted enums are closed + IOUtils.close(queue); +} --- End diff -- Would be great to have a way to mark `OffsetsEnum` with an id so that we know which enum matches when `MultiOffsetsEnum` is used. Maybe a `getId` in `OffsetsEnum` is enough ? This way we could link multi term queries with the same id and use this information for scoring. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #317: LUCENE-8145: OffsetsEnum is now unitary
Github user jimczi commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/317#discussion_r165370877 --- Diff: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java --- @@ -158,121 +143,58 @@ public Object highlightFieldForDoc(IndexReader reader, int docId, String content }); Passage passage = new Passage(); // the current passage in-progress. Will either get reset or added to queue. -OffsetsEnum off; -while ((off = offsetsEnumQueue.poll()) != null) { +do { int start = off.startOffset(); if (start == -1) { throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } int end = off.endOffset(); - // LUCENE-5166: this hit would span the content limit... however more valid - // hits may exist (they are sorted by start). so we pretend like we never - // saw this term, it won't cause a passage to be added to passageQueue or anything. - assert EMPTY.startOffset() == Integer.MAX_VALUE; if (start < contentLength && end > contentLength) { continue; } // See if this term should be part of a new passage. if (start >= passage.getEndOffset()) { -if (passage.getStartOffset() >= 0) { // true if this passage has terms; otherwise couldn't find any (yet) - // finalize passage - passage.setScore(passage.getScore() * scorer.norm(passage.getStartOffset())); - // new sentence: first add 'passage' to queue - if (passageQueue.size() == maxPassages && passage.getScore() < passageQueue.peek().getScore()) { -passage.reset(); // can't compete, just reset it - } else { -passageQueue.offer(passage); -if (passageQueue.size() > maxPassages) { - passage = passageQueue.poll(); - passage.reset(); -} else { - passage = new Passage(); -} - } -} +passage = maybeAddPassage(passageQueue, passageScorer, passage, contentLength); // if we exceed limit, we are done if (start >= contentLength) { break; } // advance breakIterator -passage.setStartOffset(Math.max(breakIterator.preceding(start + 1), 0)); -passage.setEndOffset(Math.min(breakIterator.following(start), contentLength)); +passage.setStartOffset(Math.max(this.breakIterator.preceding(start + 1), 0)); +passage.setEndOffset(Math.min(this.breakIterator.following(start), contentLength)); } // Add this term to the passage. - int tf = 0; - while (true) { -tf++; -BytesRef term = off.getTerm();// a reference; safe to refer to -assert term != null; -passage.addMatch(start, end, term); -// see if there are multiple occurrences of this term in this passage. If so, add them. -if (!off.nextPosition()) { - break; // No more in the entire text. Already removed from pq; move on -} -start = off.startOffset(); -end = off.endOffset(); -if (start >= passage.getEndOffset() || end > contentLength) { // it's beyond this passage - offsetsEnumQueue.offer(off); - break; -} - } - passage.setScore(passage.getScore() + off.getWeight() * scorer.tf(tf, passage.getEndOffset() - passage.getStartOffset())); -} + BytesRef term = off.getTerm();// a reference; safe to refer to + assert term != null; + passage.addMatch(start, end, term, off.freq()); +} while (off.nextPosition()); +maybeAddPassage(passageQueue, passageScorer, passage, contentLength); Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]); -for (Passage p : passages) { - p.sort(); -} // sort in ascending order Arrays.sort(passages, Comparator.comparingInt(Passage::getStartOffset)); return passages; } - protected static final PostingsEnum EMPTY = new PostingsEnum() { - -@Override -public int nextPosition() throws IOException { - return 0; -} - -@Override -public int startOffset() throws IOException { - return Integer.MAX_VALUE; -} - -@Override -public int endOffset() throws IOException { - return Integer.MAX_VALUE; -} - -@Override -public BytesRef getPayload() throws IOException { - return null; -} - -@Override -public int freq()
[GitHub] lucene-solr pull request #317: LUCENE-8145: OffsetsEnum is now unitary
Github user dsmiley commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/317#discussion_r165372488 --- Diff: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java --- @@ -193,8 +203,4 @@ public void setEndOffset(int endOffset) { this.endOffset = endOffset; } - /** @lucene.internal */ - public void setScore(float score) { --- End diff -- We should probably keep setScore(float) for users who want to build scores their own way. --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org
[GitHub] lucene-solr pull request #317: LUCENE-8145: OffsetsEnum is now unitary
GitHub user romseygeek opened a pull request: https://github.com/apache/lucene-solr/pull/317 LUCENE-8145: OffsetsEnum is now unitary You can merge this pull request into a Git repository by running: $ git pull https://github.com/romseygeek/lucene-solr offsetsenum Alternatively you can review and apply these changes as the patch at: https://github.com/apache/lucene-solr/pull/317.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #317 commit 612199ef545f27fe8d934d8ce73afb48f9995f9a Author: Alan WoodwardDate: 2018-01-30T17:50:53Z FieldOffsetStragety.getOffsetEnum() now returns a single MultiOffsetsEnum commit 134cae3478f1c48ed72ebc606f1e58ca27503f16 Author: Alan Woodward Date: 2018-02-01T14:00:47Z Address review comments --- - To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org