[GitHub] lucene-solr pull request #317: LUCENE-8145: OffsetsEnum is now unitary

jimczi Thu, 01 Feb 2018 06:39:43 -0800

Github user jimczi commented on a diff in the pull request:

    https://github.com/apache/lucene-solr/pull/317#discussion_r165370877
  
    --- Diff: 
lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java
 ---
    @@ -158,121 +143,58 @@ public Object highlightFieldForDoc(IndexReader 
reader, int docId, String content
         });
         Passage passage = new Passage(); // the current passage in-progress.  
Will either get reset or added to queue.
     
    -    OffsetsEnum off;
    -    while ((off = offsetsEnumQueue.poll()) != null) {
    +    do {
           int start = off.startOffset();
           if (start == -1) {
             throw new IllegalArgumentException("field '" + field + "' was 
indexed without offsets, cannot highlight");
           }
           int end = off.endOffset();
    -      // LUCENE-5166: this hit would span the content limit... however 
more valid
    -      // hits may exist (they are sorted by start). so we pretend like we 
never
    -      // saw this term, it won't cause a passage to be added to 
passageQueue or anything.
    -      assert EMPTY.startOffset() == Integer.MAX_VALUE;
           if (start < contentLength && end > contentLength) {
             continue;
           }
           // See if this term should be part of a new passage.
           if (start >= passage.getEndOffset()) {
    -        if (passage.getStartOffset() >= 0) { // true if this passage has 
terms; otherwise couldn't find any (yet)
    -          // finalize passage
    -          passage.setScore(passage.getScore() * 
scorer.norm(passage.getStartOffset()));
    -          // new sentence: first add 'passage' to queue
    -          if (passageQueue.size() == maxPassages && passage.getScore() < 
passageQueue.peek().getScore()) {
    -            passage.reset(); // can't compete, just reset it
    -          } else {
    -            passageQueue.offer(passage);
    -            if (passageQueue.size() > maxPassages) {
    -              passage = passageQueue.poll();
    -              passage.reset();
    -            } else {
    -              passage = new Passage();
    -            }
    -          }
    -        }
    +        passage = maybeAddPassage(passageQueue, passageScorer, passage, 
contentLength);
             // if we exceed limit, we are done
             if (start >= contentLength) {
               break;
             }
             // advance breakIterator
    -        passage.setStartOffset(Math.max(breakIterator.preceding(start + 
1), 0));
    -        passage.setEndOffset(Math.min(breakIterator.following(start), 
contentLength));
    +        passage.setStartOffset(Math.max(this.breakIterator.preceding(start 
+ 1), 0));
    +        passage.setEndOffset(Math.min(this.breakIterator.following(start), 
contentLength));
           }
           // Add this term to the passage.
    -      int tf = 0;
    -      while (true) {
    -        tf++;
    -        BytesRef term = off.getTerm();// a reference; safe to refer to
    -        assert term != null;
    -        passage.addMatch(start, end, term);
    -        // see if there are multiple occurrences of this term in this 
passage. If so, add them.
    -        if (!off.nextPosition()) {
    -          break; // No more in the entire text. Already removed from pq; 
move on
    -        }
    -        start = off.startOffset();
    -        end = off.endOffset();
    -        if (start >= passage.getEndOffset() || end > contentLength) { // 
it's beyond this passage
    -          offsetsEnumQueue.offer(off);
    -          break;
    -        }
    -      }
    -      passage.setScore(passage.getScore() + off.getWeight() * 
scorer.tf(tf, passage.getEndOffset() - passage.getStartOffset()));
    -    }
    +      BytesRef term = off.getTerm();// a reference; safe to refer to
    +      assert term != null;
    +      passage.addMatch(start, end, term, off.freq());
    +    } while (off.nextPosition());
    +    maybeAddPassage(passageQueue, passageScorer, passage, contentLength);
     
         Passage[] passages = passageQueue.toArray(new 
Passage[passageQueue.size()]);
    -    for (Passage p : passages) {
    -      p.sort();
    -    }
         // sort in ascending order
         Arrays.sort(passages, 
Comparator.comparingInt(Passage::getStartOffset));
         return passages;
       }
     
    -  protected static final PostingsEnum EMPTY = new PostingsEnum() {
    -
    -    @Override
    -    public int nextPosition() throws IOException {
    -      return 0;
    -    }
    -
    -    @Override
    -    public int startOffset() throws IOException {
    -      return Integer.MAX_VALUE;
    -    }
    -
    -    @Override
    -    public int endOffset() throws IOException {
    -      return Integer.MAX_VALUE;
    -    }
    -
    -    @Override
    -    public BytesRef getPayload() throws IOException {
    -      return null;
    -    }
    -
    -    @Override
    -    public int freq() throws IOException {
    -      return 0;
    -    }
    -
    -    @Override
    -    public int docID() {
    -      return NO_MORE_DOCS;
    +  private Passage maybeAddPassage(PriorityQueue<Passage> passageQueue, 
PassageScorer scorer, Passage passage, int contentLength) {
    +    if (passage.getStartOffset() == -1) {
    +      // empty passage, we can ignore it
    +      return passage;
         }
    -
    -    @Override
    -    public int nextDoc() throws IOException {
    -      return NO_MORE_DOCS;
    -    }
    -
    -    @Override
    -    public int advance(int target) throws IOException {
    -      return NO_MORE_DOCS;
    +    passage.setScore(scorer, contentLength);
    --- End diff --
    
    We could maybe do the opposite and pass the passage to the scorer with an 
API for the PassageScorer like:
    
    ```
    float score(Passage passage);
    ```
    We would still need to have the total term freq encoded in the passage but 
the passage frequency and the BM25 computation could be done entirely in the 
scorer. This would also allow more flexibility to score a passage since the 
scoring could be done in one place (instead of on a term basis like we do 
today).



---

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] lucene-solr pull request #317: LUCENE-8145: OffsetsEnum is now unitary

Reply via email to