[ https://issues.apache.org/jira/browse/LUCENE-7526?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15616778#comment-15616778 ]
ASF GitHub Bot commented on LUCENE-7526: ---------------------------------------- Github user Timothy055 commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/105#discussion_r85619786 --- Diff: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java --- @@ -65,58 +65,88 @@ public String getField() { */ public abstract List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException; - protected List<OffsetsEnum> createOffsetsEnums(LeafReader leafReader, int doc, TokenStream tokenStream) throws IOException { - List<OffsetsEnum> offsetsEnums = createOffsetsEnumsFromReader(leafReader, doc); - if (automata.length > 0) { - offsetsEnums.add(createOffsetsEnumFromTokenStream(doc, tokenStream)); + protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader leafReader, int doc) throws IOException { + final Terms termsIndex = leafReader.terms(field); + if (termsIndex == null) { + return Collections.emptyList(); } - return offsetsEnums; - } - protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader atomicReader, int doc) throws IOException { // For strict positions, get a Map of term to Spans: // note: ScriptPhraseHelper.NONE does the right thing for these method calls final Map<BytesRef, Spans> strictPhrasesTermToSpans = - strictPhrases.getTermToSpans(atomicReader, doc); + phraseHelper.getTermToSpans(leafReader, doc); // Usually simply wraps terms in a List; but if willRewrite() then can be expanded final List<BytesRef> sourceTerms = - strictPhrases.expandTermsIfRewrite(terms, strictPhrasesTermToSpans); + phraseHelper.expandTermsIfRewrite(terms, strictPhrasesTermToSpans); - final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + 1); + final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + automata.length); - Terms termsIndex = atomicReader == null || sourceTerms.isEmpty() ? null : atomicReader.terms(field); - if (termsIndex != null) { + // Handle sourceTerms: + if (!sourceTerms.isEmpty()) { TermsEnum termsEnum = termsIndex.iterator();//does not return null for (BytesRef term : sourceTerms) { - if (!termsEnum.seekExact(term)) { - continue; // term not found - } - PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); - if (postingsEnum == null) { - // no offsets or positions available - throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); - } - if (doc != postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted - continue; + if (termsEnum.seekExact(term)) { + PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); + + if (postingsEnum == null) { + // no offsets or positions available + throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); + } + + if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted + postingsEnum = phraseHelper.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term)); + if (postingsEnum != null) { + offsetsEnums.add(new OffsetsEnum(term, postingsEnum)); + } + } } - postingsEnum = strictPhrases.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term)); - if (postingsEnum == null) { - continue;// completely filtered out + } + } + + // Handle automata + if (automata.length > 0) { + offsetsEnums.addAll(createAutomataOffsetsFromTerms(termsIndex, doc)); + } + + return offsetsEnums; + } + + protected List<OffsetsEnum> createAutomataOffsetsFromTerms(Terms termsIndex, int doc) throws IOException { + Map<CharacterRunAutomaton, List<PostingsEnum>> automataPostings = new IdentityHashMap<>(automata.length); --- End diff -- Pushed that for now to see what you think. > Improvements to UnifiedHighlighter OffsetStrategies > --------------------------------------------------- > > Key: LUCENE-7526 > URL: https://issues.apache.org/jira/browse/LUCENE-7526 > Project: Lucene - Core > Issue Type: Improvement > Components: modules/highlighter > Reporter: Timothy M. Rodriguez > Assignee: David Smiley > Priority: Minor > Fix For: 6.4 > > > This ticket improves several of the UnifiedHighlighter FieldOffsetStrategies > by reducing reliance on creating or re-creating TokenStreams. > The primary changes are as follows: > * AnalysisOffsetStrategy - split into two offset strategies > ** MemoryIndexOffsetStrategy - the primary analysis mode that utilizes a > MemoryIndex for producing Offsets > ** TokenStreamOffsetStrategy - an offset strategy that avoids creating a > MemoryIndex. Can only be used if the query distills down to terms and > automata. > * TokenStream removal > ** MemoryIndexOffsetStrategy - previously a TokenStream was created to fill > the memory index and then once consumed a new one was generated by > uninverting the MemoryIndex back into a TokenStream if there were automata > (wildcard/mtq queries) involved. Now this is avoided, which should save > memory and avoid a second pass over the data. > ** TermVectorOffsetStrategy - this was refactored in a similar way to avoid > generating a TokenStream if automata are involved. > ** PostingsWithTermVectorsOffsetStrategy - similar refactoring > * CompositePostingsEnum - aggregates several underlying PostingsEnums for > wildcard/mtq queries. This should improve relevancy by providing unified > metrics for a wildcard across all it's term matches > * Added a HighlightFlag for enabling the newly separated > TokenStreamOffsetStrategy since it can adversely affect passage relevancy -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org