[GitHub] [lucene-solr] dweiss commented on a change in pull request #1721: LUCENE-9439: match region highlighter components

GitBox Thu, 06 Aug 2020 04:42:04 -0700


dweiss commented on a change in pull request #1721:
URL: https://github.com/apache/lucene-solr/pull/1721#discussion_r466352634




##########
File path: 
lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java
##########
@@ -0,0 +1,503 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Matches;
+import org.apache.lucene.search.MatchesIterator;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryVisitor;
+import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.Weight;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.PrimitiveIterator;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.function.Predicate;
+
+/**
+ * Utility class to compute a list of "hit regions" for a given query, 
searcher and
+ * document(s) using {@link Matches} API.
+ */
+public class MatchRegionRetriever {
+  private final List<LeafReaderContext> leaves;
+  private final Weight weight;
+  private final TreeSet<String> affectedFields;
+  private final Map<String, OffsetsFromMatchesStrategy> offsetStrategies;
+  private final Set<String> preloadFields;
+
+  public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer 
analyzer)
+      throws IOException {
+    leaves = searcher.getIndexReader().leaves();
+    assert checkOrderConsistency(leaves);
+
+    weight = searcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 0);
+
+    // Compute the subset of fields affected by this query so that we don't 
load or scan
+    // fields that are irrelevant.
+    affectedFields = new TreeSet<>();
+    query.visit(
+        new QueryVisitor() {
+          @Override
+          public boolean acceptField(String field) {
+            affectedFields.add(field);
+            return false;
+          }
+        });
+
+    // Compute value offset retrieval strategy for all affected fields.
+    offsetStrategies =
+        computeOffsetStrategies(affectedFields, searcher.getIndexReader(), 
analyzer);
+
+    // Ask offset strategies if they'll need field values.
+    preloadFields = new HashSet<>();
+    offsetStrategies.forEach(
+        (field, strategy) -> {
+          if (strategy.requiresDocument()) {
+            preloadFields.add(field);
+          }
+        });
+
+    // Only preload those field values that can be affected by the query and 
are required
+    // by strategies.
+    preloadFields.retainAll(affectedFields);
+  }
+
+  public void highlightDocuments(PrimitiveIterator.OfInt docIds, 
HitRegionConsumer consumer)
+      throws IOException {
+    if (leaves.isEmpty() || affectedFields.isEmpty()) {
+      return;
+    }
+
+    Iterator<LeafReaderContext> ctx = leaves.iterator();
+    LeafReaderContext currentContext = ctx.next();
+    int previousDocId = -1;
+    Map<String, List<OffsetRange>> highlights = new TreeMap<>();
+    while (docIds.hasNext()) {
+      int docId = docIds.nextInt();
+
+      if (docId < previousDocId) {
+        throw new RuntimeException("Input document IDs must be sorted 
(increasing).");
+      }
+      previousDocId = docId;
+
+      while (docId >= currentContext.docBase + 
currentContext.reader().maxDoc()) {
+        currentContext = ctx.next();
+      }
+
+      int contextRelativeDocId = docId - currentContext.docBase;
+
+      // Only preload fields we may potentially need.
+      FieldValueProvider documentSupplier;
+      if (preloadFields.isEmpty()) {
+        documentSupplier = null;
+      } else {
+        Document doc = currentContext.reader().document(contextRelativeDocId, 
preloadFields);
+        documentSupplier = new DocumentFieldValueProvider(doc);
+      }
+
+      highlightDocument(
+          currentContext, contextRelativeDocId, documentSupplier, highlights, 
(field) -> true);
+
+      consumer.accept(currentContext.reader(), contextRelativeDocId, 
highlights);
+      highlights.clear();
+    }
+  }
+
+  public void highlightDocument(
+      LeafReaderContext currentContext,
+      int contextDocId,
+      FieldValueProvider doc,
+      Map<String, List<OffsetRange>> highlights,
+      Predicate<String> acceptField)
+      throws IOException {
+    Matches matches = weight.matches(currentContext, contextDocId);
+    if (matches == null) {
+      return;
+    }
+
+    for (String field : affectedFields) {
+      if (acceptField.test(field)) {
+        MatchesIterator matchesIterator = matches.getMatches(field);
+        if (matchesIterator == null) {
+          // No matches on this field, even though the field was part of the 
query. This may be possible
+          // with complex queries that source non-text fields (have no "hit 
regions" in any textual
+          // representation).
+        } else {
+          OffsetsFromMatchesStrategy offsetStrategy = 
offsetStrategies.get(field);
+          if (offsetStrategy == null) {
+            throw new IOException(
+                "Non-empty matches but no offset retrieval strategy for field: 
" + field);
+          }
+          List<OffsetRange> ranges = offsetStrategy.get(matchesIterator, doc);
+          if (!ranges.isEmpty()) {
+            highlights.put(field, ranges);
+          }
+        }
+      }
+    }
+  }
+
+  private boolean checkOrderConsistency(List<LeafReaderContext> leaves) {
+    for (int i = 1; i < leaves.size(); i++) {
+      LeafReaderContext prev = leaves.get(i - 1);
+      LeafReaderContext next = leaves.get(i);
+      assert prev.docBase <= next.docBase;
+      assert prev.docBase + prev.reader().maxDoc() == next.docBase;
+    }
+    return true;
+  }
+
+  private static Map<String, OffsetsFromMatchesStrategy> 
computeOffsetStrategies(
+      Set<String> affectedFields, IndexReader reader, Analyzer analyzer) {
+    Map<String, OffsetsFromMatchesStrategy> offsetStrategies = new HashMap<>();
+    FieldInfos fieldInfos = FieldInfos.getMergedFieldInfos(reader);
+    for (String field : affectedFields) {
+      FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+
+      OffsetsFromMatchesStrategy offsetStrategy;
+      if (fieldInfo != null && fieldInfo.getIndexOptions() != null) {
+        switch (fieldInfo.getIndexOptions()) {
+          case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS:
+            offsetStrategy = new OffsetsFromMatchIterator(field);
+            break;
+
+          case DOCS_AND_FREQS_AND_POSITIONS:
+            offsetStrategy = new OffsetsFromPositions(field, analyzer);
+            break;
+
+
+          case DOCS_AND_FREQS:
+            // offsetStrategy = new OffsetsFromTokens(field, analyzer);
+            offsetStrategy = new OffsetsFromValues(field, analyzer);
+            break;
+
+          default:
+            offsetStrategy =
+                (matchesIterator, doc) -> {
+                  throw new IOException(
+                      "Field is indexed without positions and/or offsets: "
+                          + field
+                          + ", "
+                          + fieldInfo.getIndexOptions());
+                };
+        }
+        offsetStrategies.put(field, offsetStrategy);
+      }
+    }
+    return offsetStrategies;
+  }
+
+  public interface HitRegionConsumer {
+    void accept(LeafReader leafReader, int leafDocId, Map<String, 
List<OffsetRange>> hits)
+        throws IOException;
+  }
+
+  /**
+   * An abstraction that provides document values for a given field. Default 
implementation
+   * in {@link DocumentFieldValueProvider} just reaches to a preloaded {@link 
Document}. It is
+   * possible to write a more efficient implementation on top of a reusable 
character buffer
+   * (that reuses the buffer while retrieving hit regions for documents).
+   */
+  public interface FieldValueProvider {
+    List<CharSequence> getValues(String field);
+  }
+
+  public static final class DocumentFieldValueProvider implements 
FieldValueProvider {
+    private final Document doc;
+
+    public DocumentFieldValueProvider(Document doc) {
+      this.doc = doc;
+    }
+
+    @Override
+    public List<CharSequence> getValues(String field) {
+      return Arrays.asList(doc.getValues(field));
+    }
+  }
+
+  /**
+   * Determines how match offset regions are computed from {@link 
MatchesIterator}. Several
+   * possibilities exist, ranging from retrieving offsets directly from a 
match instance
+   * to re-evaluating the document's field and recomputing offsets from there.
+   */
+  private interface OffsetsFromMatchesStrategy {
+    List<OffsetRange> get(MatchesIterator matchesIterator, FieldValueProvider 
doc)
+        throws IOException;
+
+    default boolean requiresDocument() {
+      return false;
+    }
+  }
+
+  /**
+   * This strategy retrieves offsets directly from {@link MatchesIterator}.
+   */
+  private static class OffsetsFromMatchIterator implements 
OffsetsFromMatchesStrategy {
+    private final String field;
+
+    OffsetsFromMatchIterator(String field) {
+      this.field = field;
+    }
+
+    @Override
+    public List<OffsetRange> get(MatchesIterator matchesIterator, 
FieldValueProvider doc)
+        throws IOException {
+      ArrayList<OffsetRange> ranges = new ArrayList<>();
+      while (matchesIterator.next()) {
+        int from = matchesIterator.startOffset();
+        int to = matchesIterator.endOffset();
+        if (from < 0 || to < 0) {
+          throw new IOException("Matches API returned negative offsets for 
field: " + field);
+        }
+        ranges.add(new OffsetRange(from, to));
+      }
+      return ranges;
+    }
+  }
+
+  /**
+   * This strategy works for fields where we know the match occurred but there 
are
+   * no known positions or offsets.
+   *
+   * We re-analyze field values and return offset ranges for entire values
+   * (not individual tokens). Re-analysis is required because analyzer may 
return
+   * an unknown offset gap.
+   */
+  private static class OffsetsFromValues implements OffsetsFromMatchesStrategy 
{
+    private final String field;
+    private final Analyzer analyzer;
+
+    public OffsetsFromValues(String field, Analyzer analyzer) {
+      this.field = field;
+      this.analyzer = analyzer;
+    }
+
+    @Override
+    public List<OffsetRange> get(MatchesIterator matchesIterator, 
FieldValueProvider doc) throws IOException {
+      List<CharSequence> values = doc.getValues(field);
+
+      ArrayList<OffsetRange> ranges = new ArrayList<>();
+      int valueOffset = 0;
+      for (CharSequence charSequence : values) {
+        final String value = charSequence.toString();
+
+        TokenStream ts = analyzer.tokenStream(field, value);
+        OffsetAttribute offsetAttr = ts.getAttribute(OffsetAttribute.class);
+        ts.reset();
+        int startOffset = valueOffset;
+        while (ts.incrementToken()) {
+          // Go through all tokens to increment offset attribute properly.
+        }
+        ts.end();
+        valueOffset += offsetAttr.endOffset();
+        ranges.add(new OffsetRange(startOffset, valueOffset));
+        valueOffset += analyzer.getOffsetGap(field);
+        ts.close();
+      }
+      return ranges;
+    }
+
+    @Override
+    public boolean requiresDocument() {
+      return true;
+    }
+  }
+
+  /**
+   * This strategy works for fields where we know the match occurred but there 
are
+   * no known positions or offsets.
+   *
+   * We re-analyze field values and return offset ranges for all returned 
tokens.
+   */
+  private static class OffsetsFromTokens implements OffsetsFromMatchesStrategy 
{
+    private final String field;
+    private final Analyzer analyzer;
+
+    public OffsetsFromTokens(String field, Analyzer analyzer) {
+      this.field = field;
+      this.analyzer = analyzer;
+    }
+
+    @Override
+    public List<OffsetRange> get(MatchesIterator matchesIterator, 
FieldValueProvider doc) throws IOException {
+      List<CharSequence> values = doc.getValues(field);
+
+      ArrayList<OffsetRange> ranges = new ArrayList<>();
+      int valueOffset = 0;
+      for (int valueIndex = 0, max = values.size(); valueIndex < max; 
valueIndex++) {
+        final String value = values.get(valueIndex).toString();
+
+        TokenStream ts = analyzer.tokenStream(field, value);
+        OffsetAttribute offsetAttr = ts.getAttribute(OffsetAttribute.class);
+        ts.reset();
+        while (ts.incrementToken()) {
+          int startOffset = valueOffset + offsetAttr.startOffset();
+          int endOffset = valueOffset + offsetAttr.endOffset();
+          ranges.add(new OffsetRange(startOffset, endOffset));

Review comment:
       I don't think this is needed, really. This is a corner-case when the 
field is indexed without offsets and positions. If this triggers a hit then a 
"full field value" highlight is all you can do. My reasoning is that it's 
easier to solve higher up the design tree (by enabling positions, for example) 
if you really have text fields. But most of the time these are single literals 
(possibly multivalued) - hence typically there is no need for positions or 
offsets. I wouldn't want to go down the lane of unified highlighter where 
there's logic for each and every possibility. Let's keep it simple here?
   




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

[GitHub] [lucene-solr] dweiss commented on a change in pull request #1721: LUCENE-9439: match region highlighter components

Reply via email to