[GitHub] [lucene-solr] dweiss commented on a change in pull request #1721: LUCENE-9439: match region highlighter components

GitBox Thu, 06 Aug 2020 04:44:04 -0700


dweiss commented on a change in pull request #1721:
URL: https://github.com/apache/lucene-solr/pull/1721#discussion_r466353337




##########
File path: 
lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java
##########
@@ -0,0 +1,503 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Matches;
+import org.apache.lucene.search.MatchesIterator;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryVisitor;
+import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.Weight;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.PrimitiveIterator;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.function.Predicate;
+
+/**
+ * Utility class to compute a list of "hit regions" for a given query, 
searcher and
+ * document(s) using {@link Matches} API.
+ */
+public class MatchRegionRetriever {
+  private final List<LeafReaderContext> leaves;
+  private final Weight weight;
+  private final TreeSet<String> affectedFields;
+  private final Map<String, OffsetsFromMatchesStrategy> offsetStrategies;
+  private final Set<String> preloadFields;
+
+  public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer 
analyzer)
+      throws IOException {
+    leaves = searcher.getIndexReader().leaves();
+    assert checkOrderConsistency(leaves);
+
+    weight = searcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 0);
+
+    // Compute the subset of fields affected by this query so that we don't 
load or scan
+    // fields that are irrelevant.
+    affectedFields = new TreeSet<>();
+    query.visit(
+        new QueryVisitor() {
+          @Override
+          public boolean acceptField(String field) {
+            affectedFields.add(field);
+            return false;
+          }
+        });
+
+    // Compute value offset retrieval strategy for all affected fields.
+    offsetStrategies =
+        computeOffsetStrategies(affectedFields, searcher.getIndexReader(), 
analyzer);
+
+    // Ask offset strategies if they'll need field values.
+    preloadFields = new HashSet<>();
+    offsetStrategies.forEach(
+        (field, strategy) -> {
+          if (strategy.requiresDocument()) {
+            preloadFields.add(field);
+          }
+        });
+
+    // Only preload those field values that can be affected by the query and 
are required
+    // by strategies.
+    preloadFields.retainAll(affectedFields);
+  }
+
+  public void highlightDocuments(PrimitiveIterator.OfInt docIds, 
HitRegionConsumer consumer)
+      throws IOException {
+    if (leaves.isEmpty() || affectedFields.isEmpty()) {
+      return;
+    }
+
+    Iterator<LeafReaderContext> ctx = leaves.iterator();
+    LeafReaderContext currentContext = ctx.next();
+    int previousDocId = -1;
+    Map<String, List<OffsetRange>> highlights = new TreeMap<>();
+    while (docIds.hasNext()) {
+      int docId = docIds.nextInt();
+
+      if (docId < previousDocId) {
+        throw new RuntimeException("Input document IDs must be sorted 
(increasing).");
+      }
+      previousDocId = docId;
+
+      while (docId >= currentContext.docBase + 
currentContext.reader().maxDoc()) {
+        currentContext = ctx.next();
+      }
+
+      int contextRelativeDocId = docId - currentContext.docBase;
+
+      // Only preload fields we may potentially need.
+      FieldValueProvider documentSupplier;
+      if (preloadFields.isEmpty()) {
+        documentSupplier = null;
+      } else {
+        Document doc = currentContext.reader().document(contextRelativeDocId, 
preloadFields);
+        documentSupplier = new DocumentFieldValueProvider(doc);
+      }
+
+      highlightDocument(
+          currentContext, contextRelativeDocId, documentSupplier, highlights, 
(field) -> true);
+
+      consumer.accept(currentContext.reader(), contextRelativeDocId, 
highlights);
+      highlights.clear();
+    }
+  }
+
+  public void highlightDocument(
+      LeafReaderContext currentContext,
+      int contextDocId,
+      FieldValueProvider doc,
+      Map<String, List<OffsetRange>> highlights,
+      Predicate<String> acceptField)
+      throws IOException {
+    Matches matches = weight.matches(currentContext, contextDocId);
+    if (matches == null) {
+      return;
+    }
+
+    for (String field : affectedFields) {
+      if (acceptField.test(field)) {
+        MatchesIterator matchesIterator = matches.getMatches(field);
+        if (matchesIterator == null) {
+          // No matches on this field, even though the field was part of the 
query. This may be possible
+          // with complex queries that source non-text fields (have no "hit 
regions" in any textual
+          // representation).
+        } else {
+          OffsetsFromMatchesStrategy offsetStrategy = 
offsetStrategies.get(field);
+          if (offsetStrategy == null) {
+            throw new IOException(
+                "Non-empty matches but no offset retrieval strategy for field: 
" + field);
+          }
+          List<OffsetRange> ranges = offsetStrategy.get(matchesIterator, doc);
+          if (!ranges.isEmpty()) {
+            highlights.put(field, ranges);
+          }
+        }
+      }
+    }
+  }
+
+  private boolean checkOrderConsistency(List<LeafReaderContext> leaves) {
+    for (int i = 1; i < leaves.size(); i++) {
+      LeafReaderContext prev = leaves.get(i - 1);
+      LeafReaderContext next = leaves.get(i);
+      assert prev.docBase <= next.docBase;
+      assert prev.docBase + prev.reader().maxDoc() == next.docBase;
+    }
+    return true;
+  }
+
+  private static Map<String, OffsetsFromMatchesStrategy> 
computeOffsetStrategies(
+      Set<String> affectedFields, IndexReader reader, Analyzer analyzer) {
+    Map<String, OffsetsFromMatchesStrategy> offsetStrategies = new HashMap<>();
+    FieldInfos fieldInfos = FieldInfos.getMergedFieldInfos(reader);
+    for (String field : affectedFields) {
+      FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+
+      OffsetsFromMatchesStrategy offsetStrategy;
+      if (fieldInfo != null && fieldInfo.getIndexOptions() != null) {
+        switch (fieldInfo.getIndexOptions()) {
+          case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS:
+            offsetStrategy = new OffsetsFromMatchIterator(field);
+            break;
+
+          case DOCS_AND_FREQS_AND_POSITIONS:
+            offsetStrategy = new OffsetsFromPositions(field, analyzer);
+            break;
+
+
+          case DOCS_AND_FREQS:
+            // offsetStrategy = new OffsetsFromTokens(field, analyzer);

Review comment:
       I'd leave it in just in case something changes in the future. Early 
warning, just in case.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

[GitHub] [lucene-solr] dweiss commented on a change in pull request #1721: LUCENE-9439: match region highlighter components

Reply via email to