Author: otis
Date: Fri May 23 14:23:25 2008
New Revision: 659664
URL: http://svn.apache.org/viewvc?rev=659664&view=rev
Log:
SOLR-553 Use SpanScorer when highlighting phrase terms and
hl.usePhraseHighlighter=true
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/common/params/HighlightParams.java
lucene/solr/trunk/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
lucene/solr/trunk/src/test/org/apache/solr/highlight/HighlighterTest.java
Modified: lucene/solr/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=659664&r1=659663&r2=659664&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Fri May 23 14:23:25 2008
@@ -409,7 +409,14 @@
31. SOLR-514: Added explicit media-type with UTF* charset to *.xsl files that
don't already have one. (hossman)
-
+
+32. SOLR-505: Give RequestHandlers the possiblity to suppress the generation
+ of HTTP caching headers. (Thomas Peuss via Otis Gospodnetic)
+
+33. SOLR-553: Handle highlighting of phrase terms better when
+ hl.usePhraseHighligher=true URL param is used.
+ (Bojan Smid via Otis Gospodnetic)
+
Other Changes
1. SOLR-135: Moved common classes to org.apache.solr.common and altered the
build scripts to make two jars: apache-solr-1.3.jar and
Modified:
lucene/solr/trunk/src/java/org/apache/solr/common/params/HighlightParams.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/common/params/HighlightParams.java?rev=659664&r1=659663&r2=659664&view=diff
==============================================================================
---
lucene/solr/trunk/src/java/org/apache/solr/common/params/HighlightParams.java
(original)
+++
lucene/solr/trunk/src/java/org/apache/solr/common/params/HighlightParams.java
Fri May 23 14:23:25 2008
@@ -33,6 +33,8 @@
public static final String FIELD_MATCH = HIGHLIGHT+".requireFieldMatch";
public static final String ALTERNATE_FIELD = HIGHLIGHT+".alternateField";
public static final String ALTERNATE_FIELD_LENGTH =
HIGHLIGHT+".maxAlternateFieldLength";
+
+ public static final String USE_PHRASE_HIGHLIGHTER =
HIGHLIGHT+".usePhraseHighlighter";
public static final String MERGE_CONTIGUOUS_FRAGMENTS = HIGHLIGHT +
".mergeContiguous";
// Formatter
Modified:
lucene/solr/trunk/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java?rev=659664&r1=659663&r2=659664&view=diff
==============================================================================
---
lucene/solr/trunk/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
(original)
+++
lucene/solr/trunk/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
Fri May 23 14:23:25 2008
@@ -32,6 +32,7 @@
import javax.xml.xpath.XPathConstants;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@@ -41,6 +42,7 @@
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
+import org.apache.lucene.search.highlight.SpanScorer;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.solr.common.SolrException;
@@ -55,7 +57,6 @@
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
-import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.NamedListPluginLoader;
import org.w3c.dom.NodeList;
@@ -92,6 +93,27 @@
formatters.put( null, fmt );
}
+ /**
+ * Return a phrase Highlighter appropriate for this field.
+ * @param query The current Query
+ * @param fieldName The name of the field
+ * @param request The current SolrQueryRequest
+ * @param tokenStream document text CachingTokenStream
+ * @throws IOException
+ */
+ protected Highlighter getPhraseHighlighter(Query query, String fieldName,
SolrQueryRequest request, CachingTokenFilter tokenStream) throws IOException {
+ SolrParams params = request.getParams();
+ Highlighter highlighter = null;
+
+ highlighter = new Highlighter(getFormatter(fieldName, params),
getSpanQueryScorer(query, fieldName, tokenStream, request));
+
+ highlighter.setTextFragmenter(getFragmenter(fieldName, params));
+ highlighter.setMaxDocBytesToAnalyze(params.getFieldInt(
+ fieldName, HighlightParams.MAX_CHARS,
+ Highlighter.DEFAULT_MAX_DOC_BYTES_TO_ANALYZE));
+
+ return highlighter;
+ }
/**
* Return a Highlighter appropriate for this field.
@@ -112,6 +134,24 @@
}
/**
+ * Return a SpanScorer suitable for this Query and field.
+ * @param query The current query
+ * @param tokenStream document text CachingTokenStream
+ * @param fieldName The name of the field
+ * @param request The SolrQueryRequest
+ * @throws IOException
+ */
+ private SpanScorer getSpanQueryScorer(Query query, String fieldName,
CachingTokenFilter tokenStream, SolrQueryRequest request) throws IOException {
+ boolean reqFieldMatch = request.getParams().getFieldBool(fieldName,
HighlightParams.FIELD_MATCH, false);
+ if (reqFieldMatch) {
+ return new SpanScorer(query, fieldName, tokenStream);
+ }
+ else {
+ return new SpanScorer(query, null, tokenStream);
+ }
+ }
+
+ /**
* Return a QueryScorer suitable for this Query and field.
* @param query The current query
* @param fieldName The name of the field
@@ -230,32 +270,59 @@
fieldName = fieldName.trim();
String[] docTexts = doc.getValues(fieldName);
if (docTexts == null) continue;
+
+ TokenStream tstream = null;
+
+ // create TokenStream
+ if (docTexts.length == 1) {
+ // single-valued field
+ try {
+ // attempt term vectors
+ tstream = TokenSources.getTokenStream(searcher.getReader(),
docId, fieldName);
+ }
+ catch (IllegalArgumentException e) {
+ // fall back to anaylzer
+ tstream = new
TokenOrderingFilter(schema.getAnalyzer().tokenStream(fieldName, new
StringReader(docTexts[0])), 10);
+ }
+ }
+ else {
+ // multi-valued field
+ tstream = new MultiValueTokenStream(fieldName, docTexts,
schema.getAnalyzer(), true);
+ }
+
+ Highlighter highlighter;
+
+ if
(Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER))) {
+ // wrap CachingTokenFilter around TokenStream for reuse
+ tstream = new CachingTokenFilter(tstream);
+
+ // get highlighter
+ highlighter = getPhraseHighlighter(query, fieldName, req,
(CachingTokenFilter) tstream);
+
+ // after highlighter initialization, reset tstream since
construction of highlighter already used it
+ tstream.reset();
+ }
+ else {
+ // use "the old way"
+ highlighter = getHighlighter(query, fieldName, req);
+ }
- // get highlighter, and number of fragments for this field
- Highlighter highlighter = getHighlighter(query, fieldName, req);
int numFragments = getMaxSnippets(fieldName, params);
boolean mergeContiguousFragments =
isMergeContiguousFragments(fieldName, params);
String[] summaries = null;
TextFragment[] frag;
if (docTexts.length == 1) {
- // single-valued field
- TokenStream tstream;
- try {
- // attempt term vectors
- tstream = TokenSources.getTokenStream(searcher.getReader(),
docId, fieldName);
- }
- catch (IllegalArgumentException e) {
- // fall back to analyzer
- tstream = new
TokenOrderingFilter(schema.getAnalyzer().tokenStream(fieldName, new
StringReader(docTexts[0])), 10);
- }
frag = highlighter.getBestTextFragments(tstream, docTexts[0],
mergeContiguousFragments, numFragments);
}
else {
- // multi-valued field
- MultiValueTokenStream tstream;
- tstream = new MultiValueTokenStream(fieldName, docTexts,
schema.getAnalyzer(), true);
- frag = highlighter.getBestTextFragments(tstream,
tstream.asSingleValue(), false, numFragments);
+ StringBuilder singleValue = new StringBuilder();
+
+ for (String txt:docTexts) {
+ singleValue.append(txt);
+ }
+
+ frag = highlighter.getBestTextFragments(tstream,
singleValue.toString(), false, numFragments);
}
// convert fragments back into text
// TODO: we can include score and position information in output as
snippet attributes
@@ -303,12 +370,8 @@
}
}
-
-
-
/**
- * Helper class which creates a single TokenStream out of values from a
- * multi-valued field.
+ * Creates a single TokenStream out multi-value field values.
*/
class MultiValueTokenStream extends TokenStream {
private String fieldName;
@@ -378,7 +441,6 @@
sb.append(str);
return sb.toString();
}
-
}
@@ -424,5 +486,3 @@
return queue.isEmpty() ? null : queue.removeFirst();
}
}
-
-
Modified:
lucene/solr/trunk/src/test/org/apache/solr/highlight/HighlighterTest.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/highlight/HighlighterTest.java?rev=659664&r1=659663&r2=659664&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/highlight/HighlighterTest.java
(original)
+++ lucene/solr/trunk/src/test/org/apache/solr/highlight/HighlighterTest.java
Fri May 23 14:23:25 2008
@@ -481,4 +481,59 @@
"//[EMAIL PROTECTED]'highlighting']/[EMAIL PROTECTED]'1']/[EMAIL
PROTECTED]'t_text']/str[.='a piece of text']"
);
}
+
+ public void testPhraseHighlighter() {
+ HashMap<String,String> args = new HashMap<String,String>();
+ args.put("hl", "true");
+ args.put("hl.fl", "t_text");
+ args.put("hl.fragsize", "40");
+ args.put("hl.snippets", "10");
+
+ TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory(
+ "standard", 0, 200, args);
+
+ // String borrowed from Lucene's HighlighterTest
+ String t = "This piece of text refers to Kennedy at the beginning then has
a longer piece of text that is very long in the middle and finally ends with
another reference to Kennedy";
+
+ assertU(adoc("t_text", t, "id", "1"));
+ assertU(commit());
+ assertU(optimize());
+
+ String oldHighlight1 = "//[EMAIL PROTECTED]'1']/[EMAIL
PROTECTED]'t_text']/str[.='This piece of <em>text</em> <em>refers</em> to
Kennedy']";
+ String oldHighlight2 = "//[EMAIL PROTECTED]'1']/[EMAIL
PROTECTED]'t_text']/str[.=' at the beginning then has a longer piece of
<em>text</em>']";
+ String oldHighlight3 = "//[EMAIL PROTECTED]'1']/[EMAIL
PROTECTED]'t_text']/str[.=' with another <em>reference</em> to Kennedy']";
+ String newHighlight1 = "//[EMAIL PROTECTED]'1']/[EMAIL
PROTECTED]'t_text']/str[.='This piece of <em>text</em> <em>refers</em> to
Kennedy']";
+
+ // check if old functionality is still the same
+ assertQ("Phrase highlighting - old",
+ sumLRF.makeRequest("t_text:\"text refers\""),
+ "//[EMAIL PROTECTED]'highlighting']/[EMAIL PROTECTED]'1']",
+ oldHighlight1, oldHighlight2, oldHighlight3
+ );
+
+ assertQ("Phrase highlighting - old",
+ sumLRF.makeRequest("t_text:text refers"),
+ "//[EMAIL PROTECTED]'highlighting']/[EMAIL PROTECTED]'1']",
+ oldHighlight1, oldHighlight2, oldHighlight3
+ );
+
+ // now check if Lucene-794 highlighting works as expected
+ args.put("hl.usePhraseHighlighter", "true");
+
+ sumLRF = h.getRequestFactory("standard", 0, 200, args);
+
+ // check phrase highlighting
+ assertQ("Phrase highlighting - Lucene-794",
+ sumLRF.makeRequest("t_text:\"text refers\""),
+ "//[EMAIL PROTECTED]'highlighting']/[EMAIL PROTECTED]'1']",
+ newHighlight1
+ );
+
+ // non phrase queries should be highlighted as they were before this fix
+ assertQ("Phrase highlighting - Lucene-794",
+ sumLRF.makeRequest("t_text:text refers"),
+ "//[EMAIL PROTECTED]'highlighting']/[EMAIL PROTECTED]'1']",
+ oldHighlight1, oldHighlight2, oldHighlight3
+ );
+ }
}