Author: yonik
Date: Fri Aug 10 08:53:03 2007
New Revision: 564635
URL: http://svn.apache.org/viewvc?view=rev&rev=564635
Log:
SOLR-331: fix WordDelimiterFilter offsets for synonyms
Modified:
lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
Modified:
lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java?view=diff&rev=564635&r1=564634&r2=564635
==============================================================================
---
lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
(original)
+++
lucene/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
Fri Aug 10 08:53:03 2007
@@ -205,9 +205,20 @@
private Token newTok(Token orig, int start, int end) {
+ int startOff = orig.startOffset();
+ int endOff = orig.endOffset();
+ String origStr = orig.termText();
+
+ // if length by start + end offsets doesn't match the term text then assume
+ // this is a synonym and don't adjust the offsets.
+ if (origStr.length() == endOff-startOff) {
+ endOff = startOff + end;
+ startOff += start;
+ }
+
return new Token(orig.termText().substring(start,end),
- orig.startOffset() + start,
- orig.startOffset() + end,
+ startOff,
+ endOff,
orig.type());
}
Modified:
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java?view=diff&rev=564635&r1=564634&r2=564635
==============================================================================
---
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
(original)
+++
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
Fri Aug 10 08:53:03 2007
@@ -20,6 +20,12 @@
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.util.TestHarness;
import org.apache.solr.request.SolrQueryRequest;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+
+import java.io.IOException;
+
+import junit.framework.Assert;
/**
* New WordDelimiterFilter tests... most of the tests are in
ConvertedLegacyTest
@@ -81,5 +87,58 @@
req("subword:(good jon)")
,"//[EMAIL PROTECTED]"
);
+ }
+
+ public void testOffsets() throws IOException {
+
+ // test that subwords and catenated subwords have
+ // the correct offsets.
+ WordDelimiterFilter wdf = new WordDelimiterFilter(
+ new TokenStream() {
+ Token t;
+ public Token next() throws IOException {
+ if (t!=null) return null;
+ t = new Token("foo-bar", 5, 12); // actual
+ return t;
+ }
+ },
+ 1,1,0,0,1,1);
+
+ int i=0;
+ for(Token t; (t=wdf.next())!=null;) {
+ if (t.termText().equals("foo")) {
+ assertEquals(5, t.startOffset());
+ assertEquals(8, t.endOffset());
+ i++;
+ }
+ if (t.termText().equals("bar")) {
+ assertEquals(9, t.startOffset());
+ assertEquals(12, t.endOffset());
+ i++;
+ }
+ if (t.termText().equals("foobar")) {
+ assertEquals(5, t.startOffset());
+ assertEquals(12, t.endOffset());
+ i++;
+ }
+ }
+ assertEquals(3,i); // make sure all 3 tokens were generated
+
+ // test that if splitting or catenating a synonym, that the offsets
+ // are not altered (they would be incorrect).
+ wdf = new WordDelimiterFilter(
+ new TokenStream() {
+ Token t;
+ public Token next() throws IOException {
+ if (t!=null) return null;
+ t = new Token("foo-bar", 5, 6); // a synonym
+ return t;
+ }
+ },
+ 1,1,0,0,1,1);
+ for(Token t; (t=wdf.next())!=null;) {
+ assertEquals(5, t.startOffset());
+ assertEquals(6, t.endOffset());
+ }
}
}