Author: yonik
Date: Sat Oct 17 19:56:01 2009
New Revision: 826299
URL: http://svn.apache.org/viewvc?rev=826299&view=rev
Log:
SOLR-1394: calculate offsets correctly for entities
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java
lucene/solr/trunk/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java
Modified: lucene/solr/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=826299&r1=826298&r2=826299&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Sat Oct 17 19:56:01 2009
@@ -628,7 +628,8 @@
72. SOLR-1504: empty char mapping can cause ArrayIndexOutOfBoundsException in
analysis.jsp and co.
(koji)
-73. SOLR-1394: HTMLStripCharFilter split tokens that contained entities.
+73. SOLR-1394: HTMLStripCharFilter split tokens that contained entities and
+ often calculated offsets incorrectly for entities.
(Anders Melchiorsen via yonik)
Other Changes
Modified:
lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java?rev=826299&r1=826298&r2=826299&view=diff
==============================================================================
---
lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java
(original)
+++
lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java
Sat Oct 17 19:56:01 2009
@@ -175,6 +175,7 @@
private int readNumericEntity() throws IOException {
// "&#" has already been read at this point
+ int eaten = 2;
// is this decimal, hex, or nothing at all.
int ch = next();
@@ -194,6 +195,7 @@
}
}
} else if (ch=='x') {
+ eaten++;
// hex character entity
base=16;
sb.setLength(0);
@@ -215,7 +217,8 @@
// the entity.
try {
if (ch==';' || ch==-1) {
- numWhitespace = sb.length() + 2;// + 2 accounts for &, #, and ;, then,
take away 1 for the fact that we do output a char
+ // do not account for the eaten ";" due to the fact that we do output
a char
+ numWhitespace = sb.length() + eaten;
return Integer.parseInt(sb.toString(), base);
}
@@ -223,7 +226,7 @@
// that whitespace on the next call to read().
if (isSpace(ch)) {
push(ch);
- numWhitespace = sb.length() + 2;// + 2 accounts for &, #, and ;, then,
take away 1 for the fact that we do output a char
+ numWhitespace = sb.length() + eaten;
return Integer.parseInt(sb.toString(), base);
}
} catch (NumberFormatException e) {
Modified:
lucene/solr/trunk/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java?rev=826299&r1=826298&r2=826299&view=diff
==============================================================================
---
lucene/solr/trunk/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java
(original)
+++
lucene/solr/trunk/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java
Sat Oct 17 19:56:01 2009
@@ -236,4 +236,31 @@
assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>",
builder.toString().equals(gold) == true);
}
+
+ public void doTestOffsets(String in) throws Exception {
+ HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new
BufferedReader(new StringReader(in))));
+ int ch = 0;
+ int off = 0; // offset in the reader
+ int strOff = -1; // offset in the original string
+ while ((ch = reader.read()) != -1) {
+ int correctedOff = reader.correctOffset(off);
+
+ if (ch == 'X') {
+ strOff = in.indexOf('X',strOff+1);
+ assertEquals(strOff, correctedOff);
+ }
+
+ off++;
+ }
+ }
+
+ public void testOffsets() throws Exception {
+ doTestOffsets("hello X how X are you");
+ doTestOffsets("hello <p> X<p> how <p>X are you");
+ doTestOffsets("X & X ( X < > X");
+
+ // test backtracking
+ doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
+ }
+
}