Author: yonik
Date: Sat Jan 5 07:59:47 2008
New Revision: 609162
URL: http://svn.apache.org/viewvc?rev=609162&view=rev
Log:
SOLR-42: HTMLStripReader replaces removed content with spaces to preserve
offsets
Added:
lucene/solr/trunk/src/test/test-files/htmlStripReaderTest.html (with
props)
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripReader.java
Modified: lucene/solr/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=609162&r1=609161&r2=609162&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Sat Jan 5 07:59:47 2008
@@ -240,6 +240,9 @@
15. SOLR-449: the python and ruby response writers are now able to correctly
output NaN and Infinity in their respective languages. (klaas)
+16. SOLR-42: HTMLStripReader tokenizers now preserve correct source
+ offsets for highlighting. (Grant Ingersoll via yonik)
+
Other Changes
1. SOLR-135: Moved common classes to org.apache.solr.common and altered the
build scripts to make two jars: apache-solr-1.3.jar and
Modified:
lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripReader.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripReader.java?rev=609162&r1=609161&r2=609162&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripReader.java
(original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripReader.java
Sat Jan 5 07:59:47 2008
@@ -23,6 +23,8 @@
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
+import java.util.Set;
+import java.util.Collections;
/**
* A Reader that wraps another reader and attempts to strip out HTML
constructs.
@@ -34,6 +36,9 @@
public class HTMLStripReader extends Reader {
private final Reader in;
private final int READAHEAD=4096;
+ private int numWhitespace = 0;
+ private int numRead = 0;
+ private Set<String> escapedTags = Collections.emptySet();
// pushback buffer
private final StringBuilder pushed = new StringBuilder();
@@ -58,6 +63,11 @@
this.in=source.markSupported() ? source : new BufferedReader(source);
}
+ public HTMLStripReader(Reader source, Set<String> escapedTags){
+ this(source);
+ this.escapedTags = escapedTags;
+ }
+
private int next() throws IOException {
int len = pushed.length();
@@ -66,6 +76,7 @@
pushed.setLength(len-1);
return ch;
}
+ numRead++;
return in.read();
}
@@ -364,7 +375,10 @@
break;
}
}
-
+ if (escapedTags.contains(sb.toString())){
+ //if this is a reservedTag, then keep it
+ return MISMATCH;
+ }
// After the tag id, there needs to be either whitespace or
// '>'
if ( !(ch=='>' || isSpace(ch)) ) {
@@ -445,7 +459,7 @@
push(ch);
continue;
}
- int ret = readName();
+ int ret = readName(false);
if (ret==MISMATCH) return MISMATCH;
ch=nextSkipWS();
if (ch!='>') return MISMATCH;
@@ -482,12 +496,25 @@
}
- private int readName() throws IOException {
+ private int readName(boolean checkEscaped) throws IOException {
+ StringBuilder builder = new StringBuilder();
int ch = read();
+ builder.append((char)ch);
if (!isFirstIdChar(ch)) return MISMATCH;
ch = read();
- while(isIdChar(ch)) ch=read();
- if (ch!=-1) push(ch);
+ builder.append((char)ch);
+ while(isIdChar(ch)) {
+ ch=read();
+ builder.append((char)ch);
+ }
+ if (ch!=-1) {
+ push(ch);
+
+ }
+ //strip off the trailing >
+ if (checkEscaped && escapedTags.contains(builder.substring(0,
builder.length() - 1))){
+ return MISMATCH;
+ }
return MATCH;
}
@@ -645,12 +672,18 @@
}
+
public int read() throws IOException {
// TODO: Do we ever want to preserve CDATA sections?
// where do we have to worry about them?
// <![ CDATA [ unescaped markup ]]>
+ if (numWhitespace > 0){
+ numWhitespace--;
+ return ' ';
+ }
while(true) {
+ int lastNumRead = numRead;
int ch = next();
switch (ch) {
@@ -660,6 +693,7 @@
if (ch>=0) return ch;
if (ch==MISMATCH) {
restoreState();
+
return '&';
}
break;
@@ -671,7 +705,7 @@
if (ch=='!') {
ret = readBang(false);
} else if (ch=='/') {
- ret = readName();
+ ret = readName(true);
if (ret==MATCH) {
ch=nextSkipWS();
ret= ch=='>' ? MATCH : MISMATCH;
@@ -685,7 +719,12 @@
// matched something to be discarded, so break
// from this case and continue in the loop
- if (ret==MATCH) break;
+ if (ret==MATCH) {
+ //break;//was
+ //return whitespace from
+ numWhitespace = (numRead - lastNumRead) - 1;//tack on the -1 since
we are returning a space right now
+ return ' ';
+ }
// didn't match any HTML constructs, so roll back
// the stream state and just return '<'
Added: lucene/solr/trunk/src/test/test-files/htmlStripReaderTest.html
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/htmlStripReaderTest.html?rev=609162&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/htmlStripReaderTest.html (added)
+++ lucene/solr/trunk/src/test/test-files/htmlStripReaderTest.html Sat Jan 5
07:59:47 2008
@@ -0,0 +1,350 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta content="Apache Forrest" name="Generator">
+<meta name="Forrest-version" content="0.8">
+<meta name="Forrest-skin-name" content="pelt">
+<title>Welcome to Solr</title>
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
+<script src="skin/getBlank.js" language="javascript"
type="text/javascript"></script><script src="skin/getMenu.js"
language="javascript" type="text/javascript"></script><script
src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
+<link rel="shortcut icon" href="images/favicon.ico">
+</head>
+<body onload="init()">
+<script type="text/javascript">ndeSetTextSize();</script>
+<div id="top">
+<!--+
+ |breadtrail
+ +-->
+<div class="breadtrail">
+<a href="http://www.apache.org/">apache</a> > <a
href="http://lucene.apache.org/">lucene</a><script src="skin/breadcrumbs.js"
language="JavaScript" type="text/javascript"></script>
+</div>
+<!--+
+ |header
+ +-->
+<div class="header">
+<!--+
+ |start group logo
+ +-->
+<div class="grouplogo">
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene"
src="images/lucene_green_150.gif" title="Apache Lucene"></a>
+</div>
+<!--+
+ |end group logo
+ +-->
+<!--+
+ |start Project Logo
+ +-->
+<div class="projectlogo">
+<a href="http://lucene.apache.org/solr/"><img class="logoImage" alt="Solr"
src="images/solr.png" title="Solr Description"></a>
+</div>
+<!--+
+ |end Project Logo
+ +-->
+<!--+
+ |start Search
+ +-->
+<div class="searchbox">
+<form action="http://www.google.com/search" method="get" class="roundtopsmall">
+<input value="lucene.apache.org" name="sitesearch" type="hidden"><input
onFocus="getBlank (this, 'Search the site with google');" size="25" name="q"
id="query" type="text" value="Search the site with google">
+ <input name="Search" value="Search" type="submit">
+</form>
+</div>
+<!--+
+ |end search
+ +-->
+<!--+
+ |start Tabs
+ +-->
+<ul id="tabs">
+<li class="current">
+<a class="selected" href="index.html">Main</a>
+</li>
+<li>
+<a class="unselected" href="http://wiki.apache.org/solr">Wiki</a>
+</li>
+</ul>
+<!--+
+ |end Tabs
+ +-->
+</div>
+</div>
+<div id="main">
+<div id="publishedStrip">
+<!--+
+ |start Subtabs
+ +-->
+<div id="level2tabs"></div>
+<!--+
+ |end Endtabs
+ +-->
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+// --></script>
+</div>
+<!--+
+ |breadtrail
+ +-->
+<div class="breadtrail">
+
+
+ </div>
+<!--+
+ |start Menu, mainarea
+ +-->
+<!--+
+ |start Menu
+ +-->
+<div id="menu">
+<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')"
id="menu_selected_1.1Title" class="menutitle" style="background-image:
url('skin/images/chapter_open.gif');">About</div>
+<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display:
block;">
+<div class="menupage">
+<div class="menupagetitle">Welcome</div>
+</div>
+<div class="menuitem">
+<a href="who.html" title="Solr Committers">Who We Are</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title"
class="menutitle">Documentation</div>
+<div id="menu_1.2" class="menuitemgroup">
+<div class="menuitem">
+<a href="features.html">Features</a>
+</div>
+<div class="menuitem">
+<a href="tutorial.html">Tutorial</a>
+</div>
+<div class="menuitem">
+<a href="http://wiki.apache.org/solr/">Docs (Wiki)</a>
+</div>
+<div class="menuitem">
+<a href="http://wiki.apache.org/solr/FAQ">FAQ</a>
+</div>
+<div class="menuitem">
+<a href="api/index.html">javadoc</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title"
class="menutitle">Resources</div>
+<div id="menu_1.3" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://www.apache.org/dyn/closer.cgi/lucene/solr/">Download</a>
+</div>
+<div class="menuitem">
+<a href="mailing_lists.html">Mailing Lists</a>
+</div>
+<div class="menuitem">
+<a href="issue_tracking.html">Issue Tracking</a>
+</div>
+<div class="menuitem">
+<a href="version_control.html">Version Control</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title"
class="menutitle">Related Projects</div>
+<div id="menu_1.4" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://lucene.apache.org/java/">Lucene Java</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/nutch/">Nutch</a>
+</div>
+</div>
+<div id="credit">
+<hr>
+<a href="http://forrest.apache.org/"><img border="0" title="Built with Apache
Forrest" alt="Built with Apache Forrest - logo"
src="images/built-with-forrest-button.png" style="width: 88px;height:
31px;"></a>
+</div>
+<div id="roundbottom">
+<img style="display: none" class="corner" height="15" width="15" alt=""
src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
+<!--+
+ |alternative credits
+ +-->
+<div id="credit2"></div>
+</div>
+<!--+
+ |end Menu
+ +-->
+<!--+
+ |start content
+ +-->
+<div id="content">
+<div title="Portable Document Format" class="pdflink">
+<a class="dida" href="index.pdf"><img alt="PDF -icon"
src="skin/images/pdfdoc.gif" class="skin"><br>
+ PDF</a>
+</div>
+<h1>Welcome to Solr</h1>
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#intro">What Is Solr?</a>
+</li>
+<li>
+<a href="#news">News</a>
+<ul class="minitoc">
+<li>
+<a href="#02+October+2007+-+Solr+at+OSSummit+Asia">02 October 2007 - Solr at
OSSummit Asia</a>
+</li>
+<li>
+<a href="#03+September+2007+-+Lucene+at+ApacheCon+Atlanta">03 September 2007 -
Lucene at ApacheCon Atlanta</a>
+</li>
+<li>
+<a href="#06+June+2007%3A+Release+1.2+available">06 June 2007: Release 1.2
available</a>
+</li>
+<li>
+<a href="#17+January+2007%3A+Solr+graduates+from+Incubator">17 January 2007:
Solr graduates from Incubator</a>
+</li>
+<li>
+<a href="#22+December+2006%3A+Release+1.1.0+available">22 December 2006:
Release 1.1.0 available</a>
+</li>
+<li>
+<a href="#15+August+2006%3A+Solr+at+ApacheCon+US">15 August 2006: Solr at
ApacheCon US</a>
+</li>
+<li>
+<a href="#21+April+2006%3A+Solr+at+ApacheCon">21 April 2006: Solr at
ApacheCon</a>
+</li>
+<li>
+<a href="#21+February+2006%3A+nightly+builds">21 February 2006: nightly
builds</a>
+</li>
+<li>
+<a href="#17+January+2006%3A+Solr+Joins+Apache+Incubator">17 January 2006:
Solr Joins Apache Incubator</a>
+</li>
+</ul>
+</li>
+</ul>
+</div>
+
+<a name="N1000D"></a><a name="intro"></a>
+<h2 class="boxed">What Is Solr?</h2>
+<div class="section">
+<p>
+ Solr is an open source enterprise search server based on the
+ <a href="http://lucene.apache.org/java/">Lucene Java</a> search
library, with XML/HTTP and JSON APIs,
+ hit highlighting, faceted search, caching, replication, and a web
administration interface.
+ It runs in a Java servlet container such as <a
href="http://tomcat.apache.org">Tomcat</a>.
+ </p>
+<p>
+ See the complete <a href="features.html">feature list</a> for more
details, then check out the <a href="tutorial.html">tutorial</a>.
+ </p>
+</div>
+
+
+<a name="N1002A"></a><a name="news"></a>
+<h2 class="boxed">News</h2>
+<div class="section">
+<a name="N10030"></a><a name="02+October+2007+-+Solr+at+OSSummit+Asia"></a>
+<h3 class="boxed">02 October 2007 - Solr at OSSummit Asia</h3>
+<p>
+<a href="http://www.ossummit.com"><img alt="OSSummit Asia logo"
class="float-right" src="http://www.ossummit.com/2007/images/logo.png"></a>
+ Lucene and Solr tutorials!
+ </p>
+<p>The following talks and trainings are scheduled for the upcoming 2008
OSSummit:</p>
+<ul>
+
+<li>
+<a href="http://www.ossummit.com/2007/program/talk/8">Lucene Boot Camp</a> by
Erik Hatcher (originally by Grant Ingersoll). An all-day training focusing on
getting started with Lucene - the core library under Solr.</li>
+
+<li>
+<a href="http://www.ossummit.com/2007/program/talk/25">Solr in a Day</a> by
Erik Hatcher. All you need to know to use Solr effectively.</li>
+
+<li>
+<a href="http://www.ossummit.com/2007/program/talk/67">Lucene Case Studies</a>
by Erik Hatcher. A rapid series of examples of many Lucene and Solr using
applications.</li>
+
+</ul>
+<a name="N10058"></a><a
name="03+September+2007+-+Lucene+at+ApacheCon+Atlanta"></a>
+<h3 class="boxed">03 September 2007 - Lucene at ApacheCon Atlanta</h3>
+<p>
+<a href="http://www.us.apachecon.com"><img alt="ApacheCon US logo"
class="float-right"
src="http://www.apache.org/ads/ApacheCon/2007-usa-125x125.png"></a>
+ Lucene will once again be well represented at ApacheCon USA in
Atlanta this November 12-16, 2007.
+ </p>
+<p>The following talks and trainings are scheduled for this year's
conference:</p>
+<ul>
+
+<li>November 12: <a
href="http://us.apachecon.com/us2007/program/talk/1859">Lucene Boot Camp</a> by
Grant Ingersoll. An all-day training focusing on getting started with
Lucene.</li>
+
+<li>November 16, 9:00 am: <a
href="http://us.apachecon.com/us2007/program/talk/1992">Apache Solr out of the
Box</a> by Chris Hostetter. Introduction to Solr.</li>
+
+<li>November 16, 10:00 am: <a
href="http://us.apachecon.com/us2007/program/talk/1943">Building a Vertical
Search Site using Apache Software</a> by Ken Krugler. Will cover many
Lucene-based projects.</li>
+
+<li>November 16, 3:00 pm: <a
href="http://us.apachecon.com/us2007/program/talk/1953">Apache Lucene
Performance</a> by Grant Ingersoll. Tips and techniques for improving Lucene
performance.</li>
+
+<li>November 16, 4:00 pm: <a
href="http://us.apachecon.com/us2007/program/talk/2017"> Advanced Indexing
Techniques with Apache Lucene</a> by Michael Busch. Information on payloads and
advanced indexing techniques.</li>
+
+</ul>
+<a name="N10091"></a><a name="06+June+2007%3A+Release+1.2+available"></a>
+<h3 class="boxed">06 June 2007: Release 1.2 available</h3>
+<p>
+ This is the first release since Solr graduated from the Incubator,
+ bringing many new features, including CSV/delimited-text data
+ loading, time based autocommit, faster faceting, negative filters,
+ a spell-check handler, sounds-like word filters, regex text filters,
+ and more flexible plugins.
+ </p>
+<p>See the <a
href="http://svn.apache.org/repos/asf/lucene/solr/tags/release-1.2.0/CHANGES.txt">release
notes</a> for more details.</p>
+<a name="N100A2"></a><a
name="17+January+2007%3A+Solr+graduates+from+Incubator"></a>
+<h3 class="boxed">17 January 2007: Solr graduates from Incubator</h3>
+<p>
+ Solr has graduated from the Apache Incubator, and is now a sub-project
of Lucene.
+ </p>
+<a name="N100AC"></a><a name="22+December+2006%3A+Release+1.1.0+available"></a>
+<h3 class="boxed">22 December 2006: Release 1.1.0 available</h3>
+<p>
+ This is the first release since Solr joined the Incubator, and brings
+ many new features and performance optimizations including highlighting,
+ faceted search, and JSON/Python/Ruby response formats.
+ </p>
+<a name="N100B6"></a><a name="15+August+2006%3A+Solr+at+ApacheCon+US"></a>
+<h3 class="boxed">15 August 2006: Solr at ApacheCon US</h3>
+<p>Chris Hostetter will be presenting
+ <strong><a
href="http://www.apachecon.com/2006/US/html/sessions.html#FR26">"Faceted
Searching With Apache Solr"</a></strong>
+ at ApacheCon US 2006, on October 13th at 4:30pm.
+ See the <a href="http://www.us.apachecon.com/">ApacheCon</a> website
for more details.
+ </p>
+<a name="N100C9"></a><a name="21+April+2006%3A+Solr+at+ApacheCon"></a>
+<h3 class="boxed">21 April 2006: Solr at ApacheCon</h3>
+<p>Yonik Seeley will be presenting
+ <strong>"Apache Solr, a Full-Text Search Server based on
Lucene"</strong>
+ at ApacheCon Europe 2006, on June 29th at 5:30pm.
+ See the <a href="http://www.eu.apachecon.com/">ApacheCon</a> website
for more details.
+ </p>
+<a name="N100DA"></a><a name="21+February+2006%3A+nightly+builds"></a>
+<h3 class="boxed">21 February 2006: nightly builds</h3>
+<p>Solr now has nightly builds. This automatically creates a
+ <a
href="http://people.apache.org/builds/lucene/solr/nightly/">downloadable
version of Solr every
+ night</a>. All unit tests must pass, or a message is sent to
+ the developers mailing list and no new version is created. This
+ also updates the <a href="api/index.html">javadoc</a>.</p>
+<a name="N100EC"></a><a
name="17+January+2006%3A+Solr+Joins+Apache+Incubator"></a>
+<h3 class="boxed">17 January 2006: Solr Joins Apache Incubator</h3>
+<p>Solr, a search server based on Lucene, has been accepted into the Apache
Incubator.
+ Solr was originally developed by CNET Networks, and is widely used
within CNET
+ to provide high relevancy search and faceted browsing capabilities.
+ </p>
+</div>
+
+
+</div>
+<!--+
+ |end content
+ +-->
+<div class="clearboth"> </div>
+</div>
+<div id="footer">
+<!--+
+ |start bottomstrip
+ +-->
+<div class="lastmodified">
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+// --></script>
+</div>
+<div class="copyright">
+ Copyright ©
+ 2007 <a href="http://www.apache.org/licenses/">The Apache Software
Foundation.</a>
+</div>
+<div id="logos"></div>
+<!--+
+ |end bottomstrip
+ +-->
+</div>
+</body>
+</html>
Propchange: lucene/solr/trunk/src/test/test-files/htmlStripReaderTest.html
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/test/test-files/htmlStripReaderTest.html
------------------------------------------------------------------------------
svn:executable = *
Propchange: lucene/solr/trunk/src/test/test-files/htmlStripReaderTest.html
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL