htmlStripReaderTest.html

yonik Sat, 05 Jan 2008 08:00:23 -0800

Author: yonik
Date: Sat Jan  5 07:59:47 2008
New Revision: 609162

URL: http://svn.apache.org/viewvc?rev=609162&view=rev
Log:
SOLR-42: HTMLStripReader replaces removed content with spaces to preserve 
offsets


Added:
    lucene/solr/trunk/src/test/test-files/htmlStripReaderTest.html   (with 
props)
Modified:
    lucene/solr/trunk/CHANGES.txt
    lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripReader.java

Modified: lucene/solr/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=609162&r1=609161&r2=609162&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Sat Jan  5 07:59:47 2008
@@ -240,6 +240,9 @@
 15. SOLR-449: the python and ruby response writers are now able to correctly 
     output NaN and Infinity in their respective languages.  (klaas)
 
+16. SOLR-42: HTMLStripReader tokenizers now preserve correct source
+    offsets for highlighting.  (Grant Ingersoll via yonik)
+
 Other Changes
  1. SOLR-135: Moved common classes to org.apache.solr.common and altered the
     build scripts to make two jars: apache-solr-1.3.jar and 

Modified: 
lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripReader.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripReader.java?rev=609162&r1=609161&r2=609162&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripReader.java 
(original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripReader.java 
Sat Jan  5 07:59:47 2008
@@ -23,6 +23,8 @@
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.HashMap;
+import java.util.Set;
+import java.util.Collections;
 
 /**
  * A Reader that wraps another reader and attempts to strip out HTML 
constructs.
@@ -34,6 +36,9 @@
 public class HTMLStripReader extends Reader {
   private final Reader in;
   private final int READAHEAD=4096;
+  private int numWhitespace = 0;
+  private int numRead = 0;
+  private Set<String> escapedTags = Collections.emptySet();
 
   // pushback buffer
   private final StringBuilder pushed = new StringBuilder();
@@ -58,6 +63,11 @@
     this.in=source.markSupported() ? source : new BufferedReader(source);
   }
 
+  public HTMLStripReader(Reader source, Set<String> escapedTags){
+    this(source);
+    this.escapedTags = escapedTags;
+  }
+
 
   private int next() throws IOException {
     int len = pushed.length();
@@ -66,6 +76,7 @@
       pushed.setLength(len-1);
       return ch;
     }
+    numRead++;
     return in.read();
   }
 
@@ -364,7 +375,10 @@
         break;
       }
     }
-
+    if (escapedTags.contains(sb.toString())){
+      //if this is a reservedTag, then keep it
+      return MISMATCH;
+    }
     // After the tag id, there needs to be either whitespace or
     // '>'
     if ( !(ch=='>' || isSpace(ch)) ) {
@@ -445,7 +459,7 @@
           push(ch);
           continue;
         }
-        int ret = readName();
+        int ret = readName(false);
         if (ret==MISMATCH) return MISMATCH;
         ch=nextSkipWS();
         if (ch!='>') return MISMATCH;
@@ -482,12 +496,25 @@
   }
 
 
-  private int readName() throws IOException {
+  private int readName(boolean checkEscaped) throws IOException {
+    StringBuilder builder = new StringBuilder();
     int ch = read();
+    builder.append((char)ch);
     if (!isFirstIdChar(ch)) return MISMATCH;
     ch = read();
-    while(isIdChar(ch)) ch=read();
-    if (ch!=-1) push(ch);
+    builder.append((char)ch);
+    while(isIdChar(ch)) {
+      ch=read();
+      builder.append((char)ch);
+    }
+    if (ch!=-1) {
+      push(ch);
+
+    }
+    //strip off the trailing >
+    if (checkEscaped && escapedTags.contains(builder.substring(0, 
builder.length() - 1))){
+      return MISMATCH;
+    }
     return MATCH;
   }
 
@@ -645,12 +672,18 @@
   }
 
 
+
   public int read() throws IOException {
     // TODO: Do we ever want to preserve CDATA sections?
     // where do we have to worry about them?
     // <![ CDATA [ unescaped markup ]]>
+    if (numWhitespace > 0){
+      numWhitespace--;
+      return ' ';
+    }
 
     while(true) {
+      int lastNumRead = numRead;
       int ch = next();
 
       switch (ch) {
@@ -660,6 +693,7 @@
           if (ch>=0) return ch;
           if (ch==MISMATCH) {
             restoreState();
+
             return '&';
           }
           break;
@@ -671,7 +705,7 @@
           if (ch=='!') {
             ret = readBang(false);
           } else if (ch=='/') {
-            ret = readName();
+            ret = readName(true);
             if (ret==MATCH) {
               ch=nextSkipWS();
               ret= ch=='>' ? MATCH : MISMATCH;
@@ -685,7 +719,12 @@
 
           // matched something to be discarded, so break
           // from this case and continue in the loop
-          if (ret==MATCH) break;
+          if (ret==MATCH) {
+            //break;//was
+            //return whitespace from
+            numWhitespace = (numRead - lastNumRead) - 1;//tack on the -1 since 
we are returning a space right now
+            return ' ';
+          }
 
           // didn't match any HTML constructs, so roll back
           // the stream state and just return '<'

Added: lucene/solr/trunk/src/test/test-files/htmlStripReaderTest.html
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/htmlStripReaderTest.html?rev=609162&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/htmlStripReaderTest.html (added)
+++ lucene/solr/trunk/src/test/test-files/htmlStripReaderTest.html Sat Jan  5 
07:59:47 2008
@@ -0,0 +1,350 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 
"http://www.w3.org/TR/html4/loose.dtd";>
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta content="Apache Forrest" name="Generator">
+<meta name="Forrest-version" content="0.8">
+<meta name="Forrest-skin-name" content="pelt">
+<title>Welcome to Solr</title>
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
+<script src="skin/getBlank.js" language="javascript" 
type="text/javascript"></script><script src="skin/getMenu.js" 
language="javascript" type="text/javascript"></script><script 
src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
+<link rel="shortcut icon" href="images/favicon.ico">
+</head>
+<body onload="init()">
+<script type="text/javascript">ndeSetTextSize();</script>
+<div id="top">
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+<a href="http://www.apache.org/";>apache</a> &gt; <a 
href="http://lucene.apache.org/";>lucene</a><script src="skin/breadcrumbs.js" 
language="JavaScript" type="text/javascript"></script>
+</div>
+<!--+
+    |header
+    +-->
+<div class="header">
+<!--+
+    |start group logo
+    +-->
+<div class="grouplogo">
+<a href="http://lucene.apache.org/";><img class="logoImage" alt="Lucene" 
src="images/lucene_green_150.gif" title="Apache Lucene"></a>
+</div>
+<!--+
+    |end group logo
+    +-->
+<!--+
+    |start Project Logo
+    +-->
+<div class="projectlogo">
+<a href="http://lucene.apache.org/solr/";><img class="logoImage" alt="Solr" 
src="images/solr.png" title="Solr Description"></a>
+</div>
+<!--+
+    |end Project Logo
+    +-->
+<!--+
+    |start Search
+    +-->
+<div class="searchbox">
+<form action="http://www.google.com/search"; method="get" class="roundtopsmall">
+<input value="lucene.apache.org" name="sitesearch" type="hidden"><input 
onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" 
id="query" type="text" value="Search the site with google">&nbsp; 
+                    <input name="Search" value="Search" type="submit">
+</form>
+</div>
+<!--+
+    |end search
+    +-->
+<!--+
+    |start Tabs
+    +-->
+<ul id="tabs">
+<li class="current">
+<a class="selected" href="index.html">Main</a>
+</li>
+<li>
+<a class="unselected" href="http://wiki.apache.org/solr";>Wiki</a>
+</li>
+</ul>
+<!--+
+    |end Tabs
+    +-->
+</div>
+</div>
+<div id="main">
+<div id="publishedStrip">
+<!--+
+    |start Subtabs
+    +-->
+<div id="level2tabs"></div>
+<!--+
+    |end Endtabs
+    +-->
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+
+             &nbsp;
+           </div>
+<!--+
+    |start Menu, mainarea
+    +-->
+<!--+
+    |start Menu
+    +-->
+<div id="menu">
+<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" 
id="menu_selected_1.1Title" class="menutitle" style="background-image: 
url('skin/images/chapter_open.gif');">About</div>
+<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: 
block;">
+<div class="menupage">
+<div class="menupagetitle">Welcome</div>
+</div>
+<div class="menuitem">
+<a href="who.html" title="Solr Committers">Who We Are</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" 
class="menutitle">Documentation</div>
+<div id="menu_1.2" class="menuitemgroup">
+<div class="menuitem">
+<a href="features.html">Features</a>
+</div>
+<div class="menuitem">
+<a href="tutorial.html">Tutorial</a>
+</div>
+<div class="menuitem">
+<a href="http://wiki.apache.org/solr/";>Docs (Wiki)</a>
+</div>
+<div class="menuitem">
+<a href="http://wiki.apache.org/solr/FAQ";>FAQ</a>
+</div>
+<div class="menuitem">
+<a href="api/index.html">javadoc</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" 
class="menutitle">Resources</div>
+<div id="menu_1.3" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://www.apache.org/dyn/closer.cgi/lucene/solr/";>Download</a>
+</div>
+<div class="menuitem">
+<a href="mailing_lists.html">Mailing Lists</a>
+</div>
+<div class="menuitem">
+<a href="issue_tracking.html">Issue Tracking</a>
+</div>
+<div class="menuitem">
+<a href="version_control.html">Version Control</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" 
class="menutitle">Related Projects</div>
+<div id="menu_1.4" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://lucene.apache.org/java/";>Lucene Java</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/nutch/";>Nutch</a>
+</div>
+</div>
+<div id="credit">
+<hr>
+<a href="http://forrest.apache.org/";><img border="0" title="Built with Apache 
Forrest" alt="Built with Apache Forrest - logo" 
src="images/built-with-forrest-button.png" style="width: 88px;height: 
31px;"></a>
+</div>
+<div id="roundbottom">
+<img style="display: none" class="corner" height="15" width="15" alt="" 
src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
+<!--+
+  |alternative credits
+  +-->
+<div id="credit2"></div>
+</div>
+<!--+
+    |end Menu
+    +-->
+<!--+
+    |start content
+    +-->
+<div id="content">
+<div title="Portable Document Format" class="pdflink">
+<a class="dida" href="index.pdf"><img alt="PDF -icon" 
src="skin/images/pdfdoc.gif" class="skin"><br>
+        PDF</a>
+</div>
+<h1>Welcome to Solr</h1>
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#intro">What Is Solr?</a>
+</li>
+<li>
+<a href="#news">News</a>
+<ul class="minitoc">
+<li>
+<a href="#02+October+2007+-+Solr+at+OSSummit+Asia">02 October 2007 - Solr at 
OSSummit Asia</a>
+</li>
+<li>
+<a href="#03+September+2007+-+Lucene+at+ApacheCon+Atlanta">03 September 2007 - 
Lucene at ApacheCon Atlanta</a>
+</li>
+<li>
+<a href="#06+June+2007%3A+Release+1.2+available">06 June 2007: Release 1.2 
available</a>
+</li>
+<li>
+<a href="#17+January+2007%3A+Solr+graduates+from+Incubator">17 January 2007: 
Solr graduates from Incubator</a>
+</li>
+<li>
+<a href="#22+December+2006%3A+Release+1.1.0+available">22 December 2006: 
Release 1.1.0 available</a>
+</li>
+<li>
+<a href="#15+August+2006%3A+Solr+at+ApacheCon+US">15 August 2006: Solr at 
ApacheCon US</a>
+</li>
+<li>
+<a href="#21+April+2006%3A+Solr+at+ApacheCon">21 April 2006: Solr at 
ApacheCon</a>
+</li>
+<li>
+<a href="#21+February+2006%3A+nightly+builds">21 February 2006: nightly 
builds</a>
+</li>
+<li>
+<a href="#17+January+2006%3A+Solr+Joins+Apache+Incubator">17 January 2006: 
Solr Joins Apache Incubator</a>
+</li>
+</ul>
+</li>
+</ul>
+</div> 
+    
+<a name="N1000D"></a><a name="intro"></a>
+<h2 class="boxed">What Is Solr?</h2>
+<div class="section">
+<p>
+        Solr is an open source enterprise search server based on the
+        <a href="http://lucene.apache.org/java/";>Lucene Java</a> search 
library, with XML/HTTP and JSON APIs,
+        hit highlighting, faceted search, caching, replication, and a web 
administration interface.
+        It runs in a Java servlet container such as <a 
href="http://tomcat.apache.org";>Tomcat</a>.
+      </p>
+<p>
+        See the complete <a href="features.html">feature list</a> for more 
details, then check out the <a href="tutorial.html">tutorial</a>.
+      </p>
+</div>
+
+    
+<a name="N1002A"></a><a name="news"></a>
+<h2 class="boxed">News</h2>
+<div class="section">
+<a name="N10030"></a><a name="02+October+2007+-+Solr+at+OSSummit+Asia"></a>
+<h3 class="boxed">02 October 2007 - Solr at OSSummit Asia</h3>
+<p>
+<a href="http://www.ossummit.com";><img alt="OSSummit Asia logo" 
class="float-right" src="http://www.ossummit.com/2007/images/logo.png";></a>
+          Lucene and Solr tutorials!
+        </p>
+<p>The following talks and trainings are scheduled for the upcoming 2008 
OSSummit:</p>
+<ul>
+            
+<li>
+<a href="http://www.ossummit.com/2007/program/talk/8";>Lucene Boot Camp</a> by 
Erik Hatcher (originally by Grant Ingersoll).  An all-day training focusing on 
getting started with Lucene - the core library under Solr.</li>
+            
+<li>
+<a href="http://www.ossummit.com/2007/program/talk/25";>Solr in a Day</a> by 
Erik Hatcher.  All you need to know to use Solr effectively.</li>
+            
+<li>
+<a href="http://www.ossummit.com/2007/program/talk/67";>Lucene Case Studies</a> 
by Erik Hatcher.  A rapid series of examples of many Lucene and Solr using 
applications.</li>
+          
+</ul>
+<a name="N10058"></a><a 
name="03+September+2007+-+Lucene+at+ApacheCon+Atlanta"></a>
+<h3 class="boxed">03 September 2007 - Lucene at ApacheCon Atlanta</h3>
+<p>
+<a href="http://www.us.apachecon.com";><img alt="ApacheCon US logo" 
class="float-right" 
src="http://www.apache.org/ads/ApacheCon/2007-usa-125x125.png";></a>
+              Lucene will once again be well represented at ApacheCon USA in 
Atlanta this November 12-16, 2007.  
+            </p>
+<p>The following talks and trainings are scheduled for this year's 
conference:</p>
+<ul>
+                
+<li>November 12: <a 
href="http://us.apachecon.com/us2007/program/talk/1859";>Lucene Boot Camp</a> by 
Grant Ingersoll.  An all-day training focusing on getting started with 
Lucene.</li>
+                
+<li>November 16, 9:00 am: <a 
href="http://us.apachecon.com/us2007/program/talk/1992";>Apache Solr out of the 
Box</a> by Chris Hostetter. Introduction to Solr.</li>
+                
+<li>November 16, 10:00 am: <a 
href="http://us.apachecon.com/us2007/program/talk/1943";>Building a Vertical 
Search Site using Apache Software</a> by Ken Krugler. Will cover many 
Lucene-based projects.</li>
+                
+<li>November 16, 3:00 pm: <a 
href="http://us.apachecon.com/us2007/program/talk/1953";>Apache Lucene 
Performance</a> by Grant Ingersoll. Tips and techniques for improving Lucene 
performance.</li>
+                
+<li>November 16, 4:00 pm: <a 
href="http://us.apachecon.com/us2007/program/talk/2017";> Advanced Indexing 
Techniques with Apache Lucene</a> by Michael Busch. Information on payloads and 
advanced indexing techniques.</li>
+              
+</ul>
+<a name="N10091"></a><a name="06+June+2007%3A+Release+1.2+available"></a>
+<h3 class="boxed">06 June 2007: Release 1.2 available</h3>
+<p>
+        This is the first release since Solr graduated from the Incubator,
+        bringing many new features, including CSV/delimited-text data
+        loading, time based autocommit, faster faceting, negative filters,
+        a spell-check handler, sounds-like word filters, regex text filters,
+        and more flexible plugins.
+      </p>
+<p>See the <a 
href="http://svn.apache.org/repos/asf/lucene/solr/tags/release-1.2.0/CHANGES.txt";>release
 notes</a> for more details.</p>
+<a name="N100A2"></a><a 
name="17+January+2007%3A+Solr+graduates+from+Incubator"></a>
+<h3 class="boxed">17 January 2007: Solr graduates from Incubator</h3>
+<p>
+        Solr has graduated from the Apache Incubator, and is now a sub-project 
of Lucene.
+      </p>
+<a name="N100AC"></a><a name="22+December+2006%3A+Release+1.1.0+available"></a>
+<h3 class="boxed">22 December 2006: Release 1.1.0 available</h3>
+<p>
+        This is the first release since Solr joined the Incubator, and brings
+        many new features and performance optimizations including highlighting,
+        faceted search, and JSON/Python/Ruby response formats.
+      </p>
+<a name="N100B6"></a><a name="15+August+2006%3A+Solr+at+ApacheCon+US"></a>
+<h3 class="boxed">15 August 2006: Solr at ApacheCon US</h3>
+<p>Chris Hostetter will be presenting
+        <strong><a 
href="http://www.apachecon.com/2006/US/html/sessions.html#FR26";>"Faceted 
Searching With Apache Solr"</a></strong>  
+        at ApacheCon US 2006, on October 13th at 4:30pm.
+        See the <a href="http://www.us.apachecon.com/";>ApacheCon</a> website 
for more details.
+      </p>
+<a name="N100C9"></a><a name="21+April+2006%3A+Solr+at+ApacheCon"></a>
+<h3 class="boxed">21 April 2006: Solr at ApacheCon</h3>
+<p>Yonik Seeley will be presenting
+        <strong>"Apache Solr, a Full-Text Search Server based on 
Lucene"</strong>  
+        at ApacheCon Europe 2006, on June 29th at 5:30pm.
+        See the <a href="http://www.eu.apachecon.com/";>ApacheCon</a> website 
for more details.
+      </p>
+<a name="N100DA"></a><a name="21+February+2006%3A+nightly+builds"></a>
+<h3 class="boxed">21 February 2006: nightly builds</h3>
+<p>Solr now has nightly builds.  This automatically creates a
+      <a 
href="http://people.apache.org/builds/lucene/solr/nightly/";>downloadable 
version of Solr every
+      night</a>.  All unit tests must pass, or a message is sent to
+      the developers mailing list and no new version is created.  This
+      also updates the <a href="api/index.html">javadoc</a>.</p>
+<a name="N100EC"></a><a 
name="17+January+2006%3A+Solr+Joins+Apache+Incubator"></a>
+<h3 class="boxed">17 January 2006: Solr Joins Apache Incubator</h3>
+<p>Solr, a search server based on Lucene, has been accepted into the Apache 
Incubator.
+            Solr was originally developed by CNET Networks, and is widely used 
within CNET
+            to provide high relevancy search and faceted browsing capabilities.
+            </p>
+</div>
+
+  
+</div>
+<!--+
+    |end content
+    +-->
+<div class="clearboth">&nbsp;</div>
+</div>
+<div id="footer">
+<!--+
+    |start bottomstrip
+    +-->
+<div class="lastmodified">
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<div class="copyright">
+        Copyright &copy;
+         2007 <a href="http://www.apache.org/licenses/";>The Apache Software 
Foundation.</a>
+</div>
+<div id="logos"></div>
+<!--+
+    |end bottomstrip
+    +-->
+</div>
+</body>
+</html>

Propchange: lucene/solr/trunk/src/test/test-files/htmlStripReaderTest.html
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/solr/trunk/src/test/test-files/htmlStripReaderTest.html
------------------------------------------------------------------------------
    svn:executable = *

Propchange: lucene/solr/trunk/src/test/test-files/htmlStripReaderTest.html
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL

svn commit: r609162 - in /lucene/solr/trunk: CHANGES.txt src/java/org/apache/solr/analysis/HTMLStripReader.java src/test/test-files/htmlStripReaderTest.html

Reply via email to