The following little program should do the job for
you.
/* HTMLTextStripper.java
* July 15, 2006
*/
import java.io.*;
import org.xml.sax.*;
import org.xml.sax.helpers.*;
import javax.xml.parsers.*;
/** HTMLTextStripper
* @author Charles Bell
* @version July 15, 2006
*/
public class HTMLTextStripper extends DefaultHandler{
private SAXParser parser = null;
private String tempString = "";
private boolean debug = false;
private String errorMessage = "";
public static void main(String[] args){
new HTMLTextStripper().test();
}
public void test(){
System.out.println(stripText("<html><body>This
is body text. <p>This is paragraph text.</p><p>This is
malformed html because of no end p
tag.</body></html>"));
System.out.println("error: " +
getErrorMessage());
}
public String stripText(String html) {
try{
parser =
SAXParserFactory.newInstance().newSAXParser();
}catch(ParserConfigurationException pce){
System.err.println("ParserConfigurationException: " +
pce.getMessage());
}catch(SAXException saxe){
System.err.println("SAXException: " +
saxe.getMessage());
}
if (parser != null){
try{
InputSource inputsource = new
InputSource(new StringReader(html));
parser.parse(inputsource, this);
}catch(IOException ioe){
errorMessage = errorMessage +
("IOException: " + ioe.getMessage());
}catch(SAXException saxe){
errorMessage = errorMessage +
("SAXException: " + saxe.getMessage());
}
}else{
errorMessage = errorMessage + ("XML Reader
not initialized.");
}
return tempString;
}
public String getErrorMessage(){
return errorMessage;
}
/** characters is called by the SAXparser when it
* encounters character data in an xml document.
*/
public void characters(char[] ch, int start, int
length) throws SAXException{
tempString = tempString + new
String(ch,start,length);
}
}
--- Ross Rankin <[EMAIL PROTECTED]> wrote:
> Since I cannot seem to access the HTMLParser mailing
> list and I saw the
> library recommended here, I thought someone here
> that has used it
> successfully can help me out.
>
> I have HTML text stored in a database field which I
> want to add to a
> Lucene document, but I want to remove the HTML tags,
> so HTMLParser
> looked like it would fit the bill.
>
>
>
> First, it does not seem to be parsing
hence my
> first problem and it
> also is throwing an exception along with this phrase
> sprinkled around
> (No such file or directory).
>
>
>
> I think I may be using it wrong, so heres what I
> have done. In my
> object where I create my document, I have the
> following code:
>
> StringExtractor extract = new
>
StringExtractor(record.get("column14").toString().trim());
>
> try {
>
> value = extract.extractStrings(false);
>
> } catch (ParserException pe) {
>
> System.out.println("Index Long
> Description Parser
> Exception:" + pe.getMessage() );
>
> value = "";
>
> }
>
>
>
> What I get out in value is like the following:
>
> <LI><FONT size=2>Crystal Clear III and 3D combfilter
> for natural, sharp
> images with enhanced quality </FONT>
>
> <LI><FONT size=2>Compact and sleek design </FONT>
>
> <LI><FONT size=2>Incredible Surround (No such file
> or directory)
>
>
>
> So the tags are still there and oddly the (No such
> file or directory)
> phrase is added which is not in the original text.
>
>
>
> Then I get a ParserException.
>
>
>
> What am I doing wrong?
>
>
>
> Thanks,
>
> Ross
>
>
>
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]