>>While you are at it, perhaps it would be good to add support for add >>other META tags I posted that a while back. Here it is again. See the getMetaTags() method.....
Mark Harwood /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Lucene" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache", * "Apache Lucene", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ // HTMLParser.jj options { STATIC = false; OPTIMIZE_TOKEN_MANAGER = true; file://DEBUG_LOOKAHEAD = true; file://DEBUG_TOKEN_MANAGER = true; } PARSER_BEGIN(HTMLParser) package org.apache.lucene.HTMLParser; import java.io.*; import java.util.Properties; public class HTMLParser { public static int SUMMARY_LENGTH = 200; StringBuffer title = new StringBuffer(SUMMARY_LENGTH); StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2); Properties metaTags=new Properties(); String currentMetaTag=""; int length = 0; boolean titleComplete = false; boolean inTitle = false; boolean inMetaTag = false; boolean inScript = false; boolean afterTag = false; boolean afterSpace = false; String eol = System.getProperty("line.separator"); PipedReader pipeIn = null; PipedWriter pipeOut; public HTMLParser(File file) throws FileNotFoundException { this(new FileInputStream(file)); } public String getTitle() throws IOException, InterruptedException { if (pipeIn == null) getReader(); // spawn parsing thread while (true) { synchronized(this) { if (titleComplete || (length > SUMMARY_LENGTH)) break; wait(10); } } return title.toString().trim(); } public Properties getMetaTags() throws IOException, InterruptedException { if (pipeIn == null) getReader(); // spawn parsing thread while (true) { synchronized(this) { if (titleComplete || (length > SUMMARY_LENGTH)) break; wait(10); } } return metaTags; } public String getSummary() throws IOException, InterruptedException { if (pipeIn == null) getReader(); // spawn parsing thread while (true) { synchronized(this) { if (summary.length() >= SUMMARY_LENGTH) break; wait(10); } } if (summary.length() > SUMMARY_LENGTH) summary.setLength(SUMMARY_LENGTH); String sum = summary.toString().trim(); String tit = getTitle(); if (sum.startsWith(tit)) return sum.substring(tit.length()); else return sum; } public Reader getReader() throws IOException { if (pipeIn == null) { pipeIn = new PipedReader(); pipeOut = new PipedWriter(pipeIn); Thread thread = new ParserThread(this); thread.start(); // start parsing } return pipeIn; } void addToSummary(String text) { if (summary.length() < SUMMARY_LENGTH) { summary.append(text); if (summary.length() >= SUMMARY_LENGTH) { synchronized(this) { notifyAll(); } } } } void addText(String text) throws IOException { if (inScript) return; if (inMetaTag) { metaTags.setProperty(currentMetaTag, text); return; } if (inTitle) title.append(text); else { addToSummary(text); if (!titleComplete && !title.equals("")) { // finished title synchronized(this) { titleComplete = true; // tell waiting threads notifyAll(); } } } length += text.length(); pipeOut.write(text); afterSpace = false; } void addSpace() throws IOException { if (inScript) return; if (!afterSpace) { if (inTitle) title.append(" "); else addToSummary(" "); String space = afterTag ? eol : " "; length += space.length(); pipeOut.write(space); afterSpace = true; } } // void handleException(Exception e) { // System.out.println(e.toString()); // print the error message // System.out.println("Skipping..."); // Token t; // do { // t = getNextToken(); // } while (t.kind != TagEnd); // } } PARSER_END(HTMLParser) void HTMLDocument() throws IOException : { Token t; } { // try { ( Tag() { afterTag = true; } | t=Decl() { afterTag = true; } | CommentTag() { afterTag = true; } | t=<Word> { addText(t.image); afterTag = false; } | t=<Entity> { addText(Entities.decode(t.image)); afterTag = false; } | t=<Punct> { addText(t.image); afterTag = false; } | <Space> { addSpace(); afterTag = false; } )* <EOF> // } catch (ParseException e) { // handleException(e); // } } void Tag() throws IOException : { Token t1, t2; boolean inImg = false; } { t1=<TagName> { inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE> inMetaTag = t1.image.equalsIgnoreCase("<META"); // keep track if in <META> inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG> if (inScript) { // keep track if in <SCRIPT> inScript = !t1.image.equalsIgnoreCase("</script"); } else { inScript = t1.image.equalsIgnoreCase("<script"); } } (t1=<ArgName> (<ArgEquals> (t2=ArgValue() // save ALT text in IMG tag { if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null) addText("[" + t2.image + "]"); if(inMetaTag && ( t1.image.equalsIgnoreCase("name") || t1.image.equalsIgnoreCase("HTTP-EQUIV") ) && t2 != null) { currentMetaTag=t2.image.toLowerCase(); } if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 != null) { addText(t2.image); } } )? )? )* <TagEnd> } Token ArgValue() : { Token t = null; } { t=<ArgValue> { return t; } | LOOKAHEAD(2) <ArgQuote1> <CloseQuote1> { return t; } | <ArgQuote1> t=<Quote1Text> <CloseQuote1> { return t; } | LOOKAHEAD(2) <ArgQuote2> <CloseQuote2> { return t; } | <ArgQuote2> t=<Quote2Text> <CloseQuote2> { return t; } } Token Decl() : { Token t; } { t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd> { return t; } } void CommentTag() : {} { (<Comment1> ( <CommentText1> )* <CommentEnd1>) | (<Comment2> ( <CommentText2> )* <CommentEnd2>) } TOKEN : { < TagName: "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag | < DeclName: "<" "!" ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag | < Comment1: "<!--" > : WithinComment1 | < Comment2: "<!" > : WithinComment2 | < Word: ( <LET> | <LET> (["+","/"])+ | <NUM> ["\""] | <LET> ["-","'"] <LET> | ("$")? <NUM> [",","."] <NUM> )+ > | < #LET: ["A"-"Z","a"-"z","0"-"9"] > | < #NUM: ["0"-"9"] > | < Entity: ( "&" (["A"-"Z","a"-"z"])+ (";")? | "&" "#" (<NUM>)+ (";")? ) > | < Space: (<SP>)+ > | < #SP: [" ","\t","\r","\n"] > | < Punct: ~[] > // Keep this last. It is a catch-all. } <WithinTag> TOKEN: { < ArgName: (~[" ","\t","\r","\n","=",">","'","\""]) (~[" ","\t","\r","\n","=",">"])* > | < ArgEquals: "=" > : AfterEquals | < TagEnd: ">" | "=>" > : DEFAULT } <AfterEquals> TOKEN: { < ArgValue: (~[" ","\t","\r","\n","=",">","'","\""]) (~[" ","\t","\r","\n",">"])* > : WithinTag } <WithinTag, AfterEquals> TOKEN: { < ArgQuote1: "'" > : WithinQuote1 | < ArgQuote2: "\"" > : WithinQuote2 } <WithinTag, AfterEquals> SKIP: { < <Space> > } <WithinQuote1> TOKEN: { < Quote1Text: (~["'"])+ > | < CloseQuote1: <ArgQuote1> > : WithinTag } <WithinQuote2> TOKEN: { < Quote2Text: (~["\""])+ > | < CloseQuote2: <ArgQuote2> > : WithinTag } <WithinComment1> TOKEN : { < CommentText1: (~["-"])+ | "-" > | < CommentEnd1: "-->" > : DEFAULT } <WithinComment2> TOKEN : { < CommentText2: (~[">"])+ > | < CommentEnd2: ">" > : DEFAULT }
