otis 2002/06/29 15:08:27 Modified: src/demo/org/apache/lucene/demo/html HTMLParser.jj Log: - Improved HTML parser that allows one to get HTML document's meta tags' values. Submitted by: Mark Harwood Reviewed by: otis Revision Changes Path 1.2 +48 -5 jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj Index: HTMLParser.jj =================================================================== RCS file: /home/cvs/jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- HTMLParser.jj 26 Jan 2002 15:01:31 -0000 1.1 +++ HTMLParser.jj 29 Jun 2002 22:08:26 -0000 1.2 @@ -66,15 +66,20 @@ package org.apache.lucene.demo.html; import java.io.*; +import java.util.Properties; public class HTMLParser { public static int SUMMARY_LENGTH = 200; - + StringBuffer title = new StringBuffer(SUMMARY_LENGTH); StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2); + Properties metaTags=new Properties(); + String currentMetaTag=""; int length = 0; boolean titleComplete = false; boolean inTitle = false; + boolean inMetaTag = false; + boolean inStyle = false; boolean inScript = false; boolean afterTag = false; boolean afterSpace = false; @@ -99,6 +104,21 @@ return title.toString().trim(); } + public Properties getMetaTags() throws IOException, +InterruptedException { + if (pipeIn == null) + getReader(); // spawn parsing thread + while (true) { + synchronized(this) { + if (titleComplete || (length > SUMMARY_LENGTH)) + break; + wait(10); + } + } + return metaTags; + } + + public String getSummary() throws IOException, InterruptedException { if (pipeIn == null) getReader(); // spawn parsing thread @@ -124,7 +144,7 @@ if (pipeIn == null) { pipeIn = new PipedReader(); pipeOut = new PipedWriter(pipeIn); - + Thread thread = new ParserThread(this); thread.start(); // start parsing } @@ -146,6 +166,13 @@ void addText(String text) throws IOException { if (inScript) return; + if (inStyle) + return; + if (inMetaTag) + { + metaTags.setProperty(currentMetaTag, text); + return; + } if (inTitle) title.append(text); else { @@ -163,7 +190,7 @@ afterSpace = false; } - + void addSpace() throws IOException { if (inScript) return; @@ -172,7 +199,7 @@ title.append(" "); else addToSummary(" "); - + String space = afterTag ? eol : " "; length += space.length(); pipeOut.write(space); @@ -220,6 +247,8 @@ { t1=<TagName> { inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE> + inMetaTag = t1.image.equalsIgnoreCase("<META"); // keep track if in <META> + inStyle = t1.image.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE> inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG> if (inScript) { // keep track if in <SCRIPT> inScript = !t1.image.equalsIgnoreCase("</script"); @@ -233,6 +262,20 @@ { if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null) addText("[" + t2.image + "]"); + + if(inMetaTag && + ( t1.image.equalsIgnoreCase("name") || + t1.image.equalsIgnoreCase("HTTP-EQUIV") + ) + && t2 != null) + { + currentMetaTag=t2.image.toLowerCase(); + } + if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 != +null) + { + addText(t2.image); + } } )? )? @@ -272,7 +315,7 @@ | (<Comment2> ( <CommentText2> )* <CommentEnd2>) } - + TOKEN : {
-- To unsubscribe, e-mail: <mailto:[EMAIL PROTECTED]> For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>