While you are at it, perhaps it would be good to add support for add
other META tags, such as "robots", especially since people are working
on adding a web crawler component to Lucene.
Thanks,
Otis
--- Daniel Calvo <[EMAIL PROTECTED]> wrote:
> Hi,
>
> I was playing with HTMLParser.jj and made some changes you might be
> interested in. What I did was start handling <META> tags (added
> new methods: getAuthor, getKeywords and getMetadata and changed
> getSummary to check if there's any metadata item with
> name=="description"). I'm also filtering out any text inside
> <STYLE>...</STYLE> (like <SCRIPT> is being handled).
> I've performed some tests and I belive I didn't break anything ;-)
>
> The patch is as follows
>
> Best regards,
>
> --Daniel
>
> Index: HTMLParser.jj
> ===================================================================
> RCS file:
>
/home/cvspublic/jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj,v
> retrieving revision 1.1
> diff -u -r1.1 HTMLParser.jj
> --- HTMLParser.jj 26 Jan 2002 15:01:31 -0000 1.1
> +++ HTMLParser.jj 15 Feb 2002 20:39:49 -0000
> @@ -66,6 +66,8 @@
> package org.apache.lucene.demo.html;
>
> import java.io.*;
> +import java.util.Map;
> +import java.util.HashMap;
>
> public class HTMLParser {
> public static int SUMMARY_LENGTH = 200;
> @@ -76,11 +78,13 @@
> boolean titleComplete = false;
> boolean inTitle = false;
> boolean inScript = false;
> + boolean inStyle = false;
> boolean afterTag = false;
> boolean afterSpace = false;
> String eol = System.getProperty("line.separator");
> PipedReader pipeIn = null;
> PipedWriter pipeOut;
> + HashMap metadata = new HashMap(7);
>
> public HTMLParser(File file) throws FileNotFoundException {
> this(new FileInputStream(file));
> @@ -109,15 +113,60 @@
> wait(10);
> }
> }
> - if (summary.length() > SUMMARY_LENGTH)
> - summary.setLength(SUMMARY_LENGTH);
> + // look in metadata
> + String description = (String) metadata.get("description");
> + if (description != null)
> + return description;
> + else {
> + if (summary.length() > SUMMARY_LENGTH)
> + summary.setLength(SUMMARY_LENGTH);
> +
> + String sum = summary.toString().trim();
> + String tit = getTitle();
> + if (sum.startsWith(tit))
> + return sum.substring(tit.length());
> + else
> + return sum;
> + }
> + }
> +
> + public String getAuthor() throws IOException, InterruptedException
> {
> + if (pipeIn == null)
> + getReader(); // spawn parsing thread
> + while (true) {
> + synchronized(this) {
> + if (summary.length() > 0) // assume that all metadata
> + break; // has already been collected
> + wait(10);
> + }
> + }
> + return (String)metadata.get("author");
> + }
> +
> + public String getKeywords() throws IOException,
> InterruptedException {
> + if (pipeIn == null)
> + getReader(); // spawn parsing thread
> + while (true) {
> + synchronized(this) {
> + if (summary.length() > 0) // assume that all metadata
> + break; // has already been collected
> + wait(10);
> + }
> + }
> + return (String)metadata.get("keywords");
> + }
>
> - String sum = summary.toString().trim();
> - String tit = getTitle();
> - if (sum.startsWith(tit))
> - return sum.substring(tit.length());
> - else
> - return sum;
> + public Map getMetadata() throws IOException, InterruptedException
> {
> + if (pipeIn == null)
> + getReader(); // spawn parsing thread
> + while (true) {
> + synchronized(this) {
> + if (summary.length() > 0) // assume that all metadata
> + break; // has already been collected
> + wait(10);
> + }
> + }
> + return metadata;
> }
>
> public Reader getReader() throws IOException {
> @@ -144,7 +193,7 @@
> }
>
> void addText(String text) throws IOException {
> - if (inScript)
> + if (inScript || inStyle)
> return;
> if (inTitle)
> title.append(text);
> @@ -165,7 +214,7 @@
> }
>
> void addSpace() throws IOException {
> - if (inScript)
> + if (inScript || inStyle)
> return;
> if (!afterSpace) {
> if (inTitle)
> @@ -216,23 +265,38 @@
> {
> Token t1, t2;
> boolean inImg = false;
> + boolean inMeta = false;
> + String name = null;
> + String content = null;
> }
> {
> t1=<TagName> {
> - inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if
> in <TITLE>
> - inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in
> <IMG>
> - if (inScript) { // keep track if in <SCRIPT>
> + inTitle = t1.image.equalsIgnoreCase("<title"); // keep track
> if in <TITLE>
> + inImg = t1.image.equalsIgnoreCase("<img"); // keep track
> if in <IMG>
> + inMeta = t1.image.equalsIgnoreCase("<meta"); // keep track
> if in <META>
> + if (inScript) { // keep track
> if in <SCRIPT>
> inScript = !t1.image.equalsIgnoreCase("</script");
> } else {
> inScript = t1.image.equalsIgnoreCase("<script");
> }
> + if (inStyle) { // keep track
> if in <STYLE>
> + inStyle = !t1.image.equalsIgnoreCase("</style");
> + } else {
> + inStyle = t1.image.equalsIgnoreCase("<style");
> + }
> }
> (t1=<ArgName>
> (<ArgEquals>
> - (t2=ArgValue() // save ALT text in IMG tag
> + (t2=ArgValue()
> {
> if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
> - addText("[" + t2.image + "]");
> + addText("[" + t2.image + "]"); // save ALT text in
> IMG tag
> + if (inMeta && t1.image.equalsIgnoreCase("name") && t2 !=
> null)
> + name = t2.image.toLowerCase(); // save name in META
> tag
> + if (inMeta && t1.image.equalsIgnoreCase("content") && t2 !=
> null)
> + content = t2.image; // save content in
> META tag
> + if (inMeta && name != null && content != null)
> + metadata.put(name, content); // save metadata
> }
> )?
> )?
>
>
> --
> To unsubscribe, e-mail:
> <mailto:[EMAIL PROTECTED]>
> For additional commands, e-mail:
> <mailto:[EMAIL PROTECTED]>
>
__________________________________________________
Do You Yahoo!?
Got something to say? Say it better with Yahoo! Video Mail
http://mail.yahoo.com
--
To unsubscribe, e-mail: <mailto:[EMAIL PROTECTED]>
For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>