html HTMLParser.jj

otis Sat, 29 Jun 2002 14:48:31 -0700

otis        2002/06/29 15:08:27

  Modified:    src/demo/org/apache/lucene/demo/html HTMLParser.jj
  Log:
  - Improved HTML parser that allows one to get HTML document's meta tags' values.
  Submitted by: Mark Harwood
  Reviewed by:  otis
  
  Revision  Changes    Path
  1.2       +48 -5     
jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj
  
  Index: HTMLParser.jj
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- HTMLParser.jj     26 Jan 2002 15:01:31 -0000      1.1
  +++ HTMLParser.jj     29 Jun 2002 22:08:26 -0000      1.2
  @@ -66,15 +66,20 @@
   package org.apache.lucene.demo.html;
   
   import java.io.*;
  +import java.util.Properties;
   
   public class HTMLParser {
     public static int SUMMARY_LENGTH = 200;
  -  
  +
     StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
     StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
  +  Properties metaTags=new Properties();
  +  String currentMetaTag="";
     int length = 0;
     boolean titleComplete = false;
     boolean inTitle = false;
  +  boolean inMetaTag = false;
  +  boolean inStyle = false;
     boolean inScript = false;
     boolean afterTag = false;
     boolean afterSpace = false;
  @@ -99,6 +104,21 @@
       return title.toString().trim();
     }
   
  +  public Properties getMetaTags() throws IOException,
  +InterruptedException {
  +    if (pipeIn == null)
  +      getReader();                             // spawn parsing thread
  +    while (true) {
  +      synchronized(this) {
  +     if (titleComplete || (length > SUMMARY_LENGTH))
  +       break;
  +     wait(10);
  +      }
  +    }
  +    return metaTags;
  +  }
  +
  +
     public String getSummary() throws IOException, InterruptedException {
       if (pipeIn == null)
         getReader();                             // spawn parsing thread
  @@ -124,7 +144,7 @@
       if (pipeIn == null) {
         pipeIn = new PipedReader();
         pipeOut = new PipedWriter(pipeIn);
  -      
  +
         Thread thread = new ParserThread(this);
         thread.start();                                  // start parsing
       }
  @@ -146,6 +166,13 @@
     void addText(String text) throws IOException {
       if (inScript)
         return;
  +    if (inStyle)
  +      return;
  +    if (inMetaTag)
  +    {
  +     metaTags.setProperty(currentMetaTag, text);
  +             return;
  +    }
       if (inTitle)
         title.append(text);
       else {
  @@ -163,7 +190,7 @@
   
       afterSpace = false;
     }
  -  
  +
     void addSpace() throws IOException {
       if (inScript)
         return;
  @@ -172,7 +199,7 @@
        title.append(" ");
         else
        addToSummary(" ");
  -      
  +
         String space = afterTag ? eol : " ";
         length += space.length();
         pipeOut.write(space);
  @@ -220,6 +247,8 @@
   {
     t1=<TagName> {
       inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
  +    inMetaTag = t1.image.equalsIgnoreCase("<META"); // keep track if in <META>
  +    inStyle = t1.image.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
       inImg = t1.image.equalsIgnoreCase("<img");         // keep track if in <IMG>
       if (inScript) {                            // keep track if in <SCRIPT>
         inScript = !t1.image.equalsIgnoreCase("</script");
  @@ -233,6 +262,20 @@
        {
          if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
            addText("[" + t2.image + "]");
  +
  +     if(inMetaTag &&
  +                     (  t1.image.equalsIgnoreCase("name") ||
  +                        t1.image.equalsIgnoreCase("HTTP-EQUIV")
  +                     )
  +        && t2 != null)
  +     {
  +             currentMetaTag=t2.image.toLowerCase();
  +     }
  +     if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
  +null)
  +     {
  +             addText(t2.image);
  +     }
        }
       )?
      )?
  @@ -272,7 +315,7 @@
    |
     (<Comment2> ( <CommentText2> )* <CommentEnd2>)
   }
  -  
  +
   
   TOKEN :
   {


--
To unsubscribe, e-mail:   <mailto:[EMAIL PROTECTED]>
For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>

cvs commit: jakarta-lucene/src/demo/org/apache/lucene/demo/html HTMLParser.jj

Reply via email to