Author: jukka
Date: Fri Oct 16 09:32:02 2009
New Revision: 825821

URL: http://svn.apache.org/viewvc?rev=825821&view=rev
Log:
TIKA-287: HtmlParser should resolve relative paths in <a href="xxx"> elements

Move also the <title/> and <meta/> tag handling to the Body/HtmlHandler class

Added:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
      - copied, changed from r825818, 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BodyHandler.java
Removed:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BodyHandler.java
Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Copied: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 (from r825818, 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BodyHandler.java)
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?p2=lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java&p1=lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BodyHandler.java&r1=825818&r2=825821&rev=825821&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BodyHandler.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 Fri Oct 16 09:32:02 2009
@@ -21,12 +21,14 @@
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.tika.metadata.Metadata;
 import org.apache.tika.sax.TextContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-class BodyHandler extends TextContentHandler {
+class HtmlHandler extends TextContentHandler {
 
     /**
      * Set of safe mappings from incoming HTML elements to outgoing
@@ -71,54 +73,103 @@
 
     private final XHTMLContentHandler xhtml;
 
+    private final Metadata metadata;
+
+    private int bodyLevel = 0;
+
     private int discardLevel = 0;
 
-    public BodyHandler(XHTMLContentHandler xhtml) {
+    private int titleLevel = 0;
+
+    private StringBuilder title = new StringBuilder();
+
+    private HtmlHandler(XHTMLContentHandler xhtml, Metadata metadata) {
         super(xhtml);
         this.xhtml = xhtml;
+        this.metadata = metadata;
+    }
+
+    public HtmlHandler(ContentHandler handler, Metadata metadata) {
+        this(new XHTMLContentHandler(handler, metadata), metadata);
     }
 
     @Override
     public void startElement(
             String uri, String local, String name, Attributes atts)
             throws SAXException {
-        if (discardLevel != 0) {
+        if ("TITLE".equals(name) || titleLevel > 0) {
+            titleLevel++;
+        }
+        if ("BODY".equals(name) || bodyLevel > 0) {
+            bodyLevel++;
+        }
+        if (DISCARD_ELEMENTS.contains(name) || discardLevel > 0) {
             discardLevel++;
-        } else if (DISCARD_ELEMENTS.contains(name)) {
-            discardLevel = 1;
-        } else if (SAFE_ELEMENTS.containsKey(name)) {
-            xhtml.startElement(SAFE_ELEMENTS.get(name));
-        } else if ("A".equals(name)) {
-            String href = atts.getValue("href");
-            if (href != null) {
-                xhtml.startElement("a", "href", href);
-            } else {
-                String anchor = atts.getValue("name");
-                if (anchor != null) {
-                    xhtml.startElement("a", "name", anchor);
+        }
+
+        if (bodyLevel == 0 && discardLevel == 0 && "META".equals(name)) {
+            String content = atts.getValue("content");
+            if (atts.getValue("http-equiv") != null && content != null) {
+                metadata.set(atts.getValue("http-equiv"), content);
+            }
+            if (atts.getValue("name") != null && content != null) {
+                metadata.set(atts.getValue("name"), content);
+            }
+        }
+
+        if (bodyLevel > 0 && discardLevel == 0) {
+            if (SAFE_ELEMENTS.containsKey(name)) {
+                xhtml.startElement(SAFE_ELEMENTS.get(name));
+            } else if ("A".equals(name)) {
+                String href = atts.getValue("href");
+                if (href != null) {
+                    xhtml.startElement("a", "href", href);
                 } else {
-                    xhtml.startElement("a");
+                    String anchor = atts.getValue("name");
+                    if (anchor != null) {
+                        xhtml.startElement("a", "name", anchor);
+                    } else {
+                        xhtml.startElement("a");
+                    }
                 }
             }
         }
+
+        title.setLength(0);
     }
 
     @Override
     public void endElement(
             String uri, String local, String name) throws SAXException {
-        if (discardLevel != 0) {
+        if (bodyLevel > 0 && discardLevel == 0) {
+            if (SAFE_ELEMENTS.containsKey(name)) {
+                xhtml.endElement(SAFE_ELEMENTS.get(name));
+            } else if ("A".equals(name)) {
+                xhtml.endElement("a");
+            }
+        }
+
+        if (titleLevel > 0) {
+            titleLevel--;
+            if (titleLevel == 0) {
+                metadata.set(Metadata.TITLE, title.toString().trim());
+            }
+        }
+        if (bodyLevel > 0) {
+            bodyLevel--;
+        }
+        if (discardLevel > 0) {
             discardLevel--;
-        } else if (SAFE_ELEMENTS.containsKey(name)) {
-            xhtml.endElement(SAFE_ELEMENTS.get(name));
-        } else if ("A".equals(name)) {
-            xhtml.endElement("a");
         }
     }
 
     @Override
     public void characters(char[] ch, int start, int length)
             throws SAXException {
-        if (discardLevel == 0) {
+        if (titleLevel > 0 && bodyLevel == 0) {
+            title.append(ch, start, length);
+        }
+        if (bodyLevel > 0 && discardLevel == 0) {
             super.characters(ch, start, length);
         }
     }
@@ -126,7 +177,7 @@
     @Override
     public void ignorableWhitespace(char[] ch, int start, int length)
             throws SAXException {
-        if (discardLevel == 0) {
+        if (bodyLevel > 0 && discardLevel == 0) {
             super.ignorableWhitespace(ch, start, length);
         }
     }

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=825821&r1=825820&r2=825821&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 Fri Oct 16 09:32:02 2009
@@ -25,13 +25,7 @@
 import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.TeeContentHandler;
-import org.apache.tika.sax.WriteOutContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.sax.xpath.Matcher;
-import org.apache.tika.sax.xpath.MatchingContentHandler;
-import org.apache.tika.sax.xpath.XPathParser;
-import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
@@ -59,15 +53,7 @@
 
         // Prepare the HTML content handler that generates proper
         // XHTML events to records relevant document metadata
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        XPathParser xpath = new XPathParser(null, "");
-        Matcher body = xpath.parse("/HTML/BODY//node()");
-        Matcher title = xpath.parse("/HTML/HEAD/TITLE//node()");
-        Matcher meta = xpath.parse("/HTML/HEAD/META//node()");
-        handler = new TeeContentHandler(
-                new MatchingContentHandler(new BodyHandler(xhtml), body),
-                new MatchingContentHandler(getTitleHandler(metadata), title),
-                new MatchingContentHandler(getMetaHandler(metadata), meta));
+        handler = new HtmlHandler(handler, metadata);
 
         // Parse the HTML document
         org.ccil.cowan.tagsoup.Parser parser =
@@ -86,29 +72,4 @@
         parse(stream, handler, metadata, context);
     }
 
-    private ContentHandler getTitleHandler(final Metadata metadata) {
-        return new WriteOutContentHandler() {
-            @Override
-            public void endElement(String u, String l, String n) {
-                metadata.set(Metadata.TITLE, toString());
-            }
-        };
-    }
-
-    private ContentHandler getMetaHandler(final Metadata metadata) {
-        return new WriteOutContentHandler() {
-            @Override
-            public void startElement(
-                    String uri, String local, String name, Attributes atts)
-                    throws SAXException {
-                    if (atts.getValue("http-equiv") != null) {
-                        metadata.set(atts.getValue("http-equiv"), 
atts.getValue("content"));
-                    }
-                    if (atts.getValue("name") != null) {
-                        metadata.set(atts.getValue("name"), 
atts.getValue("content"));
-                    }
-            }
-        };
-    }
-
 }


Reply via email to