Author: kkrugler
Date: Thu Aug 12 21:42:55 2010
New Revision: 984997

URL: http://svn.apache.org/viewvc?rev=984997&view=rev
Log:
TIKA-478: Fix handling of <head> elements in HTML parser, and improve 
robustness of XHTMLContentHandler.

Also tried to fix apparent bug w/indent & newline support in 
XHTMLContentHandler.

Modified:
    
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=984997&r1=984996&r2=984997&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java 
Thu Aug 12 21:42:55 2010
@@ -49,6 +49,19 @@ public class XHTMLContentHandler extends
     private static final char[] TAB = new char[] { '\t' };
 
     /**
+     * The elements that are in the <head> section.
+     */
+    private static final Set<String> HEAD =
+        unmodifiableSet("title", "link", "base", "meta");
+
+    /**
+     * The elements that are automatically emitted by lazyStartHead, so
+     * skip them if they get sent to startElement/endElement by mistake.
+     */
+    private static final Set<String> AUTO =
+        unmodifiableSet("html", "head", "body");
+
+    /**
      * The elements that get prepended with the {...@link #TAB} character.
      */
     private static final Set<String> INDENT =
@@ -62,6 +75,8 @@ public class XHTMLContentHandler extends
             "pre", "hr", "blockquote", "address", "fieldset", "table", "form",
             "noscript", "li", "dt", "dd", "noframes", "br", "tr");
 
+    private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
     private static Set<String> unmodifiableSet(String... elements) {
         return Collections.unmodifiableSet(
                 new HashSet<String>(Arrays.asList(elements)));
@@ -74,9 +89,10 @@ public class XHTMLContentHandler extends
     private final Metadata metadata;
 
     /**
-     * Flag to indicate whether the document element has been started.
+     * Flags to indicate whether the document head element has been 
started/ended.
      */
-    private boolean started = false;
+    private boolean headStarted = false;
+    private boolean headEnded = false;
 
     public XHTMLContentHandler(ContentHandler handler, Metadata metadata) {
         super(handler);
@@ -104,19 +120,59 @@ public class XHTMLContentHandler extends
      *   &lt;body&gt;
      * </pre>
      */
-    private void lazyStartDocument() throws SAXException {
-        if (!started) {
-            started = true;
-            startElement("html");
-            startElement("head");
-            startElement("title");
+    private void lazyStartHead() throws SAXException {
+        if (!headStarted) {
+            headStarted = true;
+            
+            // Call directly, so we don't go through our startElement(), which 
will
+            // ignore these elements.
+            super.startElement(XHTML, "html", "html", EMPTY_ATTRIBUTES);
+            super.startElement(XHTML, "head", "head", EMPTY_ATTRIBUTES);
+        }
+    }
+
+    /**
+     * Generates the following XHTML prefix when called for the first time:
+     * <pre>
+     * &lt;html&gt;
+     *   &lt;head&gt;
+     *     &lt;title&gt;...&lt;/title&gt;
+     *   &lt;/head&gt;
+     *   &lt;body&gt;
+     * </pre>
+     */
+    private void lazyEndHead() throws SAXException {
+        lazyStartHead();
+        
+        if (!headEnded) {
+            headEnded = true;
+            
+            // TIKA-478: Emit all metadata values (other than title). We have 
to call
+            // startElement() and characters() directly to avoid recursive 
problems.
+            for (String name : metadata.names()) {
+                if (name.equals("title")) {
+                    continue;
+                }
+                
+                for (String value : metadata.getValues(name)) {
+                    AttributesImpl attributes = new AttributesImpl();
+                    attributes.addAttribute("", name, name, "CDATA", value);
+                    super.startElement(XHTML, "meta", "meta", attributes);
+                    super.endElement(XHTML, "meta", "meta");
+                }
+            }
+            
+            super.startElement(XHTML, "title", "title", EMPTY_ATTRIBUTES);
             String title = metadata.get(Metadata.TITLE);
             if (title != null && title.length() > 0) {
-                characters(title);
+                char[] titleChars = title.toCharArray();
+                super.characters(titleChars, 0, titleChars.length);
             }
-            endElement("title");
-            endElement("head");
-            startElement("body");
+            
+            super.endElement(XHTML, "title", "title");
+            
+            super.endElement(XHTML, "head", "head");
+            super.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES);
         }
     }
 
@@ -130,7 +186,8 @@ public class XHTMLContentHandler extends
      */
     @Override
     public void endDocument() throws SAXException {
-        lazyStartDocument();
+        lazyEndHead();
+        
         endElement("body");
         endElement("html");
         endPrefixMapping("");
@@ -145,11 +202,20 @@ public class XHTMLContentHandler extends
     public void startElement(
             String uri, String local, String name, Attributes attributes)
             throws SAXException {
-        lazyStartDocument();
-        if (XHTML.equals(uri) && INDENT.contains(local)) {
-            ignorableWhitespace(TAB, 0, TAB.length);
+        
+        if (!AUTO.contains(name)) {
+            if (HEAD.contains(name)) {
+                lazyStartHead();
+            } else {
+                lazyEndHead();
+            }
+
+            if (XHTML.equals(uri) && INDENT.contains(name)) {
+                ignorableWhitespace(TAB, 0, TAB.length);
+            }
+            
+            super.startElement(uri, local, name, attributes);
         }
-        super.startElement(uri, local, name, attributes);
     }
 
     /**
@@ -157,11 +223,12 @@ public class XHTMLContentHandler extends
      * by a newline character.
      */
     @Override
-    public void endElement(String uri, String local, String name)
-            throws SAXException {
-        super.endElement(uri, local, name);
-        if (XHTML.equals(uri) && ENDLINE.contains(local)) {
-            newline();
+    public void endElement(String uri, String local, String name) throws 
SAXException {
+        if (!AUTO.contains(name)) {
+            super.endElement(uri, local, name);
+            if (XHTML.equals(uri) && ENDLINE.contains(name)) {
+                newline();
+            }
         }
     }
 
@@ -169,16 +236,15 @@ public class XHTMLContentHandler extends
      * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-210";>TIKA-210</a>
      */
     @Override
-    public void characters(char[] ch, int start, int length)
-            throws SAXException {
-        lazyStartDocument();
+    public void characters(char[] ch, int start, int length) throws 
SAXException {
+        lazyEndHead();
         super.characters(ch, start, length);
     }
 
     //------------------------------------------< public convenience methods >
 
     public void startElement(String name) throws SAXException {
-        startElement(XHTML, name, name, new AttributesImpl());
+        startElement(XHTML, name, name, EMPTY_ATTRIBUTES);
     }
 
     public void startElement(String name, String attribute, String value)

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=984997&r1=984996&r2=984997&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 Thu Aug 12 21:42:55 2010
@@ -88,18 +88,20 @@ class HtmlHandler extends TextContentHan
 
         if (bodyLevel == 0 && discardLevel == 0) {
             if ("META".equals(name) && atts.getValue("content") != null) {
+                
+                // TIKA-478: For cases where we have either a name or 
"http-equiv", assume
+                // that XHTMLContentHandler will emit these in the <head>, 
thus passing them
+                // through safely.
                 if (atts.getValue("http-equiv") != null) {
                     metadata.set(
                             atts.getValue("http-equiv"),
                             atts.getValue("content"));
-                    xhtml.startElement(uri, local, "meta", atts);
-                }
-                if (atts.getValue("name") != null) {
+                } else if (atts.getValue("name") != null) {
                     // Record the meta tag in the metadata
                     metadata.set(
                             atts.getValue("name"),
                             atts.getValue("content"));
-                    // Normalise if possible
+                    // Normalize if possible
                     if(atts.getValue("name").equalsIgnoreCase("ICBM")) {
                         Matcher m = Pattern.compile(
                               "\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*"
@@ -109,8 +111,6 @@ class HtmlHandler extends TextContentHan
                             metadata.set(Metadata.LONGITUDE, m.group(2));
                         }
                     }
-                    // Allow downstream processing
-                    xhtml.startElement(uri, local, "meta", atts);
                 }
             } else if ("BASE".equals(name) && atts.getValue("href") != null) {
                 metadata.set(
@@ -126,8 +126,9 @@ class HtmlHandler extends TextContentHan
             String safe = mapper.mapSafeElement(name);
             if (safe != null) {
                 // check if there are any attributes to process
-                if (atts.getLength()==0) xhtml.startElement(safe);
-                else {
+                if (atts.getLength() == 0) {
+                    xhtml.startElement(safe);
+                } else {
                     AttributesImpl newAttributes = new AttributesImpl(atts);
                     for (int att=0;att<newAttributes.getLength();att++){
                         String normAttrName = mapper.mapSafeAttribute(safe, 
newAttributes.getLocalName(att));
@@ -164,8 +165,6 @@ class HtmlHandler extends TextContentHan
                 xhtml.endElement("link");
             } else if ("BASE".equals(name)) {
                 xhtml.endElement("base");
-            } else if ("META".equals(name)) {
-                xhtml.endElement("meta");
             }
         }
         if (bodyLevel > 0 && discardLevel == 0) {

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=984997&r1=984996&r2=984997&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 Thu Aug 12 21:42:55 2010
@@ -22,6 +22,7 @@ import java.io.InputStream;
 import java.io.StringWriter;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.regex.Pattern;
 
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.sax.SAXTransformerFactory;
@@ -405,5 +406,45 @@ public class HtmlParserTest extends Test
     }
     
     
+    /**
+     * Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>.
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-478";>TIKA-478</a>
+     */
+    public void testElementOrdering() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+        "<meta http-equiv=\"content-type\" content=\"text/html\">" +
+        "<link rel=\"next\" href=\"next.html\" />" +
+        "</head><body><p>Simple Content</p></body></html>";
+
+        SAXTransformerFactory factory = 
(SAXTransformerFactory)SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+        handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, 
"utf-8");
+        StringWriter sw = new StringWriter();
+        handler.setResult(new StreamResult(sw));
+
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes("UTF-8")),
+                handler, new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+        
+        // Title element in <head> section
+        
assertTrue(Pattern.matches("(?s)<html.*<head>.*<title>Title</title>.*</head>.*$",
 result));
+
+        // No meta elements in body
+        assertFalse(Pattern.matches("(?s).*<body>.*<meta. *</body>.*$", 
result));
+        
+        // meta elements should show up in <head> section
+        assertTrue(Pattern.matches("(?s)<html.*<head>.*<meta .*</head>.*$", 
result));
+        
+        // No link elements in body
+        assertFalse(Pattern.matches("(?s).*<body>.*<link .*</body>.*$", 
result));
+        
+        // link element should be in <head> section
+        assertTrue(Pattern.matches("(?s)<html.*<head>.*<link .*</head>.*$", 
result));
+    }
+
 
 }


Reply via email to