Author: kkrugler
Date: Fri Aug 13 17:09:54 2010
New Revision: 985288

URL: http://svn.apache.org/viewvc?rev=985288&view=rev
Log:
TIKA-457: Fix frameset handling (both general, and for broken HTML)

This also is part of TIKA-463, as it improves handling of <frame> elements,
such that you now get them in the output with resolved src=xxx URL attributes.

Modified:
    
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=985288&r1=985287&r2=985288&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java 
Fri Aug 13 17:09:54 2010
@@ -59,13 +59,13 @@ public class XHTMLContentHandler extends
      * skip them if they get sent to startElement/endElement by mistake.
      */
     private static final Set<String> AUTO =
-        unmodifiableSet("html", "head", "body");
+        unmodifiableSet("html", "head", "body", "frameset");
 
     /**
      * The elements that get prepended with the {...@link #TAB} character.
      */
     private static final Set<String> INDENT =
-        unmodifiableSet("li", "dd", "dt", "td", "th");
+        unmodifiableSet("li", "dd", "dt", "td", "th", "frame");
 
     /**
      * The elements that get appended with the {...@link #NL} character.
@@ -93,7 +93,8 @@ public class XHTMLContentHandler extends
      */
     private boolean headStarted = false;
     private boolean headEnded = false;
-
+    private boolean useFrameset = false;
+    
     public XHTMLContentHandler(ContentHandler handler, Metadata metadata) {
         super(handler);
         this.metadata = metadata;
@@ -138,14 +139,15 @@ public class XHTMLContentHandler extends
      *   &lt;head&gt;
      *     &lt;title&gt;...&lt;/title&gt;
      *   &lt;/head&gt;
-     *   &lt;body&gt;
+     *   &lt;body&gt; (or &lt;frameset&gt;
      * </pre>
      */
-    private void lazyEndHead() throws SAXException {
+    private void lazyEndHead(boolean isFrameset) throws SAXException {
         lazyStartHead();
         
         if (!headEnded) {
             headEnded = true;
+            useFrameset = isFrameset;
             
             // TIKA-478: Emit all metadata values (other than title). We have 
to call
             // startElement() and characters() directly to avoid recursive 
problems.
@@ -156,7 +158,8 @@ public class XHTMLContentHandler extends
                 
                 for (String value : metadata.getValues(name)) {
                     AttributesImpl attributes = new AttributesImpl();
-                    attributes.addAttribute("", name, name, "CDATA", value);
+                    attributes.addAttribute("", "name", "name", "CDATA", name);
+                    attributes.addAttribute("", "content", "content", "CDATA", 
value);
                     super.startElement(XHTML, "meta", "meta", attributes);
                     super.endElement(XHTML, "meta", "meta");
                 }
@@ -172,7 +175,12 @@ public class XHTMLContentHandler extends
             super.endElement(XHTML, "title", "title");
             
             super.endElement(XHTML, "head", "head");
-            super.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES);
+            
+            if (useFrameset) {
+                super.startElement(XHTML, "frameset", "frameset", 
EMPTY_ATTRIBUTES);
+            } else {
+                super.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES);
+            }
         }
     }
 
@@ -186,9 +194,14 @@ public class XHTMLContentHandler extends
      */
     @Override
     public void endDocument() throws SAXException {
-        lazyEndHead();
+        lazyEndHead(useFrameset);
+        
+        if (useFrameset) {
+            super.endElement(XHTML, "frameset", "frameset");
+        } else {
+            super.endElement(XHTML, "body", "body");
+        }
         
-        super.endElement(XHTML, "body", "body");
         super.endElement(XHTML, "html", "html");
         
         endPrefixMapping("");
@@ -204,11 +217,13 @@ public class XHTMLContentHandler extends
             String uri, String local, String name, Attributes attributes)
             throws SAXException {
         
-        if (!AUTO.contains(name)) {
+        if (name.equals("frameset")) {
+            lazyEndHead(true);
+        } else if (!AUTO.contains(name)) {
             if (HEAD.contains(name)) {
                 lazyStartHead();
             } else {
-                lazyEndHead();
+                lazyEndHead(false);
             }
 
             if (XHTML.equals(uri) && INDENT.contains(name)) {
@@ -238,7 +253,7 @@ public class XHTMLContentHandler extends
      */
     @Override
     public void characters(char[] ch, int start, int length) throws 
SAXException {
-        lazyEndHead();
+        lazyEndHead(useFrameset);
         super.characters(ch, start, length);
     }
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java?rev=985288&r1=985287&r2=985288&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
 Fri Aug 13 17:09:54 2010
@@ -62,6 +62,8 @@ public class DefaultHtmlMapper implement
         // TIKA-463 - add additional elements that contain URLs
         put("AREA", "area");
         put("IMG", "img");
+        put("FRAMESET", "frameset");
+        put("FRAME", "frame");
 
     }};
     
@@ -73,6 +75,7 @@ public class DefaultHtmlMapper implement
     private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new 
HashMap<String, Set<String>>() {{
         put("a", attrSet("rel", "name"));
         put("img", attrSet("src"));
+        put("frame", attrSet("src"));
         // TODO KKr - fill out this set.
     }};
     

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=985288&r1=985287&r2=985288&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 Fri Aug 13 17:09:54 2010
@@ -79,7 +79,7 @@ class HtmlHandler extends TextContentHan
         if ("TITLE".equals(name) || titleLevel > 0) {
             titleLevel++;
         }
-        if ("BODY".equals(name) || bodyLevel > 0) {
+        if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) 
{
             bodyLevel++;
         }
         if (mapper.isDiscardElement(name) || discardLevel > 0) {

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=985288&r1=985287&r2=985288&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 Fri Aug 13 17:09:54 2010
@@ -477,5 +477,88 @@ public class HtmlParserTest extends Test
         assertTrue(Pattern.matches("(?s).*<img 
src=\"http://domain.com/image.jpg\"/>.*$", result));
     }
 
+    /**
+     * Test case for TIKA-463. Don't skip elements that have URLs.
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-463";>TIKA-463</a>
+     */
+    public void testFrameSrcExtraction() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+        "<base href=\"http://domain.com\"; />" +
+        "</head><frameset><frame src=\"frame.html\" /></frameset></html>";
+
+        SAXTransformerFactory factory = 
(SAXTransformerFactory)SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+        handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, 
"utf-8");
+        StringWriter sw = new StringWriter();
+        handler.setResult(new StreamResult(sw));
+
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes("UTF-8")),
+                handler, new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+        
+        // <frame> tag should exist, with fully resolved URL
+        assertTrue(Pattern.matches("(?s).*<frame 
src=\"http://domain.com/frame.html\"/>.*$", result));
+    }
+
+    /**
+     * Test case for TIKA-457. Better handling for broken HTML that has 
<frameset> inside of <body>.
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-457";>TIKA-457</a>
+     */
+    public void testFBrokenrameset() throws Exception {
+        final String test2 = "<html><head><title> my title 
</title></head><body>" +
+            "<frameset rows=\"20,*\"><frame src=\"top.html\"></frame>" +
+            "<frameset cols=\"20,*\"><frame src=\"left.html\"></frame>" +
+            "<frame src=\"invalid.html\"/></frame>" +
+            "<frame src=\"right.html\"></frame>" +
+            "</frameset></frameset></body></html>";
+
+        SAXTransformerFactory factory = 
(SAXTransformerFactory)SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+        handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, 
"utf-8");
+        
+        final String test1 = "<html><head><title>Title</title>" +
+        "<base href=\"http://domain.com\"; />" +
+        "</head><body><frameset><frame src=\"frame.html\" 
/></frameset></body></html>";
+
+        StringWriter sw1 = new StringWriter();
+        handler.setResult(new StreamResult(sw1));
+
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test1.getBytes("UTF-8")),
+                handler, new Metadata(), new ParseContext());
+
+        String result = sw1.toString();
+        
+        // <frame> tag should exist, with fully resolved URL
+        assertTrue(Pattern.matches("(?s).*<frame 
src=\"http://domain.com/frame.html\"/>.*$", result));
+        
+        // <body> tag should not exist.
+        assertFalse(Pattern.matches("(?s).*<body>.*$", result));
+
+        StringWriter sw2 = new StringWriter();
+        handler.setResult(new StreamResult(sw2));
+
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test2.getBytes("UTF-8")),
+                handler, new Metadata(), new ParseContext());
+
+        result = sw2.toString();
+        
+        // <frame> tags should exist, with relative URL (no base element 
specified)
+        assertTrue(Pattern.matches("(?s).*<frame src=\"top.html\"/>.*$", 
result));
+        assertTrue(Pattern.matches("(?s).*<frame src=\"left.html\"/>.*$", 
result));
+        assertTrue(Pattern.matches("(?s).*<frame src=\"invalid.html\"/>.*$", 
result));
+        assertTrue(Pattern.matches("(?s).*<frame src=\"right.html\"/>.*$", 
result));
+
+        // <body> tag should not exist.
+        assertFalse(Pattern.matches("(?s).*<body>.*$", result));
+    }
+
 
 }


Reply via email to