Author: kkrugler
Date: Fri Aug 13 00:25:12 2010
New Revision: 985052

URL: http://svn.apache.org/viewvc?rev=985052&view=rev
Log:
TIKA-463: emit <img> tags with resolved URLs for src attribute.

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java?rev=985052&r1=985051&r2=985052&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
 Fri Aug 13 00:25:12 2010
@@ -16,60 +16,97 @@
  */
 package org.apache.tika.parser.html;
 
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
 /**
  * The default HTML mapping rules in Tika.
  *
  * @since Apache Tika 0.6
  */
+...@suppresswarnings("serial")
 public class DefaultHtmlMapper implements HtmlMapper {
 
+    // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
+    private static final Map<String, String> SAFE_ELEMENTS = new 
HashMap<String, String>() {{
+        put("H1", "h1");
+        put("H2", "h2");
+        put("H3", "h3");
+        put("H4", "h4");
+        put("H5", "h5");
+        put("H6", "h6");
+
+        put("P", "p");
+        put("PRE", "pre");
+        put("BLOCKQUOTE", "blockquote");
+
+        put("UL", "ul");
+        put("OL", "ol");
+        put("MENU", "ul");
+        put("LI", "li");
+        put("DL", "dl");
+        put("DT", "dt");
+        put("DD", "dd");
+
+        put("TABLE", "table");
+        put("THEAD", "thead");
+        put("TBODY", "tbody");
+        put("TR", "tr");
+        put("TH", "th");
+        put("TD", "td");
+
+        put("ADDRESS", "address");
+        
+        // TIKA-463 - add additional elements that contain URLs
+        put("AREA", "area");
+        put("IMG", "img");
+
+    }};
+    
+    private static final Set<String> DISCARDABLE_ELEMENTS = new 
HashSet<String>() {{
+        add("STYLE");
+        add("SCRIPT");
+    }};
+
+    private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new 
HashMap<String, Set<String>>() {{
+        put("a", attrSet("rel", "name"));
+        put("img", attrSet("src"));
+        // TODO KKr - fill out this set.
+    }};
+    
+    private static Set<String> attrSet(String... attrs) {
+        Set<String> result = new HashSet<String>();
+        for (String attr : attrs) {
+            result.add(attr);
+        }
+        return result;
+    }
+    
     /**
      * @since Apache Tika 0.8
      */
     public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
 
     public String mapSafeElement(String name) {
-        // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
-
-        if ("H1".equals(name)) return "h1";
-        if ("H2".equals(name)) return "h2";
-        if ("H3".equals(name)) return "h3";
-        if ("H4".equals(name)) return "h4";
-        if ("H5".equals(name)) return "h5";
-        if ("H6".equals(name)) return "h6";
-
-        if ("P".equals(name)) return "p";
-        if ("PRE".equals(name)) return "pre";
-        if ("BLOCKQUOTE".equals(name)) return "blockquote";
-
-        if ("UL".equals(name)) return "ul";
-        if ("OL".equals(name)) return "ol";
-        if ("MENU".equals(name)) return "ul";
-        if ("LI".equals(name)) return "li";
-        if ("DL".equals(name)) return "dl";
-        if ("DT".equals(name)) return "dt";
-        if ("DD".equals(name)) return "dd";
-
-        if ("TABLE".equals(name)) return "table";
-        if ("THEAD".equals(name)) return "thead";
-        if ("TBODY".equals(name)) return "tbody";
-        if ("TR".equals(name)) return "tr";
-        if ("TH".equals(name)) return "th";
-        if ("TD".equals(name)) return "td";
-
-        if ("ADDRESS".equals(name)) return "address";
-
-        return null;
+        return SAFE_ELEMENTS.get(name);
     }
 
-    /** Normalises an attribute name. Assumes that the element name 
-     * is valid and normalised **/
+    /** Normalizes an attribute name. Assumes that the element name 
+     * is valid and normalized 
+     */
     public String mapSafeAttribute(String elementName, String attributeName) {
-        return null;
-    }    
+        Set<String> safeAttrs = SAFE_ATTRIBUTES.get(elementName);
+        if ((safeAttrs != null) && safeAttrs.contains(attributeName)) {
+            return attributeName;
+        } else {
+            return null;
+        }
+    }
     
     public boolean isDiscardElement(String name) {
-        return "STYLE".equals(name) || "SCRIPT".equals(name);
+        return DISCARDABLE_ELEMENTS.contains(name);
     }
 
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=985052&r1=985051&r2=985052&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 Fri Aug 13 00:25:12 2010
@@ -130,11 +130,19 @@ class HtmlHandler extends TextContentHan
                     xhtml.startElement(safe);
                 } else {
                     AttributesImpl newAttributes = new AttributesImpl(atts);
-                    for (int att=0;att<newAttributes.getLength();att++){
+                    for (int att = 0; att < newAttributes.getLength(); att++) {
                         String normAttrName = mapper.mapSafeAttribute(safe, 
newAttributes.getLocalName(att));
-                        if (normAttrName==null){
+                        if (normAttrName == null) {
                             newAttributes.removeAttribute(att);
                             att--;
+                        } else {
+                            // We have a remapped attribute name, so set it as 
it might have changed.
+                            newAttributes.setLocalName(att, normAttrName);
+                            
+                            // And resolve relative links for the src 
attribute.
+                            if (normAttrName.equals("src")) {
+                                newAttributes.setValue(att, 
resolve(newAttributes.getValue(att).trim()));
+                            }
                         }
                     }
                     xhtml.startElement(safe, newAttributes);

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=985052&r1=985051&r2=985052&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 Fri Aug 13 00:25:12 2010
@@ -450,6 +450,32 @@ public class HtmlParserTest extends Test
 
     }
 
+    /**
+     * Test case for TIKA-463. Don't skip elements that have URLs.
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-463";>TIKA-463</a>
+     */
+    public void testImgUrlExtraction() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+        "<base href=\"http://domain.com\"; />" +
+        "</head><body><img src=\"image.jpg\" /></body></html>";
+
+        SAXTransformerFactory factory = 
(SAXTransformerFactory)SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+        handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, 
"utf-8");
+        StringWriter sw = new StringWriter();
+        handler.setResult(new StreamResult(sw));
+
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes("UTF-8")),
+                handler, new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+        
+        // <img> tag should exist, with fully resolved URL
+        assertTrue(Pattern.matches("(?s).*<img 
src=\"http://domain.com/image.jpg\"/>.*$", result));
+    }
 
 
 }


Reply via email to