Author: kkrugler
Date: Fri Aug 13 00:25:12 2010
New Revision: 985052
URL: http://svn.apache.org/viewvc?rev=985052&view=rev
Log:
TIKA-463: emit <img> tags with resolved URLs for src attribute.
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java?rev=985052&r1=985051&r2=985052&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
Fri Aug 13 00:25:12 2010
@@ -16,60 +16,97 @@
*/
package org.apache.tika.parser.html;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
/**
* The default HTML mapping rules in Tika.
*
* @since Apache Tika 0.6
*/
+...@suppresswarnings("serial")
public class DefaultHtmlMapper implements HtmlMapper {
+ // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
+ private static final Map<String, String> SAFE_ELEMENTS = new
HashMap<String, String>() {{
+ put("H1", "h1");
+ put("H2", "h2");
+ put("H3", "h3");
+ put("H4", "h4");
+ put("H5", "h5");
+ put("H6", "h6");
+
+ put("P", "p");
+ put("PRE", "pre");
+ put("BLOCKQUOTE", "blockquote");
+
+ put("UL", "ul");
+ put("OL", "ol");
+ put("MENU", "ul");
+ put("LI", "li");
+ put("DL", "dl");
+ put("DT", "dt");
+ put("DD", "dd");
+
+ put("TABLE", "table");
+ put("THEAD", "thead");
+ put("TBODY", "tbody");
+ put("TR", "tr");
+ put("TH", "th");
+ put("TD", "td");
+
+ put("ADDRESS", "address");
+
+ // TIKA-463 - add additional elements that contain URLs
+ put("AREA", "area");
+ put("IMG", "img");
+
+ }};
+
+ private static final Set<String> DISCARDABLE_ELEMENTS = new
HashSet<String>() {{
+ add("STYLE");
+ add("SCRIPT");
+ }};
+
+ private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new
HashMap<String, Set<String>>() {{
+ put("a", attrSet("rel", "name"));
+ put("img", attrSet("src"));
+ // TODO KKr - fill out this set.
+ }};
+
+ private static Set<String> attrSet(String... attrs) {
+ Set<String> result = new HashSet<String>();
+ for (String attr : attrs) {
+ result.add(attr);
+ }
+ return result;
+ }
+
/**
* @since Apache Tika 0.8
*/
public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
public String mapSafeElement(String name) {
- // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
-
- if ("H1".equals(name)) return "h1";
- if ("H2".equals(name)) return "h2";
- if ("H3".equals(name)) return "h3";
- if ("H4".equals(name)) return "h4";
- if ("H5".equals(name)) return "h5";
- if ("H6".equals(name)) return "h6";
-
- if ("P".equals(name)) return "p";
- if ("PRE".equals(name)) return "pre";
- if ("BLOCKQUOTE".equals(name)) return "blockquote";
-
- if ("UL".equals(name)) return "ul";
- if ("OL".equals(name)) return "ol";
- if ("MENU".equals(name)) return "ul";
- if ("LI".equals(name)) return "li";
- if ("DL".equals(name)) return "dl";
- if ("DT".equals(name)) return "dt";
- if ("DD".equals(name)) return "dd";
-
- if ("TABLE".equals(name)) return "table";
- if ("THEAD".equals(name)) return "thead";
- if ("TBODY".equals(name)) return "tbody";
- if ("TR".equals(name)) return "tr";
- if ("TH".equals(name)) return "th";
- if ("TD".equals(name)) return "td";
-
- if ("ADDRESS".equals(name)) return "address";
-
- return null;
+ return SAFE_ELEMENTS.get(name);
}
- /** Normalises an attribute name. Assumes that the element name
- * is valid and normalised **/
+ /** Normalizes an attribute name. Assumes that the element name
+ * is valid and normalized
+ */
public String mapSafeAttribute(String elementName, String attributeName) {
- return null;
- }
+ Set<String> safeAttrs = SAFE_ATTRIBUTES.get(elementName);
+ if ((safeAttrs != null) && safeAttrs.contains(attributeName)) {
+ return attributeName;
+ } else {
+ return null;
+ }
+ }
public boolean isDiscardElement(String name) {
- return "STYLE".equals(name) || "SCRIPT".equals(name);
+ return DISCARDABLE_ELEMENTS.contains(name);
}
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=985052&r1=985051&r2=985052&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
Fri Aug 13 00:25:12 2010
@@ -130,11 +130,19 @@ class HtmlHandler extends TextContentHan
xhtml.startElement(safe);
} else {
AttributesImpl newAttributes = new AttributesImpl(atts);
- for (int att=0;att<newAttributes.getLength();att++){
+ for (int att = 0; att < newAttributes.getLength(); att++) {
String normAttrName = mapper.mapSafeAttribute(safe,
newAttributes.getLocalName(att));
- if (normAttrName==null){
+ if (normAttrName == null) {
newAttributes.removeAttribute(att);
att--;
+ } else {
+ // We have a remapped attribute name, so set it as
it might have changed.
+ newAttributes.setLocalName(att, normAttrName);
+
+ // And resolve relative links for the src
attribute.
+ if (normAttrName.equals("src")) {
+ newAttributes.setValue(att,
resolve(newAttributes.getValue(att).trim()));
+ }
}
}
xhtml.startElement(safe, newAttributes);
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=985052&r1=985051&r2=985052&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Fri Aug 13 00:25:12 2010
@@ -450,6 +450,32 @@ public class HtmlParserTest extends Test
}
+ /**
+ * Test case for TIKA-463. Don't skip elements that have URLs.
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
+ */
+ public void testImgUrlExtraction() throws Exception {
+ final String test = "<html><head><title>Title</title>" +
+ "<base href=\"http://domain.com\" />" +
+ "</head><body><img src=\"image.jpg\" /></body></html>";
+
+ SAXTransformerFactory factory =
(SAXTransformerFactory)SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+ handler.getTransformer().setOutputProperty(OutputKeys.ENCODING,
"utf-8");
+ StringWriter sw = new StringWriter();
+ handler.setResult(new StreamResult(sw));
+
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ handler, new Metadata(), new ParseContext());
+
+ String result = sw.toString();
+
+ // <img> tag should exist, with fully resolved URL
+ assertTrue(Pattern.matches("(?s).*<img
src=\"http://domain.com/image.jpg\"/>.*$", result));
+ }
}