TIKA-1980 via Joseph Naegele
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/27bc383e Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/27bc383e Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/27bc383e Branch: refs/heads/2.x Commit: 27bc383ebb4b1c7162ec37f93a93001f01ed3f2a Parents: 09bd22f Author: tballison <talli...@mitre.org> Authored: Fri Aug 12 12:45:36 2016 -0400 Committer: tballison <talli...@mitre.org> Committed: Fri Aug 12 12:45:36 2016 -0400 ---------------------------------------------------------------------- .../apache/tika/sax/XHTMLContentHandler.java | 6 ++-- .../apache/tika/parser/html/HtmlParserTest.java | 31 +++++++++++++++++++ .../resources/test-documents/testHTML_head.html | 32 ++++++++++++++++++++ 3 files changed, 66 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/27bc383e/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java index ada3367..9f73076 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java @@ -53,8 +53,7 @@ public class XHTMLContentHandler extends SafeContentHandler { * The elements that are in the <head> section. */ private static final Set<String> HEAD = - unmodifiableSet("title", "link", "base", "meta"); - + unmodifiableSet("title", "link", "base", "meta", "script"); /** * The elements that are automatically emitted by lazyStartHead, so * skip them if they get sent to startElement/endElement by mistake. @@ -74,7 +73,8 @@ public class XHTMLContentHandler extends SafeContentHandler { public static final Set<String> ENDLINE = unmodifiableSet( "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl", "pre", "hr", "blockquote", "address", "fieldset", "table", "form", - "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option"); + "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", + "option", "link", "script"); private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); http://git-wip-us.apache.org/repos/asf/tika/blob/27bc383e/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index a61d7f8..6b4788e 100644 --- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -1167,6 +1167,37 @@ public class HtmlParserTest extends TikaTest { assertEquals(url, links.get(0)); } + @Test + public void testAllHeadElements() throws Exception { + //TIKA-1980 + // IdentityHtmlMapper is needed to extract <script> tags + ParseContext context = new ParseContext(); + context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE); + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "text/html"); + + final Map<String, Integer> tagFrequencies = new HashMap<>(); + + String path = "/test-documents/testHTML_head.html"; + try (InputStream stream = HtmlParserTest.class.getResourceAsStream(path)) { + ContentHandler tagCounter = new DefaultHandler() { + @Override + public void startElement( + String uri, String local, String name, Attributes attributes) + throws SAXException { + + int count = tagFrequencies.containsKey(name) ? tagFrequencies.get(name) : 0; + tagFrequencies.put(name, count + 1); + } + }; + new HtmlParser().parse(stream, tagCounter, metadata, context); + } + + assertEquals(1, (int)tagFrequencies.get("title")); + assertEquals(9, (int)tagFrequencies.get("meta")); + assertEquals(12, (int)tagFrequencies.get("link")); + assertEquals(6, (int)tagFrequencies.get("script")); + } @Test public void testSkippingCommentsInEncodingDetection() throws Exception { http://git-wip-us.apache.org/repos/asf/tika/blob/27bc383e/tika-test-resources/src/test/resources/test-documents/testHTML_head.html ---------------------------------------------------------------------- diff --git a/tika-test-resources/src/test/resources/test-documents/testHTML_head.html b/tika-test-resources/src/test/resources/test-documents/testHTML_head.html new file mode 100644 index 0000000..d098ad2 --- /dev/null +++ b/tika-test-resources/src/test/resources/test-documents/testHTML_head.html @@ -0,0 +1,32 @@ +<!DOCTYPE html> +<head> + <meta charset="utf-8"> + <meta http-equiv="x-dns-prefetch-control" content="on"> + <meta name="viewport" id="viewport" content="width=device-width, initial-scale=1, user-scalable=no"> + <script src="https://tika.apache.org/logic0.js" async defer></script> + <script src="https://tika.apache.org/logic1.js" defer></script> + <script src="https://tika.apache.org/logic2.js" defer></script> + <link rel="stylesheet" href="https://tika.apache.org/style0.css"> + <link rel="stylesheet" href="https://tika.apache.org/style1.css"> + <link rel="stylesheet" href="https://tika.apache.org/style2.css"> + <link rel="stylesheet" href="https://tika.apache.org/style3.css"> + <link rel="stylesheet" href="https://tika.apache.org/style4.css"> + <link rel="canonical" href="https://tika.apache.org/"> + <link rel="mask-icon" sizes="any" href="https://tika.apache.org/images/tika.svg" color="#aaaaaa"> + <link rel="icon" href="https://tika.apache.org/favicon.png"> + <link rel="manifest" href="/manifest.json"> + <meta name="application-name" content="Tika"> + <meta name="referrer" content="origin"> + <meta property="fb:app_id" content="111111111111"> + <meta property="og:site_name" content="Tika"> + <script src="https://tika.apache.org/logic3.js" defer></script> + <script src="https://tika.apache.org/logic4.js" defer></script> + <script src="https://tika.apache.org/logic5.js" defer></script> + <title>Apache Tika</title> + <link rel="alternate" hreflang="en" href="https://tika.apache.org/" /> + <link rel="alternate" href="ios-app://111111111/tika/home" /> + <link rel="alternate" href="android-app://org.apache/tika/" /> +</head> +<body> +</body> +</html>