TIKA-1980 via Joseph Naegele

Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/27bc383e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/27bc383e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/27bc383e

Branch: refs/heads/2.x
Commit: 27bc383ebb4b1c7162ec37f93a93001f01ed3f2a
Parents: 09bd22f
Author: tballison <talli...@mitre.org>
Authored: Fri Aug 12 12:45:36 2016 -0400
Committer: tballison <talli...@mitre.org>
Committed: Fri Aug 12 12:45:36 2016 -0400

----------------------------------------------------------------------
 .../apache/tika/sax/XHTMLContentHandler.java    |  6 ++--
 .../apache/tika/parser/html/HtmlParserTest.java | 31 +++++++++++++++++++
 .../resources/test-documents/testHTML_head.html | 32 ++++++++++++++++++++
 3 files changed, 66 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/27bc383e/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java 
b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
index ada3367..9f73076 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
@@ -53,8 +53,7 @@ public class XHTMLContentHandler extends SafeContentHandler {
      * The elements that are in the <head> section.
      */
     private static final Set<String> HEAD =
-        unmodifiableSet("title", "link", "base", "meta");
-
+            unmodifiableSet("title", "link", "base", "meta", "script");
     /**
      * The elements that are automatically emitted by lazyStartHead, so
      * skip them if they get sent to startElement/endElement by mistake.
@@ -74,7 +73,8 @@ public class XHTMLContentHandler extends SafeContentHandler {
     public static final Set<String> ENDLINE = unmodifiableSet(
             "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
             "pre", "hr", "blockquote", "address", "fieldset", "table", "form",
-            "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", 
"option");
+            "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select",
+            "option", "link", "script");
 
     private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
 

http://git-wip-us.apache.org/repos/asf/tika/blob/27bc383e/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 
b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index a61d7f8..6b4788e 100644
--- 
a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ 
b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -1167,6 +1167,37 @@ public class HtmlParserTest extends TikaTest {
         assertEquals(url, links.get(0));
     }
 
+    @Test
+    public void testAllHeadElements() throws Exception {
+        //TIKA-1980
+        // IdentityHtmlMapper is needed to extract <script> tags
+        ParseContext context = new ParseContext();
+        context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "text/html");
+
+        final Map<String, Integer> tagFrequencies = new HashMap<>();
+
+        String path = "/test-documents/testHTML_head.html";
+        try (InputStream stream = 
HtmlParserTest.class.getResourceAsStream(path)) {
+            ContentHandler tagCounter = new DefaultHandler() {
+                @Override
+                public void startElement(
+                        String uri, String local, String name, Attributes 
attributes)
+                        throws SAXException {
+
+                    int count = tagFrequencies.containsKey(name) ? 
tagFrequencies.get(name) : 0;
+                    tagFrequencies.put(name, count + 1);
+                }
+            };
+            new HtmlParser().parse(stream, tagCounter, metadata, context);
+        }
+
+        assertEquals(1, (int)tagFrequencies.get("title"));
+        assertEquals(9, (int)tagFrequencies.get("meta"));
+        assertEquals(12, (int)tagFrequencies.get("link"));
+        assertEquals(6, (int)tagFrequencies.get("script"));
+    }
 
     @Test
     public void testSkippingCommentsInEncodingDetection() throws Exception {

http://git-wip-us.apache.org/repos/asf/tika/blob/27bc383e/tika-test-resources/src/test/resources/test-documents/testHTML_head.html
----------------------------------------------------------------------
diff --git 
a/tika-test-resources/src/test/resources/test-documents/testHTML_head.html 
b/tika-test-resources/src/test/resources/test-documents/testHTML_head.html
new file mode 100644
index 0000000..d098ad2
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/testHTML_head.html
@@ -0,0 +1,32 @@
+<!DOCTYPE html>
+<head>
+  <meta charset="utf-8">
+  <meta http-equiv="x-dns-prefetch-control" content="on">
+  <meta name="viewport" id="viewport" content="width=device-width, 
initial-scale=1, user-scalable=no">
+  <script src="https://tika.apache.org/logic0.js"; async defer></script>
+  <script src="https://tika.apache.org/logic1.js"; defer></script>
+  <script src="https://tika.apache.org/logic2.js"; defer></script>
+  <link rel="stylesheet" href="https://tika.apache.org/style0.css";>
+  <link rel="stylesheet" href="https://tika.apache.org/style1.css";>
+  <link rel="stylesheet" href="https://tika.apache.org/style2.css";>
+  <link rel="stylesheet" href="https://tika.apache.org/style3.css";>
+  <link rel="stylesheet" href="https://tika.apache.org/style4.css";>
+  <link rel="canonical" href="https://tika.apache.org/";>
+  <link rel="mask-icon" sizes="any" 
href="https://tika.apache.org/images/tika.svg"; color="#aaaaaa">
+  <link rel="icon" href="https://tika.apache.org/favicon.png";>
+  <link rel="manifest" href="/manifest.json">
+  <meta name="application-name" content="Tika">
+  <meta name="referrer" content="origin">
+  <meta property="fb:app_id" content="111111111111">
+  <meta property="og:site_name" content="Tika">
+  <script src="https://tika.apache.org/logic3.js"; defer></script>
+  <script src="https://tika.apache.org/logic4.js"; defer></script>
+  <script src="https://tika.apache.org/logic5.js"; defer></script>
+  <title>Apache Tika</title>
+  <link rel="alternate" hreflang="en" href="https://tika.apache.org/"; />
+  <link rel="alternate" href="ios-app://111111111/tika/home" />
+  <link rel="alternate" href="android-app://org.apache/tika/" />
+</head>
+<body>
+</body>
+</html>

Reply via email to