Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,1114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.html; + +import static java.nio.charset.StandardCharsets.ISO_8859_1; +import static java.nio.charset.StandardCharsets.US_ASCII; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.tika.TikaTest.assertContains; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.sax.SAXTransformerFactory; +import javax.xml.transform.sax.TransformerHandler; +import javax.xml.transform.stream.StreamResult; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.StringWriter; +import java.io.Writer; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Geographic; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.LinkContentHandler; +import org.apache.tika.sax.TeeContentHandler; +import org.ccil.cowan.tagsoup.HTMLSchema; +import org.ccil.cowan.tagsoup.Schema; +import org.junit.Ignore; +import org.junit.Test; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +public class HtmlParserTest { + + @Test + public void testParseAscii() throws Exception { + String path = "/test-documents/testHTML.html"; + final StringWriter href = new StringWriter(); + final StringWriter name = new StringWriter(); + ContentHandler body = new BodyContentHandler(); + Metadata metadata = new Metadata(); + try (InputStream stream = HtmlParserTest.class.getResourceAsStream(path)) { + ContentHandler link = new DefaultHandler() { + @Override + public void startElement( + String u, String l, String n, Attributes a) + throws SAXException { + if ("a".equals(l)) { + if (a.getValue("href") != null) { + href.append(a.getValue("href")); + } else if (a.getValue("name") != null) { + name.append(a.getValue("name")); + } + } + } + }; + new HtmlParser().parse( + stream, new TeeContentHandler(body, link), + metadata, new ParseContext()); + } + + assertEquals( + "Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Tika Developers", metadata.get("Author")); + assertEquals("5", metadata.get("refresh")); + + assertEquals("51.2312", metadata.get(Geographic.LATITUDE)); + assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE)); + + assertEquals("http://www.apache.org/", href.toString()); + assertEquals("test-anchor", name.toString()); + + String content = body.toString(); + assertTrue( + "Did not contain expected text:" + "Test Indexation Html", + content.contains("Test Indexation Html")); + assertTrue( + "Did not contain expected text:" + "Indexation du fichier", + content.contains("Indexation du fichier")); + } + + @Test + @Ignore("The file 'testXHTML_utf8.html' is not available fo testing") + public void XtestParseUTF8() throws IOException, SAXException, TikaException { + String path = "/test-documents/testXHTML_utf8.html"; + Metadata metadata = new Metadata(); + String content = new Tika().parseToString( + HtmlParserTest.class.getResourceAsStream(path), metadata); + + assertTrue("Did not contain expected text:" + + "Title : Tilte with UTF-8 chars âââ§ââ¢", content + .contains("Title : Tilte with UTF-8 chars âââ§ââ¢")); + + assertTrue("Did not contain expected text:" + + "Content with UTF-8 chars", content + .contains("Content with UTF-8 chars")); + + assertTrue("Did not contain expected text:" + "ââ¢â§ââ", content + .contains("ââ¢â§ââ")); + } + + @Test + public void testXhtmlParsing() throws Exception { + String path = "/test-documents/testXHTML.html"; + Metadata metadata = new Metadata(); + String content = new Tika().parseToString( + HtmlParserTest.class.getResourceAsStream(path), metadata); + + //can't specify charset because default differs between OS's + assertTrue(metadata.get(Metadata.CONTENT_TYPE).startsWith("application/xhtml+xml; charset=")); + assertEquals("XHTML test document", metadata.get(TikaCoreProperties.TITLE)); + + assertEquals("Tika Developers", metadata.get("Author")); + assertEquals("5", metadata.get("refresh")); + assertContains("ability of Apache Tika", content); + assertContains("extract content", content); + assertContains("an XHTML document", content); + } + + @Test + public void testParseEmpty() throws Exception { + ContentHandler handler = new BodyContentHandler(); + new HtmlParser().parse( + new ByteArrayInputStream(new byte[0]), + handler, new Metadata(), new ParseContext()); + assertEquals("", handler.toString()); + } + + /** + * Test case for TIKA-210 + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a> + */ + @Test + public void testCharactersDirectlyUnderBodyElement() throws Exception { + String test = "<html><body>test</body></html>"; + String content = new Tika().parseToString( + new ByteArrayInputStream(test.getBytes(UTF_8))); + assertEquals("test", content); + } + + /** + * Test case for TIKA-287 + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-287">TIKA-287</a> + */ + @Test + public void testBaseHref() throws Exception { + assertRelativeLink( + "http://lucene.apache.org/tika/", + "http://lucene.apache.org/", "tika/"); + + assertRelativeLink( + "http://domain.com/?pid=1", + "http://domain.com", "?pid=1"); + assertRelativeLink( + "http://domain.com/?pid=2", + "http://domain.com?pid=1", "?pid=2"); + + assertRelativeLink( + "http://domain.com/file.html", + "http://domain.com/path/", "/file.html"); + assertRelativeLink( + "http://domain.com/path/file.html", + "http://domain.com/path/", "./file.html"); + assertRelativeLink( + "http://domain.com/path/file.html", + "http://domain.com/path/", "file.html"); + + assertRelativeLink( + "http://domain2.com/newpath", + "http://domain.com/path/to/file", "http://domain2.com/newpath"); + + // See http://www.communities.hp.com/securitysoftware/blogs/jeff/archive/2007/12/19/RFC-1808-vs-2396-vs-3986_3A00_-Browsers-vs.-programing-languages.aspx + // Also http://www.ietf.org/rfc/rfc3986.txt + // Also http://issues.apache.org/jira/browse/NUTCH-566 + // Also http://issues.apache.org/jira/browse/NUTCH-436 + assertRelativeLink( + "http://domain.com/path/?pid=1", + "http://domain.com/path/", "?pid=1"); + assertRelativeLink( + "http://domain.com/file?pid=1", + "http://domain.com/file", "?pid=1"); + assertRelativeLink( + "http://domain.com/path/d;p?pid=1", + "http://domain.com/path/d;p?q#f", "?pid=1"); + } + + private void assertRelativeLink(String url, String base, String relative) + throws Exception { + String test = + "<html><head><base href=\"" + base + "\"></head>" + + "<body><a href=\"" + relative + "\">test</a></body></html>"; + final List<String> links = new ArrayList<String>(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + new DefaultHandler() { + @Override + public void startElement( + String u, String l, String name, Attributes atts) { + if (name.equals("a") && atts.getValue("", "href") != null) { + links.add(atts.getValue("", "href")); + } + } + }, + new Metadata(), + new ParseContext()); + assertEquals(1, links.size()); + assertEquals(url, links.get(0)); + } + + /** + * Test case for TIKA-268 + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-268">TIKA-268</a> + */ + @Test + public void testWhitespaceBetweenTableCells() throws Exception { + String test = + "<html><body><table><tr><td>a</td><td>b</td></table></body></html>"; + String content = new Tika().parseToString( + new ByteArrayInputStream(test.getBytes(UTF_8))); + assertContains("a", content); + assertContains("b", content); + assertFalse(content.contains("ab")); + } + + /** + * Test case for TIKA-332 + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-332">TIKA-332</a> + */ + @Test + public void testHttpEquivCharset() throws Exception { + String test = + "<html><head><meta http-equiv=\"content-type\"" + + " content=\"text/html; charset=ISO-8859-1\" />" + + "<title>the name is \u00e1ndre</title>" + + "</head><body></body></html>"; + Metadata metadata = new Metadata(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(ISO_8859_1)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + } + + /** + * Test case for TIKA-892 + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-892">TIKA-892</a> + */ + @Test + public void testHtml5Charset() throws Exception { + String test = + "<html><head><meta charset=\"ISO-8859-15\" />" + + "<title>the name is \u00e1ndre</title>" + + "</head><body></body></html>"; + Metadata metadata = new Metadata(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(ISO_8859_1)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); + } + + /** + * Test case for TIKA-334 + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a> + */ + @Test + public void testDetectOfCharset() throws Exception { + String test = + "<html><head><title>\u017d</title></head><body></body></html>"; + Metadata metadata = new Metadata(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("\u017d", metadata.get(TikaCoreProperties.TITLE)); + } + + /** + * Test case for TIKA-341 + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a> + */ + @Test + public void testUsingCharsetInContentTypeHeader() throws Exception { + final String test = + "<html><head><title>the name is \u00e1ndre</title></head>" + + "<body></body></html>"; + + Metadata metadata = new Metadata(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); + + metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1"); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(ISO_8859_1)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + } + + /** + * Test case for HTML content like + * ">div<foo>br<bar>/div>" that should result + * in three whitespace-separated tokens "foo", "bar" and "baz" instead + * of a single token "foobarbaz". + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-343">TIKA-343</a> + */ + @Test + public void testLineBreak() throws Exception { + String test = "<html><body><div>foo<br>bar</div>baz</body></html>"; + String text = new Tika().parseToString( + new ByteArrayInputStream(test.getBytes(US_ASCII))); + String[] parts = text.trim().split("\\s+"); + assertEquals(3, parts.length); + assertEquals("foo", parts[0]); + assertEquals("bar", parts[1]); + assertEquals("baz", parts[2]); + } + + /** + * Test case for TIKA-339: Don't use language returned by CharsetDetector + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-339">TIKA-339</a> + */ + @Test + public void testIgnoreCharsetDetectorLanguage() throws Exception { + String test = "<html><title>Simple Content</title><body></body></html>"; + Metadata metadata = new Metadata(); + metadata.add(Metadata.CONTENT_LANGUAGE, "en"); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); + + assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE)); + } + + /** + * Test case for TIKA-349 + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-349">TIKA-349</a> + */ + @Test + public void testHttpEquivCharsetFunkyAttributes() throws Exception { + String test1 = + "<html><head><meta http-equiv=\"content-type\"" + + " content=\"text/html; charset=ISO-8859-15; charset=iso-8859-15\" />" + + "<title>the name is \u00e1ndre</title>" + + "</head><body></body></html>"; + Metadata metadata = new Metadata(); + new HtmlParser().parse( + new ByteArrayInputStream(test1.getBytes(ISO_8859_1)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); + + // Some HTML pages have errors like ';;' versus '; ' as separator + String test2 = + "<html><head><meta http-equiv=\"content-type\"" + + " content=\"text/html;;charset=ISO-8859-15\" />" + + "<title>the name is \u00e1ndre</title>" + + "</head><body></body></html>"; + metadata = new Metadata(); + new HtmlParser().parse( + new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); + } + + /** + * Test case for TIKA-350 + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-350">TIKA-350</a> + */ + @Test + public void testUsingFunkyCharsetInContentTypeHeader() throws Exception { + final String test = + "<html><head><title>the name is \u00e1ndre</title></head>" + + "<body></body></html>"; + + Metadata metadata = new Metadata(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); + + metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html"); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(ISO_8859_1)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + } + + + /** + * Test case for TIKA-357 + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-357">TIKA-357</a> + */ + @Test + public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception { + String path = "/test-documents/big-preamble.html"; + Metadata metadata = new Metadata(); + new HtmlParser().parse( + HtmlParserTest.class.getResourceAsStream(path), + new BodyContentHandler(), metadata, new ParseContext()); + + assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING)); + } + + /** + * Test case for TIKA-420 + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-420">TIKA-420</a> + */ + @Test + public void testBoilerplateRemoval() throws Exception { + String path = "/test-documents/boilerplate.html"; + + Metadata metadata = new Metadata(); + BodyContentHandler handler = new BodyContentHandler(); + new HtmlParser().parse( + HtmlParserTest.class.getResourceAsStream(path), + new BoilerpipeContentHandler(handler), metadata, new ParseContext()); + + String content = handler.toString(); + assertTrue(content.startsWith("This is the real meat")); + assertTrue(content.endsWith("This is the end of the text.\n")); + assertFalse(content.contains("boilerplate")); + assertFalse(content.contains("footer")); + } + + /** + * Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-478">TIKA-478</a> + */ + @Test + public void testElementOrdering() throws Exception { + final String test = "<html><head><title>Title</title>" + + "<meta http-equiv=\"content-type\" content=\"text/html\">" + + "<link rel=\"next\" href=\"next.html\" />" + + "</head><body><p>Simple Content</p></body></html>"; + + StringWriter sw = new StringWriter(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + makeHtmlTransformer(sw), new Metadata(), new ParseContext()); + + String result = sw.toString(); + + // Title element in <head> section + assertTrue(Pattern.matches("(?s)<html.*<head>.*<title>Title</title>.*</head>.*$", result)); + + // No meta elements in body + assertFalse(Pattern.matches("(?s).*<body>.*<meta. *</body>.*$", result)); + + // meta elements should show up in <head> section + assertTrue(Pattern.matches("(?s)<html.*<head>.*<meta .*</head>.*$", result)); + + // No link elements in body + assertFalse(Pattern.matches("(?s).*<body>.*<link .*</body>.*$", result)); + + // link element should be in <head> section + assertTrue(Pattern.matches("(?s)<html.*<head>.*<link .*</head>.*$", result)); + + // There should be ending elements. + assertTrue(Pattern.matches("(?s).*</body>.*</html>$", result)); + + } + + /** + * Test case for TIKA-463. Don't skip elements that have URLs. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a> + */ + @Test + public void testImgUrlExtraction() throws Exception { + final String test = "<html><head><title>Title</title>" + + "<base href=\"http://domain.com\" />" + + "</head><body><img src=\"image.jpg\" /></body></html>"; + + StringWriter sw = new StringWriter(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + makeHtmlTransformer(sw), new Metadata(), new ParseContext()); + + String result = sw.toString(); + + // <img> tag should exist, with fully resolved URL + assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result)); + } + + /** + * Test case for TIKA-463. Don't skip elements that have URLs. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a> + */ + @Test + public void testFrameSrcExtraction() throws Exception { + final String test = "<html><head><title>Title</title>" + + "<base href=\"http://domain.com\" />" + + "</head><frameset><frame src=\"frame.html\" /></frameset></html>"; + + StringWriter sw = new StringWriter(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + makeHtmlTransformer(sw), new Metadata(), new ParseContext()); + + String result = sw.toString(); + + // <frame> tag should exist, with fully resolved URL + assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result)); + } + + /** + * Test case for TIKA-463. Don't skip elements that have URLs. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a> + */ + @Test + public void testIFrameSrcExtraction() throws Exception { + final String test = "<html><head><title>Title</title>" + + "<base href=\"http://domain.com\" />" + + "</head><body><iframe src =\"framed.html\" width=\"100%\" height=\"300\">" + + "<p>Your browser doesn't support iframes!</p></body></html>"; + + StringWriter sw = new StringWriter(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + makeHtmlTransformer(sw), new Metadata(), new ParseContext()); + + String result = sw.toString(); + + // <iframe> tag should exist, with fully resolved URL + assertTrue(Pattern.matches("(?s).*<iframe .* src=\"http://domain.com/framed.html\".*$", result)); + } + + /** + * Test case for TIKA-463. Don't skip elements that have URLs. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a> + */ + @Test + public void testAreaExtraction() throws Exception { + final String test = "<html><head><title>Title</title>" + + "<base href=\"http://domain.com\" />" + + "</head><body><p><map name=\"map\" id=\"map\">" + + "<area shape=\"rect\" href=\"map.html\" alt=\"\" />" + + "</map></p></body></html>"; + + StringWriter sw = new StringWriter(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + makeHtmlTransformer(sw), new Metadata(), new ParseContext()); + + String result = sw.toString(); + + // <map> tag should exist, with <area> tag with fully resolved URL + assertTrue(Pattern.matches("(?s).*<map .*<area .* href=\"http://domain.com/map.html\".*</map>.*$", result)); + } + + /** + * Test case for TIKA-463. Don't skip elements that have URLs. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a> + */ + @Test + public void testObjectExtraction() throws Exception { + final String test = "<html><head><title>Title</title>" + + "<base href=\"http://domain.com\" />" + + "</head><body><p><object data=\"object.data\" type=\"text/html\">" + + "<param name=\"name\" value=\"value\" />" + + "</object></p></body></html>"; + + StringWriter sw = new StringWriter(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + makeHtmlTransformer(sw), new Metadata(), new ParseContext()); + + String result = sw.toString(); + + // <object> tag should exist with fully resolved URLs + assertTrue( + "<object> tag not correctly found in:\n" + result, + Pattern.matches("(?s).*<object data=\"http://domain.com/object.data\".*<param .* name=\"name\" value=\"value\"/>.*</object>.*$", result) + ); + } + + /** + * Test case for change related to TIKA-463. Verify proper handling of <meta> tags. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a> + */ + @Test + public void testMetaTagHandling() throws Exception { + final String test = "<html><body><h1>header</h1><p>some text</p></body></html>"; + + Metadata metadata = new Metadata(); + metadata.add("Content-Type", "text/html; charset=utf-8"); + metadata.add("Language", null); + + StringWriter sw = new StringWriter(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + makeHtmlTransformer(sw), metadata, new ParseContext()); + + String result = sw.toString(); + + // <meta> tag for Content-Type should exist, but nothing for Language + assertTrue(Pattern.matches("(?s).*<meta name=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>.*$", result)); + assertFalse(Pattern.matches("(?s).*<meta name=\"Language\".*$", result)); + } + + /** + * Test case for TIKA-457. Better handling for broken HTML that has <frameset> inside of <body>. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-457">TIKA-457</a> + */ + @Test + public void testBrokenFrameset() throws Exception { + final String test1 = "<html><head><title>Title</title>" + + "<base href=\"http://domain.com\" />" + + "</head><body><frameset><frame src=\"frame.html\" /></frameset></body></html>"; + + StringWriter sw1 = new StringWriter(); + new HtmlParser().parse( + new ByteArrayInputStream(test1.getBytes(UTF_8)), + makeHtmlTransformer(sw1), new Metadata(), new ParseContext()); + + String result = sw1.toString(); + + // <frame> tag should exist, with fully resolved URL + assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result)); + + // <body> tag should not exist. + assertFalse(Pattern.matches("(?s).*<body>.*$", result)); + + // Test the example from the Nutch project. + final String test2 = "<html><head><title> my title </title></head><body>" + + "<frameset rows=\"20,*\"><frame src=\"top.html\"></frame>" + + "<frameset cols=\"20,*\"><frame src=\"left.html\"></frame>" + + "<frame src=\"invalid.html\"/></frame>" + + "<frame src=\"right.html\"></frame>" + + "</frameset></frameset></body></html>"; + + StringWriter sw2 = new StringWriter(); + new HtmlParser().parse( + new ByteArrayInputStream(test2.getBytes(UTF_8)), + makeHtmlTransformer(sw2), new Metadata(), new ParseContext()); + + result = sw2.toString(); + + // <frame> tags should exist, with relative URL (no base element specified) + assertTrue(Pattern.matches("(?s).*<frame .* src=\"top.html\"/>.*$", result)); + assertTrue(Pattern.matches("(?s).*<frame .* src=\"left.html\"/>.*$", result)); + assertTrue(Pattern.matches("(?s).*<frame .* src=\"invalid.html\"/>.*$", result)); + assertTrue(Pattern.matches("(?s).*<frame .* src=\"right.html\"/>.*$", result)); + + // <body> tag should not exist. + assertFalse(Pattern.matches("(?s).*<body>.*$", result)); + } + + /** + * Test case for TIKA-480: fix NPE when using BodyContentHandler or HtmlTransformer + * as delegate for BoilerpipeContentHandler + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-480">TIKA-480</a> + */ + @Test + public void testBoilerplateDelegation() throws Exception { + String path = "/test-documents/boilerplate.html"; + + Metadata metadata = new Metadata(); + StringWriter sw = new StringWriter(); + new HtmlParser().parse( + HtmlParserTest.class.getResourceAsStream(path), + makeHtmlTransformer(sw), metadata, new ParseContext()); + + String content = sw.toString(); + + // Should have <html>, <head>, <title>, <body> elements + assertTrue(Pattern.matches("(?s).*<html xmlns=\"http://www.w3.org/1999/xhtml\">.*</html>.*$", content)); + assertTrue(Pattern.matches("(?s).*<head>.*</head>.*$", content)); + assertTrue(Pattern.matches("(?s).*<title>Title</title>.*$", content)); + assertTrue(Pattern.matches("(?s).*<body>.*</body>.*$", content)); + } + + /** + * Test case for TIKA-481. Verify href in <link> is resolved. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-481">TIKA-481</a> + */ + @Test + public void testLinkHrefResolution() throws Exception { + final String test = "<html><head><title>Title</title>" + + "<base href=\"http://domain.com\" />" + + "<link rel=\"next\" href=\"next.html\" />" + + "</head><body></body></html>"; + + StringWriter sw = new StringWriter(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + makeHtmlTransformer(sw), new Metadata(), new ParseContext()); + + String result = sw.toString(); + + // <link> tag should exist in <head>, with fully resolved URL + assertTrue(Pattern.matches("(?s).*<head>.*<link rel=\"next\" href=\"http://domain.com/next.html\"/>.*</head>.*$", result)); + } + + + /** + * Create ContentHandler that transforms SAX events into textual HTML output, + * and writes it out to <writer> - typically this is a StringWriter. + * + * @param writer Where to write resulting HTML text. + * @return ContentHandler suitable for passing to parse() methods. + * @throws Exception + */ + private ContentHandler makeHtmlTransformer(Writer writer) throws Exception { + SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); + TransformerHandler handler = factory.newTransformerHandler(); + handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); + handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); + handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8"); + handler.setResult(new StreamResult(writer)); + return handler; + } + + /** + * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a> + */ + @Test + public void testBoilerplateWithMarkup() throws Exception { + String path = "/test-documents/boilerplate.html"; + + Metadata metadata = new Metadata(); + StringWriter sw = new StringWriter(); + ContentHandler ch = makeHtmlTransformer(sw); + BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch); + bpch.setIncludeMarkup(true); + + new HtmlParser().parse( + HtmlParserTest.class.getResourceAsStream(path), + bpch, metadata, new ParseContext()); + + String content = sw.toString(); + assertTrue("Has empty table elements", content.contains("<body><table><tr><td><table><tr><td>")); + assertTrue("Has empty a element", content.contains("<a shape=\"rect\" href=\"Main.php\"/>")); + assertTrue("Has real content", content.contains("<p>This is the real meat")); + assertTrue("Ends with appropriate HTML", content.endsWith("</p></body></html>")); + assertFalse(content.contains("boilerplate")); + assertFalse(content.contains("footer")); + } + + /** + * Test case for TIKA-434 - Pushback buffer overflow in TagSoup + */ + @Test + public void testPushback() throws IOException, TikaException { + String content = new Tika().parseToString( + HtmlParserTest.class.getResourceAsStream("/test-documents/tika434.html"), new Metadata()); + assertNotNull(content); + } + + /** + * Test case for TIKA-869 + * IdentityHtmlMapper needs to lower-case tag names. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-869">TIKA-869</a> + */ + @Test + public void testIdentityMapper() throws Exception { + final String html = "<html><head><title>Title</title></head>" + + "<body></body></html>"; + Metadata metadata = new Metadata(); + ParseContext parseContext = new ParseContext(); + parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE); + + StringWriter sw = new StringWriter(); + + new HtmlParser().parse( + new ByteArrayInputStream(html.getBytes(UTF_8)), + makeHtmlTransformer(sw), metadata, parseContext); + + String result = sw.toString(); + // Make sure we don't get <body><BODY/></body> + assertTrue(Pattern.matches("(?s).*<body/>.*$", result)); + } + + /** + * Test case for TIKA-889 + * XHTMLContentHandler wont emit newline when html element matches ENDLINE set. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-889">TIKA-889</a> + */ + @Test + public void testNewlineAndIndent() throws Exception { + final String html = "<html><head><title>Title</title></head>" + + "<body><ul><li>one</li></ul></body></html>"; + + BodyContentHandler handler = new BodyContentHandler(); + new HtmlParser().parse( + new ByteArrayInputStream(html.getBytes(UTF_8)), + handler, new Metadata(), new ParseContext()); + + // Make sure we get <tab>, "one", newline, newline + String result = handler.toString(); + + assertTrue(Pattern.matches("\tone\n\n", result)); + } + + /** + * Test case for TIKA-961 + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-961">TIKA-961</a> + */ + @Test + public void testBoilerplateWhitespace() throws Exception { + String path = "/test-documents/boilerplate-whitespace.html"; + + Metadata metadata = new Metadata(); + BodyContentHandler handler = new BodyContentHandler(); + + BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler); + bpHandler.setIncludeMarkup(true); + + new HtmlParser().parse( + HtmlParserTest.class.getResourceAsStream(path), + bpHandler, metadata, new ParseContext()); + + String content = handler.toString(); + + // Should not contain item_aitem_b + assertFalse(content.contains("item_aitem_b")); + + // Should contain the two list items with a newline in between. + assertContains("item_a\nitem_b", content); + + // Should contain æä»ä¹éè¦æå¸®ä½ ç (can i help you) without whitespace + assertContains("æä»ä¹éè¦æå¸®ä½ ç", content); + } + + /** + * Test case for TIKA-983: HTML parser should add Open Graph meta tag data to Metadata returned by parser + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-983">TIKA-983</a> + */ + @Test + public void testOpenGraphMetadata() throws Exception { + String test1 = + "<html><head><meta property=\"og:description\"" + + " content=\"some description\" />" + + "<meta property=\"og:image\" content=\"http://example.com/image1.jpg\" />" + + "<meta property=\"og:image\" content=\"http://example.com/image2.jpg\" />" + + "<title>hello</title>" + + "</head><body></body></html>"; + Metadata metadata = new Metadata(); + new HtmlParser().parse( + new ByteArrayInputStream(test1.getBytes(ISO_8859_1)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("some description", metadata.get("og:description")); + assertTrue(metadata.isMultiValued("og:image")); + } + + // TIKA-1011 + @Test + public void testUserDefinedCharset() throws Exception { + String content = new Tika().parseToString( + HtmlParserTest.class.getResourceAsStream("/test-documents/testUserDefinedCharset.mhtml"), new Metadata()); + assertNotNull(content); + } + + //TIKA-1001 + @Test + public void testNoisyMetaCharsetHeaders() throws Exception { + Tika tika = new Tika(); + String hit = "\u0623\u0639\u0631\u0628"; + + for (int i = 1; i <= 4; i++) { + String fileName = "/test-documents/testHTMLNoisyMetaEncoding_" + i + ".html"; + String content = tika.parseToString( + HtmlParserTest.class.getResourceAsStream(fileName)); + assertTrue("testing: " + fileName, content.contains(hit)); + } + } + + // TIKA-1193 + @Test + public void testCustomHtmlSchema() throws Exception { + // Default schema does not allow tables inside anchors + String test = "<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>"; + + Metadata metadata = new Metadata(); + LinkContentHandler linkContentHandler = new LinkContentHandler(); + + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(ISO_8859_1)), + linkContentHandler, metadata, new ParseContext()); + + // Expect no anchor text + assertEquals("", linkContentHandler.getLinks().get(0).getText()); + + // We'll change the schema to allow tables inside anchors! + Schema schema = new HTMLSchema(); + schema.elementType("a", HTMLSchema.M_ANY, 65535, 0); + + ParseContext parseContext = new ParseContext(); + parseContext.set(Schema.class, schema); + linkContentHandler = new LinkContentHandler(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(ISO_8859_1)), + linkContentHandler, metadata, parseContext); + + // Expect anchor text + assertEquals("\ttext\n\n", linkContentHandler.getLinks().get(0).getText()); + } + + /** + * Test case for TIKA-820: Locator is unset for HTML parser + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-820">TIKA-820</a> + */ + @Test + public void testLocator() throws Exception { + final int line = 0; + final int col = 1; + final int[] textPosition = new int[2]; + + new HtmlParser().parse(HtmlParserTest.class.getResourceAsStream("/test-documents/testHTML.html"), + new ContentHandler() { + Locator locator; + + public void setDocumentLocator(Locator locator) { + this.locator = locator; + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + } + + public void startPrefixMapping(String prefix, String uri) + throws SAXException { + } + + public void endPrefixMapping(String prefix) + throws SAXException { + } + + public void startElement(String uri, String localName, + String qName, Attributes atts) throws SAXException { + } + + public void endElement(String uri, String localName, + String qName) throws SAXException { + } + + public void characters(char[] ch, int start, int length) + throws SAXException { + String text = new String(ch, start, length); + if (text.equals("Test Indexation Html") && locator != null) { + textPosition[line] = locator.getLineNumber(); + textPosition[col] = locator.getColumnNumber(); + } + } + + public void ignorableWhitespace(char[] ch, int start, + int length) throws SAXException { + } + + public void processingInstruction(String target, String data) + throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + }, + new Metadata(), + new ParseContext()); + + // The text occurs at line 24 (if lines start at 0) or 25 (if lines start at 1). + assertEquals(24, textPosition[line]); + // The column reported seems fuzzy, just test it is close enough. + assertTrue(Math.abs(textPosition[col] - 47) < 10); + } + + + /** + * Test case for TIKA-1303: HTML parse should use the first title tag to set value in meta data + * and ignore any subsequent title tags found in HTML. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-1303">TIKA-1303</a> + */ + @Test + public void testFirstTitleValueisSetToMetadata() throws Exception { + String test = "<html><title>Simple Content</title><body><h1></h1>" + + "<title>TitleToIgnore</title></body></html>"; + Metadata metadata = new Metadata(); + + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); + + //Expecting first title to be set in meta data and second one to be ignored. + assertEquals("Simple Content", metadata.get(TikaCoreProperties.TITLE)); + } + + @Test + public void testMisleadingMetaContentTypeTags() throws Exception { + //TIKA-1519 + + String test = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-ELEVEN\">" + + "</head><title>title</title><body>body</body></html>"; + Metadata metadata = new Metadata(); + + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("text/html; charset=UTF-ELEVEN", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT)); + assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); + + test = "<html><head><meta http-equiv=\"content-type\" content=\"application/pdf\">" + + "</head><title>title</title><body>body</body></html>"; + metadata = new Metadata(); + + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("application/pdf", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT)); + assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); + + //test two content values + test = "<html><head><meta http-equiv=\"content-type\" content=\"application/pdf\" content=\"application/ms-word\">" + + "</head><title>title</title><body>body</body></html>"; + metadata = new Metadata(); + + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("application/ms-word", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT)); + assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); + } + + @Test + public void testXHTMLWithMisleading() throws Exception { + //first test an acceptable XHTML header with http-equiv tags + String test = "<?xml version=\"1.0\" ?>" + + "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + + "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" + + "<head>\n" + + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\" />\n" + + "<title>title</title></head><body>body</body></html>"; + Metadata metadata = new Metadata(); + new AutoDetectParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); + + assertEquals("text/html; charset=iso-8859-1", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT)); + assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); + + test = "<?xml version=\"1.0\" ?>" + + "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + + "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" + + "<head>\n" + + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-NUMBER_SEVEN\" />\n" + + "<title>title</title></head><body>body</body></html>"; + metadata = new Metadata(); + new AutoDetectParser().parse( + new ByteArrayInputStream(test.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); + + assertEquals("text/html; charset=iso-NUMBER_SEVEN", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT)); + assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); + + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,376 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mail; + +import static java.nio.charset.StandardCharsets.US_ASCII; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.junit.Assume.assumeTrue; +import static org.mockito.Matchers.any; +import static org.mockito.Matchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; + +import org.apache.james.mime4j.stream.MimeConfig; +import org.apache.tika.TikaTest; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.ContainerExtractor; +import org.apache.tika.extractor.ParserContainerExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.junit.Test; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.helpers.DefaultHandler; + +public class RFC822ParserTest extends TikaTest { + + private static InputStream getStream(String name) { + InputStream stream = Thread.currentThread().getContextClassLoader() + .getResourceAsStream(name); + assertNotNull("Test file not found " + name, stream); + return stream; + } + + @Test + public void testSimple() { + Parser parser = new RFC822Parser(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("test-documents/testRFC822"); + ContentHandler handler = mock(DefaultHandler.class); + + try { + parser.parse(stream, handler, metadata, new ParseContext()); + verify(handler).startDocument(); + //just one body + verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class)); + verify(handler).endElement(XHTMLContentHandler.XHTML, "p", "p"); + //no multi-part body parts + verify(handler, never()).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class)); + verify(handler, never()).endElement(XHTMLContentHandler.XHTML, "div", "div"); + verify(handler).endDocument(); + //note no leading spaces, and no quotes + assertEquals("Julien Nioche (JIRA) <[email protected]>", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", + metadata.get(TikaCoreProperties.TITLE)); + assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", + metadata.get(Metadata.SUBJECT)); + } catch (Exception e) { + fail("Exception thrown: " + e.getMessage()); + } + } + + @Test + public void testMultipart() { + Parser parser = new RFC822Parser(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("test-documents/testRFC822-multipart"); + ContentHandler handler = mock(XHTMLContentHandler.class); + + try { + parser.parse(stream, handler, metadata, new ParseContext()); + verify(handler).startDocument(); + int bodyExpectedTimes = 4, multipackExpectedTimes = 5; + verify(handler, times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class)); + verify(handler, times(multipackExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "p", "p"); + verify(handler).endDocument(); + + } catch (Exception e) { + fail("Exception thrown: " + e.getMessage()); + } + + //repeat, this time looking at content + parser = new RFC822Parser(); + metadata = new Metadata(); + stream = getStream("test-documents/testRFC822-multipart"); + handler = new BodyContentHandler(); + try { + parser.parse(stream, handler, metadata, new ParseContext()); + //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode + String bodyText = handler.toString(); + assertTrue(bodyText.contains("body 1")); + assertTrue(bodyText.contains("body 2")); + assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif + } catch (Exception e) { + fail("Exception thrown: " + e.getMessage()); + } + } + + @Test + public void testQuotedPrintable() { + Parser parser = new RFC822Parser(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("test-documents/testRFC822_quoted"); + ContentHandler handler = new BodyContentHandler(); + + try { + parser.parse(stream, handler, metadata, new ParseContext()); + //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode + String bodyText = handler.toString(); + assertTrue(bodyText.contains("D\u00FCsseldorf has non-ascii.")); + assertTrue(bodyText.contains("Lines can be split like this.")); + assertTrue(bodyText.contains("Spaces at the end of a line \r\nmust be encoded.\r\n")); + assertFalse(bodyText.contains("=")); //there should be no escape sequences + } catch (Exception e) { + fail("Exception thrown: " + e.getMessage()); + } + } + + @Test + public void testBase64() { + Parser parser = new RFC822Parser(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("test-documents/testRFC822_base64"); + ContentHandler handler = new BodyContentHandler(); + + try { + parser.parse(stream, handler, metadata, new ParseContext()); + //tests correct decoding of base64 text, including ISO-8859-1 bytes into Unicode + assertContains("Here is some text, with international characters, voil\u00E0!", handler.toString()); + } catch (Exception e) { + fail("Exception thrown: " + e.getMessage()); + } + } + + @Test + public void testI18NHeaders() { + Parser parser = new RFC822Parser(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("test-documents/testRFC822_i18nheaders"); + ContentHandler handler = mock(DefaultHandler.class); + + try { + parser.parse(stream, handler, metadata, new ParseContext()); + //tests correct decoding of internationalized headers, both + //quoted-printable (Q) and Base64 (B). + assertEquals("Keld J\u00F8rn Simonsen <[email protected]>", + metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("If you can read this you understand the example.", + metadata.get(TikaCoreProperties.TITLE)); + assertEquals("If you can read this you understand the example.", + metadata.get(Metadata.SUBJECT)); + } catch (Exception e) { + fail("Exception thrown: " + e.getMessage()); + } + } + + /** + * The from isn't in the usual form. + * See TIKA-618 + */ + @Test + public void testUnusualFromAddress() throws Exception { + Parser parser = new RFC822Parser(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("test-documents/testRFC822_oddfrom"); + ContentHandler handler = mock(DefaultHandler.class); + + parser.parse(stream, handler, metadata, new ParseContext()); + assertEquals("Saved by Windows Internet Explorer 7", + metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Air Permit Programs | Air & Radiation | US EPA", + metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Air Permit Programs | Air & Radiation | US EPA", + metadata.get(Metadata.SUBJECT)); + } + + /** + * Test for TIKA-640, increase header max beyond 10k bytes + */ + @Test + public void testLongHeader() throws Exception { + StringBuilder inputBuilder = new StringBuilder(); + for (int i = 0; i < 2000; ++i) { + inputBuilder.append( //len > 50 + "really really really really really really long name "); + } + String name = inputBuilder.toString(); + byte[] data = ("From: " + name + "\r\n\r\n").getBytes(US_ASCII); + + Parser parser = new RFC822Parser(); + ContentHandler handler = new DefaultHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + + try { + parser.parse( + new ByteArrayInputStream(data), handler, metadata, context); + fail(); + } catch (TikaException expected) { + } + + MimeConfig config = new MimeConfig(); + config.setMaxHeaderLen(-1); + config.setMaxLineLen(-1); + context.set(MimeConfig.class, config); + parser.parse( + new ByteArrayInputStream(data), handler, metadata, context); + assertEquals(name.trim(), metadata.get(TikaCoreProperties.CREATOR)); + } + + /** + * Test for TIKA-678 - not all headers may be present + */ + @Test + public void testSomeMissingHeaders() throws Exception { + Parser parser = new RFC822Parser(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("test-documents/testRFC822-limitedheaders"); + ContentHandler handler = new BodyContentHandler(); + + parser.parse(stream, handler, metadata, new ParseContext()); + assertEquals(true, metadata.isMultiValued(TikaCoreProperties.CREATOR)); + assertEquals("xyz", metadata.getValues(TikaCoreProperties.CREATOR)[0]); + assertEquals("abc", metadata.getValues(TikaCoreProperties.CREATOR)[1]); + assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_FROM)); + assertEquals("xyz", metadata.getValues(Metadata.MESSAGE_FROM)[0]); + assertEquals("abc", metadata.getValues(Metadata.MESSAGE_FROM)[1]); + assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_TO)); + assertEquals("abc", metadata.getValues(Metadata.MESSAGE_TO)[0]); + assertEquals("def", metadata.getValues(Metadata.MESSAGE_TO)[1]); + assertEquals("abcd", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("abcd", metadata.get(Metadata.SUBJECT)); + assertContains("bar biz bat", handler.toString()); + } + + /** + * Test TIKA-1028 - If the mail contains an encrypted attachment (or + * an attachment that others triggers an error), parsing should carry + * on for the remainder regardless + */ + @Test + public void testEncryptedZipAttachment() throws Exception { + Parser parser = new RFC822Parser(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + InputStream stream = getStream("test-documents/testRFC822_encrypted_zip"); + ContentHandler handler = new BodyContentHandler(); + parser.parse(stream, handler, metadata, context); + + // Check we go the metadata + assertEquals("Juha Haaga <[email protected]>", metadata.get(Metadata.MESSAGE_FROM)); + assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE)); + + // Check we got the message text, for both Plain Text and HTML + assertContains("Includes encrypted zip file", handler.toString()); + assertContains("password is \"test\".", handler.toString()); + assertContains("This is the Plain Text part", handler.toString()); + assertContains("This is the HTML part", handler.toString()); + + // We won't get the contents of the zip file, but we will get the name + assertContains("text.txt", handler.toString()); + assertNotContained("ENCRYPTED ZIP FILES", handler.toString()); + + // Try again, this time with the password supplied + // Check that we also get the zip's contents as well + context.set(PasswordProvider.class, new PasswordProvider() { + public String getPassword(Metadata metadata) { + return "test"; + } + }); + stream = getStream("test-documents/testRFC822_encrypted_zip"); + handler = new BodyContentHandler(); + parser.parse(stream, handler, metadata, context); + + assertContains("Includes encrypted zip file", handler.toString()); + assertContains("password is \"test\".", handler.toString()); + assertContains("This is the Plain Text part", handler.toString()); + assertContains("This is the HTML part", handler.toString()); + + // We do get the name of the file in the encrypted zip file + assertContains("text.txt", handler.toString()); + + // TODO Upgrade to a version of Commons Compress with Encryption + // support, then verify we get the contents of the text file + // held within the encrypted zip + assumeTrue(false); // No Zip Encryption support yet + assertContains("TEST DATA FOR TIKA.", handler.toString()); + assertContains("ENCRYPTED ZIP FILES", handler.toString()); + assertContains("TIKA-1028", handler.toString()); + } + + /** + * Test TIKA-1028 - Ensure we can get the contents of an + * un-encrypted zip file + */ + @Test + public void testNormalZipAttachment() throws Exception { + Parser parser = new RFC822Parser(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + InputStream stream = getStream("test-documents/testRFC822_normal_zip"); + ContentHandler handler = new BodyContentHandler(); + parser.parse(stream, handler, metadata, context); + + // Check we go the metadata + assertEquals("Juha Haaga <[email protected]>", metadata.get(Metadata.MESSAGE_FROM)); + assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE)); + + // Check we got the message text, for both Plain Text and HTML + assertContains("Includes a normal, unencrypted zip file", handler.toString()); + assertContains("This is the Plain Text part", handler.toString()); + assertContains("This is the HTML part", handler.toString()); + + // We get both name and contents of the zip file's contents + assertContains("text.txt", handler.toString()); + assertContains("TEST DATA FOR TIKA.", handler.toString()); + assertContains("This is text inside an unencrypted zip file", handler.toString()); + assertContains("TIKA-1028", handler.toString()); + } + + /** + * TIKA-1222 When requested, ensure that the various attachments of + * the mail come through properly as embedded resources + */ + @Test + public void testGetAttachmentsAsEmbeddedResources() throws Exception { + TrackingHandler tracker = new TrackingHandler(); + ContainerExtractor ex = new ParserContainerExtractor(); + try (TikaInputStream tis = TikaInputStream.get(getStream("test-documents/testRFC822-multipart"))) { + assertEquals(true, ex.isSupported(tis)); + ex.extract(tis, ex, tracker); + } + + // Check we found all 3 parts + assertEquals(3, tracker.filenames.size()); + assertEquals(3, tracker.mediaTypes.size()); + + // No filenames available + assertEquals(null, tracker.filenames.get(0)); + assertEquals(null, tracker.filenames.get(1)); + assertEquals(null, tracker.filenames.get(2)); + // Types are available + assertEquals(MediaType.TEXT_PLAIN, tracker.mediaTypes.get(0)); + assertEquals(MediaType.TEXT_HTML, tracker.mediaTypes.get(1)); + assertEquals(MediaType.image("gif"), tracker.mediaTypes.get(2)); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mbox; + +import static org.apache.tika.TikaTest.assertContains; +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; +import java.util.Map; + +import org.apache.tika.detect.TypeDetector; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Before; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class MboxParserTest { + + protected ParseContext recursingContext; + private Parser autoDetectParser; + private TypeDetector typeDetector; + private MboxParser mboxParser; + + private static InputStream getStream(String name) { + return MboxParserTest.class.getClass().getResourceAsStream(name); + } + + @Before + public void setUp() throws Exception { + typeDetector = new TypeDetector(); + autoDetectParser = new AutoDetectParser(typeDetector); + recursingContext = new ParseContext(); + recursingContext.set(Parser.class, autoDetectParser); + + mboxParser = new MboxParser(); + mboxParser.setTracking(true); + } + + @Test + public void testSimple() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = getStream("/test-documents/simple.mbox")) { + mboxParser.parse(stream, handler, metadata, recursingContext); + } + + String content = handler.toString(); + assertContains("Test content 1", content); + assertContains("Test content 2", content); + assertEquals("application/mbox", metadata.get(Metadata.CONTENT_TYPE)); + + Map<Integer, Metadata> mailsMetadata = mboxParser.getTrackingMetadata(); + assertEquals("Nb. Of mails", 2, mailsMetadata.size()); + + Metadata mail1 = mailsMetadata.get(0); + assertEquals("message/rfc822", mail1.get(Metadata.CONTENT_TYPE)); + assertEquals("envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009", mail1.get("MboxParser-from")); + + Metadata mail2 = mailsMetadata.get(1); + assertEquals("message/rfc822", mail2.get(Metadata.CONTENT_TYPE)); + assertEquals("envelope-sender-mailbox-name Mon Jun 01 11:00:00 2010", mail2.get("MboxParser-from")); + } + + @Test + public void testHeaders() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = getStream("/test-documents/headers.mbox")) { + mboxParser.parse(stream, handler, metadata, recursingContext); + } + + assertContains("Test content", handler.toString()); + assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size()); + + Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0); + + assertEquals("2009-06-10T03:58:45Z", mailMetadata.get(TikaCoreProperties.CREATED)); + assertEquals("<[email protected]>", mailMetadata.get(TikaCoreProperties.CREATOR)); + assertEquals("subject", mailMetadata.get(Metadata.SUBJECT)); + assertEquals("<[email protected]>", mailMetadata.get(Metadata.AUTHOR)); + assertEquals("message/rfc822", mailMetadata.get(Metadata.CONTENT_TYPE)); + assertEquals("[email protected]", mailMetadata.get("Message-From")); + assertEquals("<[email protected]>", mailMetadata.get("MboxParser-return-path")); + } + + @Test + public void testMultilineHeader() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = getStream("/test-documents/multiline.mbox")) { + mboxParser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size()); + + Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0); + assertEquals("from xxx by xxx with xxx; date", mailMetadata.get("MboxParser-received")); + } + + @Test + public void testQuoted() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = getStream("/test-documents/quoted.mbox")) { + mboxParser.parse(stream, handler, metadata, recursingContext); + } + + assertContains("Test content", handler.toString()); + assertContains("> quoted stuff", handler.toString()); + } + + @Test + public void testComplex() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = getStream("/test-documents/complex.mbox")) { + mboxParser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("Nb. Of mails", 3, mboxParser.getTrackingMetadata().size()); + + Metadata firstMail = mboxParser.getTrackingMetadata().get(0); + assertEquals("Re: question about when shuffle/sort start working", firstMail.get(Metadata.SUBJECT)); + assertEquals("Re: question about when shuffle/sort start working", firstMail.get(TikaCoreProperties.TITLE)); + assertEquals("Jothi Padmanabhan <[email protected]>", firstMail.get(Metadata.AUTHOR)); + assertEquals("Jothi Padmanabhan <[email protected]>", firstMail.get(TikaCoreProperties.CREATOR)); + assertEquals("[email protected]", firstMail.get(Metadata.MESSAGE_RECIPIENT_ADDRESS)); + + assertContains("When a Mapper completes", handler.toString()); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mbox; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; + +import org.apache.tika.TikaTest; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.ToHTMLContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +public class OutlookPSTParserTest extends TikaTest { + + private Parser parser = new OutlookPSTParser(); + + @Test + public void testAccept() throws Exception { + assertTrue((parser.getSupportedTypes(null).contains(MediaType.application("vnd.ms-outlook-pst")))); + } + + @Test + public void testParse() throws Exception { + Parser pstParser = new AutoDetectParser(); + Metadata metadata = new Metadata(); + ContentHandler handler = new ToHTMLContentHandler(); + + ParseContext context = new ParseContext(); + EmbeddedTrackingExtrator trackingExtrator = new EmbeddedTrackingExtrator(context); + context.set(EmbeddedDocumentExtractor.class, trackingExtrator); + context.set(Parser.class, new AutoDetectParser()); + + pstParser.parse(getResourceAsStream("/test-documents/testPST.pst"), handler, metadata, context); + + String output = handler.toString(); + + assertFalse(output.isEmpty()); + assertTrue(output.contains("<meta name=\"Content-Length\" content=\"271360\">")); + assertTrue(output.contains("<meta name=\"Content-Type\" content=\"application/vnd.ms-outlook-pst\">")); + + assertTrue(output.contains("<body><div class=\"email-folder\"><h1>")); + assertTrue(output.contains("<div class=\"embedded\" id=\"<[email protected]>\"><h1>Re: Feature Generators</h1>")); + assertTrue(output.contains("<div class=\"embedded\" id=\"<[email protected]>\"><h1>Re: init tokenizer fails: \"Bad type in putfield/putstatic\"</h1>")); + assertTrue(output.contains("Gary Murphy commented on TIKA-1250:")); + + assertTrue(output.contains("<div class=\"email-folder\"><h1>Racine (pour la recherche)</h1>")); + + + List<Metadata> metaList = trackingExtrator.trackingMetadata; + assertEquals(6, metaList.size()); + + Metadata firstMail = metaList.get(0); + assertEquals("Jörn Kottmann", firstMail.get(TikaCoreProperties.CREATOR)); + assertEquals("Re: Feature Generators", firstMail.get(TikaCoreProperties.TITLE)); + assertEquals("[email protected]", firstMail.get("senderEmailAddress")); + assertEquals("[email protected]", firstMail.get("displayTo")); + assertEquals("", firstMail.get("displayCC")); + assertEquals("", firstMail.get("displayBCC")); + } + + + private class EmbeddedTrackingExtrator extends ParsingEmbeddedDocumentExtractor { + List<Metadata> trackingMetadata = new ArrayList<Metadata>(); + + public EmbeddedTrackingExtrator(ParseContext context) { + super(context); + } + + @Override + public boolean shouldParseEmbedded(Metadata metadata) { + return true; + } + + @Override + public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException { + this.trackingMetadata.add(metadata); + super.parseEmbedded(stream, handler, metadata, outputHtml); + } + + } +}
