ti...

bob Tue, 05 Jan 2016 19:52:06 -0800

Added: 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,1114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Geographic;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.LinkContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class HtmlParserTest {
+
+    @Test
+    public void testParseAscii() throws Exception {
+        String path = "/test-documents/testHTML.html";
+        final StringWriter href = new StringWriter();
+        final StringWriter name = new StringWriter();
+        ContentHandler body = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        try (InputStream stream = 
HtmlParserTest.class.getResourceAsStream(path)) {
+            ContentHandler link = new DefaultHandler() {
+                @Override
+                public void startElement(
+                        String u, String l, String n, Attributes a)
+                        throws SAXException {
+                    if ("a".equals(l)) {
+                        if (a.getValue("href") != null) {
+                            href.append(a.getValue("href"));
+                        } else if (a.getValue("name") != null) {
+                            name.append(a.getValue("name"));
+                        }
+                    }
+                }
+            };
+            new HtmlParser().parse(
+                    stream, new TeeContentHandler(body, link),
+                    metadata, new ParseContext());
+        }
+
+        assertEquals(
+                "Title : Test Indexation Html", 
metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Tika Developers", metadata.get("Author"));
+        assertEquals("5", metadata.get("refresh"));
+
+        assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
+        assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
+
+        assertEquals("http://www.apache.org/";, href.toString());
+        assertEquals("test-anchor", name.toString());
+
+        String content = body.toString();
+        assertTrue(
+                "Did not contain expected text:" + "Test Indexation Html",
+                content.contains("Test Indexation Html"));
+        assertTrue(
+                "Did not contain expected text:" + "Indexation du fichier",
+                content.contains("Indexation du fichier"));
+    }
+
+    @Test
+    @Ignore("The file 'testXHTML_utf8.html' is not available fo testing")
+    public void XtestParseUTF8() throws IOException, SAXException, 
TikaException {
+        String path = "/test-documents/testXHTML_utf8.html";
+        Metadata metadata = new Metadata();
+        String content = new Tika().parseToString(
+                HtmlParserTest.class.getResourceAsStream(path), metadata);
+
+        assertTrue("Did not contain expected text:"
+                + "Title : Tilte with UTF-8 chars âââÂ§ââ¢", content
+                .contains("Title : Tilte with UTF-8 chars âââÂ§ââ¢"));
+
+        assertTrue("Did not contain expected text:"
+                + "Content with UTF-8 chars", content
+                .contains("Content with UTF-8 chars"));
+
+        assertTrue("Did not contain expected text:" + "ââ¢âÂ§ââ", 
content
+                .contains("ââ¢âÂ§ââ"));
+    }
+
+    @Test
+    public void testXhtmlParsing() throws Exception {
+        String path = "/test-documents/testXHTML.html";
+        Metadata metadata = new Metadata();
+        String content = new Tika().parseToString(
+                HtmlParserTest.class.getResourceAsStream(path), metadata);
+
+        //can't specify charset because default differs between OS's
+        
assertTrue(metadata.get(Metadata.CONTENT_TYPE).startsWith("application/xhtml+xml;
 charset="));
+        assertEquals("XHTML test document", 
metadata.get(TikaCoreProperties.TITLE));
+
+        assertEquals("Tika Developers", metadata.get("Author"));
+        assertEquals("5", metadata.get("refresh"));
+        assertContains("ability of Apache Tika", content);
+        assertContains("extract content", content);
+        assertContains("an XHTML document", content);
+    }
+
+    @Test
+    public void testParseEmpty() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(new byte[0]),
+                handler, new Metadata(), new ParseContext());
+        assertEquals("", handler.toString());
+    }
+
+    /**
+     * Test case for TIKA-210
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-210";>TIKA-210</a>
+     */
+    @Test
+    public void testCharactersDirectlyUnderBodyElement() throws Exception {
+        String test = "<html><body>test</body></html>";
+        String content = new Tika().parseToString(
+                new ByteArrayInputStream(test.getBytes(UTF_8)));
+        assertEquals("test", content);
+    }
+
+    /**
+     * Test case for TIKA-287
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-287";>TIKA-287</a>
+     */
+    @Test
+    public void testBaseHref() throws Exception {
+        assertRelativeLink(
+                "http://lucene.apache.org/tika/";,
+                "http://lucene.apache.org/";, "tika/");
+
+        assertRelativeLink(
+                "http://domain.com/?pid=1";,
+                "http://domain.com";, "?pid=1");
+        assertRelativeLink(
+                "http://domain.com/?pid=2";,
+                "http://domain.com?pid=1";, "?pid=2");
+
+        assertRelativeLink(
+                "http://domain.com/file.html";,
+                "http://domain.com/path/";, "/file.html");
+        assertRelativeLink(
+                "http://domain.com/path/file.html";,
+                "http://domain.com/path/";, "./file.html");
+        assertRelativeLink(
+                "http://domain.com/path/file.html";,
+                "http://domain.com/path/";, "file.html");
+
+        assertRelativeLink(
+                "http://domain2.com/newpath";,
+                "http://domain.com/path/to/file";, 
"http://domain2.com/newpath";);
+
+        // See 
http://www.communities.hp.com/securitysoftware/blogs/jeff/archive/2007/12/19/RFC-1808-vs-2396-vs-3986_3A00_-Browsers-vs.-programing-languages.aspx
+        // Also http://www.ietf.org/rfc/rfc3986.txt
+        // Also http://issues.apache.org/jira/browse/NUTCH-566
+        // Also http://issues.apache.org/jira/browse/NUTCH-436
+        assertRelativeLink(
+                "http://domain.com/path/?pid=1";,
+                "http://domain.com/path/";, "?pid=1");
+        assertRelativeLink(
+                "http://domain.com/file?pid=1";,
+                "http://domain.com/file";, "?pid=1");
+        assertRelativeLink(
+                "http://domain.com/path/d;p?pid=1";,
+                "http://domain.com/path/d;p?q#f";, "?pid=1");
+    }
+
+    private void assertRelativeLink(String url, String base, String relative)
+            throws Exception {
+        String test =
+                "<html><head><base href=\"" + base + "\"></head>"
+                        + "<body><a href=\"" + relative + 
"\">test</a></body></html>";
+        final List<String> links = new ArrayList<String>();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new DefaultHandler() {
+                    @Override
+                    public void startElement(
+                            String u, String l, String name, Attributes atts) {
+                        if (name.equals("a") && atts.getValue("", "href") != 
null) {
+                            links.add(atts.getValue("", "href"));
+                        }
+                    }
+                },
+                new Metadata(),
+                new ParseContext());
+        assertEquals(1, links.size());
+        assertEquals(url, links.get(0));
+    }
+
+    /**
+     * Test case for TIKA-268
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-268";>TIKA-268</a>
+     */
+    @Test
+    public void testWhitespaceBetweenTableCells() throws Exception {
+        String test =
+                
"<html><body><table><tr><td>a</td><td>b</td></table></body></html>";
+        String content = new Tika().parseToString(
+                new ByteArrayInputStream(test.getBytes(UTF_8)));
+        assertContains("a", content);
+        assertContains("b", content);
+        assertFalse(content.contains("ab"));
+    }
+
+    /**
+     * Test case for TIKA-332
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-332";>TIKA-332</a>
+     */
+    @Test
+    public void testHttpEquivCharset() throws Exception {
+        String test =
+                "<html><head><meta http-equiv=\"content-type\""
+                        + " content=\"text/html; charset=ISO-8859-1\" />"
+                        + "<title>the name is \u00e1ndre</title>"
+                        + "</head><body></body></html>";
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+    /**
+     * Test case for TIKA-892
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-892";>TIKA-892</a>
+     */
+    @Test
+    public void testHtml5Charset() throws Exception {
+        String test =
+                "<html><head><meta charset=\"ISO-8859-15\" />"
+                        + "<title>the name is \u00e1ndre</title>"
+                        + "</head><body></body></html>";
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+    /**
+     * Test case for TIKA-334
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-334";>TIKA-334</a>
+     */
+    @Test
+    public void testDetectOfCharset() throws Exception {
+        String test =
+                "<html><head><title>\u017d</title></head><body></body></html>";
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("\u017d", metadata.get(TikaCoreProperties.TITLE));
+    }
+
+    /**
+     * Test case for TIKA-341
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-341";>TIKA-341</a>
+     */
+    @Test
+    public void testUsingCharsetInContentTypeHeader() throws Exception {
+        final String test =
+                "<html><head><title>the name is \u00e1ndre</title></head>"
+                        + "<body></body></html>";
+
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+
+        metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+    /**
+     * Test case for HTML content like
+     * "&gt;div&lt;foo&gt;br&lt;bar&gt;/div&gt;" that should result
+     * in three whitespace-separated tokens "foo", "bar" and "baz" instead
+     * of a single token "foobarbaz".
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-343";>TIKA-343</a>
+     */
+    @Test
+    public void testLineBreak() throws Exception {
+        String test = "<html><body><div>foo<br>bar</div>baz</body></html>";
+        String text = new Tika().parseToString(
+                new ByteArrayInputStream(test.getBytes(US_ASCII)));
+        String[] parts = text.trim().split("\\s+");
+        assertEquals(3, parts.length);
+        assertEquals("foo", parts[0]);
+        assertEquals("bar", parts[1]);
+        assertEquals("baz", parts[2]);
+    }
+
+    /**
+     * Test case for TIKA-339: Don't use language returned by CharsetDetector
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-339";>TIKA-339</a>
+     */
+    @Test
+    public void testIgnoreCharsetDetectorLanguage() throws Exception {
+        String test = "<html><title>Simple 
Content</title><body></body></html>";
+        Metadata metadata = new Metadata();
+        metadata.add(Metadata.CONTENT_LANGUAGE, "en");
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+
+        assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
+    }
+
+    /**
+     * Test case for TIKA-349
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-349";>TIKA-349</a>
+     */
+    @Test
+    public void testHttpEquivCharsetFunkyAttributes() throws Exception {
+        String test1 =
+                "<html><head><meta http-equiv=\"content-type\""
+                        + " content=\"text/html; charset=ISO-8859-15; 
charset=iso-8859-15\" />"
+                        + "<title>the name is \u00e1ndre</title>"
+                        + "</head><body></body></html>";
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
+
+        // Some HTML pages have errors like ';;' versus '; ' as separator
+        String test2 =
+                "<html><head><meta http-equiv=\"content-type\""
+                        + " content=\"text/html;;charset=ISO-8859-15\" />"
+                        + "<title>the name is \u00e1ndre</title>"
+                        + "</head><body></body></html>";
+        metadata = new Metadata();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+    /**
+     * Test case for TIKA-350
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-350";>TIKA-350</a>
+     */
+    @Test
+    public void testUsingFunkyCharsetInContentTypeHeader() throws Exception {
+        final String test =
+                "<html><head><title>the name is \u00e1ndre</title></head>"
+                        + "<body></body></html>";
+
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+
+        metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html");
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+
+    /**
+     * Test case for TIKA-357
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-357";>TIKA-357</a>
+     */
+    @Test
+    public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception {
+        String path = "/test-documents/big-preamble.html";
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(
+                HtmlParserTest.class.getResourceAsStream(path),
+                new BodyContentHandler(), metadata, new ParseContext());
+
+        assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+    /**
+     * Test case for TIKA-420
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-420";>TIKA-420</a>
+     */
+    @Test
+    public void testBoilerplateRemoval() throws Exception {
+        String path = "/test-documents/boilerplate.html";
+
+        Metadata metadata = new Metadata();
+        BodyContentHandler handler = new BodyContentHandler();
+        new HtmlParser().parse(
+                HtmlParserTest.class.getResourceAsStream(path),
+                new BoilerpipeContentHandler(handler), metadata, new 
ParseContext());
+
+        String content = handler.toString();
+        assertTrue(content.startsWith("This is the real meat"));
+        assertTrue(content.endsWith("This is the end of the text.\n"));
+        assertFalse(content.contains("boilerplate"));
+        assertFalse(content.contains("footer"));
+    }
+
+    /**
+     * Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-478";>TIKA-478</a>
+     */
+    @Test
+    public void testElementOrdering() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+                "<meta http-equiv=\"content-type\" content=\"text/html\">" +
+                "<link rel=\"next\" href=\"next.html\" />" +
+                "</head><body><p>Simple Content</p></body></html>";
+
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+
+        // Title element in <head> section
+        
assertTrue(Pattern.matches("(?s)<html.*<head>.*<title>Title</title>.*</head>.*$",
 result));
+
+        // No meta elements in body
+        assertFalse(Pattern.matches("(?s).*<body>.*<meta. *</body>.*$", 
result));
+
+        // meta elements should show up in <head> section
+        assertTrue(Pattern.matches("(?s)<html.*<head>.*<meta .*</head>.*$", 
result));
+
+        // No link elements in body
+        assertFalse(Pattern.matches("(?s).*<body>.*<link .*</body>.*$", 
result));
+
+        // link element should be in <head> section
+        assertTrue(Pattern.matches("(?s)<html.*<head>.*<link .*</head>.*$", 
result));
+
+        // There should be ending elements.
+        assertTrue(Pattern.matches("(?s).*</body>.*</html>$", result));
+
+    }
+
+    /**
+     * Test case for TIKA-463. Don't skip elements that have URLs.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-463";>TIKA-463</a>
+     */
+    @Test
+    public void testImgUrlExtraction() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+                "<base href=\"http://domain.com\"; />" +
+                "</head><body><img src=\"image.jpg\" /></body></html>";
+
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+
+        // <img> tag should exist, with fully resolved URL
+        
assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$";, 
result));
+    }
+
+    /**
+     * Test case for TIKA-463. Don't skip elements that have URLs.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-463";>TIKA-463</a>
+     */
+    @Test
+    public void testFrameSrcExtraction() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+                "<base href=\"http://domain.com\"; />" +
+                "</head><frameset><frame src=\"frame.html\" 
/></frameset></html>";
+
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+
+        // <frame> tag should exist, with fully resolved URL
+        assertTrue(Pattern.matches("(?s).*<frame .* 
src=\"http://domain.com/frame.html\"/>.*$", result));
+    }
+
+    /**
+     * Test case for TIKA-463. Don't skip elements that have URLs.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-463";>TIKA-463</a>
+     */
+    @Test
+    public void testIFrameSrcExtraction() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+                "<base href=\"http://domain.com\"; />" +
+                "</head><body><iframe src =\"framed.html\" width=\"100%\" 
height=\"300\">" +
+                "<p>Your browser doesn't support iframes!</p></body></html>";
+
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+
+        // <iframe> tag should exist, with fully resolved URL
+        assertTrue(Pattern.matches("(?s).*<iframe .* 
src=\"http://domain.com/framed.html\".*$";, result));
+    }
+
+    /**
+     * Test case for TIKA-463. Don't skip elements that have URLs.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-463";>TIKA-463</a>
+     */
+    @Test
+    public void testAreaExtraction() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+                "<base href=\"http://domain.com\"; />" +
+                "</head><body><p><map name=\"map\" id=\"map\">" +
+                "<area shape=\"rect\" href=\"map.html\" alt=\"\" />" +
+                "</map></p></body></html>";
+
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+
+        // <map> tag should exist, with <area> tag with fully resolved URL
+        assertTrue(Pattern.matches("(?s).*<map .*<area .* 
href=\"http://domain.com/map.html\".*</map>.*$", result));
+    }
+
+    /**
+     * Test case for TIKA-463. Don't skip elements that have URLs.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-463";>TIKA-463</a>
+     */
+    @Test
+    public void testObjectExtraction() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+                "<base href=\"http://domain.com\"; />" +
+                "</head><body><p><object data=\"object.data\" 
type=\"text/html\">" +
+                "<param name=\"name\" value=\"value\" />" +
+                "</object></p></body></html>";
+
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+
+        // <object> tag should exist with fully resolved URLs
+        assertTrue(
+                "<object> tag not correctly found in:\n" + result,
+                Pattern.matches("(?s).*<object 
data=\"http://domain.com/object.data\".*<param .* name=\"name\" 
value=\"value\"/>.*</object>.*$", result)
+        );
+    }
+
+    /**
+     * Test case for change related to TIKA-463. Verify proper handling of 
<meta> tags.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-463";>TIKA-463</a>
+     */
+    @Test
+    public void testMetaTagHandling() throws Exception {
+        final String test = "<html><body><h1>header</h1><p>some 
text</p></body></html>";
+
+        Metadata metadata = new Metadata();
+        metadata.add("Content-Type", "text/html; charset=utf-8");
+        metadata.add("Language", null);
+
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), metadata, new ParseContext());
+
+        String result = sw.toString();
+
+        // <meta> tag for Content-Type should exist, but nothing for Language
+        assertTrue(Pattern.matches("(?s).*<meta name=\"Content-Type\" 
content=\"text/html; charset=UTF-8\"/>.*$", result));
+        assertFalse(Pattern.matches("(?s).*<meta name=\"Language\".*$", 
result));
+    }
+
+    /**
+     * Test case for TIKA-457. Better handling for broken HTML that has 
<frameset> inside of <body>.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-457";>TIKA-457</a>
+     */
+    @Test
+    public void testBrokenFrameset() throws Exception {
+        final String test1 = "<html><head><title>Title</title>" +
+                "<base href=\"http://domain.com\"; />" +
+                "</head><body><frameset><frame src=\"frame.html\" 
/></frameset></body></html>";
+
+        StringWriter sw1 = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test1.getBytes(UTF_8)),
+                makeHtmlTransformer(sw1), new Metadata(), new ParseContext());
+
+        String result = sw1.toString();
+
+        // <frame> tag should exist, with fully resolved URL
+        assertTrue(Pattern.matches("(?s).*<frame .* 
src=\"http://domain.com/frame.html\"/>.*$", result));
+
+        // <body> tag should not exist.
+        assertFalse(Pattern.matches("(?s).*<body>.*$", result));
+
+        // Test the example from the Nutch project.
+        final String test2 = "<html><head><title> my title 
</title></head><body>" +
+                "<frameset rows=\"20,*\"><frame src=\"top.html\"></frame>" +
+                "<frameset cols=\"20,*\"><frame src=\"left.html\"></frame>" +
+                "<frame src=\"invalid.html\"/></frame>" +
+                "<frame src=\"right.html\"></frame>" +
+                "</frameset></frameset></body></html>";
+
+        StringWriter sw2 = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test2.getBytes(UTF_8)),
+                makeHtmlTransformer(sw2), new Metadata(), new ParseContext());
+
+        result = sw2.toString();
+
+        // <frame> tags should exist, with relative URL (no base element 
specified)
+        assertTrue(Pattern.matches("(?s).*<frame .* src=\"top.html\"/>.*$", 
result));
+        assertTrue(Pattern.matches("(?s).*<frame .* src=\"left.html\"/>.*$", 
result));
+        assertTrue(Pattern.matches("(?s).*<frame .* 
src=\"invalid.html\"/>.*$", result));
+        assertTrue(Pattern.matches("(?s).*<frame .* src=\"right.html\"/>.*$", 
result));
+
+        // <body> tag should not exist.
+        assertFalse(Pattern.matches("(?s).*<body>.*$", result));
+    }
+
+    /**
+     * Test case for TIKA-480: fix NPE when using BodyContentHandler or 
HtmlTransformer
+     * as delegate for BoilerpipeContentHandler
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-480";>TIKA-480</a>
+     */
+    @Test
+    public void testBoilerplateDelegation() throws Exception {
+        String path = "/test-documents/boilerplate.html";
+
+        Metadata metadata = new Metadata();
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                HtmlParserTest.class.getResourceAsStream(path),
+                makeHtmlTransformer(sw), metadata, new ParseContext());
+
+        String content = sw.toString();
+
+        // Should have <html>, <head>, <title>, <body> elements
+        assertTrue(Pattern.matches("(?s).*<html 
xmlns=\"http://www.w3.org/1999/xhtml\";>.*</html>.*$", content));
+        assertTrue(Pattern.matches("(?s).*<head>.*</head>.*$", content));
+        assertTrue(Pattern.matches("(?s).*<title>Title</title>.*$", content));
+        assertTrue(Pattern.matches("(?s).*<body>.*</body>.*$", content));
+    }
+
+    /**
+     * Test case for TIKA-481. Verify href in <link> is resolved.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-481";>TIKA-481</a>
+     */
+    @Test
+    public void testLinkHrefResolution() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+                "<base href=\"http://domain.com\"; />" +
+                "<link rel=\"next\" href=\"next.html\" />" +
+                "</head><body></body></html>";
+
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+
+        // <link> tag should exist in <head>, with fully resolved URL
+        assertTrue(Pattern.matches("(?s).*<head>.*<link rel=\"next\" 
href=\"http://domain.com/next.html\"/>.*</head>.*$", result));
+    }
+
+
+    /**
+     * Create ContentHandler that transforms SAX events into textual HTML 
output,
+     * and writes it out to <writer> - typically this is a StringWriter.
+     *
+     * @param writer Where to write resulting HTML text.
+     * @return ContentHandler suitable for passing to parse() methods.
+     * @throws Exception
+     */
+    private ContentHandler makeHtmlTransformer(Writer writer) throws Exception 
{
+        SAXTransformerFactory factory = (SAXTransformerFactory) 
SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+        handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, 
"utf-8");
+        handler.setResult(new StreamResult(writer));
+        return handler;
+    }
+
+    /**
+     * Test case for TIKA-564. Support returning markup from 
BoilerpipeContentHandler.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-564";>TIKA-564</a>
+     */
+    @Test
+    public void testBoilerplateWithMarkup() throws Exception {
+        String path = "/test-documents/boilerplate.html";
+
+        Metadata metadata = new Metadata();
+        StringWriter sw = new StringWriter();
+        ContentHandler ch = makeHtmlTransformer(sw);
+        BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
+        bpch.setIncludeMarkup(true);
+
+        new HtmlParser().parse(
+                HtmlParserTest.class.getResourceAsStream(path),
+                bpch, metadata, new ParseContext());
+
+        String content = sw.toString();
+        assertTrue("Has empty table elements", 
content.contains("<body><table><tr><td><table><tr><td>"));
+        assertTrue("Has empty a element", content.contains("<a shape=\"rect\" 
href=\"Main.php\"/>"));
+        assertTrue("Has real content", content.contains("<p>This is the real 
meat"));
+        assertTrue("Ends with appropriate HTML", 
content.endsWith("</p></body></html>"));
+        assertFalse(content.contains("boilerplate"));
+        assertFalse(content.contains("footer"));
+    }
+
+    /**
+     * Test case for TIKA-434 - Pushback buffer overflow in TagSoup
+     */
+    @Test
+    public void testPushback() throws IOException, TikaException {
+        String content = new Tika().parseToString(
+                
HtmlParserTest.class.getResourceAsStream("/test-documents/tika434.html"), new 
Metadata());
+        assertNotNull(content);
+    }
+
+    /**
+     * Test case for TIKA-869
+     * IdentityHtmlMapper needs to lower-case tag names.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-869";>TIKA-869</a>
+     */
+    @Test
+    public void testIdentityMapper() throws Exception {
+        final String html = "<html><head><title>Title</title></head>" +
+                "<body></body></html>";
+        Metadata metadata = new Metadata();
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
+
+        StringWriter sw = new StringWriter();
+
+        new HtmlParser().parse(
+                new ByteArrayInputStream(html.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), metadata, parseContext);
+
+        String result = sw.toString();
+        // Make sure we don't get <body><BODY/></body>
+        assertTrue(Pattern.matches("(?s).*<body/>.*$", result));
+    }
+
+    /**
+     * Test case for TIKA-889
+     * XHTMLContentHandler wont emit newline when html element matches ENDLINE 
set.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-889";>TIKA-889</a>
+     */
+    @Test
+    public void testNewlineAndIndent() throws Exception {
+        final String html = "<html><head><title>Title</title></head>" +
+                "<body><ul><li>one</li></ul></body></html>";
+
+        BodyContentHandler handler = new BodyContentHandler();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(html.getBytes(UTF_8)),
+                handler, new Metadata(), new ParseContext());
+
+        // Make sure we get <tab>, "one", newline, newline
+        String result = handler.toString();
+
+        assertTrue(Pattern.matches("\tone\n\n", result));
+    }
+
+    /**
+     * Test case for TIKA-961
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-961";>TIKA-961</a>
+     */
+    @Test
+    public void testBoilerplateWhitespace() throws Exception {
+        String path = "/test-documents/boilerplate-whitespace.html";
+
+        Metadata metadata = new Metadata();
+        BodyContentHandler handler = new BodyContentHandler();
+
+        BoilerpipeContentHandler bpHandler = new 
BoilerpipeContentHandler(handler);
+        bpHandler.setIncludeMarkup(true);
+
+        new HtmlParser().parse(
+                HtmlParserTest.class.getResourceAsStream(path),
+                bpHandler, metadata, new ParseContext());
+
+        String content = handler.toString();
+
+        // Should not contain item_aitem_b
+        assertFalse(content.contains("item_aitem_b"));
+
+        // Should contain the two list items with a newline in between.
+        assertContains("item_a\nitem_b", content);
+
+        // Should contain æä»ä¹éè¦æå¸®ä½ ç (can i help you) without 
whitespace
+        assertContains("æä»ä¹éè¦æå¸®ä½ ç", content);
+    }
+
+    /**
+     * Test case for TIKA-983:  HTML parser should add Open Graph meta tag 
data to Metadata returned by parser
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-983";>TIKA-983</a>
+     */
+    @Test
+    public void testOpenGraphMetadata() throws Exception {
+        String test1 =
+                "<html><head><meta property=\"og:description\""
+                        + " content=\"some description\" />"
+                        + "<meta property=\"og:image\" 
content=\"http://example.com/image1.jpg\"; />"
+                        + "<meta property=\"og:image\" 
content=\"http://example.com/image2.jpg\"; />"
+                        + "<title>hello</title>"
+                        + "</head><body></body></html>";
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("some description", metadata.get("og:description"));
+        assertTrue(metadata.isMultiValued("og:image"));
+    }
+
+    // TIKA-1011
+    @Test
+    public void testUserDefinedCharset() throws Exception {
+        String content = new Tika().parseToString(
+                
HtmlParserTest.class.getResourceAsStream("/test-documents/testUserDefinedCharset.mhtml"),
 new Metadata());
+        assertNotNull(content);
+    }
+
+    //TIKA-1001
+    @Test
+    public void testNoisyMetaCharsetHeaders() throws Exception {
+        Tika tika = new Tika();
+        String hit = "\u0623\u0639\u0631\u0628";
+
+        for (int i = 1; i <= 4; i++) {
+            String fileName = "/test-documents/testHTMLNoisyMetaEncoding_" + i 
+ ".html";
+            String content = tika.parseToString(
+                    HtmlParserTest.class.getResourceAsStream(fileName));
+            assertTrue("testing: " + fileName, content.contains(hit));
+        }
+    }
+
+    // TIKA-1193
+    @Test
+    public void testCustomHtmlSchema() throws Exception {
+        // Default schema does not allow tables inside anchors
+        String test = 
"<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>";
+
+        Metadata metadata = new Metadata();
+        LinkContentHandler linkContentHandler = new LinkContentHandler();
+
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
+                linkContentHandler, metadata, new ParseContext());
+
+        // Expect no anchor text
+        assertEquals("", linkContentHandler.getLinks().get(0).getText());
+
+        // We'll change the schema to allow tables inside anchors!
+        Schema schema = new HTMLSchema();
+        schema.elementType("a", HTMLSchema.M_ANY, 65535, 0);
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(Schema.class, schema);
+        linkContentHandler = new LinkContentHandler();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
+                linkContentHandler, metadata, parseContext);
+
+        // Expect anchor text
+        assertEquals("\ttext\n\n", 
linkContentHandler.getLinks().get(0).getText());
+    }
+
+    /**
+     * Test case for TIKA-820:  Locator is unset for HTML parser
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-820";>TIKA-820</a>
+     */
+    @Test
+    public void testLocator() throws Exception {
+        final int line = 0;
+        final int col = 1;
+        final int[] textPosition = new int[2];
+
+        new 
HtmlParser().parse(HtmlParserTest.class.getResourceAsStream("/test-documents/testHTML.html"),
+                new ContentHandler() {
+                    Locator locator;
+
+                    public void setDocumentLocator(Locator locator) {
+                        this.locator = locator;
+                    }
+
+                    public void startDocument() throws SAXException {
+                    }
+
+                    public void endDocument() throws SAXException {
+                    }
+
+                    public void startPrefixMapping(String prefix, String uri)
+                            throws SAXException {
+                    }
+
+                    public void endPrefixMapping(String prefix)
+                            throws SAXException {
+                    }
+
+                    public void startElement(String uri, String localName,
+                                             String qName, Attributes atts) 
throws SAXException {
+                    }
+
+                    public void endElement(String uri, String localName,
+                                           String qName) throws SAXException {
+                    }
+
+                    public void characters(char[] ch, int start, int length)
+                            throws SAXException {
+                        String text = new String(ch, start, length);
+                        if (text.equals("Test Indexation Html") && locator != 
null) {
+                            textPosition[line] = locator.getLineNumber();
+                            textPosition[col] = locator.getColumnNumber();
+                        }
+                    }
+
+                    public void ignorableWhitespace(char[] ch, int start,
+                                                    int length) throws 
SAXException {
+                    }
+
+                    public void processingInstruction(String target, String 
data)
+                            throws SAXException {
+                    }
+
+                    public void skippedEntity(String name) throws SAXException 
{
+                    }
+                },
+                new Metadata(),
+                new ParseContext());
+
+        // The text occurs at line 24 (if lines start at 0) or 25 (if lines 
start at 1).
+        assertEquals(24, textPosition[line]);
+        // The column reported seems fuzzy, just test it is close enough.
+        assertTrue(Math.abs(textPosition[col] - 47) < 10);
+    }
+
+
+    /**
+     * Test case for TIKA-1303: HTML parse should use the first title tag to 
set value in meta data
+     * and ignore any subsequent title tags found in HTML.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-1303";>TIKA-1303</a>
+     */
+    @Test
+    public void testFirstTitleValueisSetToMetadata() throws Exception {
+        String test = "<html><title>Simple Content</title><body><h1></h1>"
+                + "<title>TitleToIgnore</title></body></html>";
+        Metadata metadata = new Metadata();
+
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+
+        //Expecting first title to be set in meta data and second one to be 
ignored.
+        assertEquals("Simple Content", metadata.get(TikaCoreProperties.TITLE));
+    }
+
+    @Test
+    public void testMisleadingMetaContentTypeTags() throws Exception {
+        //TIKA-1519
+
+        String test = "<html><head><meta http-equiv=\"content-type\" 
content=\"text/html; charset=UTF-ELEVEN\">" +
+                "</head><title>title</title><body>body</body></html>";
+        Metadata metadata = new Metadata();
+
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("text/html; charset=UTF-ELEVEN", 
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
+        assertEquals("text/html; charset=ISO-8859-1", 
metadata.get(Metadata.CONTENT_TYPE));
+
+        test = "<html><head><meta http-equiv=\"content-type\" 
content=\"application/pdf\">" +
+                "</head><title>title</title><body>body</body></html>";
+        metadata = new Metadata();
+
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("application/pdf", 
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
+        assertEquals("text/html; charset=ISO-8859-1", 
metadata.get(Metadata.CONTENT_TYPE));
+
+        //test two content values
+        test = "<html><head><meta http-equiv=\"content-type\" 
content=\"application/pdf\" content=\"application/ms-word\">" +
+                "</head><title>title</title><body>body</body></html>";
+        metadata = new Metadata();
+
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("application/ms-word", 
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
+        assertEquals("text/html; charset=ISO-8859-1", 
metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testXHTMLWithMisleading() throws Exception {
+        //first test an acceptable XHTML header with http-equiv tags
+        String test = "<?xml version=\"1.0\" ?>" +
+                "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 
Transitional//EN\" 
\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\";>\n" +
+                "<html xmlns=\"http://www.w3.org/1999/xhtml\";>\n" +
+                "<head>\n" +
+                "<meta http-equiv=\"Content-Type\" content=\"text/html; 
charset=iso-8859-1\" />\n" +
+                "<title>title</title></head><body>body</body></html>";
+        Metadata metadata = new Metadata();
+        new AutoDetectParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+
+        assertEquals("text/html; charset=iso-8859-1", 
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
+        assertEquals("application/xhtml+xml; charset=ISO-8859-1", 
metadata.get(Metadata.CONTENT_TYPE));
+
+        test = "<?xml version=\"1.0\" ?>" +
+                "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 
Transitional//EN\" 
\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\";>\n" +
+                "<html xmlns=\"http://www.w3.org/1999/xhtml\";>\n" +
+                "<head>\n" +
+                "<meta http-equiv=\"Content-Type\" content=\"text/html; 
charset=iso-NUMBER_SEVEN\" />\n" +
+                "<title>title</title></head><body>body</body></html>";
+        metadata = new Metadata();
+        new AutoDetectParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+
+        assertEquals("text/html; charset=iso-NUMBER_SEVEN", 
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
+        assertEquals("application/xhtml+xml; charset=ISO-8859-1", 
metadata.get(Metadata.CONTENT_TYPE));
+
+    }
+}


Added: 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,376 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+import static org.junit.Assume.assumeTrue;
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+
+import org.apache.james.mime4j.stream.MimeConfig;
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.junit.Test;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class RFC822ParserTest extends TikaTest {
+
+    private static InputStream getStream(String name) {
+        InputStream stream = Thread.currentThread().getContextClassLoader()
+                .getResourceAsStream(name);
+        assertNotNull("Test file not found " + name, stream);
+        return stream;
+    }
+
+    @Test
+    public void testSimple() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822");
+        ContentHandler handler = mock(DefaultHandler.class);
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            verify(handler).startDocument();
+            //just one body
+            verify(handler).startElement(eq(XHTMLContentHandler.XHTML), 
eq("p"), eq("p"), any(Attributes.class));
+            verify(handler).endElement(XHTMLContentHandler.XHTML, "p", "p");
+            //no multi-part body parts
+            verify(handler, 
never()).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), 
any(Attributes.class));
+            verify(handler, never()).endElement(XHTMLContentHandler.XHTML, 
"div", "div");
+            verify(handler).endDocument();
+            //note no leading spaces, and no quotes
+            assertEquals("Julien Nioche (JIRA) <[email protected]>", 
metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not 
parsed",
+                    metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not 
parsed",
+                    metadata.get(Metadata.SUBJECT));
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    @Test
+    public void testMultipart() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822-multipart");
+        ContentHandler handler = mock(XHTMLContentHandler.class);
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            verify(handler).startDocument();
+            int bodyExpectedTimes = 4, multipackExpectedTimes = 5;
+            verify(handler, 
times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), 
eq("p"), eq("p"), any(Attributes.class));
+            verify(handler, 
times(multipackExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "p", "p");
+            verify(handler).endDocument();
+
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+
+        //repeat, this time looking at content
+        parser = new RFC822Parser();
+        metadata = new Metadata();
+        stream = getStream("test-documents/testRFC822-multipart");
+        handler = new BodyContentHandler();
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            //tests correct decoding of quoted printable text, including UTF-8 
bytes into Unicode
+            String bodyText = handler.toString();
+            assertTrue(bodyText.contains("body 1"));
+            assertTrue(bodyText.contains("body 2"));
+            assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of 
encoded gif
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    @Test
+    public void testQuotedPrintable() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822_quoted");
+        ContentHandler handler = new BodyContentHandler();
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            //tests correct decoding of quoted printable text, including UTF-8 
bytes into Unicode
+            String bodyText = handler.toString();
+            assertTrue(bodyText.contains("D\u00FCsseldorf has non-ascii."));
+            assertTrue(bodyText.contains("Lines can be split like this."));
+            assertTrue(bodyText.contains("Spaces at the end of a line \r\nmust 
be encoded.\r\n"));
+            assertFalse(bodyText.contains("=")); //there should be no escape 
sequences
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    @Test
+    public void testBase64() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822_base64");
+        ContentHandler handler = new BodyContentHandler();
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            //tests correct decoding of base64 text, including ISO-8859-1 
bytes into Unicode
+            assertContains("Here is some text, with international characters, 
voil\u00E0!", handler.toString());
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    @Test
+    public void testI18NHeaders() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = 
getStream("test-documents/testRFC822_i18nheaders");
+        ContentHandler handler = mock(DefaultHandler.class);
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            //tests correct decoding of internationalized headers, both
+            //quoted-printable (Q) and Base64 (B).
+            assertEquals("Keld J\u00F8rn Simonsen <[email protected]>",
+                    metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("If you can read this you understand the example.",
+                    metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("If you can read this you understand the example.",
+                    metadata.get(Metadata.SUBJECT));
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    /**
+     * The from isn't in the usual form.
+     * See TIKA-618
+     */
+    @Test
+    public void testUnusualFromAddress() throws Exception {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822_oddfrom");
+        ContentHandler handler = mock(DefaultHandler.class);
+
+        parser.parse(stream, handler, metadata, new ParseContext());
+        assertEquals("Saved by Windows Internet Explorer 7",
+                metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Air Permit Programs | Air & Radiation | US EPA",
+                metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Air Permit Programs | Air & Radiation | US EPA",
+                metadata.get(Metadata.SUBJECT));
+    }
+
+    /**
+     * Test for TIKA-640, increase header max beyond 10k bytes
+     */
+    @Test
+    public void testLongHeader() throws Exception {
+        StringBuilder inputBuilder = new StringBuilder();
+        for (int i = 0; i < 2000; ++i) {
+            inputBuilder.append( //len > 50
+                    "really really really really really really long name ");
+        }
+        String name = inputBuilder.toString();
+        byte[] data = ("From: " + name + "\r\n\r\n").getBytes(US_ASCII);
+
+        Parser parser = new RFC822Parser();
+        ContentHandler handler = new DefaultHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+
+        try {
+            parser.parse(
+                    new ByteArrayInputStream(data), handler, metadata, 
context);
+            fail();
+        } catch (TikaException expected) {
+        }
+
+        MimeConfig config = new MimeConfig();
+        config.setMaxHeaderLen(-1);
+        config.setMaxLineLen(-1);
+        context.set(MimeConfig.class, config);
+        parser.parse(
+                new ByteArrayInputStream(data), handler, metadata, context);
+        assertEquals(name.trim(), metadata.get(TikaCoreProperties.CREATOR));
+    }
+
+    /**
+     * Test for TIKA-678 - not all headers may be present
+     */
+    @Test
+    public void testSomeMissingHeaders() throws Exception {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = 
getStream("test-documents/testRFC822-limitedheaders");
+        ContentHandler handler = new BodyContentHandler();
+
+        parser.parse(stream, handler, metadata, new ParseContext());
+        assertEquals(true, metadata.isMultiValued(TikaCoreProperties.CREATOR));
+        assertEquals("xyz", metadata.getValues(TikaCoreProperties.CREATOR)[0]);
+        assertEquals("abc", metadata.getValues(TikaCoreProperties.CREATOR)[1]);
+        assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_FROM));
+        assertEquals("xyz", metadata.getValues(Metadata.MESSAGE_FROM)[0]);
+        assertEquals("abc", metadata.getValues(Metadata.MESSAGE_FROM)[1]);
+        assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_TO));
+        assertEquals("abc", metadata.getValues(Metadata.MESSAGE_TO)[0]);
+        assertEquals("def", metadata.getValues(Metadata.MESSAGE_TO)[1]);
+        assertEquals("abcd", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("abcd", metadata.get(Metadata.SUBJECT));
+        assertContains("bar biz bat", handler.toString());
+    }
+
+    /**
+     * Test TIKA-1028 - If the mail contains an encrypted attachment (or
+     * an attachment that others triggers an error), parsing should carry
+     * on for the remainder regardless
+     */
+    @Test
+    public void testEncryptedZipAttachment() throws Exception {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        InputStream stream = 
getStream("test-documents/testRFC822_encrypted_zip");
+        ContentHandler handler = new BodyContentHandler();
+        parser.parse(stream, handler, metadata, context);
+
+        // Check we go the metadata
+        assertEquals("Juha Haaga <[email protected]>", 
metadata.get(Metadata.MESSAGE_FROM));
+        assertEquals("Test mail for Tika", 
metadata.get(TikaCoreProperties.TITLE));
+
+        // Check we got the message text, for both Plain Text and HTML
+        assertContains("Includes encrypted zip file", handler.toString());
+        assertContains("password is \"test\".", handler.toString());
+        assertContains("This is the Plain Text part", handler.toString());
+        assertContains("This is the HTML part", handler.toString());
+
+        // We won't get the contents of the zip file, but we will get the name
+        assertContains("text.txt", handler.toString());
+        assertNotContained("ENCRYPTED ZIP FILES", handler.toString());
+
+        // Try again, this time with the password supplied
+        // Check that we also get the zip's contents as well
+        context.set(PasswordProvider.class, new PasswordProvider() {
+            public String getPassword(Metadata metadata) {
+                return "test";
+            }
+        });
+        stream = getStream("test-documents/testRFC822_encrypted_zip");
+        handler = new BodyContentHandler();
+        parser.parse(stream, handler, metadata, context);
+
+        assertContains("Includes encrypted zip file", handler.toString());
+        assertContains("password is \"test\".", handler.toString());
+        assertContains("This is the Plain Text part", handler.toString());
+        assertContains("This is the HTML part", handler.toString());
+
+        // We do get the name of the file in the encrypted zip file
+        assertContains("text.txt", handler.toString());
+
+        // TODO Upgrade to a version of Commons Compress with Encryption
+        //  support, then verify we get the contents of the text file
+        //  held within the encrypted zip
+        assumeTrue(false); // No Zip Encryption support yet
+        assertContains("TEST DATA FOR TIKA.", handler.toString());
+        assertContains("ENCRYPTED ZIP FILES", handler.toString());
+        assertContains("TIKA-1028", handler.toString());
+    }
+
+    /**
+     * Test TIKA-1028 - Ensure we can get the contents of an
+     * un-encrypted zip file
+     */
+    @Test
+    public void testNormalZipAttachment() throws Exception {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        InputStream stream = getStream("test-documents/testRFC822_normal_zip");
+        ContentHandler handler = new BodyContentHandler();
+        parser.parse(stream, handler, metadata, context);
+
+        // Check we go the metadata
+        assertEquals("Juha Haaga <[email protected]>", 
metadata.get(Metadata.MESSAGE_FROM));
+        assertEquals("Test mail for Tika", 
metadata.get(TikaCoreProperties.TITLE));
+
+        // Check we got the message text, for both Plain Text and HTML
+        assertContains("Includes a normal, unencrypted zip file", 
handler.toString());
+        assertContains("This is the Plain Text part", handler.toString());
+        assertContains("This is the HTML part", handler.toString());
+
+        // We get both name and contents of the zip file's contents
+        assertContains("text.txt", handler.toString());
+        assertContains("TEST DATA FOR TIKA.", handler.toString());
+        assertContains("This is text inside an unencrypted zip file", 
handler.toString());
+        assertContains("TIKA-1028", handler.toString());
+    }
+
+    /**
+     * TIKA-1222 When requested, ensure that the various attachments of
+     * the mail come through properly as embedded resources
+     */
+    @Test
+    public void testGetAttachmentsAsEmbeddedResources() throws Exception {
+        TrackingHandler tracker = new TrackingHandler();
+        ContainerExtractor ex = new ParserContainerExtractor();
+        try (TikaInputStream tis = 
TikaInputStream.get(getStream("test-documents/testRFC822-multipart"))) {
+            assertEquals(true, ex.isSupported(tis));
+            ex.extract(tis, ex, tracker);
+        }
+
+        // Check we found all 3 parts
+        assertEquals(3, tracker.filenames.size());
+        assertEquals(3, tracker.mediaTypes.size());
+
+        // No filenames available
+        assertEquals(null, tracker.filenames.get(0));
+        assertEquals(null, tracker.filenames.get(1));
+        assertEquals(null, tracker.filenames.get(2));
+        // Types are available
+        assertEquals(MediaType.TEXT_PLAIN, tracker.mediaTypes.get(0));
+        assertEquals(MediaType.TEXT_HTML, tracker.mediaTypes.get(1));
+        assertEquals(MediaType.image("gif"), tracker.mediaTypes.get(2));
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+import java.util.Map;
+
+import org.apache.tika.detect.TypeDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class MboxParserTest {
+
+    protected ParseContext recursingContext;
+    private Parser autoDetectParser;
+    private TypeDetector typeDetector;
+    private MboxParser mboxParser;
+
+    private static InputStream getStream(String name) {
+        return MboxParserTest.class.getClass().getResourceAsStream(name);
+    }
+
+    @Before
+    public void setUp() throws Exception {
+        typeDetector = new TypeDetector();
+        autoDetectParser = new AutoDetectParser(typeDetector);
+        recursingContext = new ParseContext();
+        recursingContext.set(Parser.class, autoDetectParser);
+
+        mboxParser = new MboxParser();
+        mboxParser.setTracking(true);
+    }
+
+    @Test
+    public void testSimple() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = getStream("/test-documents/simple.mbox")) {
+            mboxParser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        String content = handler.toString();
+        assertContains("Test content 1", content);
+        assertContains("Test content 2", content);
+        assertEquals("application/mbox", metadata.get(Metadata.CONTENT_TYPE));
+
+        Map<Integer, Metadata> mailsMetadata = 
mboxParser.getTrackingMetadata();
+        assertEquals("Nb. Of mails", 2, mailsMetadata.size());
+
+        Metadata mail1 = mailsMetadata.get(0);
+        assertEquals("message/rfc822", mail1.get(Metadata.CONTENT_TYPE));
+        assertEquals("envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009", 
mail1.get("MboxParser-from"));
+
+        Metadata mail2 = mailsMetadata.get(1);
+        assertEquals("message/rfc822", mail2.get(Metadata.CONTENT_TYPE));
+        assertEquals("envelope-sender-mailbox-name Mon Jun 01 11:00:00 2010", 
mail2.get("MboxParser-from"));
+    }
+
+    @Test
+    public void testHeaders() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = getStream("/test-documents/headers.mbox")) {
+            mboxParser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertContains("Test content", handler.toString());
+        assertEquals("Nb. Of mails", 1, 
mboxParser.getTrackingMetadata().size());
+
+        Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
+
+        assertEquals("2009-06-10T03:58:45Z", 
mailMetadata.get(TikaCoreProperties.CREATED));
+        assertEquals("<[email protected]>", 
mailMetadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("subject", mailMetadata.get(Metadata.SUBJECT));
+        assertEquals("<[email protected]>", mailMetadata.get(Metadata.AUTHOR));
+        assertEquals("message/rfc822", 
mailMetadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("[email protected]", mailMetadata.get("Message-From"));
+        assertEquals("<[email protected]>", 
mailMetadata.get("MboxParser-return-path"));
+    }
+
+    @Test
+    public void testMultilineHeader() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = getStream("/test-documents/multiline.mbox")) 
{
+            mboxParser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertEquals("Nb. Of mails", 1, 
mboxParser.getTrackingMetadata().size());
+
+        Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
+        assertEquals("from xxx by xxx with xxx; date", 
mailMetadata.get("MboxParser-received"));
+    }
+
+    @Test
+    public void testQuoted() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = getStream("/test-documents/quoted.mbox")) {
+            mboxParser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertContains("Test content", handler.toString());
+        assertContains("> quoted stuff", handler.toString());
+    }
+
+    @Test
+    public void testComplex() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = getStream("/test-documents/complex.mbox")) {
+            mboxParser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertEquals("Nb. Of mails", 3, 
mboxParser.getTrackingMetadata().size());
+
+        Metadata firstMail = mboxParser.getTrackingMetadata().get(0);
+        assertEquals("Re: question about when shuffle/sort start working", 
firstMail.get(Metadata.SUBJECT));
+        assertEquals("Re: question about when shuffle/sort start working", 
firstMail.get(TikaCoreProperties.TITLE));
+        assertEquals("Jothi Padmanabhan <[email protected]>", 
firstMail.get(Metadata.AUTHOR));
+        assertEquals("Jothi Padmanabhan <[email protected]>", 
firstMail.get(TikaCoreProperties.CREATOR));
+        assertEquals("[email protected]", 
firstMail.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
+
+        assertContains("When a Mapper completes", handler.toString());
+    }
+
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.ToHTMLContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class OutlookPSTParserTest extends TikaTest {
+
+  private Parser parser = new OutlookPSTParser();
+
+  @Test
+  public void testAccept() throws Exception {
+    
assertTrue((parser.getSupportedTypes(null).contains(MediaType.application("vnd.ms-outlook-pst"))));
+  }
+
+  @Test
+  public void testParse() throws Exception {
+    Parser pstParser = new AutoDetectParser();
+    Metadata metadata = new Metadata();
+    ContentHandler handler = new ToHTMLContentHandler();
+
+    ParseContext context = new ParseContext();
+    EmbeddedTrackingExtrator trackingExtrator = new 
EmbeddedTrackingExtrator(context);
+    context.set(EmbeddedDocumentExtractor.class, trackingExtrator);
+    context.set(Parser.class, new AutoDetectParser());
+
+    pstParser.parse(getResourceAsStream("/test-documents/testPST.pst"), 
handler, metadata, context);
+
+    String output = handler.toString();
+
+    assertFalse(output.isEmpty());
+    assertTrue(output.contains("<meta name=\"Content-Length\" 
content=\"271360\">"));
+    assertTrue(output.contains("<meta name=\"Content-Type\" 
content=\"application/vnd.ms-outlook-pst\">"));
+
+    assertTrue(output.contains("<body><div class=\"email-folder\"><h1>"));
+    assertTrue(output.contains("<div class=\"embedded\" 
id=\"&lt;[email protected]&gt;\"><h1>Re: Feature Generators</h1>"));
+    assertTrue(output.contains("<div class=\"embedded\" 
id=\"&lt;[email protected]&gt;\"><h1>Re:
 init tokenizer fails: \"Bad type in putfield/putstatic\"</h1>"));
+    assertTrue(output.contains("Gary Murphy commented on TIKA-1250:"));
+
+    assertTrue(output.contains("<div class=\"email-folder\"><h1>Racine (pour 
la recherche)</h1>"));
+
+
+    List<Metadata> metaList = trackingExtrator.trackingMetadata;
+    assertEquals(6, metaList.size());
+
+    Metadata firstMail = metaList.get(0);
+    assertEquals("JÃ¶rn Kottmann", firstMail.get(TikaCoreProperties.CREATOR));
+    assertEquals("Re: Feature Generators", 
firstMail.get(TikaCoreProperties.TITLE));
+    assertEquals("[email protected]", firstMail.get("senderEmailAddress"));
+    assertEquals("[email protected]", firstMail.get("displayTo"));
+    assertEquals("", firstMail.get("displayCC"));
+    assertEquals("", firstMail.get("displayBCC"));
+  }
+
+
+  private class EmbeddedTrackingExtrator extends 
ParsingEmbeddedDocumentExtractor {
+    List<Metadata> trackingMetadata = new ArrayList<Metadata>();
+
+    public EmbeddedTrackingExtrator(ParseContext context) {
+      super(context);
+    }
+
+    @Override
+    public boolean shouldParseEmbedded(Metadata metadata) {
+      return true;
+    }
+
+    @Override
+    public void parseEmbedded(InputStream stream, ContentHandler handler, 
Metadata metadata, boolean outputHtml) throws SAXException, IOException {
+      this.trackingMetadata.add(metadata);
+      super.parseEmbedded(stream, handler, metadata, outputHtml);
+    }
+
+  }
+}

svn commit: r1723223 [31/32] - in /tika/branches/2.x: tika-core/src/test/resources/META-INF/ tika-core/src/test/resources/META-INF/services/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-module/src/ ti...

Reply via email to