http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml index 88205ca..b95d8a2 100644 --- a/tika-parser-modules/pom.xml +++ b/tika-parser-modules/pom.xml @@ -150,32 +150,6 @@ <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-dependency-plugin</artifactId> - <version>2.10</version> - <executions> - <execution> - <id>unpack</id> - <phase>compile</phase> - <goals> - <goal>unpack</goal> - </goals> - <configuration> - <artifactItems> - <artifactItem> - <groupId>${project.groupId}</groupId> - <artifactId>tika-test-resources</artifactId> - <version>${project.version}</version> - <type>test-jar</type> - <overWrite>true</overWrite> - <outputDirectory>${project.build.testOutputDirectory}</outputDirectory> - </artifactItem> - </artifactItems> - </configuration> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-jar-plugin</artifactId> <executions> <execution>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java b/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java index fff644a..63e75a4 100644 --- a/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java +++ b/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java @@ -16,26 +16,26 @@ */ package org.apache.tika.parser.ner; -import org.apache.tika.Tika; -import org.apache.tika.config.TikaConfig; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser; -import org.apache.tika.parser.ner.regex.RegexNERecogniser; -import org.junit.Test; +import static org.junit.Assert.assertTrue; import java.io.ByteArrayInputStream; import java.nio.charset.Charset; import java.util.Arrays; import java.util.HashSet; -import static org.junit.Assert.assertTrue; +import org.apache.tika.Tika; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser; +import org.apache.tika.parser.ner.regex.RegexNERecogniser; +import org.junit.Test; /** *Test case for {@link NamedEntityParser} */ public class NamedEntityParserTest { - public static final String CONFIG_FILE = "tika-config.xml"; + public static final String CONFIG_FILE = "tika-config-for-ner.xml"; @Test public void testParse() throws Exception { http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java b/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java index 57c2162..257fea8 100644 --- a/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java +++ b/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java @@ -16,11 +16,7 @@ */ package org.apache.tika.parser.ner.regex; -import org.apache.tika.Tika; -import org.apache.tika.config.TikaConfig; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ner.NamedEntityParser; -import org.junit.Test; +import static org.junit.Assert.assertTrue; import java.io.ByteArrayInputStream; import java.nio.charset.StandardCharsets; @@ -28,7 +24,12 @@ import java.util.Arrays; import java.util.HashSet; import java.util.Set; -import static org.junit.Assert.assertTrue; +import org.apache.tika.Tika; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ner.NamedEntityParser; +import org.apache.tika.parser.ner.NamedEntityParserTest; +import org.junit.Test; public class RegexNERecogniserTest { @@ -38,7 +39,7 @@ public class RegexNERecogniserTest { String text = "Hey, Lets meet on this Sunday or MONDAY because i am busy on Saturday"; System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, RegexNERecogniser.class.getName()); - Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml"))); + Tika tika = new Tika(new TikaConfig(NamedEntityParserTest.class.getResourceAsStream("tika-config-for-ner.xml"))); Metadata md = new Metadata(); tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md); http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt b/tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt new file mode 100644 index 0000000..e6fa39e --- /dev/null +++ b/tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt @@ -0,0 +1,17 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)? \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/tika-config-for-ner.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/tika-config-for-ner.xml b/tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/tika-config-for-ner.xml new file mode 100644 index 0000000..267c399 --- /dev/null +++ b/tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/tika-config-for-ner.xml @@ -0,0 +1,27 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.ner.NamedEntityParser"> + <mime>text/plain</mime> + <mime>text/html</mime> + <mime>application/xhtml+xml</mime> + </parser> + </parsers> + +</properties> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java b/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java index 7ea27fa..d394c61 100644 --- a/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java +++ b/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java @@ -20,10 +20,14 @@ package org.apache.tika.parser.jdbc; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertEquals; +import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import java.util.ArrayList; import java.util.List; @@ -41,13 +45,27 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; -import org.apache.tika.sax.ToXMLContentHandler; +import org.junit.AfterClass; +import org.junit.BeforeClass; import org.junit.Test; import org.xml.sax.ContentHandler; public class SQLite3ParserTest extends TikaTest { private final static String TEST_FILE_NAME = "testSqlite3b.db"; - private final static String TEST_FILE1 = "/test-documents/" + TEST_FILE_NAME; + static Path tmp = null; + @BeforeClass + public static void createTMPFile() throws IOException { + tmp = Files.createTempFile("sqlite-", ""); + Files.copy( + TikaTest.class.getClassLoader().getResourceAsStream("test-documents/"+TEST_FILE_NAME), + tmp, StandardCopyOption.REPLACE_EXISTING); + + } + + @AfterClass + public static void deleteTMPFile() throws IOException { + Files.delete(tmp); + } @Test public void testBasic() throws Exception { @@ -56,18 +74,20 @@ public class SQLite3ParserTest extends TikaTest { //test different types of input streams //actual inputstream, memory buffered bytearray and literal file InputStream[] streams = new InputStream[3]; - streams[0] = getResourceAsStream(TEST_FILE1); + streams[0] = getTestDocumentAsStream(TEST_FILE_NAME); ByteArrayOutputStream bos = new ByteArrayOutputStream(); - IOUtils.copy(getResourceAsStream(TEST_FILE1), bos); + IOUtils.copy(getTestDocumentAsStream(TEST_FILE_NAME), bos); streams[1] = new ByteArrayInputStream(bos.toByteArray()); - streams[2] = TikaInputStream.get(getResourceAsFile(TEST_FILE1)); + streams[2] = TikaInputStream.get(tmp); int tests = 0; + ParseContext context = new ParseContext(); + context.set(Parser.class, p); for (InputStream stream : streams) { Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME); //1) getXML closes the stream //2) getXML runs recursively on the contents, so the embedded docs should show up - XMLResult result = getXML(stream, p, metadata); + XMLResult result = getXML(stream, p, metadata, context); String x = result.xml; //first table name assertContains("<table name=\"my_table1\"><thead><tr>\t<th>INT_COL</th>", x); @@ -106,7 +126,7 @@ public class SQLite3ParserTest extends TikaTest { ContentHandler handler = new BodyContentHandler(-1); ParseContext ctx = new ParseContext(); ctx.set(Parser.class, p); - try (InputStream stream = getResourceAsStream(TEST_FILE1)) { + try (InputStream stream = getTestDocumentAsStream(TEST_FILE_NAME)) { p.parse(stream, handler, metadata, ctx); } String s = handler.toString(); @@ -118,14 +138,11 @@ public class SQLite3ParserTest extends TikaTest { //to handle embedded documents @Test public void testNotAddingEmbeddedParserToParseContext() throws Exception { - Parser p = new AutoDetectParser(); - InputStream is = getResourceAsStream(TEST_FILE1); Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME); - ContentHandler handler = new ToXMLContentHandler(); - p.parse(is, handler, metadata, new ParseContext()); - String xml = handler.toString(); + XMLResult r = getXML(TEST_FILE_NAME, new AutoDetectParser(), new Metadata(), new ParseContext()); + String xml = r.xml; //just includes headers for embedded documents assertContains("<table name=\"my_table1\"><thead><tr>", xml); assertContains("<td><span type=\"blob\" column_name=\"BYTES_COL\" row_number=\"0\"><div class=\"package-entry\"><h1>BYTES_COL_0.doc</h1>", xml); @@ -143,7 +160,7 @@ public class SQLite3ParserTest extends TikaTest { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, new BasicContentHandlerFactory( BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1)); - InputStream is = getResourceAsStream(TEST_FILE1); + InputStream is = getTestDocumentAsStream(TEST_FILE_NAME); Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME); wrapper.parse(is, new BodyContentHandler(-1), metadata, new ParseContext()); @@ -176,7 +193,7 @@ public class SQLite3ParserTest extends TikaTest { ParserContainerExtractor ex = new ParserContainerExtractor(); ByteCopyingHandler byteCopier = new ByteCopyingHandler(); - InputStream is = getResourceAsStream(TEST_FILE1); + InputStream is = getTestDocumentAsStream(TEST_FILE_NAME); Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME); ex.extract(TikaInputStream.get(is), ex, byteCopier); @@ -217,9 +234,12 @@ public class SQLite3ParserTest extends TikaTest { //4x word files, two docs and two docxs //4x png files, the same image embedded in each of the doc and docx + //not clear why we get an exception on reset if we try + //to get the test file directly ParserContainerExtractor ex = new ParserContainerExtractor(); InputStreamResettingHandler byteCopier = new InputStreamResettingHandler(); - InputStream is = getResourceAsStream(TEST_FILE1); + InputStream is = new BufferedInputStream( + getResourceAsStream("/test-documents/"+TEST_FILE_NAME)); Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME); ex.extract(TikaInputStream.get(is), ex, byteCopier); http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java index 90a3c1a..5f53870 100644 --- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java +++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java @@ -20,11 +20,8 @@ import static java.nio.charset.StandardCharsets.ISO_8859_1; import static org.junit.Assert.assertTrue; import java.io.ByteArrayInputStream; -import java.io.File; -import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.net.URL; import java.util.Arrays; import java.util.HashSet; import java.util.List; @@ -34,6 +31,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.regex.Pattern; +import org.apache.tika.TikaTest; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; @@ -45,7 +43,7 @@ import org.apache.tika.sax.BodyContentHandler; import org.junit.Test; import org.xml.sax.SAXException; -public class TestChmExtraction { +public class TestChmExtraction extends TikaTest { private final Parser parser = new ChmParser(); @@ -196,12 +194,19 @@ public class TestChmExtraction { @Test public void test_TIKA_1446() throws Exception { - URL chmDir = TestChmExtraction.class.getResource("/test-documents/chm/"); - File chmFolder = new File(chmDir.toURI()); - for (String fileName : chmFolder.list()) { - File file = new File(chmFolder, fileName); - InputStream stream = new FileInputStream(file); - testingChm(stream); + String[] chemFiles = { + "admin.chm", + "cmak_ops.CHM", + "comexp.CHM", + "gpedit.CHM", + "IMJPCL.CHM", + "IMJPCLE.CHM", + "IMTCEN.CHM", + "tcpip.CHM", + "wmicontrol.CHM" + }; + for (String fileName : chemFiles) { + testingChm(getTestDocumentAsStream("chm/"+fileName)); } } } http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java index 4f5bfcd..4b92e88 100644 --- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java +++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java @@ -16,8 +16,6 @@ */ package org.apache.tika.parser.microsoft; -import static org.apache.tika.TikaTest.assertContains; -import static org.apache.tika.TikaTest.assertNotContained; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -25,6 +23,7 @@ import static org.junit.Assert.fail; import java.io.InputStream; import java.util.Locale; +import org.apache.tika.TikaTest; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; import org.apache.tika.exception.EncryptedDocumentException; @@ -41,155 +40,139 @@ import org.apache.tika.sax.BodyContentHandler; import org.junit.Test; import org.xml.sax.ContentHandler; -public class ExcelParserTest { +public class ExcelParserTest extends TikaTest { @Test @SuppressWarnings("deprecation") // Checks legacy Tika-1.0 style metadata keys public void testExcelParser() throws Exception { - try (InputStream input = ExcelParserTest.class.getResourceAsStream( - "/test-documents/testEXCEL.xls")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - new OfficeParser().parse(input, handler, metadata, context); - - assertEquals( - "application/vnd.ms-excel", - metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR)); - - // Mon Oct 01 17:13:56 BST 2007 - assertEquals("2007-10-01T16:13:56Z", metadata.get(TikaCoreProperties.CREATED)); - assertEquals("2007-10-01T16:13:56Z", metadata.get(Metadata.CREATION_DATE)); - // Mon Oct 01 17:31:43 BST 2007 - assertEquals("2007-10-01T16:31:43Z", metadata.get(TikaCoreProperties.MODIFIED)); - assertEquals("2007-10-01T16:31:43Z", metadata.get(Metadata.DATE)); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + XMLResult r = getXML("testEXCEL.xls", new OfficeParser(), new Metadata(), context); + + assertEquals( + "application/vnd.ms-excel", + r.metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Simple Excel document", r.metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Keith Bennett", r.metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Keith Bennett", r.metadata.get(Metadata.AUTHOR)); + + // Mon Oct 01 17:13:56 BST 2007 + assertEquals("2007-10-01T16:13:56Z", r.metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2007-10-01T16:13:56Z", r.metadata.get(Metadata.CREATION_DATE)); + + // Mon Oct 01 17:31:43 BST 2007 + assertEquals("2007-10-01T16:31:43Z", r.metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2007-10-01T16:31:43Z", r.metadata.get(Metadata.DATE)); + + String content = r.xml; + assertContains("Sample Excel Worksheet", content); + assertContains("Numbers and their Squares", content); + assertContains("<tr>\t<td />\t<td>Number</td>\t<td>Square", content); + assertContains("9", content); + assertNotContained("9.0", content); + assertContains("196", content); + assertNotContained("196.0", content); - String content = handler.toString(); - assertContains("Sample Excel Worksheet", content); - assertContains("Numbers and their Squares", content); - assertContains("\t\tNumber\tSquare", content); - assertContains("9", content); - assertNotContained("9.0", content); - assertContains("196", content); - assertNotContained("196.0", content); - } } @Test public void testExcelParserFormatting() throws Exception { - try (InputStream input = ExcelParserTest.class.getResourceAsStream( - "/test-documents/testEXCEL-formats.xls")) { - Metadata metadata = new Metadata(); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - ContentHandler handler = new BodyContentHandler(); - new OfficeParser().parse(input, handler, metadata, context); - - assertEquals( - "application/vnd.ms-excel", - metadata.get(Metadata.CONTENT_TYPE)); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + XMLResult r = getXML("testEXCEL-formats.xls", new OfficeParser(), new Metadata(), context); + + assertEquals( + "application/vnd.ms-excel", + r.metadata.get(Metadata.CONTENT_TYPE)); + + String content = r.xml; + + // Number #,##0.00 + assertContains("1,599.99", content); + assertContains("-1,599.99", content); + + // Currency $#,##0.00;[Red]($#,##0.00) + assertContains("$1,599.99", content); + assertContains("($1,599.99)", content); + + // Scientific 0.00E+00 + // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08 + assertTrue(content.contains("1.98E08") || content.contains("1.98E+08")); + assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08")); + + // Percentage. + assertContains("2.50%", content); + // Excel rounds up to 3%, but that requires Java 1.6 or later + if (System.getProperty("java.version").startsWith("1.5")) { + assertContains("2%", content); + } else { + assertContains("3%", content); + } - String content = handler.toString(); + // Time Format: h:mm + assertContains("6:15", content); + assertContains("18:15", content); - // Number #,##0.00 - assertContains("1,599.99", content); - assertContains("-1,599.99", content); - - // Currency $#,##0.00;[Red]($#,##0.00) - assertContains("$1,599.99", content); - assertContains("($1,599.99)", content); - - // Scientific 0.00E+00 - // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08 - assertTrue(content.contains("1.98E08") || content.contains("1.98E+08")); - assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08")); - - // Percentage. - assertContains("2.50%", content); - // Excel rounds up to 3%, but that requires Java 1.6 or later - if (System.getProperty("java.version").startsWith("1.5")) { - assertContains("2%", content); - } else { - assertContains("3%", content); - } + // Date Format: d-mmm-yy + assertContains("17-May-07", content); - // Time Format: h:mm - assertContains("6:15", content); - assertContains("18:15", content); + // Date Format: m/d/yy + assertContains("10/3/09", content); - // Date Format: d-mmm-yy - assertContains("17-May-07", content); + // Date/Time Format: m/d/yy h:mm + assertContains("1/19/08 4:35", content); - // Date Format: m/d/yy - assertContains("10/3/09", content); + // Fraction (2.5): # ?/? + assertContains("2 1/2", content); - // Date/Time Format: m/d/yy h:mm - assertContains("1/19/08 4:35", content); - // Fraction (2.5): # ?/? - assertContains("2 1/2", content); + // Below assertions represent outstanding formatting issues to be addressed + // they are included to allow the issues to be progressed with the Apache POI + // team - See TIKA-103. + /************************************************************************* + // Custom Number (0 "dollars and" .00 "cents") + assertContains("19 dollars and .99 cents", content); - // Below assertions represent outstanding formatting issues to be addressed - // they are included to allow the issues to be progressed with the Apache POI - // team - See TIKA-103. + // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy) + assertContains("At 4:20 AM on Thursday May 17, 2007", content); + **************************************************************************/ - /************************************************************************* - // Custom Number (0 "dollars and" .00 "cents") - assertContains("19 dollars and .99 cents", content); - // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy) - assertContains("At 4:20 AM on Thursday May 17, 2007", content); - **************************************************************************/ - - } } @Test public void testExcelParserPassword() throws Exception { - try (InputStream input = ExcelParserTest.class.getResourceAsStream( - "/test-documents/testEXCEL_protected_passtika.xls")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - new OfficeParser().parse(input, handler, metadata, context); + try { + XMLResult r = getXML("testEXCEL_protected_passtika.xls"); fail("Document is encrypted, shouldn't parse"); } catch (EncryptedDocumentException e) { // Good } // Try again, this time with the password - try (InputStream input = ExcelParserTest.class.getResourceAsStream( - "/test-documents/testEXCEL_protected_passtika.xls")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - context.set(PasswordProvider.class, new PasswordProvider() { - @Override - public String getPassword(Metadata metadata) { - return "tika"; - } - }); - new OfficeParser().parse(input, handler, metadata, context); - - assertEquals( - "application/vnd.ms-excel", - metadata.get(Metadata.CONTENT_TYPE)); - - assertEquals(null, metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Antoni", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("2011-11-25T09:52:48Z", metadata.get(TikaCoreProperties.CREATED)); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + context.set(PasswordProvider.class, new PasswordProvider() { + @Override + public String getPassword(Metadata metadata) { + return "tika"; + } + }); + XMLResult r = getXML("testEXCEL_protected_passtika.xls", new OfficeParser(), new Metadata(), context); + + assertEquals( + "application/vnd.ms-excel", + r.metadata.get(Metadata.CONTENT_TYPE)); + + assertEquals(null, r.metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Antoni", r.metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("2011-11-25T09:52:48Z", r.metadata.get(TikaCoreProperties.CREATED)); + + String content = r.xml; + assertContains("This is an Encrypted Excel spreadsheet", content); + assertNotContained("9.0", content); - String content = handler.toString(); - assertContains("This is an Encrypted Excel spreadsheet", content); - assertNotContained("9.0", content); - } } /** @@ -197,70 +180,48 @@ public class ExcelParserTest { */ @Test public void testExcelParserCharts() throws Exception { - try (InputStream input = ExcelParserTest.class.getResourceAsStream( - "/test-documents/testEXCEL-charts.xls")) { - Metadata metadata = new Metadata(); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - ContentHandler handler = new BodyContentHandler(); - new OfficeParser().parse(input, handler, metadata, context); - assertEquals( - "application/vnd.ms-excel", - metadata.get(Metadata.CONTENT_TYPE)); + XMLResult r = getXML("testEXCEL-charts.xls", new OfficeParser()); + assertEquals( + "application/vnd.ms-excel", + r.metadata.get(Metadata.CONTENT_TYPE)); - String content = handler.toString(); + String content = r.xml; + + // The first sheet has a pie chart + assertContains("charttabyodawg", content); + assertContains("WhamPuff", content); + + // The second sheet has a bar chart and some text + assertContains("Sheet1", content); + assertContains("Test Excel Spreasheet", content); + assertContains("foo", content); + assertContains("bar", content); + assertContains("fizzlepuff", content); + assertContains("whyaxis", content); + assertContains("eksaxis", content); + + // The third sheet has some text + assertContains("Sheet2", content); + assertContains("dingdong", content); - // The first sheet has a pie chart - assertContains("charttabyodawg", content); - assertContains("WhamPuff", content); - - // The second sheet has a bar chart and some text - assertContains("Sheet1", content); - assertContains("Test Excel Spreasheet", content); - assertContains("foo", content); - assertContains("bar", content); - assertContains("fizzlepuff", content); - assertContains("whyaxis", content); - assertContains("eksaxis", content); - - // The third sheet has some text - assertContains("Sheet2", content); - assertContains("dingdong", content); - } } @Test public void testJXL() throws Exception { - try (InputStream input = ExcelParserTest.class.getResourceAsStream( - "/test-documents/jxl.xls")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(-1); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - new OfficeParser().parse(input, handler, metadata, context); - assertEquals( - "application/vnd.ms-excel", - metadata.get(Metadata.CONTENT_TYPE)); - String content = handler.toString(); - assertContains("Number Formats", content); - } + XMLResult r = getXML("jxl.xls", new OfficeParser()); + assertEquals( + "application/vnd.ms-excel", + r.metadata.get(Metadata.CONTENT_TYPE)); + assertContains("Number Formats", r.xml); + } @Test public void testWorksSpreadsheet70() throws Exception { - try (InputStream input = ExcelParserTest.class.getResourceAsStream( - "/test-documents/testWORKSSpreadsheet7.0.xlr")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(-1); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - new OfficeParser().parse(input, handler, metadata, context); - - String content = handler.toString(); - assertContains("Microsoft Works", content); - } + assertContains("Microsoft Works", + getXML("testWORKSSpreadsheet7.0.xlr", new OfficeParser()).xml); } /** @@ -278,8 +239,7 @@ public class ExcelParserTest { // Should be detected correctly MediaType type; - try (InputStream input = ExcelParserTest.class.getResourceAsStream( - "/test-documents/testEXCEL.xlsb")) { + try (InputStream input = getTestDocumentAsStream("testEXCEL.xlsb")) { type = detector.detect(input, m); assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString()); } @@ -291,15 +251,8 @@ public class ExcelParserTest { assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type)); // AutoDetectParser doesn't break on it - try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) { - ContentHandler handler = new BodyContentHandler(-1); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - parser.parse(input, handler, m, context); + assertContains("<body />", getXML("testEXCEL.xlsb").xml); - String content = handler.toString(); - assertEquals("", content); - } } /** @@ -315,7 +268,7 @@ public class ExcelParserTest { // First try detection of Excel 5 m = new Metadata(); m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls"); - try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) { + try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) { type = detector.detect(input, m); assertEquals("application/vnd.ms-excel", type.toString()); } @@ -323,7 +276,7 @@ public class ExcelParserTest { // Now Excel 95 m = new Metadata(); m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls"); - try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) { + try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) { type = detector.detect(input, m); assertEquals("application/vnd.ms-excel", type.toString()); } @@ -337,7 +290,7 @@ public class ExcelParserTest { // Parse the Excel 5 file m = new Metadata(); - try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) { + try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) { ContentHandler handler = new BodyContentHandler(-1); ParseContext context = new ParseContext(); context.set(Locale.class, Locale.US); @@ -364,7 +317,7 @@ public class ExcelParserTest { // Parse the Excel 95 file m = new Metadata(); - try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) { + try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) { ContentHandler handler = new BodyContentHandler(-1); ParseContext context = new ParseContext(); context.set(Locale.class, Locale.US); @@ -388,16 +341,11 @@ public class ExcelParserTest { */ @Test public void testCustomProperties() throws Exception { - Metadata metadata = new Metadata(); - - try (InputStream input = ExcelParserTest.class.getResourceAsStream( - "/test-documents/testEXCEL_custom_props.xls")) { - ContentHandler handler = new BodyContentHandler(-1); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - new OfficeParser().parse(input, handler, metadata, context); - } + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + XMLResult r = getXML("testEXCEL_custom_props.xls", new OfficeParser(), new Metadata(), context); + Metadata metadata = r.metadata; assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("", metadata.get(TikaCoreProperties.CREATOR)); assertEquals("", metadata.get(TikaCoreProperties.MODIFIER)); @@ -413,31 +361,30 @@ public class ExcelParserTest { @Test public void testHeaderAndFooterExtraction() throws Exception { - try (InputStream input = ExcelParserTest.class.getResourceAsStream( - "/test-documents/testEXCEL_headers_footers.xls")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.UK); - new OfficeParser().parse(input, handler, metadata, context); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.UK); + + XMLResult r = getXML("testEXCEL_headers_footers.xls", new OfficeParser(), + new Metadata(), context); + + Metadata metadata = r.metadata; + assertEquals( + "application/vnd.ms-excel", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR)); + + String content = r.xml; + assertContains("John Smith1", content); + assertContains("John Smith50", content); + assertContains("1 Corporate HQ", content); + assertContains("Header - Corporate Spreadsheet", content); + assertContains("Header - For Internal Use Only", content); + assertContains("Header - Author: John Smith", content); + assertContains("Footer - Corporate Spreadsheet", content); + assertContains("Footer - For Internal Use Only", content); + assertContains("Footer - Author: John Smith", content); - assertEquals( - "application/vnd.ms-excel", - metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR)); - - String content = handler.toString(); - assertContains("John Smith1", content); - assertContains("John Smith50", content); - assertContains("1 Corporate HQ", content); - assertContains("Header - Corporate Spreadsheet", content); - assertContains("Header - For Internal Use Only", content); - assertContains("Header - Author: John Smith", content); - assertContains("Footer - Corporate Spreadsheet", content); - assertContains("Footer - For Internal Use Only", content); - assertContains("Footer - Author: John Smith", content); - } } } http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java index 8a7c202..3cfda82 100644 --- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java +++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java @@ -17,9 +17,11 @@ package org.apache.tika.parser.odf; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import org.apache.tika.TikaTest; import org.apache.tika.io.TikaInputStream; @@ -27,7 +29,6 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; import org.apache.tika.metadata.OfficeOpenXMLCore; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.opendocument.OpenOfficeParser; @@ -50,270 +51,235 @@ public class ODFParserTest extends TikaTest { @Test public void testOO3() throws Exception { for (Parser parser : getParsers()) { - try (InputStream input = ODFParserTest.class.getResourceAsStream( - "/test-documents/testODFwithOOo3.odt")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - parser.parse(input, handler, metadata, new ParseContext()); - - assertEquals( - "application/vnd.oasis.opendocument.text", - metadata.get(Metadata.CONTENT_TYPE)); - - String content = handler.toString(); - assertContains("Tika is part of the Lucene project.", content); - assertContains("Solr", content); - assertContains("one embedded", content); - assertContains("Rectangle Title", content); - assertContains("a blue background and dark border", content); - } + XMLResult r = getXML("testODFwithOOo3.odt", parser); + assertEquals( + "application/vnd.oasis.opendocument.text", + r.metadata.get(Metadata.CONTENT_TYPE)); + + String content = r.xml; + assertContains("Tika is part of the Lucene project.", content); + assertContains("Solr", content); + assertContains("one embedded", content); + assertContains("Rectangle Title", content); + assertContains("a blue background and dark border", content); + } } @Test public void testOO2() throws Exception { - for (Parser parser : getParsers()) { - try (InputStream input = ODFParserTest.class.getResourceAsStream( - "/test-documents/testOpenOffice2.odt")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - parser.parse(input, handler, metadata, new ParseContext()); - - assertEquals( - "application/vnd.oasis.opendocument.text", - metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("en-US", metadata.get(Metadata.LANGUAGE)); - assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME)); - assertEquals( - "NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161", - metadata.get("generator")); - - // Check date metadata, both old-style and new-style - assertEquals("2007-09-14T11:07:10", metadata.get(TikaCoreProperties.MODIFIED)); - assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.MODIFIED)); - assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE)); - assertEquals("2007-09-14T11:06:08", metadata.get(TikaCoreProperties.CREATED)); - assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE)); - - // Check the document statistics - assertEquals("1", metadata.get(Office.PAGE_COUNT)); - assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT)); - assertEquals("14", metadata.get(Office.WORD_COUNT)); - assertEquals("78", metadata.get(Office.CHARACTER_COUNT)); - assertEquals("0", metadata.get(Office.TABLE_COUNT)); - assertEquals("0", metadata.get(Office.OBJECT_COUNT)); - assertEquals("0", metadata.get(Office.IMAGE_COUNT)); - - // Check the Tika-1.0 style document statistics - assertEquals("1", metadata.get(Metadata.PAGE_COUNT)); - assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT)); - assertEquals("14", metadata.get(Metadata.WORD_COUNT)); - assertEquals("78", metadata.get(Metadata.CHARACTER_COUNT)); - assertEquals("0", metadata.get(Metadata.TABLE_COUNT)); - assertEquals("0", metadata.get(Metadata.OBJECT_COUNT)); - assertEquals("0", metadata.get(Metadata.IMAGE_COUNT)); - - // Check the very old style statistics (these will be removed shortly) - assertEquals("0", metadata.get("nbTab")); - assertEquals("0", metadata.get("nbObject")); - assertEquals("0", metadata.get("nbImg")); - assertEquals("1", metadata.get("nbPage")); - assertEquals("1", metadata.get("nbPara")); - assertEquals("14", metadata.get("nbWord")); - assertEquals("78", metadata.get("nbCharacter")); - - // Custom metadata tags present but without values - assertEquals(null, metadata.get("custom:Info 1")); - assertEquals(null, metadata.get("custom:Info 2")); - assertEquals(null, metadata.get("custom:Info 3")); - assertEquals(null, metadata.get("custom:Info 4")); - - String content = handler.toString(); - assertTrue(content.contains( - "This is a sample Open Office document," - + " written in NeoOffice 2.2.1 for the Mac.")); - } - } - } - - /** - * Similar to {@link #testXMLParser()}, but using a different - * OO2 file with different metadata in it - */ - @Test - public void testOO2Metadata() throws Exception { - try (InputStream input = ODFParserTest.class.getResourceAsStream( - "/test-documents/testOpenOffice2.odf")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - new OpenDocumentParser().parse(input, handler, metadata); - - assertEquals( - "application/vnd.oasis.opendocument.formula", - metadata.get(Metadata.CONTENT_TYPE)); - assertEquals(null, metadata.get(TikaCoreProperties.MODIFIED)); - assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE)); - assertEquals("The quick brown fox jumps over the lazy dog", - metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Gym class featuring a brown fox and lazy dog", - metadata.get(TikaCoreProperties.DESCRIPTION)); - assertEquals("Gym class featuring a brown fox and lazy dog", - metadata.get(OfficeOpenXMLCore.SUBJECT)); - assertEquals("Gym class featuring a brown fox and lazy dog", - metadata.get(Metadata.SUBJECT)); - assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME)); - assertEquals("1", metadata.get("editing-cycles")); - assertEquals( - "OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134", - metadata.get("generator")); - assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS)); - - // User defined metadata - assertEquals("Text 1", metadata.get("custom:Info 1")); - assertEquals("2", metadata.get("custom:Info 2")); - assertEquals("false", metadata.get("custom:Info 3")); - assertEquals("true", metadata.get("custom:Info 4")); - - // No statistics present - assertEquals(null, metadata.get(Metadata.PAGE_COUNT)); - assertEquals(null, metadata.get(Metadata.PARAGRAPH_COUNT)); - assertEquals(null, metadata.get(Metadata.WORD_COUNT)); - assertEquals(null, metadata.get(Metadata.CHARACTER_COUNT)); - assertEquals(null, metadata.get(Metadata.TABLE_COUNT)); - assertEquals(null, metadata.get(Metadata.OBJECT_COUNT)); - assertEquals(null, metadata.get(Metadata.IMAGE_COUNT)); - assertEquals(null, metadata.get("nbTab")); - assertEquals(null, metadata.get("nbObject")); - assertEquals(null, metadata.get("nbImg")); - assertEquals(null, metadata.get("nbPage")); - assertEquals(null, metadata.get("nbPara")); - assertEquals(null, metadata.get("nbWord")); - assertEquals(null, metadata.get("nbCharacter")); - - // Note - contents of maths files not currently supported - String content = handler.toString(); - assertEquals("", content); - } - } - - /** - * Similar to {@link #testXMLParser()}, but using an OO3 file - */ - @Test - public void testOO3Metadata() throws Exception { - try (InputStream input = ODFParserTest.class.getResourceAsStream( - "/test-documents/testODFwithOOo3.odt")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - new OpenDocumentParser().parse(input, handler, metadata); - + for (Parser parser : getParsers()) { + XMLResult r = getXML("testOpenOffice2.odt", parser); + Metadata metadata = r.metadata; assertEquals( "application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("2009-10-05T21:22:38", metadata.get(TikaCoreProperties.MODIFIED)); - assertEquals("2009-10-05T19:04:01", metadata.get(TikaCoreProperties.CREATED)); - assertEquals("2009-10-05T19:04:01", metadata.get(Metadata.CREATION_DATE)); - assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Test document", metadata.get(OfficeOpenXMLCore.SUBJECT)); - assertEquals("Test document", metadata.get(Metadata.SUBJECT)); - assertEquals("A rather complex document", metadata.get(TikaCoreProperties.DESCRIPTION)); - assertEquals("Bart Hanssens", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("Bart Hanssens", metadata.get("initial-creator")); - assertEquals("2", metadata.get("editing-cycles")); - assertEquals("PT02H03M24S", metadata.get(Metadata.EDIT_TIME)); + assertEquals("en-US", metadata.get(Metadata.LANGUAGE)); + assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME)); assertEquals( - "OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420", + "NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161", metadata.get("generator")); - assertEquals("Apache, Lucene, Tika", metadata.get(Metadata.KEYWORDS)); - // User defined metadata - assertEquals("Bart Hanssens", metadata.get("custom:Editor")); - assertEquals(null, metadata.get("custom:Info 2")); - assertEquals(null, metadata.get("custom:Info 3")); - assertEquals(null, metadata.get("custom:Info 4")); + // Check date metadata, both old-style and new-style + assertEquals("2007-09-14T11:07:10", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.MODIFIED)); + assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE)); + assertEquals("2007-09-14T11:06:08", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE)); // Check the document statistics - assertEquals("2", metadata.get(Office.PAGE_COUNT)); - assertEquals("13", metadata.get(Office.PARAGRAPH_COUNT)); - assertEquals("54", metadata.get(Office.WORD_COUNT)); - assertEquals("351", metadata.get(Office.CHARACTER_COUNT)); + assertEquals("1", metadata.get(Office.PAGE_COUNT)); + assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT)); + assertEquals("14", metadata.get(Office.WORD_COUNT)); + assertEquals("78", metadata.get(Office.CHARACTER_COUNT)); assertEquals("0", metadata.get(Office.TABLE_COUNT)); - assertEquals("2", metadata.get(Office.OBJECT_COUNT)); + assertEquals("0", metadata.get(Office.OBJECT_COUNT)); assertEquals("0", metadata.get(Office.IMAGE_COUNT)); // Check the Tika-1.0 style document statistics - assertEquals("2", metadata.get(Metadata.PAGE_COUNT)); - assertEquals("13", metadata.get(Metadata.PARAGRAPH_COUNT)); - assertEquals("54", metadata.get(Metadata.WORD_COUNT)); - assertEquals("351", metadata.get(Metadata.CHARACTER_COUNT)); + assertEquals("1", metadata.get(Metadata.PAGE_COUNT)); + assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT)); + assertEquals("14", metadata.get(Metadata.WORD_COUNT)); + assertEquals("78", metadata.get(Metadata.CHARACTER_COUNT)); assertEquals("0", metadata.get(Metadata.TABLE_COUNT)); - assertEquals("2", metadata.get(Metadata.OBJECT_COUNT)); + assertEquals("0", metadata.get(Metadata.OBJECT_COUNT)); assertEquals("0", metadata.get(Metadata.IMAGE_COUNT)); - // Check the old style statistics (these will be removed shortly) + // Check the very old style statistics (these will be removed shortly) assertEquals("0", metadata.get("nbTab")); - assertEquals("2", metadata.get("nbObject")); + assertEquals("0", metadata.get("nbObject")); assertEquals("0", metadata.get("nbImg")); - assertEquals("2", metadata.get("nbPage")); - assertEquals("13", metadata.get("nbPara")); - assertEquals("54", metadata.get("nbWord")); - assertEquals("351", metadata.get("nbCharacter")); + assertEquals("1", metadata.get("nbPage")); + assertEquals("1", metadata.get("nbPara")); + assertEquals("14", metadata.get("nbWord")); + assertEquals("78", metadata.get("nbCharacter")); + + // Custom metadata tags present but without values + assertEquals(null, metadata.get("custom:Info 1")); + assertEquals(null, metadata.get("custom:Info 2")); + assertEquals(null, metadata.get("custom:Info 3")); + assertEquals(null, metadata.get("custom:Info 4")); + + assertContains( + "This is a sample Open Office document," + + " written in NeoOffice 2.2.1 for the Mac.", + r.xml); - String content = handler.toString(); - assertTrue(content.contains( - "Apache Tika Tika is part of the Lucene project." - )); } } + /** + * Similar to {@link #testXMLParser()}, but using a different + * OO2 file with different metadata in it + */ @Test - public void testODPMasterFooter() throws Exception { - try (InputStream input = ODFParserTest.class.getResourceAsStream( - "/test-documents/testMasterFooter.odp")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - new AutoDetectParser().parse(input, handler, metadata); + public void testOO2Metadata() throws Exception { + XMLResult r = getXML("testOpenOffice2.odf", new OpenDocumentParser()); + Metadata metadata = r.metadata; + assertEquals( + "application/vnd.oasis.opendocument.formula", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(null, metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE)); + assertEquals("The quick brown fox jumps over the lazy dog", + metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Gym class featuring a brown fox and lazy dog", + metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("Gym class featuring a brown fox and lazy dog", + metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("Gym class featuring a brown fox and lazy dog", + metadata.get(Metadata.SUBJECT)); + assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME)); + assertEquals("1", metadata.get("editing-cycles")); + assertEquals( + "OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134", + metadata.get("generator")); + assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS)); + + // User defined metadata + assertEquals("Text 1", metadata.get("custom:Info 1")); + assertEquals("2", metadata.get("custom:Info 2")); + assertEquals("false", metadata.get("custom:Info 3")); + assertEquals("true", metadata.get("custom:Info 4")); + + // No statistics present + assertEquals(null, metadata.get(Metadata.PAGE_COUNT)); + assertEquals(null, metadata.get(Metadata.PARAGRAPH_COUNT)); + assertEquals(null, metadata.get(Metadata.WORD_COUNT)); + assertEquals(null, metadata.get(Metadata.CHARACTER_COUNT)); + assertEquals(null, metadata.get(Metadata.TABLE_COUNT)); + assertEquals(null, metadata.get(Metadata.OBJECT_COUNT)); + assertEquals(null, metadata.get(Metadata.IMAGE_COUNT)); + assertEquals(null, metadata.get("nbTab")); + assertEquals(null, metadata.get("nbObject")); + assertEquals(null, metadata.get("nbImg")); + assertEquals(null, metadata.get("nbPage")); + assertEquals(null, metadata.get("nbPara")); + assertEquals(null, metadata.get("nbWord")); + assertEquals(null, metadata.get("nbCharacter")); + + // Note - contents of maths files not currently supported + assertContains("<body />", r.xml); - String content = handler.toString(); - assertContains("Master footer is here", content); - } - } + } + /** + * Similar to {@link #testXMLParser()}, but using an OO3 file + */ @Test - public void testODTFooter() throws Exception { - try (InputStream input = ODFParserTest.class.getResourceAsStream( - "/test-documents/testFooter.odt")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - new AutoDetectParser().parse(input, handler, metadata); + public void testOO3Metadata() throws Exception { + XMLResult r = getXML("testODFwithOOo3.odt", new OpenDocumentParser()); + Metadata metadata = r.metadata; + assertEquals( + "application/vnd.oasis.opendocument.text", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("2009-10-05T21:22:38", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2009-10-05T19:04:01", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2009-10-05T19:04:01", metadata.get(Metadata.CREATION_DATE)); + assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Test document", metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("Test document", metadata.get(Metadata.SUBJECT)); + assertEquals("A rather complex document", metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("Bart Hanssens", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Bart Hanssens", metadata.get("initial-creator")); + assertEquals("2", metadata.get("editing-cycles")); + assertEquals("PT02H03M24S", metadata.get(Metadata.EDIT_TIME)); + assertEquals( + "OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420", + metadata.get("generator")); + assertEquals("Apache, Lucene, Tika", metadata.get(Metadata.KEYWORDS)); + + // User defined metadata + assertEquals("Bart Hanssens", metadata.get("custom:Editor")); + assertEquals(null, metadata.get("custom:Info 2")); + assertEquals(null, metadata.get("custom:Info 3")); + assertEquals(null, metadata.get("custom:Info 4")); + + // Check the document statistics + assertEquals("2", metadata.get(Office.PAGE_COUNT)); + assertEquals("13", metadata.get(Office.PARAGRAPH_COUNT)); + assertEquals("54", metadata.get(Office.WORD_COUNT)); + assertEquals("351", metadata.get(Office.CHARACTER_COUNT)); + assertEquals("0", metadata.get(Office.TABLE_COUNT)); + assertEquals("2", metadata.get(Office.OBJECT_COUNT)); + assertEquals("0", metadata.get(Office.IMAGE_COUNT)); + + // Check the Tika-1.0 style document statistics + assertEquals("2", metadata.get(Metadata.PAGE_COUNT)); + assertEquals("13", metadata.get(Metadata.PARAGRAPH_COUNT)); + assertEquals("54", metadata.get(Metadata.WORD_COUNT)); + assertEquals("351", metadata.get(Metadata.CHARACTER_COUNT)); + assertEquals("0", metadata.get(Metadata.TABLE_COUNT)); + assertEquals("2", metadata.get(Metadata.OBJECT_COUNT)); + assertEquals("0", metadata.get(Metadata.IMAGE_COUNT)); + + // Check the old style statistics (these will be removed shortly) + assertEquals("0", metadata.get("nbTab")); + assertEquals("2", metadata.get("nbObject")); + assertEquals("0", metadata.get("nbImg")); + assertEquals("2", metadata.get("nbPage")); + assertEquals("13", metadata.get("nbPara")); + assertEquals("54", metadata.get("nbWord")); + assertEquals("351", metadata.get("nbCharacter")); + + assertContains( + "Tika is part of the Lucene project.", r.xml); - String content = handler.toString(); - assertContains("Here is some text...", content); - assertContains("Here is some text on page 2", content); - assertContains("Here is footer text", content); - } - } + + } + + @Test + public void testODPMasterFooter() throws Exception { + assertContains("Master footer is here", + getXML("testMasterFooter.odp").xml); + } + + @Test + public void testODTFooter() throws Exception { + XMLResult r = getXML("testFooter.odt"); + assertContains("Here is some text...", r.xml); + assertContains("Here is some text on page 2", r.xml); + assertContains("Here is footer text", r.xml); + } @Test public void testODSFooter() throws Exception { - try (InputStream input = ODFParserTest.class.getResourceAsStream( - "/test-documents/testFooter.ods")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - new AutoDetectParser().parse(input, handler, metadata); + assertContains("Here is a footer in the center area", + getXML("testFooter.ods").xml); - String content = handler.toString(); - assertContains("Here is a footer in the center area", content); - } } @Test public void testFromFile() throws Exception { - try (TikaInputStream tis = TikaInputStream.get(this.getClass().getResource( - "/test-documents/testODFwithOOo3.odt"))) { - assertEquals(true, tis.hasFile()); - OpenDocumentParser parser = new OpenDocumentParser(); + OpenDocumentParser parser = new OpenDocumentParser(); + Path tmp = null; + try { + tmp = Files.createTempFile("test-odf-", ".odt"); + Files.copy(getTestDocumentAsStream("testODFwithOOo3.odt"), tmp, + StandardCopyOption.REPLACE_EXISTING); Metadata metadata = new Metadata(); + TikaInputStream tis = TikaInputStream.get(tmp, metadata); + assertEquals(true, tis.hasFile()); ContentHandler handler = new BodyContentHandler(); parser.parse(tis, handler, metadata, new ParseContext()); @@ -323,25 +289,20 @@ public class ODFParserTest extends TikaTest { String content = handler.toString(); assertContains("Tika is part of the Lucene project.", content); + } finally { + Files.delete(tmp); } } - + @Test public void testNPEFromFile() throws Exception { - OpenDocumentParser parser = new OpenDocumentParser(); - try (TikaInputStream tis = TikaInputStream.get(this.getClass().getResource( - "/test-documents/testNPEOpenDocument.odt"))) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - parser.parse(tis, handler, metadata, new ParseContext()); + XMLResult r = getXML("testNPEOpenDocument.odt", new OpenDocumentParser()); + assertEquals( + "application/vnd.oasis.opendocument.text", + r.metadata.get(Metadata.CONTENT_TYPE)); - assertEquals( - "application/vnd.oasis.opendocument.text", - metadata.get(Metadata.CONTENT_TYPE)); + assertContains("primero hay que generar un par de claves", r.xml); - String content = handler.toString(); - assertContains("primero hay que generar un par de claves", content); - } } // TIKA-1063: Test basic style support. @@ -359,20 +320,17 @@ public class ODFParserTest extends TikaTest { //TIKA-1600: Test that null pointer doesn't break parsing. @Test public void testNullStylesInODTFooter() throws Exception { - Parser parser = new OpenDocumentParser(); - try (InputStream input = ODFParserTest.class.getResourceAsStream("/test-documents/testODT-TIKA-6000.odt")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - parser.parse(input, handler, metadata, new ParseContext()); - assertEquals("application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE)); + XMLResult r = getXML("testODT-TIKA-6000.odt", new OpenDocumentParser(), new Metadata(), new ParseContext()); - String content = handler.toString(); + assertEquals("application/vnd.oasis.opendocument.text", r.metadata.get(Metadata.CONTENT_TYPE)); + + String content = r.xml; + + assertContains("Utilisation de ce document", content); + assertContains("Copyright and License", content); + assertContains("Changer la langue", content); + assertContains("La page dâaccueil permet de faire une recherche simple", content); - assertContains("Utilisation de ce document", content); - assertContains("Copyright and License", content); - assertContains("Changer la langue", content); - assertContains("La page dâaccueil permet de faire une recherche simple", content); - } } } http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java index 365de77..dc75be5 100644 --- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java +++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java @@ -17,15 +17,11 @@ package org.apache.tika.parser.rtf; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; -import java.io.File; -import java.io.FileInputStream; import java.io.InputStream; -import java.io.StringWriter; import java.util.ArrayList; import java.util.HashSet; import java.util.List; @@ -49,7 +45,6 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; -import org.apache.tika.sax.WriteOutContentHandler; import org.junit.Test; import org.xml.sax.ContentHandler; @@ -62,117 +57,98 @@ public class RTFParserTest extends TikaTest { @Test public void testBasicExtraction() throws Exception { - File file = getResourceAsFile("/test-documents/testRTF.rtf"); - Metadata metadata = new Metadata(); - StringWriter writer = new StringWriter(); - tika.getParser().parse( - new FileInputStream(file), - new WriteOutContentHandler(writer), - metadata, - new ParseContext()); - String content = writer.toString(); - - assertEquals("application/rtf", metadata.get(Metadata.CONTENT_TYPE)); - assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length); - assertContains("Test", content); - assertContains("indexation Word", content); + XMLResult r = getXML("testRTF.rtf"); + assertEquals("application/rtf", r.metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(1, r.metadata.getValues(Metadata.CONTENT_TYPE).length); + assertContains("Test", r.xml); + assertContains("indexation Word", r.xml); } @Test public void testUmlautSpacesExtraction2() throws Exception { - String content = getText("testRTFUmlautSpaces2.rtf"); - content = content.replaceAll("\\s+", ""); - assertEquals("\u00DCbersicht", content); + assertContains("<p>\u00DCbersicht</p>", + getXML("testRTFUmlautSpaces2.rtf").xml); } @Test public void testUnicodeUCNControlWordCharacterDoublingExtraction() throws Exception { - String content = getText("testRTFUnicodeUCNControlWordCharacterDoubling.rtf"); + XMLResult r = getXML("testRTFUnicodeUCNControlWordCharacterDoubling.rtf"); - assertContains("\u5E74", content); - assertContains("\u5ff5", content); - assertContains("0 ", content); - assertContains("abc", content); - assertFalse("Doubled character \u5E74", content.contains("\u5E74\u5E74")); + assertContains("\u5E74", r.xml); + assertContains("\u5ff5", r.xml); + assertContains("0 ", r.xml); + assertContains("abc", r.xml); + assertNotContained("\u5E74\u5E74", r.xml); } @Test public void testHexEscapeInsideWord() throws Exception { - String content = getText("testRTFHexEscapeInsideWord.rtf"); - assertContains("ESP\u00cdRITO", content); + XMLResult r = getXML("testRTFHexEscapeInsideWord.rtf"); + assertContains("ESP\u00cdRITO", r.xml); } @Test public void testWindowsCodepage1250() throws Exception { - String content = getText("testRTFWindowsCodepage1250.rtf"); - assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", content); - assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", content); + XMLResult r = getXML("testRTFWindowsCodepage1250.rtf"); + assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", r.xml); + assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", r.xml); } @Test public void testTableCellSeparation() throws Exception { - File file = getResourceAsFile("/test-documents/testRTFTableCellSeparation.rtf"); - String content = tika.parseToString(file); - content = content.replaceAll("\\s+", " "); - assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content); + String content = getXML("testRTFTableCellSeparation.rtf").xml; + content = content.replaceAll("(\\s|<\\/?p>)+", " "); assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content); } @Test public void testTableCellSeparation2() throws Exception { - String content = getText("testRTFTableCellSeparation2.rtf"); + String content = getXML("testRTFTableCellSeparation2.rtf").xml.replaceAll("\\s+", " "); // TODO: why do we insert extra whitespace...? - content = content.replaceAll("\\s+", " "); - assertContains("Station Fax", content); + assertContains("Station</p> <p>Fax", content); } @Test public void testWordPadCzechCharactersExtraction() throws Exception { - File file = getResourceAsFile("/test-documents/testRTFWordPadCzechCharacters.rtf"); - String s1 = tika.parseToString(file); - assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne")); - assertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty")); + XMLResult r = getXML("testRTFWordPadCzechCharacters.rtf"); + assertContains("\u010Cl\u00E1nek t\u00FDdne", r.xml); + assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", r.xml); } @Test public void testWord2010CzechCharactersExtraction() throws Exception { - File file = getResourceAsFile("/test-documents/testRTFWord2010CzechCharacters.rtf"); - String s1 = tika.parseToString(file); - assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne")); - assertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty")); + XMLResult r = getXML("testRTFWord2010CzechCharacters.rtf"); + assertContains("\u010Cl\u00E1nek t\u00FDdne", r.xml); + assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", r.xml); } @Test public void testMS932Extraction() throws Exception { - File file = getResourceAsFile("/test-documents/testRTF-ms932.rtf"); - String s1 = tika.parseToString(file); - + XMLResult r = getXML("testRTF-ms932.rtf"); // Hello in Japanese - assertTrue(s1.contains("\u3053\u3093\u306b\u3061\u306f")); + assertContains("\u3053\u3093\u306b\u3061\u306f", r.xml); // Verify title, since it was also encoded with MS932: - Result r = getResult("testRTF-ms932.rtf"); + r = getXML("testRTF-ms932.rtf"); assertEquals("\u30bf\u30a4\u30c8\u30eb", r.metadata.get(TikaCoreProperties.TITLE)); } @Test public void testUmlautSpacesExtraction() throws Exception { - File file = getResourceAsFile("/test-documents/testRTFUmlautSpaces.rtf"); - String s1 = tika.parseToString(file); - assertTrue(s1.contains("\u00DCbersicht")); + XMLResult r = getXML("testRTFUmlautSpaces.rtf"); + assertContains("\u00DCbersicht", r.xml); } @Test public void testGothic() throws Exception { - String content = getText("testRTFUnicodeGothic.rtf"); - assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content); + XMLResult r = getXML("testRTFUnicodeGothic.rtf"); + assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", r.xml); } @Test public void testJapaneseText() throws Exception { - Result r = getResult("testRTFJapanese.rtf"); - String content = r.text; + XMLResult r = getXML("testRTFJapanese.rtf"); // Verify title -- this title uses upr escape inside // title info field: @@ -183,17 +159,17 @@ public class RTFParserTest extends TikaTest { assertEquals("StarWriter", r.metadata.get(TikaCoreProperties.COMMENTS)); // Special version of (GHQ) - assertContains("\uff08\uff27\uff28\uff31\uff09", content); + assertContains("\uff08\uff27\uff28\uff31\uff09", r.xml); // 6 other characters - assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", content); + assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", r.xml); } @Test public void testMaxLength() throws Exception { - File file = getResourceAsFile("/test-documents/testRTFJapanese.rtf"); Metadata metadata = new Metadata(); - InputStream stream = TikaInputStream.get(file, metadata); + InputStream stream = TikaInputStream.get( + getTestDocumentAsStream("testRTFJapanese.rtf")); // Test w/ default limit: Tika localTika = new Tika(); @@ -204,7 +180,7 @@ public class RTFParserTest extends TikaTest { // Test setting max length on the instance: localTika.setMaxStringLength(200); - stream = TikaInputStream.get(file, metadata); + stream = TikaInputStream.get(getTestDocumentAsStream("testRTFJapanese.rtf")); content = localTika.parseToString(stream, metadata); // parseToString closes for convenience: @@ -212,7 +188,7 @@ public class RTFParserTest extends TikaTest { assertTrue(content.length() <= 200); // Test setting max length per-call: - stream = TikaInputStream.get(file, metadata); + stream = TikaInputStream.get(getTestDocumentAsStream("testRTFJapanese.rtf")); content = localTika.parseToString(stream, metadata, 100); // parseToString closes for convenience: //stream.close(); @@ -221,14 +197,14 @@ public class RTFParserTest extends TikaTest { @Test public void testTextWithCurlyBraces() throws Exception { - String content = getText("testRTFWithCurlyBraces.rtf"); - assertContains("{ some text inside curly brackets }", content); + XMLResult r = getXML("testRTFWithCurlyBraces.rtf"); + assertContains("{ some text inside curly brackets }", r.xml); } @Test public void testControls() throws Exception { - Result r = getResult("testRTFControls.rtf"); - String content = r.text; + XMLResult r = getXML("testRTFControls.rtf"); + String content = r.xml; assertContains("Thiswordhasanem\u2014dash", content); assertContains("Thiswordhasanen\u2013dash", content); assertContains("Thiswordhasanon\u2011breakinghyphen", content); @@ -241,8 +217,8 @@ public class RTFParserTest extends TikaTest { @Test public void testInvalidUnicode() throws Exception { - Result r = getResult("testRTFInvalidUnicode.rtf"); - String content = r.text; + XMLResult r = getXML("testRTFInvalidUnicode.rtf"); + String content = r.xml; assertContains("Unpaired hi \ufffd here", content); assertContains("Unpaired lo \ufffd here", content); assertContains("Mismatched pair \ufffd\ufffd here", content); @@ -250,8 +226,8 @@ public class RTFParserTest extends TikaTest { @Test public void testVarious() throws Exception { - Result r = getResult("testRTFVarious.rtf"); - String content = r.text; + XMLResult r = getXML("testRTFVarious.rtf"); + String content = r.xml; assertContains("Footnote appears here", content); assertContains("This is a footnote.", content); assertContains("This is the header text.", content); @@ -267,10 +243,10 @@ public class RTFParserTest extends TikaTest { assertContains("(Kramer)", content); // Table - assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " ")); + assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("(\\s|<\\/?p>)+", " ")); // 2-columns - assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " ")); + assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("(\\s|<\\/?p>)+", " ")); assertContains("This is a hyperlink", content); assertContains("Here is a list:", content); for (int row = 1; row <= 3; row++) { @@ -393,17 +369,13 @@ public class RTFParserTest extends TikaTest { // TIKA-1192 @Test public void testListOverride() throws Exception { - Result r = getResult("testRTFListOverride.rtf"); - String content = r.text; - assertContains("Body", content); + assertContains("Body", getXML("testRTFListOverride.rtf").xml); } // TIKA-1305 @Test public void testCorruptListOverride() throws Exception { - Result r = getResult("testRTFCorruptListOverride.rtf"); - String content = r.text; - assertContains("apple", content); + assertContains("apple", getXML("testRTFCorruptListOverride.rtf").xml); } // TIKA-1010 @@ -565,31 +537,4 @@ public class RTFParserTest extends TikaTest { assertEquals(2, tracker.filenames.size()); } - private Result getResult(String filename) throws Exception { - File file = getResourceAsFile("/test-documents/" + filename); - - Metadata metadata = new Metadata(); - StringWriter writer = new StringWriter(); - tika.getParser().parse( - new FileInputStream(file), - new WriteOutContentHandler(writer), - metadata, - new ParseContext()); - String content = writer.toString(); - return new Result(content, metadata); - } - - private String getText(String filename) throws Exception { - return getResult(filename).text; - } - - private static class Result { - public final String text; - public final Metadata metadata; - - public Result(String text, Metadata metadata) { - this.text = text; - this.metadata = metadata; - } - } }
