Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Fri May 29 14:36:21 2015 @@ -61,27 +61,17 @@ public class RTFParserTest extends TikaT private Tika tika = new Tika(); - private static class Result { - public final String text; - public final Metadata metadata; - - public Result(String text, Metadata metadata) { - this.text = text; - this.metadata = metadata; - } - } - @Test public void testBasicExtraction() throws Exception { File file = getResourceAsFile("/test-documents/testRTF.rtf"); - + Metadata metadata = new Metadata(); StringWriter writer = new StringWriter(); tika.getParser().parse( - new FileInputStream(file), - new WriteOutContentHandler(writer), - metadata, - new ParseContext()); + new FileInputStream(file), + new WriteOutContentHandler(writer), + metadata, + new ParseContext()); String content = writer.toString(); assertEquals("application/rtf", metadata.get(Metadata.CONTENT_TYPE)); @@ -124,16 +114,16 @@ public class RTFParserTest extends TikaT public void testTableCellSeparation() throws Exception { File file = getResourceAsFile("/test-documents/testRTFTableCellSeparation.rtf"); String content = tika.parseToString(file); - content = content.replaceAll("\\s+"," "); + content = content.replaceAll("\\s+", " "); assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content); assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content); } - + @Test public void testTableCellSeparation2() throws Exception { String content = getText("testRTFTableCellSeparation2.rtf"); // TODO: why do we insert extra whitespace...? - content = content.replaceAll("\\s+"," "); + content = content.replaceAll("\\s+", " "); assertContains("Station Fax", content); } @@ -187,14 +177,14 @@ public class RTFParserTest extends TikaT // Verify title -- this title uses upr escape inside // title info field: assertEquals("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f\u3000", - r.metadata.get(TikaCoreProperties.TITLE)); + r.metadata.get(TikaCoreProperties.TITLE)); assertEquals("VMazel", r.metadata.get(TikaCoreProperties.CREATOR)); assertEquals("VMazel", r.metadata.get(Metadata.AUTHOR)); assertEquals("StarWriter", r.metadata.get(TikaCoreProperties.COMMENTS)); - + // Special version of (GHQ) assertContains("\uff08\uff27\uff28\uff31\uff09", content); - + // 6 other characters assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", content); } @@ -216,11 +206,11 @@ public class RTFParserTest extends TikaT localTika.setMaxStringLength(200); stream = TikaInputStream.get(file, metadata); content = localTika.parseToString(stream, metadata); - + // parseToString closes for convenience: //stream.close(); assertTrue(content.length() <= 200); - + // Test setting max length per-call: stream = TikaInputStream.get(file, metadata); content = localTika.parseToString(stream, metadata, 100); @@ -277,35 +267,35 @@ public class RTFParserTest extends TikaT assertContains("(Kramer)", content); // Table - assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," ")); + assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " ")); // 2-columns - assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," ")); + assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " ")); assertContains("This is a hyperlink", content); assertContains("Here is a list:", content); - for(int row=1;row<=3;row++) { + for (int row = 1; row <= 3; row++) { assertContains("Bullet " + row, content); } assertContains("Here is a numbered list:", content); - for(int row=1;row<=3;row++) { + for (int row = 1; row <= 3; row++) { assertContains("Number bullet " + row, content); } - for(int row=1;row<=2;row++) { - for(int col=1;col<=3;col++) { + for (int row = 1; row <= 2; row++) { + for (int col = 1; col <= 3; col++) { assertContains("Row " + row + " Col " + col, content); } } assertContains("Keyword1 Keyword2", content); assertEquals("Keyword1 Keyword2", - r.metadata.get(TikaCoreProperties.KEYWORDS)); + r.metadata.get(TikaCoreProperties.KEYWORDS)); assertContains("Subject is here", content); assertEquals("Subject is here", - r.metadata.get(OfficeOpenXMLCore.SUBJECT)); + r.metadata.get(OfficeOpenXMLCore.SUBJECT)); assertEquals("Subject is here", - r.metadata.get(Metadata.SUBJECT)); + r.metadata.get(Metadata.SUBJECT)); assertContains("Suddenly some Japanese text:", content); // Special version of (GHQ) @@ -314,7 +304,7 @@ public class RTFParserTest extends TikaT assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content); assertContains("And then some Gothic text:", content); - assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content); + assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content); } @Test @@ -350,7 +340,7 @@ public class RTFParserTest extends TikaT @Test public void testFontAfterBufferedText() throws Exception { assertContains("\u0423\u0432\u0430\u0436\u0430\u0435\u043c\u044b\u0439 \u043a\u043b\u0438\u0435\u043d\u0442!", - getXML("testFontAfterBufferedText.rtf").xml); + getXML("testFontAfterBufferedText.rtf").xml); } @Test @@ -380,28 +370,28 @@ public class RTFParserTest extends TikaT ContainerExtractor ex = new ParserContainerExtractor(); tis = TikaInputStream.get(getResourceAsStream("/test-documents/testBinControlWord.rtf")); assertEquals(true, ex.isSupported(tis)); - ex.extract(tis, ex, embHandler); + ex.extract(tis, ex, embHandler); } finally { tis.close(); } assertEquals(1, embHandler.bytes.size()); - + byte[] bytes = embHandler.bytes.get(0); assertEquals(10, bytes.length); //} - assertEquals(125, (int)bytes[4]); + assertEquals(125, (int) bytes[4]); //make sure that at least the last value is correct - assertEquals(-1, (int)bytes[9]); + assertEquals(-1, (int) bytes[9]); } // TIKA-999 @Test public void testMetaDataCounts() throws Exception { - XMLResult xml = getXML("test_embedded_package.rtf"); - assertEquals("1", xml.metadata.get(Office.PAGE_COUNT)); - assertEquals("7", xml.metadata.get(Office.WORD_COUNT)); - assertEquals("36", xml.metadata.get(Office.CHARACTER_COUNT)); - assertTrue(xml.metadata.get(Office.CREATION_DATE).startsWith("2012-09-02T")); + XMLResult xml = getXML("test_embedded_package.rtf"); + assertEquals("1", xml.metadata.get(Office.PAGE_COUNT)); + assertEquals("7", xml.metadata.get(Office.WORD_COUNT)); + assertEquals("36", xml.metadata.get(Office.CHARACTER_COUNT)); + assertTrue(xml.metadata.get(Office.CREATION_DATE).startsWith("2012-09-02T")); } // TIKA-1192 @@ -420,15 +410,14 @@ public class RTFParserTest extends TikaT assertContains("apple", content); } - // TIKA-1010 @Test public void testEmbeddedMonster() throws Exception { Set<MediaType> skipTypes = new HashSet<MediaType>(); skipTypes.add(MediaType.parse("application/x-emf")); skipTypes.add(MediaType.parse("application/x-msmetafile")); - - + + List<String> trueNames = new ArrayList<String>(); trueNames.add("file_0.doc"); trueNames.add("Hw.txt"); @@ -468,7 +457,7 @@ public class RTFParserTest extends TikaT trueTypes.add("application/msword"); trueTypes.add("application/vnd.openxmlformats-officedocument.wordprocessingml.document"); trueTypes.add("image/jpeg"); - + TrackingHandler tracker = new TrackingHandler(skipTypes); TikaInputStream tis = null; try { @@ -491,12 +480,12 @@ public class RTFParserTest extends TikaT assertNotNull(tracker.filenames.get(i)); //necessary to getName() because MSOffice extractor includes //directory: _1457338524/HW.txt - assertEquals("filename equals ", + assertEquals("filename equals ", expectedName, FilenameUtils.getName(tracker.filenames.get(i))); } assertEquals(trueTypes.get(i), tracker.mediaTypes.get(i).toString()); } - + tracker = new TrackingHandler(); tis = null; try { @@ -512,7 +501,7 @@ public class RTFParserTest extends TikaT assertEquals("thumbnail_26.emf", tracker.filenames.get(45)); assertEquals("thumbnail_27.wmf", tracker.filenames.get(46)); } - + //TIKA-1010 test regular (not "embedded") images/picts public void testRegularImages() throws Exception { Parser base = new AutoDetectParser(); @@ -526,15 +515,15 @@ public class RTFParserTest extends TikaT rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf"); try { tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf")); - parser.parse(tis, handler, rootMetadata, ctx); + parser.parse(tis, handler, rootMetadata, ctx); } finally { tis.close(); } - List<Metadata> metadatas = parser.getMetadata(); + List<Metadata> metadatas = parser.getMetadata(); Metadata meta_jpg_exif = metadatas.get(0);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg"); Metadata meta_jpg = metadatas.get(2);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg"); - + assertTrue(meta_jpg_exif != null); assertTrue(meta_jpg != null); assertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor")); @@ -543,7 +532,7 @@ public class RTFParserTest extends TikaT assertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor")); assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL)); assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL)); - + assertEquals(40, meta_jpg.names().length); assertEquals(105, meta_jpg.names().length); } @@ -564,7 +553,6 @@ public class RTFParserTest extends TikaT "<p>four</p>", content); } - //TIKA-1010 test linked embedded doc @Test public void testEmbeddedLinkedDocument() throws Exception { @@ -602,14 +590,14 @@ public class RTFParserTest extends TikaT private Result getResult(String filename) throws Exception { File file = getResourceAsFile("/test-documents/" + filename); - + Metadata metadata = new Metadata(); StringWriter writer = new StringWriter(); tika.getParser().parse( - new FileInputStream(file), - new WriteOutContentHandler(writer), - metadata, - new ParseContext()); + new FileInputStream(file), + new WriteOutContentHandler(writer), + metadata, + new ParseContext()); String content = writer.toString(); return new Result(content, metadata); } @@ -617,4 +605,14 @@ public class RTFParserTest extends TikaT private String getText(String filename) throws Exception { return getResult(filename).text; } + + private static class Result { + public final String text; + public final Metadata metadata; + + public Result(String text, Metadata metadata) { + this.text = text; + this.metadata = metadata; + } + } }
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java Fri May 29 14:36:21 2015 @@ -16,7 +16,6 @@ */ package org.apache.tika.parser.txt; -import org.junit.Test; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -24,48 +23,50 @@ import java.io.IOException; import java.io.InputStream; import java.io.Reader; +import org.junit.Test; + public class CharsetDetectorTest { - - @Test - public void testTagDropper() throws IOException { - InputStream in = CharsetDetectorTest.class.getResourceAsStream( "/test-documents/resume.html" ); - - try { - CharsetDetector detector = new CharsetDetector(); - detector.enableInputFilter(true); - detector.setText(in); - CharsetMatch [] matches = detector.detectAll(); - CharsetMatch mm = null; - for ( CharsetMatch m : matches ) { - if ( mm == null || mm.getConfidence() < m.getConfidence() ) { - mm = m; + + @Test + public void testTagDropper() throws IOException { + InputStream in = CharsetDetectorTest.class.getResourceAsStream("/test-documents/resume.html"); + + try { + CharsetDetector detector = new CharsetDetector(); + detector.enableInputFilter(true); + detector.setText(in); + CharsetMatch[] matches = detector.detectAll(); + CharsetMatch mm = null; + for (CharsetMatch m : matches) { + if (mm == null || mm.getConfidence() < m.getConfidence()) { + mm = m; + } + } + assertTrue(mm != null); + assertEquals("UTF-8", mm.getName()); + } finally { + in.close(); } - } - assertTrue( mm != null ); - assertEquals( "UTF-8", mm.getName() ); - } finally { - in.close(); } - } /* https://issues.apache.org/jira/browse/TIKA-1248 * Verify empty or null declaredEncoding doesn't cause an exception * */ - - @Test - public void testEmptyOrNullDeclaredCharset() throws IOException { - InputStream in = CharsetDetectorTest.class.getResourceAsStream( "/test-documents/resume.html" ); - - try { - CharsetDetector detector = new CharsetDetector(); - Reader reader = detector.getReader(in, null); - assertTrue(reader.ready()); - - reader = detector.getReader(in, ""); - assertTrue(reader.ready()); - } finally { - in.close(); + + @Test + public void testEmptyOrNullDeclaredCharset() throws IOException { + InputStream in = CharsetDetectorTest.class.getResourceAsStream("/test-documents/resume.html"); + + try { + CharsetDetector detector = new CharsetDetector(); + Reader reader = detector.getReader(in, null); + assertTrue(reader.ready()); + + reader = detector.getReader(in, ""); + assertTrue(reader.ready()); + } finally { + in.close(); + } } - } } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java Fri May 29 14:36:21 2015 @@ -41,9 +41,9 @@ public class TXTParserTest { @Test public void testEnglishText() throws Exception { String text = - "Hello, World! This is simple UTF-8 text content written" - + " in English to test autodetection of both the character" - + " encoding and the language of the input stream."; + "Hello, World! This is simple UTF-8 text content written" + + " in English to test autodetection of both the character" + + " encoding and the language of the input stream."; Metadata metadata = new Metadata(); StringWriter writer = new StringWriter(); @@ -65,7 +65,7 @@ public class TXTParserTest { assertContains("autodetection", content); assertContains("stream", content); } - + @Test public void testUTF8Text() throws Exception { String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n"; @@ -134,22 +134,22 @@ public class TXTParserTest { /** * Test case for TIKA-240: Drop the BOM when extracting plain text * - * @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a> + * @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a> */ @Test public void testDropByteOrderMark() throws Exception { - assertExtractText("UTF-8 BOM", "test", new byte[] { - (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't' }); - assertExtractText("UTF-16 BE BOM", "test", new byte[] { + assertExtractText("UTF-8 BOM", "test", new byte[]{ + (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't'}); + assertExtractText("UTF-16 BE BOM", "test", new byte[]{ (byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'}); - assertExtractText("UTF-16 LE BOM", "test", new byte[] { + assertExtractText("UTF-16 LE BOM", "test", new byte[]{ (byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0}); } /** * Test case for TIKA-335: using incoming charset * - * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a> + * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a> */ @Test public void testUseIncomingCharsetAsHint() throws Exception { @@ -160,14 +160,14 @@ public class TXTParserTest { Metadata metadata = new Metadata(); parser.parse( new ByteArrayInputStream(test2.getBytes("ISO-8859-1")), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15"); parser.parse( new ByteArrayInputStream(test2.getBytes("ISO-8859-1")), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated } @@ -175,7 +175,7 @@ public class TXTParserTest { /** * Test case for TIKA-341: using charset in content-type * - * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a> + * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a> */ @Test public void testUsingCharsetInContentTypeHeader() throws Exception { @@ -186,7 +186,7 @@ public class TXTParserTest { Metadata metadata = new Metadata(); parser.parse( new ByteArrayInputStream(test2.getBytes("ISO-8859-1")), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated @@ -194,7 +194,7 @@ public class TXTParserTest { metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-15"); parser.parse( new ByteArrayInputStream(test2.getBytes("ISO-8859-1")), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated } @@ -214,7 +214,7 @@ public class TXTParserTest { /** * Test case for TIKA-339: don't override incoming language * - * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a> + * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a> */ @Test public void testRetainIncomingLanguage() throws Exception { @@ -225,7 +225,7 @@ public class TXTParserTest { parser.parse( new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8)), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE)); } @@ -266,11 +266,11 @@ public class TXTParserTest { assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); } - + /** * Test case for TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500 * - * @see <a href="https://issues.apache.org/jira/browse/TIKA-771">TIKA-771</a> + * @see <a href="https://issues.apache.org/jira/browse/TIKA-771">TIKA-771</a> */ @Test public void testCharsetDetectionWithShortSnipet() throws Exception { @@ -281,7 +281,7 @@ public class TXTParserTest { new ByteArrayInputStream(text.getBytes(IOUtils.UTF_8)), new BodyContentHandler(), metadata, new ParseContext()); assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); - + // Now verify that if we tell the parser the encoding is UTF-8, that's what // we get back (see TIKA-868) metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8"); Modified: tika/trunk/tika-server/pom.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/pom.xml?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-server/pom.xml (original) +++ tika/trunk/tika-server/pom.xml Fri May 29 14:36:21 2015 @@ -254,25 +254,6 @@ </execution> </executions> </plugin> - <plugin> - <groupId>com.qmino</groupId> - <artifactId>miredot-maven-plugin</artifactId> - <version>1.4</version> - <executions> - <execution> - <goals> - <goal>restdoc</goal> - </goals> - </execution> - </executions> - <configuration> - <licence> - <!-- Miredot license key valid until August 1st, 2016 when we can apply for a new one - http://s.apache.org/oE --> - UHJvamVjdHxvcmcuYXBhY2hlLnRpa2EudGlrYS1zZXJ2ZXJ8MjAxNi0wOC0wMXx0cnVlI01Dd0NGRklXRzRqRmNTZXNJb2laRElKZVF4RXpieUNTQWhSMHBmTzZCMUdMbDBPQ1B1WmJYQ3NpZElZSCtRPT0= - </licence> - <!-- insert other configuration here (optional) --> - </configuration> - </plugin> </plugins> </build> <profiles>
