Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java Fri May 29 14:36:21 2015 @@ -1,25 +1,25 @@ /** -******************************************************************************* -* Copyright (C) 2005, International Business Machines Corporation and * -* others. All Rights Reserved. * -******************************************************************************* -*/ + * ****************************************************************************** + * Copyright (C) 2005, International Business Machines Corporation and * + * others. All Rights Reserved. * + * ****************************************************************************** + */ package org.apache.tika.parser.txt; /** * Abstract class for recognizing a single charset. * Part of the implementation of ICU's CharsetDetector. - * + * * Each specific charset that can be recognized will have an instance * of some subclass of this class. All interaction between the overall * CharsetDetector and the stuff specific to an individual charset happens * via the interface provided here. - * + * * Instances of CharsetDetector DO NOT have or maintain * state pertaining to a specific match or detect operation. * The WILL be shared by multiple instances of CharsetDetector. * They encapsulate const charset-specific information. - * + * * @internal */ abstract class CharsetRecognizer { @@ -27,29 +27,28 @@ abstract class CharsetRecognizer { * Get the IANA name of this charset. * @return the charset name. */ - abstract String getName(); - + abstract String getName(); + /** * Get the ISO language code for this charset. * @return the language code, or <code>null</code> if the language cannot be determined. */ - public String getLanguage() - { + public String getLanguage() { return null; } - + /** * Test the match of this charset with the input text data * which is obtained via the CharsetDetector object. - * + * * @param det The CharsetDetector, which contains the input text * to be checked for being in this charset. - * @return Two values packed into one int (Damn java, anyhow) + * @return Two values packed into one int (Damn java, anyhow) * <br/> * bits 0-7: the match confidence, ranging from 0-100 * <br/> * bits 8-15: The match reason, an enum-like value. */ - abstract int match(CharsetDetector det); + abstract int match(CharsetDetector det); }
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java Fri May 29 14:36:21 2015 @@ -5,9 +5,9 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java Fri May 29 14:36:21 2015 @@ -40,20 +40,22 @@ import org.xml.sax.SAXException; * beginning of the stream and the given document metadata, most * notably the <code>charset</code> parameter of a * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE} value. - * <p> + * <p/> * This parser sets the following output metadata entries: * <dl> - * <dt>{@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE}</dt> - * <dd><code>text/plain; charset=...</code></dd> + * <dt>{@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE}</dt> + * <dd><code>text/plain; charset=...</code></dd> * </dl> */ public class TXTParser extends AbstractParser { - /** Serial version UID */ + /** + * Serial version UID + */ private static final long serialVersionUID = -6656102320836888910L; private static final Set<MediaType> SUPPORTED_TYPES = - Collections.singleton(MediaType.TEXT_PLAIN); + Collections.singleton(MediaType.TEXT_PLAIN); private static final ServiceLoader LOADER = new ServiceLoader(TXTParser.class.getClassLoader()); Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java Fri May 29 14:36:21 2015 @@ -76,12 +76,12 @@ public class RecursiveParserWrapperTest assertNull(content); } - + @Test public void testCharLimit() throws Exception { ParseContext context = new ParseContext(); Metadata metadata = new Metadata(); - + Parser wrapped = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60)); @@ -89,19 +89,20 @@ public class RecursiveParserWrapperTest "/test-documents/test_recursive_embedded.docx"); wrapper.parse(stream, new DefaultHandler(), metadata, context); List<Metadata> list = wrapper.getMetadata(); - + assertEquals(5, list.size()); - + int wlr = 0; for (Metadata m : list) { String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED); - if (limitReached != null && limitReached.equals("true")){ + if (limitReached != null && limitReached.equals("true")) { wlr++; } } assertEquals(1, wlr); } + @Test public void testMaxEmbedded() throws Exception { int maxEmbedded = 4; @@ -109,7 +110,7 @@ public class RecursiveParserWrapperTest ParseContext context = new ParseContext(); Metadata metadata = new Metadata(); String limitReached = null; - + Parser wrapped = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); @@ -124,7 +125,7 @@ public class RecursiveParserWrapperTest limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED); assertNull(limitReached); - + wrapper.reset(); stream.close(); @@ -137,29 +138,29 @@ public class RecursiveParserWrapperTest list = wrapper.getMetadata(); //add 1 for outer container file - assertEquals(maxEmbedded+1, list.size()); - + assertEquals(maxEmbedded + 1, list.size()); + limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED); assertEquals("true", limitReached); wrapper.reset(); stream.close(); - + //test setting value < 0 metadata = new Metadata(); stream = RecursiveParserWrapperTest.class.getResourceAsStream( "/test-documents/test_recursive_embedded.docx"); - + wrapper.setMaxEmbeddedResources(-2); wrapper.parse(stream, new DefaultHandler(), metadata, context); assertEquals(totalNoLimit, list.size()); limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED); assertNull(limitReached); } - + @Test public void testEmbeddedResourcePath() throws Exception { - + Set<String> targets = new HashSet<String>(); targets.add("test_recursive_embedded.docx/embed1.zip"); targets.add("test_recursive_embedded.docx/embed1.zip/embed2.zip"); @@ -172,15 +173,15 @@ public class RecursiveParserWrapperTest targets.add("test_recursive_embedded.docx/embed1.zip/embed1b.txt"); targets.add("test_recursive_embedded.docx/embed1.zip/embed1a.txt"); targets.add("test_recursive_embedded.docx/image1.emf"); - + Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx"); List<Metadata> list = getMetadata(metadata, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); Metadata container = list.get(0); String content = container.get(RecursiveParserWrapper.TIKA_CONTENT); - assertTrue(content.indexOf("<p class=\"header\" />") > -1); - + assertTrue(content.indexOf("<p class=\"header\" />") > -1); + Set<String> seen = new HashSet<String>(); for (Metadata m : list) { String path = m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH); @@ -224,7 +225,7 @@ public class RecursiveParserWrapperTest if (path == null) { path = "/test-documents/test_recursive_embedded.docx"; } else { - path = "/test-documents/"+path; + path = "/test-documents/" + path; } InputStream stream = null; try { Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Fri May 29 14:36:21 2015 @@ -157,6 +157,7 @@ public class HtmlParserTest { /** * Test case for TIKA-210 + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a> */ @Test @@ -169,6 +170,7 @@ public class HtmlParserTest { /** * Test case for TIKA-287 + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-287">TIKA-287</a> */ @Test @@ -216,8 +218,8 @@ public class HtmlParserTest { private void assertRelativeLink(String url, String base, String relative) throws Exception { String test = - "<html><head><base href=\"" + base + "\"></head>" - + "<body><a href=\"" + relative + "\">test</a></body></html>"; + "<html><head><base href=\"" + base + "\"></head>" + + "<body><a href=\"" + relative + "\">test</a></body></html>"; final List<String> links = new ArrayList<String>(); new HtmlParser().parse( new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8)), @@ -238,12 +240,13 @@ public class HtmlParserTest { /** * Test case for TIKA-268 + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-268">TIKA-268</a> */ @Test public void testWhitespaceBetweenTableCells() throws Exception { String test = - "<html><body><table><tr><td>a</td><td>b</td></table></body></html>"; + "<html><body><table><tr><td>a</td><td>b</td></table></body></html>"; String content = new Tika().parseToString( new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8))); assertContains("a", content); @@ -253,15 +256,16 @@ public class HtmlParserTest { /** * Test case for TIKA-332 + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-332">TIKA-332</a> */ @Test public void testHttpEquivCharset() throws Exception { String test = - "<html><head><meta http-equiv=\"content-type\"" - + " content=\"text/html; charset=ISO-8859-1\" />" - + "<title>the name is \u00e1ndre</title>" - + "</head><body></body></html>"; + "<html><head><meta http-equiv=\"content-type\"" + + " content=\"text/html; charset=ISO-8859-1\" />" + + "<title>the name is \u00e1ndre</title>" + + "</head><body></body></html>"; Metadata metadata = new Metadata(); new HtmlParser().parse( new ByteArrayInputStream(test.getBytes("ISO-8859-1")), @@ -271,14 +275,15 @@ public class HtmlParserTest { /** * Test case for TIKA-892 + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-892">TIKA-892</a> */ @Test public void testHtml5Charset() throws Exception { String test = "<html><head><meta charset=\"ISO-8859-15\" />" - + "<title>the name is \u00e1ndre</title>" - + "</head><body></body></html>"; + + "<title>the name is \u00e1ndre</title>" + + "</head><body></body></html>"; Metadata metadata = new Metadata(); new HtmlParser().parse( new ByteArrayInputStream(test.getBytes("ISO-8859-1")), @@ -288,40 +293,42 @@ public class HtmlParserTest { /** * Test case for TIKA-334 + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a> */ @Test public void testDetectOfCharset() throws Exception { String test = - "<html><head><title>\u017d</title></head><body></body></html>"; + "<html><head><title>\u017d</title></head><body></body></html>"; Metadata metadata = new Metadata(); - new HtmlParser().parse ( + new HtmlParser().parse( new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8)), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("\u017d", metadata.get(TikaCoreProperties.TITLE)); } /** * Test case for TIKA-341 + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a> */ @Test public void testUsingCharsetInContentTypeHeader() throws Exception { final String test = - "<html><head><title>the name is \u00e1ndre</title></head>" - + "<body></body></html>"; + "<html><head><title>the name is \u00e1ndre</title></head>" + + "<body></body></html>"; Metadata metadata = new Metadata(); - new HtmlParser().parse ( + new HtmlParser().parse( new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8)), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1"); - new HtmlParser().parse ( + new HtmlParser().parse( new ByteArrayInputStream(test.getBytes("ISO-8859-1")), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); } @@ -347,6 +354,7 @@ public class HtmlParserTest { /** * Test case for TIKA-339: Don't use language returned by CharsetDetector + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-339">TIKA-339</a> */ @Test @@ -363,61 +371,64 @@ public class HtmlParserTest { /** * Test case for TIKA-349 + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-349">TIKA-349</a> */ @Test public void testHttpEquivCharsetFunkyAttributes() throws Exception { String test1 = - "<html><head><meta http-equiv=\"content-type\"" - + " content=\"text/html; charset=ISO-8859-15; charset=iso-8859-15\" />" - + "<title>the name is \u00e1ndre</title>" - + "</head><body></body></html>"; + "<html><head><meta http-equiv=\"content-type\"" + + " content=\"text/html; charset=ISO-8859-15; charset=iso-8859-15\" />" + + "<title>the name is \u00e1ndre</title>" + + "</head><body></body></html>"; Metadata metadata = new Metadata(); - new HtmlParser().parse ( + new HtmlParser().parse( new ByteArrayInputStream(test1.getBytes("ISO-8859-1")), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // Some HTML pages have errors like ';;' versus '; ' as separator String test2 = - "<html><head><meta http-equiv=\"content-type\"" - + " content=\"text/html;;charset=ISO-8859-15\" />" - + "<title>the name is \u00e1ndre</title>" - + "</head><body></body></html>"; + "<html><head><meta http-equiv=\"content-type\"" + + " content=\"text/html;;charset=ISO-8859-15\" />" + + "<title>the name is \u00e1ndre</title>" + + "</head><body></body></html>"; metadata = new Metadata(); - new HtmlParser().parse ( + new HtmlParser().parse( new ByteArrayInputStream(test2.getBytes("ISO-8859-1")), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); } /** * Test case for TIKA-350 + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-350">TIKA-350</a> */ @Test public void testUsingFunkyCharsetInContentTypeHeader() throws Exception { final String test = - "<html><head><title>the name is \u00e1ndre</title></head>" - + "<body></body></html>"; + "<html><head><title>the name is \u00e1ndre</title></head>" + + "<body></body></html>"; Metadata metadata = new Metadata(); - new HtmlParser().parse ( + new HtmlParser().parse( new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8)), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html"); - new HtmlParser().parse ( + new HtmlParser().parse( new ByteArrayInputStream(test.getBytes("ISO-8859-1")), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); } /** * Test case for TIKA-357 + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-357">TIKA-357</a> */ @Test @@ -426,13 +437,14 @@ public class HtmlParserTest { Metadata metadata = new Metadata(); new HtmlParser().parse( HtmlParserTest.class.getResourceAsStream(path), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING)); } /** * Test case for TIKA-420 + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-420">TIKA-420</a> */ @Test @@ -443,7 +455,7 @@ public class HtmlParserTest { BodyContentHandler handler = new BodyContentHandler(); new HtmlParser().parse( HtmlParserTest.class.getResourceAsStream(path), - new BoilerpipeContentHandler(handler), metadata, new ParseContext()); + new BoilerpipeContentHandler(handler), metadata, new ParseContext()); String content = handler.toString(); assertTrue(content.startsWith("This is the real meat")); @@ -454,14 +466,15 @@ public class HtmlParserTest { /** * Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>. + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-478">TIKA-478</a> */ @Test public void testElementOrdering() throws Exception { final String test = "<html><head><title>Title</title>" + - "<meta http-equiv=\"content-type\" content=\"text/html\">" + - "<link rel=\"next\" href=\"next.html\" />" + - "</head><body><p>Simple Content</p></body></html>"; + "<meta http-equiv=\"content-type\" content=\"text/html\">" + + "<link rel=\"next\" href=\"next.html\" />" + + "</head><body><p>Simple Content</p></body></html>"; StringWriter sw = new StringWriter(); new HtmlParser().parse( @@ -492,13 +505,14 @@ public class HtmlParserTest { /** * Test case for TIKA-463. Don't skip elements that have URLs. + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a> */ @Test public void testImgUrlExtraction() throws Exception { final String test = "<html><head><title>Title</title>" + - "<base href=\"http://domain.com\" />" + - "</head><body><img src=\"image.jpg\" /></body></html>"; + "<base href=\"http://domain.com\" />" + + "</head><body><img src=\"image.jpg\" /></body></html>"; StringWriter sw = new StringWriter(); new HtmlParser().parse( @@ -513,13 +527,14 @@ public class HtmlParserTest { /** * Test case for TIKA-463. Don't skip elements that have URLs. + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a> */ @Test public void testFrameSrcExtraction() throws Exception { final String test = "<html><head><title>Title</title>" + - "<base href=\"http://domain.com\" />" + - "</head><frameset><frame src=\"frame.html\" /></frameset></html>"; + "<base href=\"http://domain.com\" />" + + "</head><frameset><frame src=\"frame.html\" /></frameset></html>"; StringWriter sw = new StringWriter(); new HtmlParser().parse( @@ -534,14 +549,15 @@ public class HtmlParserTest { /** * Test case for TIKA-463. Don't skip elements that have URLs. + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a> */ @Test public void testIFrameSrcExtraction() throws Exception { final String test = "<html><head><title>Title</title>" + - "<base href=\"http://domain.com\" />" + - "</head><body><iframe src =\"framed.html\" width=\"100%\" height=\"300\">" + - "<p>Your browser doesn't support iframes!</p></body></html>"; + "<base href=\"http://domain.com\" />" + + "</head><body><iframe src =\"framed.html\" width=\"100%\" height=\"300\">" + + "<p>Your browser doesn't support iframes!</p></body></html>"; StringWriter sw = new StringWriter(); new HtmlParser().parse( @@ -556,15 +572,16 @@ public class HtmlParserTest { /** * Test case for TIKA-463. Don't skip elements that have URLs. + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a> */ @Test public void testAreaExtraction() throws Exception { final String test = "<html><head><title>Title</title>" + - "<base href=\"http://domain.com\" />" + - "</head><body><p><map name=\"map\" id=\"map\">" + - "<area shape=\"rect\" href=\"map.html\" alt=\"\" />" + - "</map></p></body></html>"; + "<base href=\"http://domain.com\" />" + + "</head><body><p><map name=\"map\" id=\"map\">" + + "<area shape=\"rect\" href=\"map.html\" alt=\"\" />" + + "</map></p></body></html>"; StringWriter sw = new StringWriter(); new HtmlParser().parse( @@ -579,15 +596,16 @@ public class HtmlParserTest { /** * Test case for TIKA-463. Don't skip elements that have URLs. + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a> */ @Test public void testObjectExtraction() throws Exception { final String test = "<html><head><title>Title</title>" + - "<base href=\"http://domain.com\" />" + - "</head><body><p><object data=\"object.data\" type=\"text/html\">" + - "<param name=\"name\" value=\"value\" />" + - "</object></p></body></html>"; + "<base href=\"http://domain.com\" />" + + "</head><body><p><object data=\"object.data\" type=\"text/html\">" + + "<param name=\"name\" value=\"value\" />" + + "</object></p></body></html>"; StringWriter sw = new StringWriter(); new HtmlParser().parse( @@ -598,13 +616,14 @@ public class HtmlParserTest { // <object> tag should exist with fully resolved URLs assertTrue( - "<object> tag not correctly found in:\n" + result, - Pattern.matches("(?s).*<object data=\"http://domain.com/object.data\".*<param .* name=\"name\" value=\"value\"/>.*</object>.*$", result) + "<object> tag not correctly found in:\n" + result, + Pattern.matches("(?s).*<object data=\"http://domain.com/object.data\".*<param .* name=\"name\" value=\"value\"/>.*</object>.*$", result) ); } /** * Test case for change related to TIKA-463. Verify proper handling of <meta> tags. + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a> */ @Test @@ -629,13 +648,14 @@ public class HtmlParserTest { /** * Test case for TIKA-457. Better handling for broken HTML that has <frameset> inside of <body>. + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-457">TIKA-457</a> */ @Test public void testBrokenFrameset() throws Exception { final String test1 = "<html><head><title>Title</title>" + - "<base href=\"http://domain.com\" />" + - "</head><body><frameset><frame src=\"frame.html\" /></frameset></body></html>"; + "<base href=\"http://domain.com\" />" + + "</head><body><frameset><frame src=\"frame.html\" /></frameset></body></html>"; StringWriter sw1 = new StringWriter(); new HtmlParser().parse( @@ -652,11 +672,11 @@ public class HtmlParserTest { // Test the example from the Nutch project. final String test2 = "<html><head><title> my title </title></head><body>" + - "<frameset rows=\"20,*\"><frame src=\"top.html\"></frame>" + - "<frameset cols=\"20,*\"><frame src=\"left.html\"></frame>" + - "<frame src=\"invalid.html\"/></frame>" + - "<frame src=\"right.html\"></frame>" + - "</frameset></frameset></body></html>"; + "<frameset rows=\"20,*\"><frame src=\"top.html\"></frame>" + + "<frameset cols=\"20,*\"><frame src=\"left.html\"></frame>" + + "<frame src=\"invalid.html\"/></frame>" + + "<frame src=\"right.html\"></frame>" + + "</frameset></frameset></body></html>"; StringWriter sw2 = new StringWriter(); new HtmlParser().parse( @@ -678,6 +698,7 @@ public class HtmlParserTest { /** * Test case for TIKA-480: fix NPE when using BodyContentHandler or HtmlTransformer * as delegate for BoilerpipeContentHandler + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-480">TIKA-480</a> */ @Test @@ -688,7 +709,7 @@ public class HtmlParserTest { StringWriter sw = new StringWriter(); new HtmlParser().parse( HtmlParserTest.class.getResourceAsStream(path), - makeHtmlTransformer(sw), metadata, new ParseContext()); + makeHtmlTransformer(sw), metadata, new ParseContext()); String content = sw.toString(); @@ -701,14 +722,15 @@ public class HtmlParserTest { /** * Test case for TIKA-481. Verify href in <link> is resolved. + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-481">TIKA-481</a> */ @Test public void testLinkHrefResolution() throws Exception { final String test = "<html><head><title>Title</title>" + - "<base href=\"http://domain.com\" />" + - "<link rel=\"next\" href=\"next.html\" />" + - "</head><body></body></html>"; + "<base href=\"http://domain.com\" />" + + "<link rel=\"next\" href=\"next.html\" />" + + "</head><body></body></html>"; StringWriter sw = new StringWriter(); new HtmlParser().parse( @@ -731,7 +753,7 @@ public class HtmlParserTest { * @throws Exception */ private ContentHandler makeHtmlTransformer(Writer writer) throws Exception { - SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); + SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); @@ -742,6 +764,7 @@ public class HtmlParserTest { /** * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler. + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a> */ @Test @@ -756,7 +779,7 @@ public class HtmlParserTest { new HtmlParser().parse( HtmlParserTest.class.getResourceAsStream(path), - bpch, metadata, new ParseContext()); + bpch, metadata, new ParseContext()); String content = sw.toString(); assertTrue("Has empty table elements", content.contains("<body><table><tr><td><table><tr><td>")); @@ -793,9 +816,9 @@ public class HtmlParserTest { StringWriter sw = new StringWriter(); - new HtmlParser().parse ( + new HtmlParser().parse( new ByteArrayInputStream(html.getBytes(IOUtils.UTF_8)), - makeHtmlTransformer(sw), metadata, parseContext); + makeHtmlTransformer(sw), metadata, parseContext); String result = sw.toString(); // Make sure we don't get <body><BODY/></body> @@ -816,7 +839,7 @@ public class HtmlParserTest { BodyContentHandler handler = new BodyContentHandler(); new HtmlParser().parse( new ByteArrayInputStream(html.getBytes(IOUtils.UTF_8)), - handler, new Metadata(), new ParseContext()); + handler, new Metadata(), new ParseContext()); // Make sure we get <tab>, "one", newline, newline String result = handler.toString(); @@ -826,22 +849,23 @@ public class HtmlParserTest { /** * Test case for TIKA-961 + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-961">TIKA-961</a> */ @Test public void testBoilerplateWhitespace() throws Exception { String path = "/test-documents/boilerplate-whitespace.html"; - + Metadata metadata = new Metadata(); BodyContentHandler handler = new BodyContentHandler(); - + BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler); bpHandler.setIncludeMarkup(true); - + new HtmlParser().parse( HtmlParserTest.class.getResourceAsStream(path), - bpHandler, metadata, new ParseContext()); - + bpHandler, metadata, new ParseContext()); + String content = handler.toString(); // Should not contain item_aitem_b @@ -862,12 +886,12 @@ public class HtmlParserTest { @Test public void testOpenGraphMetadata() throws Exception { String test1 = - "<html><head><meta property=\"og:description\"" - + " content=\"some description\" />" - + "<meta property=\"og:image\" content=\"http://example.com/image1.jpg\" />" - + "<meta property=\"og:image\" content=\"http://example.com/image2.jpg\" />" - + "<title>hello</title>" - + "</head><body></body></html>"; + "<html><head><meta property=\"og:description\"" + + " content=\"some description\" />" + + "<meta property=\"og:image\" content=\"http://example.com/image1.jpg\" />" + + "<meta property=\"og:image\" content=\"http://example.com/image2.jpg\" />" + + "<title>hello</title>" + + "</head><body></body></html>"; Metadata metadata = new Metadata(); new HtmlParser().parse( new ByteArrayInputStream(test1.getBytes("ISO-8859-1")), @@ -883,19 +907,19 @@ public class HtmlParserTest { HtmlParserTest.class.getResourceAsStream("/test-documents/testUserDefinedCharset.mhtml"), new Metadata()); assertNotNull(content); } - + //TIKA-1001 @Test public void testNoisyMetaCharsetHeaders() throws Exception { - Tika tika = new Tika(); - String hit = "\u0623\u0639\u0631\u0628"; + Tika tika = new Tika(); + String hit = "\u0623\u0639\u0631\u0628"; - for (int i = 1; i <=4; i++){ - String fileName = "/test-documents/testHTMLNoisyMetaEncoding_"+i+".html"; - String content = tika.parseToString( - HtmlParserTest.class.getResourceAsStream(fileName)); - assertTrue("testing: " +fileName, content.contains(hit)); - } + for (int i = 1; i <= 4; i++) { + String fileName = "/test-documents/testHTMLNoisyMetaEncoding_" + i + ".html"; + String content = tika.parseToString( + HtmlParserTest.class.getResourceAsStream(fileName)); + assertTrue("testing: " + fileName, content.contains(hit)); + } } // TIKA-1193 @@ -907,7 +931,7 @@ public class HtmlParserTest { Metadata metadata = new Metadata(); LinkContentHandler linkContentHandler = new LinkContentHandler(); - new HtmlParser().parse ( + new HtmlParser().parse( new ByteArrayInputStream(test.getBytes("ISO-8859-1")), linkContentHandler, metadata, new ParseContext()); @@ -921,7 +945,7 @@ public class HtmlParserTest { ParseContext parseContext = new ParseContext(); parseContext.set(Schema.class, schema); linkContentHandler = new LinkContentHandler(); - new HtmlParser().parse ( + new HtmlParser().parse( new ByteArrayInputStream(test.getBytes("ISO-8859-1")), linkContentHandler, metadata, parseContext); @@ -939,56 +963,57 @@ public class HtmlParserTest { final int line = 0; final int col = 1; final int[] textPosition = new int[2]; - + new HtmlParser().parse(HtmlParserTest.class.getResourceAsStream("/test-documents/testHTML.html"), - new ContentHandler(){ - Locator locator; + new ContentHandler() { + Locator locator; + + public void setDocumentLocator(Locator locator) { + this.locator = locator; + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + } + + public void startPrefixMapping(String prefix, String uri) + throws SAXException { + } - public void setDocumentLocator(Locator locator) { - this.locator = locator; - } - - public void startDocument() throws SAXException { - } - - public void endDocument() throws SAXException { - } - - public void startPrefixMapping(String prefix, String uri) - throws SAXException { - } - - public void endPrefixMapping(String prefix) - throws SAXException { - } - - public void startElement(String uri, String localName, - String qName, Attributes atts) throws SAXException { - } - - public void endElement(String uri, String localName, - String qName) throws SAXException { - } - - public void characters(char[] ch, int start, int length) - throws SAXException { - String text = new String(ch, start, length); - if (text.equals("Test Indexation Html") && locator != null) { - textPosition[line] = locator.getLineNumber(); - textPosition[col] = locator.getColumnNumber(); - } - } - - public void ignorableWhitespace(char[] ch, int start, - int length) throws SAXException { - } - - public void processingInstruction(String target, String data) - throws SAXException { - } + public void endPrefixMapping(String prefix) + throws SAXException { + } + + public void startElement(String uri, String localName, + String qName, Attributes atts) throws SAXException { + } + + public void endElement(String uri, String localName, + String qName) throws SAXException { + } + + public void characters(char[] ch, int start, int length) + throws SAXException { + String text = new String(ch, start, length); + if (text.equals("Test Indexation Html") && locator != null) { + textPosition[line] = locator.getLineNumber(); + textPosition[col] = locator.getColumnNumber(); + } + } + + public void ignorableWhitespace(char[] ch, int start, + int length) throws SAXException { + } - public void skippedEntity(String name) throws SAXException { - }}, + public void processingInstruction(String target, String data) + throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + }, new Metadata(), new ParseContext()); @@ -997,20 +1022,20 @@ public class HtmlParserTest { // The column reported seems fuzzy, just test it is close enough. assertTrue(Math.abs(textPosition[col] - 47) < 10); } - - - /** - * Test case for TIKA-1303: HTML parse should use the first title tag to set value in meta data + + + /** + * Test case for TIKA-1303: HTML parse should use the first title tag to set value in meta data * and ignore any subsequent title tags found in HTML. - * + * * @see <a href="https://issues.apache.org/jira/browse/TIKA-1303">TIKA-1303</a> */ @Test - public void testFirstTitleValueisSetToMetadata() throws Exception{ + public void testFirstTitleValueisSetToMetadata() throws Exception { String test = "<html><title>Simple Content</title><body><h1></h1>" - + "<title>TitleToIgnore</title></body></html>"; + + "<title>TitleToIgnore</title></body></html>"; Metadata metadata = new Metadata(); - + new HtmlParser().parse( new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8)), new BodyContentHandler(), metadata, new ParseContext()); @@ -1023,7 +1048,7 @@ public class HtmlParserTest { public void testMisleadingMetaContentTypeTags() throws Exception { //TIKA-1519 - String test = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-ELEVEN\">"+ + String test = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-ELEVEN\">" + "</head><title>title</title><body>body</body></html>"; Metadata metadata = new Metadata(); @@ -1033,7 +1058,7 @@ public class HtmlParserTest { assertEquals("text/html; charset=UTF-ELEVEN", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT)); assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); - test = "<html><head><meta http-equiv=\"content-type\" content=\"application/pdf\">"+ + test = "<html><head><meta http-equiv=\"content-type\" content=\"application/pdf\">" + "</head><title>title</title><body>body</body></html>"; metadata = new Metadata(); @@ -1044,7 +1069,7 @@ public class HtmlParserTest { assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); //test two content values - test = "<html><head><meta http-equiv=\"content-type\" content=\"application/pdf\" content=\"application/ms-word\">"+ + test = "<html><head><meta http-equiv=\"content-type\" content=\"application/pdf\" content=\"application/ms-word\">" + "</head><title>title</title><body>body</body></html>"; metadata = new Metadata(); @@ -1058,7 +1083,7 @@ public class HtmlParserTest { @Test public void testXHTMLWithMisleading() throws Exception { //first test an acceptable XHTML header with http-equiv tags - String test = "<?xml version=\"1.0\" ?>"+ + String test = "<?xml version=\"1.0\" ?>" + "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" + "<head>\n" + @@ -1072,7 +1097,7 @@ public class HtmlParserTest { assertEquals("text/html; charset=iso-8859-1", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT)); assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); - test = "<?xml version=\"1.0\" ?>"+ + test = "<?xml version=\"1.0\" ?>" + "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" + "<head>\n" + Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/BPGParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/BPGParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/BPGParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/BPGParserTest.java Fri May 29 14:36:21 2015 @@ -16,6 +16,9 @@ */ package org.apache.tika.parser.image; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + import java.io.InputStream; import java.util.Arrays; import java.util.List; @@ -28,10 +31,6 @@ import org.apache.tika.parser.Parser; import org.junit.Test; import org.xml.sax.helpers.DefaultHandler; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - public class BPGParserTest { private final Parser parser = new BPGParser(); @@ -43,7 +42,7 @@ public class BPGParserTest { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/x-bpg"); InputStream stream = - getClass().getResourceAsStream("/test-documents/testBPG.bpg"); + getClass().getResourceAsStream("/test-documents/testBPG.bpg"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); @@ -60,14 +59,14 @@ public class BPGParserTest { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/x-bpg"); InputStream stream = - getClass().getResourceAsStream("/test-documents/testBPG_commented.bpg"); + getClass().getResourceAsStream("/test-documents/testBPG_commented.bpg"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("103", metadata.get(Metadata.IMAGE_WIDTH)); assertEquals("77", metadata.get(Metadata.IMAGE_LENGTH)); assertEquals("10", metadata.get(Metadata.BITS_PER_SAMPLE)); assertEquals("YCbCr Colour", metadata.get(Photoshop.COLOR_MODE)); - + // TODO Get the exif comment data to be properly extracted, see TIKA-1495 if (false) { assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE)); @@ -77,7 +76,7 @@ public class BPGParserTest { assertTrue(keywords.contains("bird watching")); assertEquals(keywords, Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS))); } - + // TODO Get the exif data to be properly extracted, see TIKA-1495 if (false) { assertEquals("1.0E-6", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1000000 @@ -90,7 +89,7 @@ public class BPGParserTest { assertEquals("1", metadata.get(Metadata.ORIENTATION)); assertEquals("300.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL)); assertEquals("300.0", metadata.get(Metadata.RESOLUTION_VERTICAL)); - assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT)); + assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT)); } } @@ -102,20 +101,20 @@ public class BPGParserTest { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/x-bpg"); InputStream stream = - getClass().getResourceAsStream("/test-documents/testBPG_GEO.bpg"); + getClass().getResourceAsStream("/test-documents/testBPG_GEO.bpg"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH)); assertEquals("10", metadata.get(Metadata.BITS_PER_SAMPLE)); assertEquals("YCbCr Colour", metadata.get(Photoshop.COLOR_MODE)); - + // TODO Get the geographic data to be properly extracted, see TIKA-1495 if (false) { assertEquals("12.54321", metadata.get(Metadata.LATITUDE)); assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE)); } - + // TODO Get the exif data to be properly extracted, see TIKA-1495 if (false) { assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600 Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java Fri May 29 14:36:21 2015 @@ -17,16 +17,13 @@ package org.apache.tika.parser.image; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.junit.Test; - -import com.drew.metadata.Directory; -import com.drew.metadata.MetadataException; -import com.drew.metadata.Tag; -import com.drew.metadata.exif.ExifIFD0Directory; -import com.drew.metadata.exif.ExifSubIFDDirectory; -import com.drew.metadata.jpeg.JpegCommentDirectory; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; import java.util.Arrays; import java.util.GregorianCalendar; @@ -35,29 +32,31 @@ import java.util.List; import java.util.Locale; import java.util.TimeZone; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertTrue; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.verify; -import static org.mockito.Mockito.when; +import com.drew.metadata.Directory; +import com.drew.metadata.MetadataException; +import com.drew.metadata.Tag; +import com.drew.metadata.exif.ExifIFD0Directory; +import com.drew.metadata.exif.ExifSubIFDDirectory; +import com.drew.metadata.jpeg.JpegCommentDirectory; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.Test; public class ImageMetadataExtractorTest { - - @SuppressWarnings({ "rawtypes", "unchecked" }) + + @SuppressWarnings({"rawtypes", "unchecked"}) @Test public void testHandleDirectories() throws MetadataException { Metadata metadata = mock(Metadata.class); ImageMetadataExtractor.DirectoryHandler handler1 = mock(ImageMetadataExtractor.DirectoryHandler.class); ImageMetadataExtractor e = new ImageMetadataExtractor(metadata, handler1); - + Directory directory = new JpegCommentDirectory(); Iterator directories = mock(Iterator.class); when(directories.hasNext()).thenReturn(true, false); when(directories.next()).thenReturn(directory); when(handler1.supports(JpegCommentDirectory.class)).thenReturn(true); - + e.handle(directories); verify(handler1).supports(JpegCommentDirectory.class); verify(handler1).handle(directory, metadata); @@ -70,7 +69,7 @@ public class ImageMetadataExtractorTest assertFalse(new ImageMetadataExtractor.ExifHandler().supports(Directory.class)); assertFalse(new ImageMetadataExtractor.ExifHandler().supports(JpegCommentDirectory.class)); } - + @Test public void testExifHandlerParseDate() throws MetadataException { ExifSubIFDDirectory exif = mock(ExifSubIFDDirectory.class); @@ -81,9 +80,9 @@ public class ImageMetadataExtractorTest when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn( calendar.getTime()); // jvm default timezone as in Metadata Extractor Metadata metadata = new Metadata(); - + new ImageMetadataExtractor.ExifHandler().handle(exif, metadata); - assertEquals("Should be ISO date without time zone", "2000-01-01T00:00:00", + assertEquals("Should be ISO date without time zone", "2000-01-01T00:00:00", metadata.get(TikaCoreProperties.CREATED)); } @@ -97,24 +96,24 @@ public class ImageMetadataExtractorTest when(exif.getDate(ExifIFD0Directory.TAG_DATETIME)).thenReturn( calendar.getTime()); // jvm default timezone as in Metadata Extractor Metadata metadata = new Metadata(); - + new ImageMetadataExtractor.ExifHandler().handle(exif, metadata); - assertEquals("Should try EXIF Date/Time if Original is not set", "1999-01-01T00:00:00", + assertEquals("Should try EXIF Date/Time if Original is not set", "1999-01-01T00:00:00", metadata.get(TikaCoreProperties.CREATED)); } - + @Test public void testExifHandlerParseDateError() throws MetadataException { ExifIFD0Directory exif = mock(ExifIFD0Directory.class); when(exif.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true); when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(null); Metadata metadata = new Metadata(); - + new ImageMetadataExtractor.ExifHandler().handle(exif, metadata); - assertEquals("Parsing should proceed without date", null, + assertEquals("Parsing should proceed without date", null, metadata.get(TikaCoreProperties.CREATED)); } - + @Test public void testCopyUnknownFieldsHandler() throws MetadataException { Directory d = mock(Directory.class); @@ -136,5 +135,5 @@ public class ImageMetadataExtractorTest metadata.get(Metadata.KEYWORDS)); assertNull(metadata.get(TikaCoreProperties.DESCRIPTION)); } - + } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java Fri May 29 14:36:21 2015 @@ -36,7 +36,7 @@ public class ImageParserTest { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/bmp"); InputStream stream = - getClass().getResourceAsStream("/test-documents/testBMP.bmp"); + getClass().getResourceAsStream("/test-documents/testBMP.bmp"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("75", metadata.get("height")); @@ -47,7 +47,7 @@ public class ImageParserTest { assertEquals("0", metadata.get("Dimension HorizontalPhysicalPixelSpacing")); assertEquals("BI_RGB", metadata.get("Compression CompressionTypeName")); assertEquals("image/bmp", metadata.get("Content-Type")); - + assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH)); assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE)); @@ -58,7 +58,7 @@ public class ImageParserTest { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/gif"); InputStream stream = - getClass().getResourceAsStream("/test-documents/testGIF.gif"); + getClass().getResourceAsStream("/test-documents/testGIF.gif"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("75", metadata.get("height")); @@ -78,7 +78,7 @@ public class ImageParserTest { assertEquals("disposalMethod=none, userInputFlag=false, transparentColorFlag=false, delayTime=0, transparentColorIndex=0", metadata.get("GraphicControlExtension")); assertEquals("0", metadata.get("Dimension VerticalPixelOffset")); assertEquals("image/gif", metadata.get("Content-Type")); - + assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH)); assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(TikaCoreProperties.COMMENTS)); @@ -89,7 +89,7 @@ public class ImageParserTest { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); InputStream stream = - getClass().getResourceAsStream("/test-documents/testJPEG.jpg"); + getClass().getResourceAsStream("/test-documents/testJPEG.jpg"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("75", metadata.get("height")); @@ -114,7 +114,7 @@ public class ImageParserTest { assertEquals("keyword=comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("Text TextEntry")); assertEquals("image/jpeg", metadata.get("Content-Type")); assertEquals("process=0, samplePrecision=8, numLines=75, samplesPerLine=100, numFrameComponents=3", metadata.get("markerSequence sof")); - + assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH)); assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(TikaCoreProperties.COMMENTS)); @@ -125,7 +125,7 @@ public class ImageParserTest { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/png"); InputStream stream = - getClass().getResourceAsStream("/test-documents/testPNG.png"); + getClass().getResourceAsStream("/test-documents/testPNG.png"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("75", metadata.get("height")); @@ -152,7 +152,7 @@ public class ImageParserTest { assertEquals("true", metadata.get("Chroma BlackIsZero")); assertEquals("year=2008, month=5, day=6, hour=6, minute=18, second=47", metadata.get("Document ImageModificationTime")); assertEquals("image/png", metadata.get("Content-Type")); - + assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH)); assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE)); Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java Fri May 29 14:36:21 2015 @@ -16,6 +16,8 @@ */ package org.apache.tika.parser.image; +import static junit.framework.Assert.assertEquals; + import java.io.InputStream; import org.apache.tika.metadata.Metadata; @@ -24,8 +26,6 @@ import org.apache.tika.parser.Parser; import org.junit.Test; import org.xml.sax.helpers.DefaultHandler; -import static junit.framework.Assert.assertEquals; - public class PSDParserTest { private final Parser parser = new PSDParser(); @@ -38,24 +38,24 @@ public class PSDParserTest { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/x-psd"); InputStream stream = - getClass().getResourceAsStream("/test-documents/testPSD.psd"); + getClass().getResourceAsStream("/test-documents/testPSD.psd"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("537", metadata.get(Metadata.IMAGE_WIDTH)); assertEquals("51", metadata.get(Metadata.IMAGE_LENGTH)); assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE)); } - + /** * Tests a very basic file, without much metadata, - * where some of the data lengths are padded to be even + * where some of the data lengths are padded to be even */ @Test public void testOddPSD() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/x-psd"); InputStream stream = - getClass().getResourceAsStream("/test-documents/testPSD2.psd"); + getClass().getResourceAsStream("/test-documents/testPSD2.psd"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("69", metadata.get(Metadata.IMAGE_WIDTH)); assertEquals("70", metadata.get(Metadata.IMAGE_LENGTH)); Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java Fri May 29 14:36:21 2015 @@ -19,18 +19,17 @@ package org.apache.tika.parser.image; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.parser.image.TiffParser; +import java.io.InputStream; +import java.util.Arrays; +import java.util.List; + import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; import org.junit.Test; import org.xml.sax.helpers.DefaultHandler; -import java.io.InputStream; -import java.util.Arrays; -import java.util.List; - public class TiffParserTest { private final Parser parser = new TiffParser(); @@ -39,23 +38,23 @@ public class TiffParserTest { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/tiff"); InputStream stream = - getClass().getResourceAsStream("/test-documents/testTIFF.tif"); + getClass().getResourceAsStream("/test-documents/testTIFF.tif"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("Licensed to the Apache Software Foundation (ASF) under one or " + - "more contributor license agreements. See the NOTICE file " + - "distributed with this work for additional information regarding " + - "copyright ownership.", metadata.get(TikaCoreProperties.DESCRIPTION)); - + "more contributor license agreements. See the NOTICE file " + + "distributed with this work for additional information regarding " + + "copyright ownership.", metadata.get(TikaCoreProperties.DESCRIPTION)); + // All EXIF/TIFF tags assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT)); - + // Core EXIF/TIFF tags assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH)); assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE)); assertEquals("3", metadata.get(Metadata.SAMPLES_PER_PIXEL)); - + // Embedded XMP List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); assertTrue("got " + keywords, keywords.contains("cat")); Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/WebPParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/WebPParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/WebPParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/WebPParserTest.java Fri May 29 14:36:21 2015 @@ -33,16 +33,17 @@ import org.xml.sax.helpers.DefaultHandle public class WebPParserTest { Parser parser = new AutoDetectParser(); -/* - Two photos in test-documents (testWebp_Alpha_Lossy.webp and testWebp_Alpha_Lossless.webp) - are in the public domain. These files were retrieved from: - https://github.com/drewnoakes/metadata-extractor-images/tree/master/webp - These photos are also available here: - https://developers.google.com/speed/webp/gallery2#webp_links - Credits for the photo: - "Free Stock Photo in High Resolution - Yellow Rose 3 - Flowers" - Image Author: Jon Sullivan - */ + + /* + Two photos in test-documents (testWebp_Alpha_Lossy.webp and testWebp_Alpha_Lossless.webp) + are in the public domain. These files were retrieved from: + https://github.com/drewnoakes/metadata-extractor-images/tree/master/webp + These photos are also available here: + https://developers.google.com/speed/webp/gallery2#webp_links + Credits for the photo: + "Free Stock Photo in High Resolution - Yellow Rose 3 - Flowers" + Image Author: Jon Sullivan + */ @Test public void testSimple() throws Exception { Metadata metadata = new Metadata(); Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java Fri May 29 14:36:21 2015 @@ -27,11 +27,10 @@ import java.util.Collection; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.parser.image.xmp.JempboxExtractor; import org.junit.Test; public class JempboxExtractorTest { - + @Test public void testParseJpeg() throws IOException, TikaException { Metadata metadata = new Metadata(); @@ -42,21 +41,21 @@ public class JempboxExtractorTest { metadata.set(TikaCoreProperties.CREATOR, "previous author"); // ... or kept in case the field is multi-value metadata.add(TikaCoreProperties.KEYWORDS, "oldkeyword"); - + JempboxExtractor extractor = new JempboxExtractor(metadata); extractor.parse(stream); - + // DublinCore fields assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE)); assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); - Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); + Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); assertTrue(keywords.contains("oldkeyword")); assertTrue(keywords.contains("grazelands")); assertTrue(keywords.contains("nature reserve")); assertTrue(keywords.contains("bird watching")); assertTrue(keywords.contains("coast")); - Collection<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT)); + Collection<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT)); assertTrue(subject.contains("oldkeyword")); assertTrue(subject.contains("grazelands")); assertTrue(subject.contains("nature reserve")); @@ -68,34 +67,34 @@ public class JempboxExtractorTest { public void testParseJpegPhotoshop() throws IOException, TikaException { Metadata metadata = new Metadata(); InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg"); - + JempboxExtractor extractor = new JempboxExtractor(metadata); extractor.parse(stream); - + // DublinCore fields assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE)); assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); - Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); + Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); assertTrue(keywords.contains("bird watching")); assertTrue(keywords.contains("coast")); } - + @Test public void testParseJpegXnviewmp() throws IOException, TikaException { Metadata metadata = new Metadata(); InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg"); - + JempboxExtractor extractor = new JempboxExtractor(metadata); extractor.parse(stream); - + // XnViewMp fields not understood by Jempbox assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); assertTrue(keywords.contains("coast")); assertTrue(keywords.contains("nature reserve")); } - + @Test public void testJoinCreators() { assertEquals("Mr B", new JempboxExtractor(null).joinCreators( Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java Fri May 29 14:36:21 2015 @@ -46,7 +46,8 @@ import org.xml.sax.ContentHandler; public class SQLite3ParserTest extends TikaTest { private final static String TEST_FILE_NAME = "testSqlite3b.db"; - private final static String TEST_FILE1 = "/test-documents/"+TEST_FILE_NAME;; + private final static String TEST_FILE1 = "/test-documents/" + TEST_FILE_NAME; + ; @Test public void testBasic() throws Exception { @@ -98,7 +99,7 @@ public class SQLite3ParserTest extends T //make sure that table cells and rows are properly marked to //yield \t and \n at the appropriate places @Test - public void testSpacesInBodyContentHandler() throws Exception { + public void testSpacesInBodyContentHandler() throws Exception { Parser p = new AutoDetectParser(); InputStream stream = null; Metadata metadata = new Metadata(); @@ -188,20 +189,20 @@ public class SQLite3ParserTest extends T String[] strings = new String[4]; for (int i = 1; i < byteCopier.bytes.size(); i++) { byte[] byteArr = byteCopier.bytes.get(i); - String s = new String(byteArr, 0, Math.min(byteArr.length,1000), "UTF-8"); + String s = new String(byteArr, 0, Math.min(byteArr.length, 1000), "UTF-8"); strings[i] = s; } byte[] oleBytes = new byte[]{ - (byte)-48, - (byte)-49, - (byte)17, - (byte)-32, - (byte)-95, - (byte)-79, - (byte)26, - (byte)-31, - (byte)0, - (byte)0, + (byte) -48, + (byte) -49, + (byte) 17, + (byte) -32, + (byte) -95, + (byte) -79, + (byte) 26, + (byte) -31, + (byte) 0, + (byte) 0, }; //test OLE for (int i = 0; i < 10; i++) { @@ -231,7 +232,6 @@ public class SQLite3ParserTest extends T } - public static class InputStreamResettingHandler implements EmbeddedResourceHandler { public List<byte[]> bytes = new ArrayList<byte[]>(); @@ -240,7 +240,7 @@ public class SQLite3ParserTest extends T public void handle(String filename, MediaType mediaType, InputStream stream) { ByteArrayOutputStream os = new ByteArrayOutputStream(); - if (! stream.markSupported()) { + if (!stream.markSupported()) { stream = TikaInputStream.get(stream); } stream.mark(1000000);
