Author: kkrugler
Date: Mon Aug 13 17:53:38 2012
New Revision: 1372530

URL: http://svn.apache.org/viewvc?rev=1372530&view=rev
Log:
TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500

Added test to confirm that it was fixed by Jukka's previous changes to
the charset detection & CONTENT_TYPE handling code.

Modified:
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=1372530&r1=1372529&r2=1372530&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
 Mon Aug 13 17:53:38 2012
@@ -252,5 +252,28 @@ public class TXTParserTest extends TestC
 
         assertEquals("text/plain; charset=ISO-8859-1", 
metadata.get(Metadata.CONTENT_TYPE));
     }
+    
+    /**
+     * Test case for TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as 
IBM500
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-771";>TIKA-771</a> 
+     */
+    public void testCharsetDetectionWithShortSnipet() throws Exception {
+        final String text = "Hello, World!";
+
+        Metadata metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(text.getBytes("UTF-8")),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("text/plain; charset=ISO-8859-1", 
metadata.get(Metadata.CONTENT_TYPE));
+        
+        // Now verify that if we tell the parser the encoding is UTF-8, that's 
what
+        // we get back (see TIKA-868)
+        metadata.set(Metadata.CONTENT_TYPE, "application/binary; 
charset=UTF-8");
+        parser.parse(
+                new ByteArrayInputStream(text.getBytes("UTF-8")),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("text/plain; charset=UTF-8", 
metadata.get(Metadata.CONTENT_TYPE));
+    }
 
 }


Reply via email to