Author: kkrugler
Date: Mon Aug 13 17:53:38 2012
New Revision: 1372530
URL: http://svn.apache.org/viewvc?rev=1372530&view=rev
Log:
TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500
Added test to confirm that it was fixed by Jukka's previous changes to
the charset detection & CONTENT_TYPE handling code.
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=1372530&r1=1372529&r2=1372530&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Mon Aug 13 17:53:38 2012
@@ -252,5 +252,28 @@ public class TXTParserTest extends TestC
assertEquals("text/plain; charset=ISO-8859-1",
metadata.get(Metadata.CONTENT_TYPE));
}
+
+ /**
+ * Test case for TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as
IBM500
+ *
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-771">TIKA-771</a>
+ */
+ public void testCharsetDetectionWithShortSnipet() throws Exception {
+ final String text = "Hello, World!";
+
+ Metadata metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(text.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("text/plain; charset=ISO-8859-1",
metadata.get(Metadata.CONTENT_TYPE));
+
+ // Now verify that if we tell the parser the encoding is UTF-8, that's
what
+ // we get back (see TIKA-868)
+ metadata.set(Metadata.CONTENT_TYPE, "application/binary;
charset=UTF-8");
+ parser.parse(
+ new ByteArrayInputStream(text.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("text/plain; charset=UTF-8",
metadata.get(Metadata.CONTENT_TYPE));
+ }
}