Author: jukka
Date: Sun Jun 28 18:14:51 2009
New Revision: 789134
URL: http://svn.apache.org/viewvc?rev=789134&view=rev
Log:
TIKA-240: Drop the BOM when extracting plain text
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=789134&r1=789133&r2=789134&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
Sun Jun 28 18:14:51 2009
@@ -17,6 +17,7 @@
package org.apache.tika.parser.txt;
import java.io.BufferedInputStream;
+import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
@@ -55,7 +56,14 @@
throw new TikaException("Unable to detect character encoding");
}
- Reader reader = match.getReader();
+ Reader reader = new BufferedReader(match.getReader());
+ // TIKA-240: Drop the BOM when extracting plain text
+ reader.mark(1);
+ int bom = reader.read();
+ if (bom != '\ufeff') { // zero-width no-break space
+ reader.reset();
+ }
+
metadata.set(Metadata.CONTENT_TYPE, "text/plain");
metadata.set(Metadata.CONTENT_ENCODING, match.getName());
Modified:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=789134&r1=789133&r2=789134&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Sun Jun 28 18:14:51 2009
@@ -81,4 +81,31 @@
assertEquals("\n", handler.toString());
}
+ /**
+ * Test case for TIKA-240: Drop the BOM when extracting plain text
+ *
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a>
+ */
+ public void testDropByteOrderMark() throws Exception {
+ assertExtractText("UTF-8 BOM", "test", new byte[] {
+ (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't' });
+ assertExtractText("UTF-16 BE BOM", "test", new byte[] {
+ (byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'});
+ assertExtractText("UTF-16 LE BOM", "test", new byte[] {
+ (byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
+ }
+
+ private void assertExtractText(String msg, String expected, byte[] input)
+ throws Exception {
+ ContentHandler handler = new BodyContentHandler() {
+ public void ignorableWhitespace(char[] ch, int off, int len) {
+ // Ignore the whitespace added by XHTMLContentHandler
+ }
+ };
+ Metadata metadata = new Metadata();
+ parser.parse(new ByteArrayInputStream(input), handler, metadata);
+ assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(msg, expected, handler.toString());
+ }
+
}