Author: jukka
Date: Wed Oct 10 05:07:41 2007
New Revision: 583443
URL: http://svn.apache.org/viewvc?rev=583443&view=rev
Log:
TIKA-40 - Tika needs to support diverse character encodings
- Use ICU4J to parse text content
- Support Metadata.CONTENT_ENCODING hints in TXTParser
- Added specific test cases for TXTParser
Added:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/
incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/
incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
(with props)
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/pom.xml
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=583443&r1=583442&r2=583443&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Wed Oct 10 05:07:41 2007
@@ -82,3 +82,5 @@
37. TIKA-49 - Some files have old-style license headers, fixed (Robert Burrell
Donkin & bdelacretaz)
38. TIKA-51 - Leftover temp files after running Tika tests, fixed (bdelacretaz)
+
+39. TIKA-40 - Tika needs to support diverse character encodings (jukka)
Modified: incubator/tika/trunk/pom.xml
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/pom.xml?rev=583443&r1=583442&r2=583443&view=diff
==============================================================================
--- incubator/tika/trunk/pom.xml (original)
+++ incubator/tika/trunk/pom.xml Wed Oct 10 05:07:41 2007
@@ -189,6 +189,11 @@
<version>4aug2000r7-dev</version>
</dependency>
<dependency>
+ <groupId>com.ibm.icu</groupId>
+ <artifactId>icu4j</artifactId>
+ <version>3.4.4</version>
+ </dependency>
+ <dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.14</version>
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=583443&r1=583442&r2=583443&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
Wed Oct 10 05:07:41 2007
@@ -16,15 +16,17 @@
*/
package org.apache.tika.parser.txt;
-import java.io.BufferedReader;
+import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.io.InputStreamReader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
+
/**
* Text parser
*/
@@ -32,16 +34,35 @@
public String parse(InputStream stream, Metadata metadata)
throws IOException, TikaException {
- StringBuilder sb = new StringBuilder();
- BufferedReader br = new BufferedReader(new InputStreamReader(stream));
+ CharsetDetector detector = new CharsetDetector();
+
+ // Use the declared character encoding, if available
+ String encoding = metadata.get(Metadata.CONTENT_ENCODING);
+ if (encoding != null) {
+ detector.setDeclaredEncoding(encoding);
+ }
- int charAsInt;
+ // CharsetDetector expects a stream to support marks
+ if (!stream.markSupported()) {
+ stream = new BufferedInputStream(stream);
+ }
+
+ detector.setText(stream);
+
+ CharsetMatch match = detector.detect();
+ if (match == null) {
+ throw new TikaException("Unable to detect character encoding");
+ }
- while ((charAsInt = br.read()) != -1) {
- sb.append((char) charAsInt);
+ metadata.set(Metadata.CONTENT_TYPE, "text/plain");
+ metadata.set(Metadata.CONTENT_ENCODING, match.getName());
+ String language = match.getLanguage();
+ if (language != null) {
+ metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
+ metadata.set(Metadata.LANGUAGE, match.getLanguage());
}
- return sb.toString();
+ return match.getString();
}
}
Added:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=583443&view=auto
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
(added)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Wed Oct 10 05:07:41 2007
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.ByteArrayInputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+
+import junit.framework.TestCase;
+
+public class TXTParserTest extends TestCase {
+
+ private Parser parser = new TXTParser();
+
+ public void testEnglishText() throws Exception {
+ String text =
+ "Hello, World! This is simple UTF-8 text content written"
+ + " in English to test autodetection of both the character"
+ + " encoding and the language of the input stream.";
+
+ Metadata metadata = new Metadata();
+ String content = parser.parse(
+ new ByteArrayInputStream(text.getBytes("UTF-8")), metadata);
+
+ assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
+ assertEquals("en", metadata.get(Metadata.LANGUAGE));
+ // TODO: ICU reports the content encoding as ISO-8859-1, even though
+ // it could just as well be ASCII or UTF-8, so for now we won't
+ // test for the Metadata.CONTENT_ENCODING field
+
+ assertTrue(content.contains("Hello"));
+ assertTrue(content.contains("World"));
+ assertTrue(content.contains("autodetection"));
+ assertTrue(content.contains("stream"));
+ }
+
+ public void testUTF8Text() throws Exception {
+ String text =
"I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
+
+ Metadata metadata = new Metadata();
+ String content = parser.parse(
+ new ByteArrayInputStream(text.getBytes("UTF-8")), metadata);
+
+ assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+
+ assertTrue(content.contains(text));
+ }
+
+ public void testEmptyText() throws Exception {
+ Metadata metadata = new Metadata();
+ String content = parser.parse(
+ new ByteArrayInputStream(new byte[0]), metadata);
+ assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("", content);
+ }
+
+}
Propchange:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
------------------------------------------------------------------------------
svn:eol-style = native