svn commit: r583443 - in /incubator/tika/trunk: ./ src/main/java/org/apache/tika/parser/txt/ src/test/java/org/apache/tika/parser/ src/test/java/org/apache/tika/parser/txt/

jukka Wed, 10 Oct 2007 05:08:33 -0700

Author: jukka
Date: Wed Oct 10 05:07:41 2007
New Revision: 583443

URL: http://svn.apache.org/viewvc?rev=583443&view=rev
Log:
TIKA-40 - Tika needs to support diverse character encodings
    - Use ICU4J to parse text content
    - Support Metadata.CONTENT_ENCODING hints in TXTParser
    - Added specific test cases for TXTParser


Added:
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/
    
incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
   (with props)
Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/pom.xml
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=583443&r1=583442&r2=583443&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Wed Oct 10 05:07:41 2007
@@ -82,3 +82,5 @@
 37. TIKA-49 - Some files have old-style license headers, fixed (Robert Burrell 
Donkin & bdelacretaz)
 
 38. TIKA-51 - Leftover temp files after running Tika tests, fixed (bdelacretaz)
+
+39. TIKA-40 - Tika needs to support diverse character encodings (jukka)

Modified: incubator/tika/trunk/pom.xml
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/pom.xml?rev=583443&r1=583442&r2=583443&view=diff
==============================================================================
--- incubator/tika/trunk/pom.xml (original)
+++ incubator/tika/trunk/pom.xml Wed Oct 10 05:07:41 2007
@@ -189,6 +189,11 @@
       <version>4aug2000r7-dev</version>
     </dependency>
     <dependency>
+      <groupId>com.ibm.icu</groupId>
+      <artifactId>icu4j</artifactId>
+      <version>3.4.4</version>
+    </dependency>
+    <dependency>
       <groupId>log4j</groupId>
       <artifactId>log4j</artifactId>
       <version>1.2.14</version>

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=583443&r1=583442&r2=583443&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java 
(original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java 
Wed Oct 10 05:07:41 2007
@@ -16,15 +16,17 @@
  */
 package org.apache.tika.parser.txt;
 
-import java.io.BufferedReader;
+import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.InputStreamReader;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.Parser;
 
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
+
 /**
  * Text parser
  */
@@ -32,16 +34,35 @@
 
     public String parse(InputStream stream, Metadata metadata)
             throws IOException, TikaException {
-        StringBuilder sb = new StringBuilder();
-        BufferedReader br = new BufferedReader(new InputStreamReader(stream));
+        CharsetDetector detector = new CharsetDetector();
+
+        // Use the declared character encoding, if available
+        String encoding = metadata.get(Metadata.CONTENT_ENCODING);
+        if (encoding != null) {
+            detector.setDeclaredEncoding(encoding);
+        }
 
-        int charAsInt;
+        // CharsetDetector expects a stream to support marks
+        if (!stream.markSupported()) {
+            stream = new BufferedInputStream(stream);
+        }
+
+        detector.setText(stream);
+
+        CharsetMatch match = detector.detect();
+        if (match == null) {
+            throw new TikaException("Unable to detect character encoding");
+        }
 
-        while ((charAsInt = br.read()) != -1) {
-            sb.append((char) charAsInt);
+        metadata.set(Metadata.CONTENT_TYPE, "text/plain");
+        metadata.set(Metadata.CONTENT_ENCODING, match.getName());
+        String language = match.getLanguage();
+        if (language != null) {
+            metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
+            metadata.set(Metadata.LANGUAGE, match.getLanguage());
         }
 
-        return sb.toString();
+        return match.getString();
     }
 
 }

Added: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=583443&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
 (added)
+++ 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
 Wed Oct 10 05:07:41 2007
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.ByteArrayInputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+
+import junit.framework.TestCase;
+
+public class TXTParserTest extends TestCase {
+
+    private Parser parser = new TXTParser();
+
+    public void testEnglishText() throws Exception {
+        String text =
+            "Hello, World! This is simple UTF-8 text content written"
+            + " in English to test autodetection of both the character"
+            + " encoding and the language of the input stream.";
+
+        Metadata metadata = new Metadata();
+        String content = parser.parse(
+                new ByteArrayInputStream(text.getBytes("UTF-8")), metadata);
+
+        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
+        assertEquals("en", metadata.get(Metadata.LANGUAGE));
+        // TODO: ICU reports the content encoding as ISO-8859-1, even though
+        // it could just as well be ASCII or UTF-8, so  for now we won't
+        // test for the Metadata.CONTENT_ENCODING field
+
+        assertTrue(content.contains("Hello"));
+        assertTrue(content.contains("World"));
+        assertTrue(content.contains("autodetection"));
+        assertTrue(content.contains("stream"));
+    }
+
+    public void testUTF8Text() throws Exception {
+        String text = 
"I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
+
+        Metadata metadata = new Metadata();
+        String content = parser.parse(
+                new ByteArrayInputStream(text.getBytes("UTF-8")), metadata);
+
+        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+
+        assertTrue(content.contains(text));
+    }
+
+    public void testEmptyText() throws Exception {
+        Metadata metadata = new Metadata();
+        String content = parser.parse(
+                new ByteArrayInputStream(new byte[0]), metadata);
+        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("", content);
+    }
+
+}

Propchange: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

svn commit: r583443 - in /incubator/tika/trunk: ./ src/main/java/org/apache/tika/parser/txt/ src/test/java/org/apache/tika/parser/ src/test/java/org/apache/tika/parser/txt/

Reply via email to