jukka
Sat, 12 Dec 2009 16:32:50 -0800
Author: jukka Date: Sun Dec 13 00:32:23 2009 New Revision: 890011 URL: http://svn.apache.org/viewvc?rev=890011&view=rev Log: TIKA-335: TXTParser should use incoming charset Patches by Ken Krugler. Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java?rev=890011&r1=890010&r2=890011&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java (original) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java Sun Dec 13 00:32:23 2009 @@ -9,6 +9,7 @@ import java.io.InputStream; import java.io.Reader; import java.io.IOException; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; import java.util.Arrays; @@ -71,7 +72,7 @@ * @stable ICU 3.4 */ public CharsetDetector setDeclaredEncoding(String encoding) { - fDeclaredEncoding = encoding; + setCanonicalDeclaredEncoding(encoding); return this; } @@ -95,6 +96,8 @@ private static final int kBufSize = 8000; + private static final int MAX_CONFIDENCE = 100; + /** * Set the input text (byte) data whose charset is to be detected. * <p/> @@ -188,19 +191,29 @@ int i; int detectResults; int confidence; - ArrayList matches = new ArrayList(); + ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>(); // Iterate over all possible charsets, remember all that // give a match quality > 0. for (i=0; i<fCSRecognizers.size(); i++) { - csr = (CharsetRecognizer)fCSRecognizers.get(i); + csr = fCSRecognizers.get(i); detectResults = csr.match(this); confidence = detectResults & 0x000000ff; if (confidence > 0) { + // Just to be safe, constrain + confidence = Math.min(confidence, MAX_CONFIDENCE); + + // Apply charset hint. + if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) { + // Reduce lack of confidence (delta between "sure" and current) by 50%. + confidence += (MAX_CONFIDENCE - confidence)/2; + } + CharsetMatch m = new CharsetMatch(this, csr, confidence); matches.add(m); } } + Collections.sort(matches); // CharsetMatch compares on confidence Collections.reverse(matches); // Put best match first. CharsetMatch [] resultArray = new CharsetMatch[matches.size()]; @@ -232,7 +245,7 @@ * @stable ICU 3.4 */ public Reader getReader(InputStream in, String declaredEncoding) { - fDeclaredEncoding = declaredEncoding; + setCanonicalDeclaredEncoding(declaredEncoding); try { setText(in); @@ -265,9 +278,8 @@ * * @stable ICU 3.4 */ - public String getString(byte[] in, String declaredEncoding) - { - fDeclaredEncoding = declaredEncoding; + public String getString(byte[] in, String declaredEncoding) { + setCanonicalDeclaredEncoding(declaredEncoding); try { setText(in); @@ -331,6 +343,18 @@ return previous; } + /** + * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists. + * + * @param encoding - name of character encoding + */ + private void setCanonicalDeclaredEncoding(String encoding) { + Charset cs = Charset.forName(encoding); + if (cs != null) { + fDeclaredEncoding = cs.name(); + } + } + /* * MungeInput - after getting a set of raw input data to be analyzed, preprocess * it by removing what appears to be html markup. @@ -450,14 +474,14 @@ /* * List of recognizers for all charsets known to the implementation. */ - private static ArrayList fCSRecognizers = createRecognizers(); + private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers(); private static String [] fCharsetNames; /* * Create the singleton instances of the CharsetRecognizer classes */ - private static ArrayList createRecognizers() { - ArrayList recognizers = new ArrayList(); + private static ArrayList<CharsetRecognizer> createRecognizers() { + ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>(); recognizers.add(new CharsetRecog_UTF8()); Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java?rev=890011&r1=890010&r2=890011&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java (original) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java Sun Dec 13 00:32:23 2009 @@ -26,7 +26,7 @@ * * @stable ICU 3.4 */ -public class CharsetMatch implements Comparable { +public class CharsetMatch implements Comparable<CharsetMatch> { /** @@ -218,8 +218,7 @@ * @throws ClassCastException if the argument is not a CharsetMatch. * @stable ICU 3.4 */ - public int compareTo (Object o) { - CharsetMatch other = (CharsetMatch)o; + public int compareTo(CharsetMatch other) { int compareResult = 0; if (this.fConfidence > other.fConfidence) { compareResult = 1; @@ -260,4 +259,5 @@ private InputStream fInputStream = null; // User's input stream, or null if the user // gave us a byte array. + } Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=890011&r1=890010&r2=890011&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java Sun Dec 13 00:32:23 2009 @@ -63,7 +63,7 @@ public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) - throws IOException, SAXException, TikaException { + throws IOException, SAXException, TikaException { metadata.set(Metadata.CONTENT_TYPE, "text/plain"); // CharsetDetector expects a stream to support marks @@ -72,8 +72,14 @@ } // Detect the content encoding (the stream is reset to the beginning) - // TODO: Better use of the possible encoding hint in input metadata CharsetDetector detector = new CharsetDetector(); + String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING); + if (incomingCharset != null) { + detector.setDeclaredEncoding(incomingCharset); + } else { + // TODO: try to extract charset from CONTENT_TYPE in metadata + } + detector.setText(stream); for (CharsetMatch match : detector.detectAll()) { if (Charset.isSupported(match.getName())) { @@ -133,7 +139,7 @@ */ public void parse( InputStream stream, ContentHandler handler, Metadata metadata) - throws IOException, SAXException, TikaException { + throws IOException, SAXException, TikaException { parse(stream, handler, metadata, new ParseContext()); } Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=890011&r1=890010&r2=890011&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java (original) +++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java Sun Dec 13 00:32:23 2009 @@ -20,6 +20,7 @@ import java.io.StringWriter; import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.WriteOutContentHandler; @@ -42,7 +43,8 @@ parser.parse( new ByteArrayInputStream(text.getBytes("UTF-8")), new WriteOutContentHandler(writer), - metadata); + metadata, + new ParseContext()); String content = writer.toString(); assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); @@ -65,7 +67,7 @@ Metadata metadata = new Metadata(); parser.parse( new ByteArrayInputStream(text.getBytes("UTF-8")), - handler, metadata); + handler, metadata, new ParseContext()); assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); @@ -76,7 +78,7 @@ ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); parser.parse( - new ByteArrayInputStream(new byte[0]), handler, metadata); + new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext()); assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("\n", handler.toString()); } @@ -95,6 +97,32 @@ (byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0}); } + /** + * Test case for TIKA-335: using incoming charset + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a> + */ + public void testUseIncomingCharsetAsHint() throws Exception { + // Could be UTF-8 or ISO 8859-1 or ... + // u00e1 is latin small letter a with acute + final String test2 = "the name is \u00e1ndre"; + + Metadata metadata = new Metadata(); + parser.parse( + new ByteArrayInputStream(test2.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + + assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); + + metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1"); + parser.parse( + new ByteArrayInputStream(test2.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + + assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + } + + private void assertExtractText(String msg, String expected, byte[] input) throws Exception { ContentHandler handler = new BodyContentHandler() { @@ -103,7 +131,7 @@ } }; Metadata metadata = new Metadata(); - parser.parse(new ByteArrayInputStream(input), handler, metadata); + parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext()); assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); assertEquals(msg, expected, handler.toString()); }