Author: jukka
Date: Sun Dec 13 00:32:23 2009
New Revision: 890011
URL: http://svn.apache.org/viewvc?rev=890011&view=rev
Log:
TIKA-335: TXTParser should use incoming charset
Patches by Ken Krugler.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java?rev=890011&r1=890010&r2=890011&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
Sun Dec 13 00:32:23 2009
@@ -9,6 +9,7 @@
import java.io.InputStream;
import java.io.Reader;
import java.io.IOException;
+import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Arrays;
@@ -71,7 +72,7 @@
* @stable ICU 3.4
*/
public CharsetDetector setDeclaredEncoding(String encoding) {
- fDeclaredEncoding = encoding;
+ setCanonicalDeclaredEncoding(encoding);
return this;
}
@@ -95,6 +96,8 @@
private static final int kBufSize = 8000;
+ private static final int MAX_CONFIDENCE = 100;
+
/**
* Set the input text (byte) data whose charset is to be detected.
* <p/>
@@ -188,19 +191,29 @@
int i;
int detectResults;
int confidence;
- ArrayList matches = new ArrayList();
+ ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
// Iterate over all possible charsets, remember all that
// give a match quality > 0.
for (i=0; i<fCSRecognizers.size(); i++) {
- csr = (CharsetRecognizer)fCSRecognizers.get(i);
+ csr = fCSRecognizers.get(i);
detectResults = csr.match(this);
confidence = detectResults & 0x000000ff;
if (confidence > 0) {
+ // Just to be safe, constrain
+ confidence = Math.min(confidence, MAX_CONFIDENCE);
+
+ // Apply charset hint.
+ if ((fDeclaredEncoding != null) &&
(fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
+ // Reduce lack of confidence (delta between "sure" and
current) by 50%.
+ confidence += (MAX_CONFIDENCE - confidence)/2;
+ }
+
CharsetMatch m = new CharsetMatch(this, csr, confidence);
matches.add(m);
}
}
+
Collections.sort(matches); // CharsetMatch compares on confidence
Collections.reverse(matches); // Put best match first.
CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
@@ -232,7 +245,7 @@
* @stable ICU 3.4
*/
public Reader getReader(InputStream in, String declaredEncoding) {
- fDeclaredEncoding = declaredEncoding;
+ setCanonicalDeclaredEncoding(declaredEncoding);
try {
setText(in);
@@ -265,9 +278,8 @@
*
* @stable ICU 3.4
*/
- public String getString(byte[] in, String declaredEncoding)
- {
- fDeclaredEncoding = declaredEncoding;
+ public String getString(byte[] in, String declaredEncoding) {
+ setCanonicalDeclaredEncoding(declaredEncoding);
try {
setText(in);
@@ -331,6 +343,18 @@
return previous;
}
+ /**
+ * Try to set fDeclaredEncoding to the canonical name for <encoding>, if
it exists.
+ *
+ * @param encoding - name of character encoding
+ */
+ private void setCanonicalDeclaredEncoding(String encoding) {
+ Charset cs = Charset.forName(encoding);
+ if (cs != null) {
+ fDeclaredEncoding = cs.name();
+ }
+ }
+
/*
* MungeInput - after getting a set of raw input data to be analyzed,
preprocess
* it by removing what appears to be html markup.
@@ -450,14 +474,14 @@
/*
* List of recognizers for all charsets known to the implementation.
*/
- private static ArrayList fCSRecognizers = createRecognizers();
+ private static ArrayList<CharsetRecognizer> fCSRecognizers =
createRecognizers();
private static String [] fCharsetNames;
/*
* Create the singleton instances of the CharsetRecognizer classes
*/
- private static ArrayList createRecognizers() {
- ArrayList recognizers = new ArrayList();
+ private static ArrayList<CharsetRecognizer> createRecognizers() {
+ ArrayList<CharsetRecognizer> recognizers = new
ArrayList<CharsetRecognizer>();
recognizers.add(new CharsetRecog_UTF8());
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java?rev=890011&r1=890010&r2=890011&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
Sun Dec 13 00:32:23 2009
@@ -26,7 +26,7 @@
*
* @stable ICU 3.4
*/
-public class CharsetMatch implements Comparable {
+public class CharsetMatch implements Comparable<CharsetMatch> {
/**
@@ -218,8 +218,7 @@
* @throws ClassCastException if the argument is not a CharsetMatch.
* @stable ICU 3.4
*/
- public int compareTo (Object o) {
- CharsetMatch other = (CharsetMatch)o;
+ public int compareTo(CharsetMatch other) {
int compareResult = 0;
if (this.fConfidence > other.fConfidence) {
compareResult = 1;
@@ -260,4 +259,5 @@
private InputStream fInputStream = null; // User's input stream,
or null if the user
// gave us a byte
array.
+
}
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=890011&r1=890010&r2=890011&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
Sun Dec 13 00:32:23 2009
@@ -63,7 +63,7 @@
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ throws IOException, SAXException, TikaException {
metadata.set(Metadata.CONTENT_TYPE, "text/plain");
// CharsetDetector expects a stream to support marks
@@ -72,8 +72,14 @@
}
// Detect the content encoding (the stream is reset to the beginning)
- // TODO: Better use of the possible encoding hint in input metadata
CharsetDetector detector = new CharsetDetector();
+ String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
+ if (incomingCharset != null) {
+ detector.setDeclaredEncoding(incomingCharset);
+ } else {
+ // TODO: try to extract charset from CONTENT_TYPE in metadata
+ }
+
detector.setText(stream);
for (CharsetMatch match : detector.detectAll()) {
if (Charset.isSupported(match.getName())) {
@@ -133,7 +139,7 @@
*/
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
- throws IOException, SAXException, TikaException {
+ throws IOException, SAXException, TikaException {
parse(stream, handler, metadata, new ParseContext());
}
Modified:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=890011&r1=890010&r2=890011&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Sun Dec 13 00:32:23 2009
@@ -20,6 +20,7 @@
import java.io.StringWriter;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
@@ -42,7 +43,8 @@
parser.parse(
new ByteArrayInputStream(text.getBytes("UTF-8")),
new WriteOutContentHandler(writer),
- metadata);
+ metadata,
+ new ParseContext());
String content = writer.toString();
assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
@@ -65,7 +67,7 @@
Metadata metadata = new Metadata();
parser.parse(
new ByteArrayInputStream(text.getBytes("UTF-8")),
- handler, metadata);
+ handler, metadata, new ParseContext());
assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
@@ -76,7 +78,7 @@
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
parser.parse(
- new ByteArrayInputStream(new byte[0]), handler, metadata);
+ new ByteArrayInputStream(new byte[0]), handler, metadata, new
ParseContext());
assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("\n", handler.toString());
}
@@ -95,6 +97,32 @@
(byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
}
+ /**
+ * Test case for TIKA-335: using incoming charset
+ *
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
+ */
+ public void testUseIncomingCharsetAsHint() throws Exception {
+ // Could be UTF-8 or ISO 8859-1 or ...
+ // u00e1 is latin small letter a with acute
+ final String test2 = "the name is \u00e1ndre";
+
+ Metadata metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(test2.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+
+ assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+
+ metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1");
+ parser.parse(
+ new ByteArrayInputStream(test2.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
+
private void assertExtractText(String msg, String expected, byte[] input)
throws Exception {
ContentHandler handler = new BodyContentHandler() {
@@ -103,7 +131,7 @@
}
};
Metadata metadata = new Metadata();
- parser.parse(new ByteArrayInputStream(input), handler, metadata);
+ parser.parse(new ByteArrayInputStream(input), handler, metadata, new
ParseContext());
assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
assertEquals(msg, expected, handler.toString());
}