Repository: tika Updated Branches: refs/heads/master 8a68b5d47 -> bd9a9b911
TIKA-2041 - add important diffs between new copy/paste from ICU4J and legacy code which may have included Tika-specific mods. Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/bd9a9b91 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/bd9a9b91 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/bd9a9b91 Branch: refs/heads/master Commit: bd9a9b911b4e0205c9dfd4527063e6e1c0fd0c44 Parents: 8a68b5d Author: tballison <talli...@mitre.org> Authored: Thu Aug 11 16:03:43 2016 -0400 Committer: tballison <talli...@mitre.org> Committed: Thu Aug 11 16:03:43 2016 -0400 ---------------------------------------------------------------------- .../apache/tika/parser/txt/CharsetDetector.java | 61 ++++++++++++---- .../apache/tika/parser/txt/CharsetMatch.java | 33 ++++++++- .../tika/parser/txt/CharsetRecog_sbcs.java | 74 ++++++++++++++++++++ 3 files changed, 152 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/bd9a9b91/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java index 19ec341..de6a72a 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java @@ -11,6 +11,7 @@ package org.apache.tika.parser.txt; import java.io.IOException; import java.io.InputStream; import java.io.Reader; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -50,7 +51,8 @@ public class CharsetDetector { // actually choose the "real" charset. All assuming that the application just // wants the data, and doesn't care about a char set name. - private static final int kBufSize = 12000;//legacy value; more recent value is 8000 + private static final int kBufSize = 12000;//This is a Tika modification; ICU's is 8000 + private static final int MAX_CONFIDENCE = 100; /* * List of recognizers for all charsets known to the implementation. */ @@ -94,11 +96,12 @@ public class CharsetDetector { list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl(), true)); // IBM 420/424 recognizers are disabled by default - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false)); - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false)); - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false)); - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru(), true)); ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list); } @@ -174,7 +177,7 @@ public class CharsetDetector { * @stable ICU 3.4 */ public CharsetDetector setDeclaredEncoding(String encoding) { - fDeclaredEncoding = encoding; + setCanonicalDeclaredEncoding(encoding); return this; } // Value is rounded up, so zero really means zero occurences. @@ -277,18 +280,30 @@ public class CharsetDetector { * @stable ICU 3.4 */ public CharsetMatch[] detectAll() { - ArrayList<CharsetMatch> matches = new ArrayList<>(); - - MungeInput(); // Strip html markup, collect byte stats. + CharsetRecognizer csr; + int i; + CharsetMatch charsetMatch; + int confidence; + ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>(); // Iterate over all possible charsets, remember all that // give a match quality > 0. - for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { - CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i); - boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled; - if (active) { - CharsetMatch m = rcinfo.recognizer.match(this); - if (m != null) { + for (i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { + csr = ALL_CS_RECOGNIZERS.get(i).recognizer; + charsetMatch = csr.match(this); + if (charsetMatch != null) { + confidence = charsetMatch.getConfidence() & 0x000000ff; + if (confidence > 0) { + // Just to be safe, constrain + confidence = Math.min(confidence, MAX_CONFIDENCE); + + // Apply charset hint. + if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) { + // Reduce lack of confidence (delta between "sure" and current) by 50%. + confidence += (MAX_CONFIDENCE - confidence) / 2; + } + + CharsetMatch m = new CharsetMatch(this, csr, confidence); matches.add(m); } } @@ -401,6 +416,22 @@ public class CharsetDetector { return previous; } + /** + * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists. + * + * @param encoding - name of character encoding + */ + private void setCanonicalDeclaredEncoding(String encoding) { + if ((encoding == null) || encoding.isEmpty()) { + return; + } + + Charset cs = Charset.forName(encoding); + if (cs != null) { + fDeclaredEncoding = cs.name(); + } + } + /* * MungeInput - after getting a set of raw input data to be analyzed, preprocess * it by removing what appears to be html markup. http://git-wip-us.apache.org/repos/asf/tika/blob/bd9a9b91/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java b/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java index 35b653f..06ff848 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java @@ -232,5 +232,36 @@ public class CharsetMatch implements Comparable<CharsetMatch> { } return compareResult; } - // the recognizer during the detect operation. + + /** + * compare this CharsetMatch to another based on confidence value + * @param o the CharsetMatch object to compare against + * @return true if equal + */ + public boolean equals(Object o) { + if (o instanceof CharsetMatch) { + CharsetMatch that = (CharsetMatch) o; + return (this.fConfidence == that.fConfidence); + } + + return false; + } + + /** + * generates a hashCode based on the confidence value + * @return the hashCode + */ + public int hashCode() { + return fConfidence; + } + // gave us a byte array. + + public String toString() { + String s = "Match of " + fCharsetName; + if (getLanguage() != null) { + s += " in " + getLanguage(); + } + s += " with confidence " + fConfidence; + return s; + } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/bd9a9b91/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java b/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java index 32824be..951082d 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java @@ -13,6 +13,22 @@ package org.apache.tika.parser.txt; /** * This class recognizes single-byte encodings. Because the encoding scheme is so * simple, language statistics are used to do the matching. + * <p/> + * The Recognizer works by first mapping from bytes in the encoding under test + * into that Recognizer's ngram space. Normally this means performing a + * lowercase, and excluding codepoints that don't correspond to numbers of + * letters. (Accented letters may or may not be ignored or normalised, depending + * on the needs of the ngrams) + * Then, ngram analysis is run against the transformed text, and a confidence + * is calculated. + * <p/> + * For many of our Recognizers, we have one ngram set per language in each + * encoding, and do a simultanious language+charset detection. + * <p/> + * When adding new Recognizers, the easiest way is to byte map to an existing + * encoding for which we have ngrams, excluding non text, and re-use the ngrams. + * + * @internal */ abstract class CharsetRecog_sbcs extends CharsetRecognizer { @@ -889,6 +905,64 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer { public CharsetMatch match(CharsetDetector det) { int confidence = match(det, ngrams, byteMap); + return confidence == 0 ? null : new CharsetMatch(det, this, confidence, getName(), "tr"); + } + } + + static class CharsetRecog_IBM866_ru extends CharsetRecog_sbcs { + private static int[] ngrams = { + 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE, + 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED, + 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2, + 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520, + }; + + // bytemap converts cp866 chars to cp1251 chars, so ngrams are still unchanged + private static byte[] byteMap = { + (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, + (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, + (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, + (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, + (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00, + (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, + (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, + (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, + (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, + (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, + (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, + (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, + (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, + (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, + (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, + (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, + (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, + (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, + (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7, + (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF, + (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, + (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, + (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, + (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, + (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, + (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, + (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, + (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, + (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7, + (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF, + (byte) 0xB8, (byte) 0xB8, (byte) 0xBA, (byte) 0xBA, (byte) 0xBF, (byte) 0xBF, (byte) 0xA2, (byte) 0xA2, + (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, + }; + + public String getName() { + return "IBM866"; + } + + public String getLanguage() { + return "ru"; + } + + public CharsetMatch match(CharsetDetector det) { + int confidence = match(det, ngrams, byteMap); return confidence == 0 ? null : new CharsetMatch(det, this, confidence); } }