tika git commit: TIKA-2041 - add important diffs between new copy/paste from ICU4J and legacy code which may have included Tika-specific mods.

tallison Thu, 11 Aug 2016 13:04:33 -0700

Repository: tika
Updated Branches:
  refs/heads/master 8a68b5d47 -> bd9a9b911



TIKA-2041 - add important diffs between new copy/paste from ICU4J and legacy 
code which may have included Tika-specific mods.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/bd9a9b91
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/bd9a9b91
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/bd9a9b91

Branch: refs/heads/master
Commit: bd9a9b911b4e0205c9dfd4527063e6e1c0fd0c44
Parents: 8a68b5d
Author: tballison <talli...@mitre.org>
Authored: Thu Aug 11 16:03:43 2016 -0400
Committer: tballison <talli...@mitre.org>
Committed: Thu Aug 11 16:03:43 2016 -0400

----------------------------------------------------------------------
 .../apache/tika/parser/txt/CharsetDetector.java | 61 ++++++++++++----
 .../apache/tika/parser/txt/CharsetMatch.java    | 33 ++++++++-
 .../tika/parser/txt/CharsetRecog_sbcs.java      | 74 ++++++++++++++++++++
 3 files changed, 152 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/bd9a9b91/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
index 19ec341..de6a72a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
@@ -11,6 +11,7 @@ package org.apache.tika.parser.txt;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -50,7 +51,8 @@ public class CharsetDetector {
 //   actually choose the "real" charset.  All assuming that the application 
just
 //   wants the data, and doesn't care about a char set name.
 
-    private static final int kBufSize = 12000;//legacy value; more recent 
value is 8000
+    private static final int kBufSize = 12000;//This is a Tika modification; 
ICU's is 8000
+    private static final int MAX_CONFIDENCE = 100;
     /*
      * List of recognizers for all charsets known to the implementation.
      */
@@ -94,11 +96,12 @@ public class CharsetDetector {
         list.add(new CSRecognizerInfo(new 
CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl(), true));
 
         // IBM 420/424 recognizers are disabled by default
-        list.add(new CSRecognizerInfo(new 
CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false));
-        list.add(new CSRecognizerInfo(new 
CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false));
-        list.add(new CSRecognizerInfo(new 
CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false));
-        list.add(new CSRecognizerInfo(new 
CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false));
+        list.add(new CSRecognizerInfo(new 
CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), true));
+        list.add(new CSRecognizerInfo(new 
CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), true));
+        list.add(new CSRecognizerInfo(new 
CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), true));
+        list.add(new CSRecognizerInfo(new 
CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), true));
 
+        list.add(new CSRecognizerInfo(new 
CharsetRecog_sbcs.CharsetRecog_IBM866_ru(), true));
         ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list);
     }
 
@@ -174,7 +177,7 @@ public class CharsetDetector {
      * @stable ICU 3.4
      */
     public CharsetDetector setDeclaredEncoding(String encoding) {
-        fDeclaredEncoding = encoding;
+        setCanonicalDeclaredEncoding(encoding);
         return this;
     }
     //   Value is rounded up, so zero really means zero occurences.
@@ -277,18 +280,30 @@ public class CharsetDetector {
      * @stable ICU 3.4
      */
     public CharsetMatch[] detectAll() {
-        ArrayList<CharsetMatch> matches = new ArrayList<>();
-
-        MungeInput();  // Strip html markup, collect byte stats.
+        CharsetRecognizer csr;
+        int i;
+        CharsetMatch charsetMatch;
+        int confidence;
+        ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
 
         //  Iterate over all possible charsets, remember all that
         //    give a match quality > 0.
-        for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
-            CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
-            boolean active = (fEnabledRecognizers != null) ? 
fEnabledRecognizers[i] : rcinfo.isDefaultEnabled;
-            if (active) {
-                CharsetMatch m = rcinfo.recognizer.match(this);
-                if (m != null) {
+        for (i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+            csr = ALL_CS_RECOGNIZERS.get(i).recognizer;
+            charsetMatch = csr.match(this);
+            if (charsetMatch != null) {
+                confidence = charsetMatch.getConfidence() & 0x000000ff;
+                if (confidence > 0) {
+                    // Just to be safe, constrain
+                    confidence = Math.min(confidence, MAX_CONFIDENCE);
+
+                    // Apply charset hint.
+                    if ((fDeclaredEncoding != null) && 
(fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
+                        // Reduce lack of confidence (delta between "sure" and 
current) by 50%.
+                        confidence += (MAX_CONFIDENCE - confidence) / 2;
+                    }
+
+                    CharsetMatch m = new CharsetMatch(this, csr, confidence);
                     matches.add(m);
                 }
             }
@@ -401,6 +416,22 @@ public class CharsetDetector {
         return previous;
     }
 
+    /**
+     * Try to set fDeclaredEncoding to the canonical name for <encoding>, if 
it exists.
+     *
+     * @param encoding - name of character encoding
+     */
+    private void setCanonicalDeclaredEncoding(String encoding) {
+        if ((encoding == null) || encoding.isEmpty()) {
+            return;
+        }
+
+        Charset cs = Charset.forName(encoding);
+        if (cs != null) {
+            fDeclaredEncoding = cs.name();
+        }
+    }
+
     /*
      *  MungeInput - after getting a set of raw input data to be analyzed, 
preprocess
      *               it by removing what appears to be html markup.

http://git-wip-us.apache.org/repos/asf/tika/blob/bd9a9b91/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
index 35b653f..06ff848 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
@@ -232,5 +232,36 @@ public class CharsetMatch implements 
Comparable<CharsetMatch> {
         }
         return compareResult;
     }
-    //   the recognizer during the detect operation.
+
+    /**
+     * compare this CharsetMatch to another based on confidence value
+     * @param o the CharsetMatch object to compare against
+     * @return true if equal
+     */
+    public boolean equals(Object o) {
+        if (o instanceof CharsetMatch) {
+            CharsetMatch that = (CharsetMatch) o;
+            return (this.fConfidence == that.fConfidence);
+        }
+
+        return false;
+    }
+
+    /**
+     * generates a hashCode based on the confidence value
+     * @return the hashCode
+     */
+    public int hashCode() {
+        return fConfidence;
+    }
+    //   gave us a byte array.
+
+    public String toString() {
+        String s = "Match of " + fCharsetName;
+        if (getLanguage() != null) {
+            s += " in " + getLanguage();
+        }
+        s += " with confidence " + fConfidence;
+        return s;
+    }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/bd9a9b91/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
index 32824be..951082d 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
@@ -13,6 +13,22 @@ package org.apache.tika.parser.txt;
 /**
  * This class recognizes single-byte encodings. Because the encoding scheme is 
so
  * simple, language statistics are used to do the matching.
+ * <p/>
+ * The Recognizer works by first mapping from bytes in the encoding under test
+ * into that Recognizer's ngram space. Normally this means performing a
+ * lowercase, and excluding codepoints that don't correspond to numbers of
+ * letters. (Accented letters may or may not be ignored or normalised, 
depending
+ * on the needs of the ngrams)
+ * Then, ngram analysis is run against the transformed text, and a confidence
+ * is calculated.
+ * <p/>
+ * For many of our Recognizers, we have one ngram set per language in each
+ * encoding, and do a simultanious language+charset detection.
+ * <p/>
+ * When adding new Recognizers, the easiest way is to byte map to an existing
+ * encoding for which we have ngrams, excluding non text, and re-use the 
ngrams.
+ *
+ * @internal
  */
 abstract class CharsetRecog_sbcs extends CharsetRecognizer {
 
@@ -889,6 +905,64 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer 
{
 
         public CharsetMatch match(CharsetDetector det) {
             int confidence = match(det, ngrams, byteMap);
+            return confidence == 0 ? null : new CharsetMatch(det, this, 
confidence, getName(), "tr");
+        }
+    }
+
+    static class CharsetRecog_IBM866_ru extends CharsetRecog_sbcs {
+        private static int[] ngrams = {
+                0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 
0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 
0x20F1F2, 0x20F2EE,
+                0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 
0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 
0xEBE820, 0xEBFCED,
+                0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 
0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 
0xEEEC20, 0xEEF1F2,
+                0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 
0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 
0xF7F2EE, 0xFBF520,
+        };
+
+        // bytemap converts cp866 chars to cp1251 chars, so ngrams are still 
unchanged
+        private static byte[] byteMap = {
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 
0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
+                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 
0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
+                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 
0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
+                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 
0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
+                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 
0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
+                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 
0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
+                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 
0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
+                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 
0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
+                (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 
0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
+                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 
0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
+                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 
0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
+                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 
0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 
0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
+                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 
0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
+                (byte) 0xB8, (byte) 0xB8, (byte) 0xBA, (byte) 0xBA, (byte) 
0xBF, (byte) 0xBF, (byte) 0xA2, (byte) 0xA2,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 
0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+        };
+
+        public String getName() {
+            return "IBM866";
+        }
+
+        public String getLanguage() {
+            return "ru";
+        }
+
+        public CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det, ngrams, byteMap);
             return confidence == 0 ? null : new CharsetMatch(det, this, 
confidence);
         }
     }

tika git commit: TIKA-2041 - add important diffs between new copy/paste from ICU4J and legacy code which may have included Tika-specific mods.

Reply via email to