a...

tallison Fri, 29 May 2015 07:37:49 -0700

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
 Fri May 29 14:36:21 2015
@@ -1,18 +1,18 @@
 /**
-*******************************************************************************
-* Copyright (C) 2005-2009, International Business Machines Corporation and    *
-* others. All Rights Reserved.                                                *
-*******************************************************************************
-*/
+ * 
******************************************************************************
+ * Copyright (C) 2005-2009, International Business Machines Corporation and    
*
+ * others. All Rights Reserved.                                                
*
+ * 
******************************************************************************
+ */
 package org.apache.tika.parser.txt;
 
+import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
-import java.io.IOException;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
-import java.util.Collections;
 import java.util.Arrays;
+import java.util.Collections;
 
 
 /**
@@ -47,27 +47,150 @@ public class CharsetDetector {
 //   actually choose the "real" charset.  All assuming that the application 
just
 //   wants the data, and doesn't care about a char set name.
 
+    private static final int kBufSize = 12000;
+    private static final int MAX_CONFIDENCE = 100;
+    private static String[] fCharsetNames;
+    /*
+     * List of recognizers for all charsets known to the implementation.
+     */
+    private static ArrayList<CharsetRecognizer> fCSRecognizers = 
createRecognizers();
+    /*
+     *  The following items are accessed by individual CharsetRecongizers 
during
+     *     the recognition process
+     *
+     */
+    byte[] fInputBytes =       // The text to be checked.  Markup will have 
been
+            new byte[kBufSize];  //   removed if appropriate.
+    int fInputLen;          // Length of the byte data in fInputText.
+    short fByteStats[] =      // byte frequency statistics for the input text.
+            new short[256];  //   Value is percent, not absolute.
+    boolean fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F 
are in the input;
+            false;
+    String fDeclaredEncoding;
+    //
+    //  Stuff private to CharsetDetector
+    //
+    byte[] fRawInput;     // Original, untouched input bytes.
+    //  If user gave us a byte array, this is it.
+    //  If user gave us a stream, it's read to a
+    //  buffer here.
+    int fRawLength;    // Length of data in fRawInput array.
+    InputStream fInputStream;  // User's input stream, or null if the user
+    boolean fStripTags =   // If true, setText() will strip tags from input 
text.
+            false;
+
     /**
      *   Constructor
-     * 
+     *
      * @stable ICU 3.4
      */
     public CharsetDetector() {
     }
 
     /**
+     * Get the names of all char sets that can be recognized by the char set 
detector.
+     *
+     * @return an array of the names of all charsets that can be recognized
+     * by the charset detector.
+     *
+     * @stable ICU 3.4
+     */
+    public static String[] getAllDetectableCharsets() {
+        return fCharsetNames;
+    }
+
+    /*
+     * Create the singleton instances of the CharsetRecognizer classes
+     */
+    private static ArrayList<CharsetRecognizer> createRecognizers() {
+        ArrayList<CharsetRecognizer> recognizers = new 
ArrayList<CharsetRecognizer>();
+
+        recognizers.add(new CharsetRecog_UTF8());
+
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
+
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
+        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
+        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
+        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
+        recognizers.add(new 
CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
+        recognizers.add(new 
CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
+        recognizers.add(new 
CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
+
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
+
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
+
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
+
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
+
+        // Create an array of all charset names, as a side effect.
+        // Needed for the getAllDetectableCharsets() API.
+        String[] charsetNames = new String[recognizers.size()];
+        int out = 0;
+
+        for (CharsetRecognizer recognizer : recognizers) {
+            String name = recognizer.getName();
+
+            if (out == 0 || !name.equals(charsetNames[out - 1])) {
+                charsetNames[out++] = name;
+            }
+        }
+
+        fCharsetNames = new String[out];
+        System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
+
+        return recognizers;
+    }
+
+    /**
      * Set the declared encoding for charset detection.
      *  The declared encoding of an input text is an encoding obtained
      *  from an http header or xml declaration or similar source that
-     *  can be provided as additional information to the charset detector.  
+     *  can be provided as additional information to the charset detector.
      *  A match between a declared encoding and a possible detected encoding
      *  will raise the quality of that detected encoding by a small delta,
      *  and will also appear as a "reason" for the match.
      * <p/>
      * A declared encoding that is incompatible with the input data being
      * analyzed will not be added to the list of possible encodings.
-     * 
-     *  @param encoding The declared encoding 
+     *
+     *  @param encoding The declared encoding
      *
      * @stable ICU 3.4
      */
@@ -75,28 +198,25 @@ public class CharsetDetector {
         setCanonicalDeclaredEncoding(encoding);
         return this;
     }
-    
+
     /**
      * Set the input text (byte) data whose charset is to be detected.
-     * 
+     *
      * @param in the input text of unknown encoding
-     * 
+     *
      * @return This CharsetDetector
      *
      * @stable ICU 3.4
      */
-    public CharsetDetector setText(byte [] in) {
-        fRawInput  = in;
+    public CharsetDetector setText(byte[] in) {
+        fRawInput = in;
         fRawLength = in.length;
-        
+
         MungeInput();
-        
+
         return this;
     }
-    
-    private static final int kBufSize = 12000;
-
-    private static final int MAX_CONFIDENCE = 100;
+    //   Value is rounded up, so zero really means zero occurences.
 
     /**
      * Set the input text (byte) data whose charset is to be detected.
@@ -108,45 +228,44 @@ public class CharsetDetector {
      *   be read depends on the characteristics of the data itself.
      *
      * @param in the input text of unknown encoding
-     * 
+     *
      * @return This CharsetDetector
      *
      * @stable ICU 3.4
      */
-    
+
     public CharsetDetector setText(InputStream in) throws IOException {
         fInputStream = in;
         fInputStream.mark(kBufSize);
         fRawInput = new byte[kBufSize];   // Always make a new buffer because 
the
-                                          //   previous one may have come from 
the caller,
-                                          //   in which case we can't touch it.
+        //   previous one may have come from the caller,
+        //   in which case we can't touch it.
         fRawLength = 0;
         int remainingLength = kBufSize;
-        while (remainingLength > 0 ) {
+        while (remainingLength > 0) {
             // read() may give data in smallish chunks, esp. for remote 
sources.  Hence, this loop.
-            int  bytesRead = fInputStream.read(fRawInput, fRawLength, 
remainingLength);
+            int bytesRead = fInputStream.read(fRawInput, fRawLength, 
remainingLength);
             if (bytesRead <= 0) {
-                 break;
+                break;
             }
             fRawLength += bytesRead;
             remainingLength -= bytesRead;
         }
         fInputStream.reset();
-        
+
         MungeInput();                     // Strip html markup, collect byte 
stats.
         return this;
     }
 
-  
     /**
      * Return the charset that best matches the supplied input data.
-     * 
-     * Note though, that because the detection 
+     *
+     * Note though, that because the detection
      * only looks at the start of the input data,
      * there is a possibility that the returned charset will fail to handle
      * the full set of input data.
      * <p/>
-     * Raise an exception if 
+     * Raise an exception if
      *  <ul>
      *    <li>no charset appears to match the data.</li>
      *    <li>no input text has been provided</li>
@@ -163,65 +282,64 @@ public class CharsetDetector {
 //          is found.  This is something to be done later, after things are 
otherwise
 //          working.
         CharsetMatch matches[] = detectAll();
-        
+
         if (matches == null || matches.length == 0) {
             return null;
         }
-        
+
         return matches[0];
-     }
-    
+    }
+
     /**
      *  Return an array of all charsets that appear to be plausible
      *  matches with the input data.  The array is ordered with the
      *  best quality match first.
      * <p/>
-     * Raise an exception if 
+     * Raise an exception if
      *  <ul>
      *    <li>no charsets appear to match the input data.</li>
      *    <li>no input text has been provided</li>
      *  </ul>
-     * 
+     *
      * @return An array of CharsetMatch objects representing possibly matching 
charsets.
      *
      * @stable ICU 3.4
      */
     public CharsetMatch[] detectAll() {
         CharsetRecognizer csr;
-        int               i;
-        int               detectResults;
-        int               confidence;
+        int i;
+        int detectResults;
+        int confidence;
         ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
-        
+
         //  Iterate over all possible charsets, remember all that
         //    give a match quality > 0.
-        for (i=0; i<fCSRecognizers.size(); i++) {
+        for (i = 0; i < fCSRecognizers.size(); i++) {
             csr = fCSRecognizers.get(i);
             detectResults = csr.match(this);
             confidence = detectResults & 0x000000ff;
             if (confidence > 0) {
                 // Just to be safe, constrain
                 confidence = Math.min(confidence, MAX_CONFIDENCE);
-                
+
                 // Apply charset hint.
                 if ((fDeclaredEncoding != null) && 
(fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
                     // Reduce lack of confidence (delta between "sure" and 
current) by 50%.
-                    confidence += (MAX_CONFIDENCE - confidence)/2;
+                    confidence += (MAX_CONFIDENCE - confidence) / 2;
                 }
-                
-                CharsetMatch  m = new CharsetMatch(this, csr, confidence);
+
+                CharsetMatch m = new CharsetMatch(this, csr, confidence);
                 matches.add(m);
             }
         }
-        
+
         Collections.sort(matches);      // CharsetMatch compares on confidence
         Collections.reverse(matches);   //  Put best match first.
-        CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
+        CharsetMatch[] resultArray = new CharsetMatch[matches.size()];
         resultArray = matches.toArray(resultArray);
         return resultArray;
     }
 
-    
     /**
      * Autodetect the charset of an inputStream, and return a Java Reader
      * to access the converted input data.
@@ -236,7 +354,7 @@ public class CharsetDetector {
      *    be read depends on the characteristics of the data itself.
      *<p/>
      * Raise an exception if no charsets appear to match the input data.
-     * 
+     *
      * @param in The source of the byte data in the unknown charset.
      *
      * @param declaredEncoding  A declared encoding for the data, if available,
@@ -246,16 +364,16 @@ public class CharsetDetector {
      */
     public Reader getReader(InputStream in, String declaredEncoding) {
         setCanonicalDeclaredEncoding(declaredEncoding);
-        
+
         try {
             setText(in);
-            
+
             CharsetMatch match = detect();
-            
+
             if (match == null) {
                 return null;
             }
-            
+
             return match.getReader();
         } catch (IOException e) {
             return null;
@@ -270,7 +388,7 @@ public class CharsetDetector {
      *   
<code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
      *<p/>
      * Raise an exception if no charsets appear to match the input data.
-     * 
+     *
      * @param in The source of the byte data in the unknown charset.
      *
      * @param declaredEncoding  A declared encoding for the data, if available,
@@ -280,85 +398,71 @@ public class CharsetDetector {
      */
     public String getString(byte[] in, String declaredEncoding) {
         setCanonicalDeclaredEncoding(declaredEncoding);
-       
+
         try {
             setText(in);
-            
+
             CharsetMatch match = detect();
-            
+
             if (match == null) {
                 return null;
             }
-            
+
             return match.getString(-1);
         } catch (IOException e) {
             return null;
         }
     }
+    //   gave us a byte array.
 
- 
-    /**
-     * Get the names of all char sets that can be recognized by the char set 
detector.
-     *
-     * @return an array of the names of all charsets that can be recognized
-     * by the charset detector.
-     *
-     * @stable ICU 3.4
-     */
-    public static String[] getAllDetectableCharsets() {
-        return fCharsetNames;
-    }
-    
     /**
      * Test whether or not input filtering is enabled.
-     * 
+     *
      * @return <code>true</code> if input text will be filtered.
-     * 
+     *
      * @see #enableInputFilter
      *
      * @stable ICU 3.4
      */
-    public boolean inputFilterEnabled()
-    {
+    public boolean inputFilterEnabled() {
         return fStripTags;
     }
-    
+
     /**
      * Enable filtering of input text. If filtering is enabled,
      * text within angle brackets ("<" and ">") will be removed
      * before detection.
-     * 
+     *
      * @param filter <code>true</code> to enable input text filtering.
-     * 
+     *
      * @return The previous setting.
      *
      * @stable ICU 3.4
      */
-    public boolean enableInputFilter(boolean filter)
-    {
+    public boolean enableInputFilter(boolean filter) {
         boolean previous = fStripTags;
-        
+
         fStripTags = filter;
-        
+
         return previous;
     }
-    
+
     /**
      * Try to set fDeclaredEncoding to the canonical name for <encoding>, if 
it exists.
-     * 
+     *
      * @param encoding - name of character encoding
      */
     private void setCanonicalDeclaredEncoding(String encoding) {
         if ((encoding == null) || encoding.isEmpty()) {
             return;
         }
-        
+
         Charset cs = Charset.forName(encoding);
         if (cs != null) {
             fDeclaredEncoding = cs.name();
         }
     }
-    
+
     /*
      *  MungeInput - after getting a set of raw input data to be analyzed, 
preprocess
      *               it by removing what appears to be html markup.
@@ -367,10 +471,10 @@ public class CharsetDetector {
         int srci = 0;
         int dsti = 0;
         byte b;
-        boolean  inMarkup = false;
-        int      openTags = 0;
-        int      badTags  = 0;
-        
+        boolean inMarkup = false;
+        int openTags = 0;
+        int badTags = 0;
+
         //
         //  html / xml markup stripping.
         //     quick and dirty, not 100% accurate, but hopefully good enough, 
statistically.
@@ -380,55 +484,55 @@ public class CharsetDetector {
         if (fStripTags) {
             for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; 
srci++) {
                 b = fRawInput[srci];
-                if (b == (byte)'<') {
+                if (b == (byte) '<') {
                     if (inMarkup) {
                         badTags++;
                     }
                     inMarkup = true;
                     openTags++;
                 }
-                
-                if (! inMarkup) {
+
+                if (!inMarkup) {
                     fInputBytes[dsti++] = b;
                 }
-                
-                if (b == (byte)'>') {
+
+                if (b == (byte) '>') {
                     inMarkup = false;
-                }        
+                }
             }
-            
+
             fInputLen = dsti;
         }
-        
+
         //
         //  If it looks like this input wasn't marked up, or if it looks like 
it's
         //    essentially nothing but markup abandon the markup stripping.
         //    Detection will have to work on the unstripped input.
         //
-        if (openTags<5 || openTags/5 < badTags || 
-                (fInputLen < 100 && fRawLength>600)) {
+        if (openTags < 5 || openTags / 5 < badTags ||
+                (fInputLen < 100 && fRawLength > 600)) {
             int limit = fRawLength;
-            
+
             if (limit > kBufSize) {
                 limit = kBufSize;
             }
-            
-            for (srci=0; srci<limit; srci++) {
+
+            for (srci = 0; srci < limit; srci++) {
                 fInputBytes[srci] = fRawInput[srci];
             }
             fInputLen = srci;
         }
-        
+
         //
         // Tally up the byte occurence statistics.
         //   These are available for use by the various detectors.
         //
-        Arrays.fill(fByteStats, (short)0);
-        for (srci=0; srci<fInputLen; srci++) {
+        Arrays.fill(fByteStats, (short) 0);
+        for (srci = 0; srci < fInputLen; srci++) {
             int val = fInputBytes[srci] & 0x00ff;
             fByteStats[val]++;
         }
-        
+
         fC1Bytes = false;
         for (int i = 0x80; i <= 0x9F; i += 1) {
             if (fByteStats[i] != 0) {
@@ -436,127 +540,5 @@ public class CharsetDetector {
                 break;
             }
         }
-     }
-
-    /*
-     *  The following items are accessed by individual CharsetRecongizers 
during
-     *     the recognition process
-     * 
-     */
-    byte[]      fInputBytes =       // The text to be checked.  Markup will 
have been
-                   new byte[kBufSize];  //   removed if appropriate.
-    
-    int         fInputLen;          // Length of the byte data in fInputText.
-    
-    short       fByteStats[] =      // byte frequency statistics for the input 
text.
-                   new short[256];  //   Value is percent, not absolute.
-                                    //   Value is rounded up, so zero really 
means zero occurences.
-    
-    boolean     fC1Bytes =          // True if any bytes in the range 0x80 - 
0x9F are in the input;
-                   false;
-    
-    String      fDeclaredEncoding;
-    
-    
-
-    //
-    //  Stuff private to CharsetDetector
-    //
-    byte[]               fRawInput;     // Original, untouched input bytes.
-                                        //  If user gave us a byte array, this 
is it.
-                                        //  If user gave us a stream, it's 
read to a 
-                                        //  buffer here.
-    int                  fRawLength;    // Length of data in fRawInput array.
-    
-    InputStream          fInputStream;  // User's input stream, or null if the 
user
-                                        //   gave us a byte array.
-     
-    boolean              fStripTags =   // If true, setText() will strip tags 
from input text.
-                           false;
-    
-    
-    /*
-     * List of recognizers for all charsets known to the implementation.
-     */
-    private static ArrayList<CharsetRecognizer> fCSRecognizers = 
createRecognizers();
-    private static String [] fCharsetNames;
-    
-    /*
-     * Create the singleton instances of the CharsetRecognizer classes
-     */
-    private static ArrayList<CharsetRecognizer> createRecognizers() {
-        ArrayList<CharsetRecognizer> recognizers = new 
ArrayList<CharsetRecognizer>();
-        
-        recognizers.add(new CharsetRecog_UTF8());
-        
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
-        
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
-        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
-        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
-        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
-        recognizers.add(new 
CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
-        recognizers.add(new 
CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
-        recognizers.add(new 
CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
-        
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
-        
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
-        
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
-
-        // Create an array of all charset names, as a side effect.
-        // Needed for the getAllDetectableCharsets() API.
-        String[] charsetNames = new String [recognizers.size()];
-        int out = 0;
-        
-        for (CharsetRecognizer recognizer : recognizers) {
-            String name = recognizer.getName();
-            
-            if (out == 0 || ! name.equals(charsetNames[out - 1])) {
-                charsetNames[out++] = name;
-            }
-        }
-        
-        fCharsetNames = new String[out];
-        System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
-        
-        return recognizers;
     }
 }


Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
 Fri May 29 14:36:21 2015
@@ -1,9 +1,9 @@
 /**
-*******************************************************************************
-* Copyright (C) 2005-2007, International Business Machines Corporation and    *
-* others. All Rights Reserved.                                                *
-*******************************************************************************
-*/
+ * 
******************************************************************************
+ * Copyright (C) 2005-2007, International Business Machines Corporation and    
*
+ * others. All Rights Reserved.                                                
*
+ * 
******************************************************************************
+ */
 package org.apache.tika.parser.txt;
 
 import java.io.ByteArrayInputStream;
@@ -28,13 +28,70 @@ import java.io.Reader;
  */
 public class CharsetMatch implements Comparable<CharsetMatch> {
 
-    
+
+    /**
+     * Bit flag indicating the match is based on the the encoding scheme.
+     *
+     * @see #getMatchType
+     * @stable ICU 3.4
+     */
+    static public final int ENCODING_SCHEME = 1;
+    /**
+     * Bit flag indicating the match is based on the presence of a BOM.
+     *
+     * @see #getMatchType
+     * @stable ICU 3.4
+     */
+    static public final int BOM = 2;
+    /**
+     * Bit flag indicating he match is based on the declared encoding.
+     *
+     * @see #getMatchType
+     * @stable ICU 3.4
+     */
+    static public final int DECLARED_ENCODING = 4;
+    /**
+     * Bit flag indicating the match is based on language statistics.
+     *
+     * @see #getMatchType
+     * @stable ICU 3.4
+     */
+    static public final int LANG_STATISTICS = 8;
+    //
+    //   Private Data
+    //
+    private int fConfidence;
+    private CharsetRecognizer fRecognizer;
+    private byte[] fRawInput = null;     // Original, untouched input bytes.
+    //  If user gave us a byte array, this is it.
+    private int fRawLength;           // Length of data in fRawInput array.
+    private InputStream fInputStream = null;  // User's input stream, or null 
if the user
+
+    /*
+     *  Constructor.  Implementation internal
+     */
+    CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
+        fRecognizer = rec;
+        fConfidence = conf;
+
+        // The references to the original aplication input data must be copied 
out
+        //   of the charset recognizer to here, in case the application resets 
the
+        //   recognizer before using this CharsetMatch.
+        if (det.fInputStream == null) {
+            // We only want the existing input byte data if it came straight 
from the user,
+            //   not if is just the head of a stream.
+            fRawInput = det.fRawInput;
+            fRawLength = det.fRawLength;
+        }
+        fInputStream = det.fInputStream;
+    }
+
     /**
      * Create a java.io.Reader for reading the Unicode character data 
corresponding
      * to the original byte data supplied to the Charset detect operation.
      * <p/>
      * CAUTION:  if the source of the byte data was an InputStream, a Reader
-     * can be created for only one matching char set using this method.  If 
more 
+     * can be created for only one matching char set using this method.  If 
more
      * than one charset needs to be tried, the caller will need to reset
      * the InputStream and create InputStreamReaders itself, based on the 
charset name.
      *
@@ -44,11 +101,11 @@ public class CharsetMatch implements Com
      */
     public Reader getReader() {
         InputStream inputStream = fInputStream;
-        
+
         if (inputStream == null) {
             inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
         }
-        
+
         try {
             inputStream.reset();
             return new InputStreamReader(inputStream, getName());
@@ -65,7 +122,7 @@ public class CharsetMatch implements Com
      *
      * @stable ICU 3.4
      */
-    public String getString()  throws java.io.IOException {
+    public String getString() throws java.io.IOException {
         return getString(-1);
 
     }
@@ -90,24 +147,24 @@ public class CharsetMatch implements Com
             StringBuffer sb = new StringBuffer();
             char[] buffer = new char[1024];
             Reader reader = getReader();
-            int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
+            int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength;
             int bytesRead = 0;
-            
+
             while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) 
>= 0) {
                 sb.append(buffer, 0, bytesRead);
                 max -= bytesRead;
             }
-            
+
             reader.close();
-            
+
             return sb.toString();
         } else {
-            result = new String(fRawInput, getName());            
+            result = new String(fRawInput, getName());
         }
         return result;
 
     }
-    
+
     /**
      * Get an indication of the confidence in the charset detected.
      * Confidence values range from 0-100, with larger numbers indicating
@@ -121,42 +178,9 @@ public class CharsetMatch implements Com
     public int getConfidence() {
         return fConfidence;
     }
-    
 
     /**
-     * Bit flag indicating the match is based on the the encoding scheme.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int ENCODING_SCHEME    = 1;
-    
-    /**
-     * Bit flag indicating the match is based on the presence of a BOM.
-     * 
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int BOM                = 2;
-    
-    /**
-     * Bit flag indicating he match is based on the declared encoding.
-     * 
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int DECLARED_ENCODING  = 4;
-    
-    /**
-     * Bit flag indicating the match is based on language statistics.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int LANG_STATISTICS    = 8;
-    
-    /**
-     * Return flags indicating what it was about the input data 
+     * Return flags indicating what it was about the input data
      * that caused this charset to be considered as a possible match.
      * The result is a bitfield containing zero or more of the flags
      * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
@@ -176,7 +200,7 @@ public class CharsetMatch implements Com
     }
 
     /**
-     * Get the name of the detected charset.  
+     * Get the name of the detected charset.
      * The name will be one that can be used with other APIs on the
      * platform that accept charset names.  It is the "Canonical name"
      * as defined by the class java.nio.charset.Charset; for
@@ -193,9 +217,9 @@ public class CharsetMatch implements Com
     public String getName() {
         return fRecognizer.getName();
     }
-    
+
     /**
-     * Get the ISO code for the language of the detected charset.  
+     * Get the ISO code for the language of the detected charset.
      *
      * @return The ISO code for the language or <code>null</code> if the 
language cannot be determined.
      *
@@ -207,11 +231,11 @@ public class CharsetMatch implements Com
 
     /**
      * Compare to other CharsetMatch objects.
-     * Comparison is based on the match confidence value, which 
-     *   allows CharsetDetector.detectAll() to order its results. 
+     * Comparison is based on the match confidence value, which
+     *   allows CharsetDetector.detectAll() to order its results.
      *
      * @param o the CharsetMatch object to compare against.
-     * @return  a negative integer, zero, or a positive integer as the 
+     * @return a negative integer, zero, or a positive integer as the
      *          confidence level of this CharsetMatch
      *          is less than, equal to, or greater than that of
      *          the argument.
@@ -249,45 +273,14 @@ public class CharsetMatch implements Com
     public int hashCode() {
         return fConfidence;
     }
-    
-    /*
-     *  Constructor.  Implementation internal
-     */
-    CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
-        fRecognizer = rec;
-        fConfidence = conf;
-        
-        // The references to the original aplication input data must be copied 
out
-        //   of the charset recognizer to here, in case the application resets 
the
-        //   recognizer before using this CharsetMatch.
-        if (det.fInputStream == null) {
-            // We only want the existing input byte data if it came straight 
from the user,
-            //   not if is just the head of a stream.
-            fRawInput    = det.fRawInput;
-            fRawLength   = det.fRawLength;
-        }
-        fInputStream = det.fInputStream;
-    }
-
-    
-    //
-    //   Private Data
-    //
-    private int                 fConfidence;
-    private CharsetRecognizer   fRecognizer;
-    private byte[]              fRawInput = null;     // Original, untouched 
input bytes.
-                                                      //  If user gave us a 
byte array, this is it.
-    private int                 fRawLength;           // Length of data in 
fRawInput array.
-
-    private InputStream         fInputStream = null;  // User's input stream, 
or null if the user
-                                                      //   gave us a byte 
array.
+    //   gave us a byte array.
 
     public String toString() {
-       String s = "Match of " + fRecognizer.getName();
-       if(fRecognizer.getLanguage() != null) {
-          s += " in " + fRecognizer.getLanguage();
-       }
-       s += " with confidence " + fConfidence;
-       return s;
+        String s = "Match of " + fRecognizer.getName();
+        if (fRecognizer.getLanguage() != null) {
+            s += " in " + fRecognizer.getLanguage();
+        }
+        s += " with confidence " + fConfidence;
+        return s;
     }
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
 Fri May 29 14:36:21 2015
@@ -7,98 +7,95 @@
 package org.apache.tika.parser.txt;
 
 /**
- *  class CharsetRecog_2022  part of the ICU charset detection imlementation.
- *                           This is a superclass for the individual detectors 
for
- *                           each of the detectable members of the ISO 2022 
family
- *                           of encodings.
- * 
- *                           The separate classes are nested within this class.
- * 
+ * class CharsetRecog_2022  part of the ICU charset detection imlementation.
+ * This is a superclass for the individual detectors for
+ * each of the detectable members of the ISO 2022 family
+ * of encodings.
+ * <p/>
+ * The separate classes are nested within this class.
+ *
  * @internal
  */
 abstract class CharsetRecog_2022 extends CharsetRecognizer {
 
-    
+
     /**
      * Matching function shared among the 2022 detectors JP, CN and KR
      * Counts up the number of legal an unrecognized escape sequences in
      * the sample of text, and computes a score based on the total number &
      * the proportion that fit the encoding.
-     * 
-     * 
-     * @param text the byte buffer containing text to analyse
-     * @param textLen  the size of the text in the byte.
+     *
+     * @param text            the byte buffer containing text to analyse
+     * @param textLen         the size of the text in the byte.
      * @param escapeSequences the byte escape sequences to test for.
      * @return match quality, in the range of 0-100.
      */
-    int   match(byte [] text, int textLen, byte [][] escapeSequences) {
-        int     i, j;
-        int     escN;
-        int     hits   = 0;
-        int     misses = 0;
-        int     shifts = 0;
-        int     quality;
+    int match(byte[] text, int textLen, byte[][] escapeSequences) {
+        int i, j;
+        int escN;
+        int hits = 0;
+        int misses = 0;
+        int shifts = 0;
+        int quality;
         scanInput:
-            for (i=0; i<textLen; i++) {
-                if (text[i] == 0x1b) {
-                    checkEscapes:
-                        for (escN=0; escN<escapeSequences.length; escN++) {
-                            byte [] seq = escapeSequences[escN];
-                            
-                            if ((textLen - i) < seq.length) {
-                                continue checkEscapes;
-                            }
-                            
-                            for (j=1; j<seq.length; j++) {
-                                if (seq[j] != text[i+j])  {
-                                    continue checkEscapes;
-                                }                                   
-                            }
-                            
-                            hits++; 
-                            i += seq.length-1;
-                            continue scanInput;
+        for (i = 0; i < textLen; i++) {
+            if (text[i] == 0x1b) {
+                checkEscapes:
+                for (escN = 0; escN < escapeSequences.length; escN++) {
+                    byte[] seq = escapeSequences[escN];
+
+                    if ((textLen - i) < seq.length) {
+                        continue checkEscapes;
+                    }
+
+                    for (j = 1; j < seq.length; j++) {
+                        if (seq[j] != text[i + j]) {
+                            continue checkEscapes;
                         }
-                
-                        misses++;                  
-                }
-                
-                if (text[i] == 0x0e || text[i] == 0x0f) {
-                    // Shift in/out
-                    shifts++;
+                    }
+
+                    hits++;
+                    i += seq.length - 1;
+                    continue scanInput;
                 }
+
+                misses++;
             }
-        
+
+            if (text[i] == 0x0e || text[i] == 0x0f) {
+                // Shift in/out
+                shifts++;
+            }
+        }
+
         if (hits == 0) {
             return 0;
         }
-        
+
         //
         // Initial quality is based on relative proportion of recongized vs.
         //   unrecognized escape sequences. 
         //   All good:  quality = 100;
         //   half or less good: quality = 0;
         //   linear inbetween.
-        quality = (100*hits - 100*misses) / (hits + misses);
-        
+        quality = (100 * hits - 100 * misses) / (hits + misses);
+
         // Back off quality if there were too few escape sequences seen.
         //   Include shifts in this computation, so that KR does not get 
penalized
         //   for having only a single Escape sequence, but many shifts.
-        if (hits+shifts < 5) {
-            quality -= (5-(hits+shifts))*10;
+        if (hits + shifts < 5) {
+            quality -= (5 - (hits + shifts)) * 10;
         }
-        
+
         if (quality < 0) {
             quality = 0;
-        }        
+        }
         return quality;
     }
 
-    
- 
-    
+
     static class CharsetRecog_2022JP extends CharsetRecog_2022 {
-        private byte [] [] escapeSequences = {
+        private byte[][] escapeSequences = {
                 {0x1b, 0x24, 0x28, 0x43},   // KS X 1001:1992
                 {0x1b, 0x24, 0x28, 0x44},   // JIS X 212-1990
                 {0x1b, 0x24, 0x40},         // JIS C 6226-1978
@@ -111,34 +108,34 @@ abstract class CharsetRecog_2022 extends
                 {0x1b, 0x28, 0x4a},         // JIS-Roman
                 {0x1b, 0x2e, 0x41},         // ISO 8859-1
                 {0x1b, 0x2e, 0x46}          // ISO 8859-7
-                };
-        
+        };
+
         String getName() {
             return "ISO-2022-JP";
         }
-        
-        int   match(CharsetDetector det) {
+
+        int match(CharsetDetector det) {
             return match(det.fInputBytes, det.fInputLen, escapeSequences);
         }
     }
 
     static class CharsetRecog_2022KR extends CharsetRecog_2022 {
-        private byte [] [] escapeSequences = {
-                {0x1b, 0x24, 0x29, 0x43}   
-                 };
-        
+        private byte[][] escapeSequences = {
+                {0x1b, 0x24, 0x29, 0x43}
+        };
+
         String getName() {
             return "ISO-2022-KR";
         }
-        
-        int   match(CharsetDetector det) {
+
+        int match(CharsetDetector det) {
             return match(det.fInputBytes, det.fInputLen, escapeSequences);
         }
-        
+
     }
 
     static class CharsetRecog_2022CN extends CharsetRecog_2022 {
-        private byte [] [] escapeSequences = {
+        private byte[][] escapeSequences = {
                 {0x1b, 0x24, 0x29, 0x41},   // GB 2312-80
                 {0x1b, 0x24, 0x29, 0x47},   // CNS 11643-1992 Plane 1
                 {0x1b, 0x24, 0x2A, 0x48},   // CNS 11643-1992 Plane 2
@@ -151,16 +148,16 @@ abstract class CharsetRecog_2022 extends
                 {0x1b, 0x4e},               // SS2
                 {0x1b, 0x4f},               // SS3
         };
-        
+
         String getName() {
             return "ISO-2022-CN";
         }
-        
-        
-        int   match(CharsetDetector det) {
+
+
+        int match(CharsetDetector det) {
             return match(det.fInputBytes, det.fInputLen, escapeSequences);
         }
     }
-    
-    }
+
+}
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
 Fri May 29 14:36:21 2015
@@ -1,9 +1,9 @@
 /**
-*******************************************************************************
-* Copyright (C) 2005 - 2007, International Business Machines Corporation and  *
-* others. All Rights Reserved.                                                *
-*******************************************************************************
-*/
+ * 
******************************************************************************
+ * Copyright (C) 2005 - 2007, International Business Machines Corporation and  
*
+ * others. All Rights Reserved.                                                
*
+ * 
******************************************************************************
+ */
 package org.apache.tika.parser.txt;
 
 /**
@@ -21,29 +21,29 @@ class CharsetRecog_UTF8 extends CharsetR
      * @see 
com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
      */
     int match(CharsetDetector det) {
-        boolean     hasBOM = false;
-        int         numValid = 0;
-        int         numInvalid = 0;
-        byte        input[] = det.fRawInput;
-        int         i;
-        int         trailBytes = 0;
-        int         confidence;
-        
-        if (det.fRawLength >= 3 && 
+        boolean hasBOM = false;
+        int numValid = 0;
+        int numInvalid = 0;
+        byte input[] = det.fRawInput;
+        int i;
+        int trailBytes = 0;
+        int confidence;
+
+        if (det.fRawLength >= 3 &&
                 (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && 
(input[2] & 0xFF) == 0xbf) {
             hasBOM = true;
         }
-        
+
         // Scan for multi-byte sequences
-        for (i=0; i<det.fRawLength; i++) {
+        for (i = 0; i < det.fRawLength; i++) {
             int b = input[i];
             if ((b & 0x80) == 0) {
                 continue;   // ASCII
             }
-            
+
             // Hi bit on char found.  Figure out how long the sequence should 
be
             if ((b & 0x0e0) == 0x0c0) {
-                trailBytes = 1;                
+                trailBytes = 1;
             } else if ((b & 0x0f0) == 0x0e0) {
                 trailBytes = 2;
             } else if ((b & 0x0f8) == 0xf0) {
@@ -55,11 +55,11 @@ class CharsetRecog_UTF8 extends CharsetR
                 }
                 trailBytes = 0;
             }
-                
+
             // Verify that we've got the right number of trail bytes in the 
sequence
-            for (;;) {
+            for (; ; ) {
                 i++;
-                if (i>=det.fRawLength) {
+                if (i >= det.fRawLength) {
                     break;
                 }
                 b = input[i];
@@ -72,24 +72,24 @@ class CharsetRecog_UTF8 extends CharsetR
                     break;
                 }
             }
-                        
+
         }
-        
+
         // Cook up some sort of confidence score, based on presense of a BOM
         //    and the existence of valid and/or invalid multi-byte sequences.
         confidence = 0;
-        if (hasBOM && numInvalid==0) {
+        if (hasBOM && numInvalid == 0) {
             confidence = 100;
-        } else if (hasBOM && numValid > numInvalid*10) {
+        } else if (hasBOM && numValid > numInvalid * 10) {
             confidence = 80;
         } else if (numValid > 3 && numInvalid == 0) {
-            confidence = 100;            
+            confidence = 100;
         } else if (numValid > 0 && numInvalid == 0) {
             confidence = 80;
         } else if (numValid == 0 && numInvalid == 0) {
             // Plain ASCII.  
-            confidence = 10;            
-        } else if (numValid > numInvalid*10) {
+            confidence = 10;
+        } else if (numValid > numInvalid * 10) {
             // Probably corruput utf-8 data.  Valid sequences aren't likely by 
chance.
             confidence = 25;
         }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
 Fri May 29 14:36:21 2015
@@ -10,7 +10,7 @@ package org.apache.tika.parser.txt;
 /**
  * This class matches UTF-16 and UTF-32, both big- and little-endian. The
  * BOM will be used if it is present.
- * 
+ *
  * @internal
  */
 abstract class CharsetRecog_Unicode extends CharsetRecognizer {
@@ -24,130 +24,115 @@ abstract class CharsetRecog_Unicode exte
      * @see 
com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
      */
     abstract int match(CharsetDetector det);
-    
-    static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode
-    {
-        String getName()
-        {
+
+    static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode {
+        String getName() {
             return "UTF-16BE";
         }
-        
-        int match(CharsetDetector det)
-        {
+
+        int match(CharsetDetector det) {
             byte[] input = det.fRawInput;
-            
-            if (input.length>=2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 
0xFF) == 0xFF)) {
+
+            if (input.length >= 2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 
0xFF) == 0xFF)) {
                 return 100;
             }
-            
+
             // TODO: Do some statistics to check for unsigned UTF-16BE
             return 0;
         }
     }
-    
-    static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode
-    {
-        String getName()
-        {
+
+    static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode {
+        String getName() {
             return "UTF-16LE";
         }
-        
-        int match(CharsetDetector det)
-        {
+
+        int match(CharsetDetector det) {
             byte[] input = det.fRawInput;
-            
-            if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 
0xFF) == 0xFE))
-            {
-               // An LE BOM is present.
-               if (input.length>=4 && input[2] == 0x00 && input[3] == 0x00) {
-                   // It is probably UTF-32 LE, not UTF-16
-                   return 0;
-               }
-               return 100;
-            }        
-            
+
+            if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 
0xFF) == 0xFE)) {
+                // An LE BOM is present.
+                if (input.length >= 4 && input[2] == 0x00 && input[3] == 0x00) 
{
+                    // It is probably UTF-32 LE, not UTF-16
+                    return 0;
+                }
+                return 100;
+            }
+
             // TODO: Do some statistics to check for unsigned UTF-16LE
             return 0;
         }
     }
-    
-    static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode
-    {
+
+    static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode {
         abstract int getChar(byte[] input, int index);
-        
+
         abstract String getName();
-        
-        int match(CharsetDetector det)
-        {
-            byte[] input   = det.fRawInput;
-            int limit      = (det.fRawLength / 4) * 4;
-            int numValid   = 0;
+
+        int match(CharsetDetector det) {
+            byte[] input = det.fRawInput;
+            int limit = (det.fRawLength / 4) * 4;
+            int numValid = 0;
             int numInvalid = 0;
             boolean hasBOM = false;
             int confidence = 0;
-            
-            if (limit==0) {
+
+            if (limit == 0) {
                 return 0;
             }
             if (getChar(input, 0) == 0x0000FEFF) {
                 hasBOM = true;
             }
-            
-            for(int i = 0; i < limit; i += 4) {
+
+            for (int i = 0; i < limit; i += 4) {
                 int ch = getChar(input, i);
-                
+
                 if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 
0xDFFF)) {
                     numInvalid += 1;
                 } else {
                     numValid += 1;
                 }
             }
-            
-            
+
+
             // Cook up some sort of confidence score, based on presence of a 
BOM
             //    and the existence of valid and/or invalid multi-byte 
sequences.
-            if (hasBOM && numInvalid==0) {
+            if (hasBOM && numInvalid == 0) {
                 confidence = 100;
-            } else if (hasBOM && numValid > numInvalid*10) {
+            } else if (hasBOM && numValid > numInvalid * 10) {
                 confidence = 80;
             } else if (numValid > 3 && numInvalid == 0) {
-                confidence = 100;            
+                confidence = 100;
             } else if (numValid > 0 && numInvalid == 0) {
                 confidence = 80;
-            } else if (numValid > numInvalid*10) {
+            } else if (numValid > numInvalid * 10) {
                 // Probably corrupt UTF-32BE data.  Valid sequences aren't 
likely by chance.
                 confidence = 25;
             }
-            
+
             return confidence;
         }
     }
-    
-    static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32
-    {
-        int getChar(byte[] input, int index)
-        {
+
+    static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32 {
+        int getChar(byte[] input, int index) {
             return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) 
<< 16 |
-                   (input[index + 2] & 0xFF) <<  8 | (input[index + 3] & 0xFF);
+                    (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF);
         }
-        
-        String getName()
-        {
+
+        String getName() {
             return "UTF-32BE";
         }
     }
 
-    
-    static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32
-    {
-        int getChar(byte[] input, int index)
-        {
+
+    static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32 {
+        int getChar(byte[] input, int index) {
             return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) 
<< 16 |
-                   (input[index + 1] & 0xFF) <<  8 | (input[index + 0] & 0xFF);
+                    (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF);
         }
-        
-        String getName()
-        {
+
+        String getName() {
             return "UTF-32LE";
         }
     }

svn commit: r1682489 [7/14] - in /tika/trunk: tika-parsers/src/main/java/org/apache/tika/parser/html/ tika-parsers/src/main/java/org/apache/tika/parser/image/ tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/ tika-parsers/src/main/java/org/a...

Reply via email to