Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java Fri May 29 14:36:21 2015 @@ -1,18 +1,18 @@ /** -******************************************************************************* -* Copyright (C) 2005-2009, International Business Machines Corporation and * -* others. All Rights Reserved. * -******************************************************************************* -*/ + * ****************************************************************************** + * Copyright (C) 2005-2009, International Business Machines Corporation and * + * others. All Rights Reserved. * + * ****************************************************************************** + */ package org.apache.tika.parser.txt; +import java.io.IOException; import java.io.InputStream; import java.io.Reader; -import java.io.IOException; import java.nio.charset.Charset; import java.util.ArrayList; -import java.util.Collections; import java.util.Arrays; +import java.util.Collections; /** @@ -47,27 +47,150 @@ public class CharsetDetector { // actually choose the "real" charset. All assuming that the application just // wants the data, and doesn't care about a char set name. + private static final int kBufSize = 12000; + private static final int MAX_CONFIDENCE = 100; + private static String[] fCharsetNames; + /* + * List of recognizers for all charsets known to the implementation. + */ + private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers(); + /* + * The following items are accessed by individual CharsetRecongizers during + * the recognition process + * + */ + byte[] fInputBytes = // The text to be checked. Markup will have been + new byte[kBufSize]; // removed if appropriate. + int fInputLen; // Length of the byte data in fInputText. + short fByteStats[] = // byte frequency statistics for the input text. + new short[256]; // Value is percent, not absolute. + boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input; + false; + String fDeclaredEncoding; + // + // Stuff private to CharsetDetector + // + byte[] fRawInput; // Original, untouched input bytes. + // If user gave us a byte array, this is it. + // If user gave us a stream, it's read to a + // buffer here. + int fRawLength; // Length of data in fRawInput array. + InputStream fInputStream; // User's input stream, or null if the user + boolean fStripTags = // If true, setText() will strip tags from input text. + false; + /** * Constructor - * + * * @stable ICU 3.4 */ public CharsetDetector() { } /** + * Get the names of all char sets that can be recognized by the char set detector. + * + * @return an array of the names of all charsets that can be recognized + * by the charset detector. + * + * @stable ICU 3.4 + */ + public static String[] getAllDetectableCharsets() { + return fCharsetNames; + } + + /* + * Create the singleton instances of the CharsetRecognizer classes + */ + private static ArrayList<CharsetRecognizer> createRecognizers() { + ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>(); + + recognizers.add(new CharsetRecog_UTF8()); + + recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE()); + recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE()); + recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE()); + recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE()); + + recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis()); + recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP()); + recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN()); + recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR()); + recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030()); + recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp()); + recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr()); + recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5()); + + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr()); + + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr()); + + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl()); + + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru()); + + // Create an array of all charset names, as a side effect. + // Needed for the getAllDetectableCharsets() API. + String[] charsetNames = new String[recognizers.size()]; + int out = 0; + + for (CharsetRecognizer recognizer : recognizers) { + String name = recognizer.getName(); + + if (out == 0 || !name.equals(charsetNames[out - 1])) { + charsetNames[out++] = name; + } + } + + fCharsetNames = new String[out]; + System.arraycopy(charsetNames, 0, fCharsetNames, 0, out); + + return recognizers; + } + + /** * Set the declared encoding for charset detection. * The declared encoding of an input text is an encoding obtained * from an http header or xml declaration or similar source that - * can be provided as additional information to the charset detector. + * can be provided as additional information to the charset detector. * A match between a declared encoding and a possible detected encoding * will raise the quality of that detected encoding by a small delta, * and will also appear as a "reason" for the match. * <p/> * A declared encoding that is incompatible with the input data being * analyzed will not be added to the list of possible encodings. - * - * @param encoding The declared encoding + * + * @param encoding The declared encoding * * @stable ICU 3.4 */ @@ -75,28 +198,25 @@ public class CharsetDetector { setCanonicalDeclaredEncoding(encoding); return this; } - + /** * Set the input text (byte) data whose charset is to be detected. - * + * * @param in the input text of unknown encoding - * + * * @return This CharsetDetector * * @stable ICU 3.4 */ - public CharsetDetector setText(byte [] in) { - fRawInput = in; + public CharsetDetector setText(byte[] in) { + fRawInput = in; fRawLength = in.length; - + MungeInput(); - + return this; } - - private static final int kBufSize = 12000; - - private static final int MAX_CONFIDENCE = 100; + // Value is rounded up, so zero really means zero occurences. /** * Set the input text (byte) data whose charset is to be detected. @@ -108,45 +228,44 @@ public class CharsetDetector { * be read depends on the characteristics of the data itself. * * @param in the input text of unknown encoding - * + * * @return This CharsetDetector * * @stable ICU 3.4 */ - + public CharsetDetector setText(InputStream in) throws IOException { fInputStream = in; fInputStream.mark(kBufSize); fRawInput = new byte[kBufSize]; // Always make a new buffer because the - // previous one may have come from the caller, - // in which case we can't touch it. + // previous one may have come from the caller, + // in which case we can't touch it. fRawLength = 0; int remainingLength = kBufSize; - while (remainingLength > 0 ) { + while (remainingLength > 0) { // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop. - int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength); + int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength); if (bytesRead <= 0) { - break; + break; } fRawLength += bytesRead; remainingLength -= bytesRead; } fInputStream.reset(); - + MungeInput(); // Strip html markup, collect byte stats. return this; } - /** * Return the charset that best matches the supplied input data. - * - * Note though, that because the detection + * + * Note though, that because the detection * only looks at the start of the input data, * there is a possibility that the returned charset will fail to handle * the full set of input data. * <p/> - * Raise an exception if + * Raise an exception if * <ul> * <li>no charset appears to match the data.</li> * <li>no input text has been provided</li> @@ -163,65 +282,64 @@ public class CharsetDetector { // is found. This is something to be done later, after things are otherwise // working. CharsetMatch matches[] = detectAll(); - + if (matches == null || matches.length == 0) { return null; } - + return matches[0]; - } - + } + /** * Return an array of all charsets that appear to be plausible * matches with the input data. The array is ordered with the * best quality match first. * <p/> - * Raise an exception if + * Raise an exception if * <ul> * <li>no charsets appear to match the input data.</li> * <li>no input text has been provided</li> * </ul> - * + * * @return An array of CharsetMatch objects representing possibly matching charsets. * * @stable ICU 3.4 */ public CharsetMatch[] detectAll() { CharsetRecognizer csr; - int i; - int detectResults; - int confidence; + int i; + int detectResults; + int confidence; ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>(); - + // Iterate over all possible charsets, remember all that // give a match quality > 0. - for (i=0; i<fCSRecognizers.size(); i++) { + for (i = 0; i < fCSRecognizers.size(); i++) { csr = fCSRecognizers.get(i); detectResults = csr.match(this); confidence = detectResults & 0x000000ff; if (confidence > 0) { // Just to be safe, constrain confidence = Math.min(confidence, MAX_CONFIDENCE); - + // Apply charset hint. if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) { // Reduce lack of confidence (delta between "sure" and current) by 50%. - confidence += (MAX_CONFIDENCE - confidence)/2; + confidence += (MAX_CONFIDENCE - confidence) / 2; } - - CharsetMatch m = new CharsetMatch(this, csr, confidence); + + CharsetMatch m = new CharsetMatch(this, csr, confidence); matches.add(m); } } - + Collections.sort(matches); // CharsetMatch compares on confidence Collections.reverse(matches); // Put best match first. - CharsetMatch [] resultArray = new CharsetMatch[matches.size()]; + CharsetMatch[] resultArray = new CharsetMatch[matches.size()]; resultArray = matches.toArray(resultArray); return resultArray; } - /** * Autodetect the charset of an inputStream, and return a Java Reader * to access the converted input data. @@ -236,7 +354,7 @@ public class CharsetDetector { * be read depends on the characteristics of the data itself. *<p/> * Raise an exception if no charsets appear to match the input data. - * + * * @param in The source of the byte data in the unknown charset. * * @param declaredEncoding A declared encoding for the data, if available, @@ -246,16 +364,16 @@ public class CharsetDetector { */ public Reader getReader(InputStream in, String declaredEncoding) { setCanonicalDeclaredEncoding(declaredEncoding); - + try { setText(in); - + CharsetMatch match = detect(); - + if (match == null) { return null; } - + return match.getReader(); } catch (IOException e) { return null; @@ -270,7 +388,7 @@ public class CharsetDetector { * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code> *<p/> * Raise an exception if no charsets appear to match the input data. - * + * * @param in The source of the byte data in the unknown charset. * * @param declaredEncoding A declared encoding for the data, if available, @@ -280,85 +398,71 @@ public class CharsetDetector { */ public String getString(byte[] in, String declaredEncoding) { setCanonicalDeclaredEncoding(declaredEncoding); - + try { setText(in); - + CharsetMatch match = detect(); - + if (match == null) { return null; } - + return match.getString(-1); } catch (IOException e) { return null; } } + // gave us a byte array. - - /** - * Get the names of all char sets that can be recognized by the char set detector. - * - * @return an array of the names of all charsets that can be recognized - * by the charset detector. - * - * @stable ICU 3.4 - */ - public static String[] getAllDetectableCharsets() { - return fCharsetNames; - } - /** * Test whether or not input filtering is enabled. - * + * * @return <code>true</code> if input text will be filtered. - * + * * @see #enableInputFilter * * @stable ICU 3.4 */ - public boolean inputFilterEnabled() - { + public boolean inputFilterEnabled() { return fStripTags; } - + /** * Enable filtering of input text. If filtering is enabled, * text within angle brackets ("<" and ">") will be removed * before detection. - * + * * @param filter <code>true</code> to enable input text filtering. - * + * * @return The previous setting. * * @stable ICU 3.4 */ - public boolean enableInputFilter(boolean filter) - { + public boolean enableInputFilter(boolean filter) { boolean previous = fStripTags; - + fStripTags = filter; - + return previous; } - + /** * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists. - * + * * @param encoding - name of character encoding */ private void setCanonicalDeclaredEncoding(String encoding) { if ((encoding == null) || encoding.isEmpty()) { return; } - + Charset cs = Charset.forName(encoding); if (cs != null) { fDeclaredEncoding = cs.name(); } } - + /* * MungeInput - after getting a set of raw input data to be analyzed, preprocess * it by removing what appears to be html markup. @@ -367,10 +471,10 @@ public class CharsetDetector { int srci = 0; int dsti = 0; byte b; - boolean inMarkup = false; - int openTags = 0; - int badTags = 0; - + boolean inMarkup = false; + int openTags = 0; + int badTags = 0; + // // html / xml markup stripping. // quick and dirty, not 100% accurate, but hopefully good enough, statistically. @@ -380,55 +484,55 @@ public class CharsetDetector { if (fStripTags) { for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) { b = fRawInput[srci]; - if (b == (byte)'<') { + if (b == (byte) '<') { if (inMarkup) { badTags++; } inMarkup = true; openTags++; } - - if (! inMarkup) { + + if (!inMarkup) { fInputBytes[dsti++] = b; } - - if (b == (byte)'>') { + + if (b == (byte) '>') { inMarkup = false; - } + } } - + fInputLen = dsti; } - + // // If it looks like this input wasn't marked up, or if it looks like it's // essentially nothing but markup abandon the markup stripping. // Detection will have to work on the unstripped input. // - if (openTags<5 || openTags/5 < badTags || - (fInputLen < 100 && fRawLength>600)) { + if (openTags < 5 || openTags / 5 < badTags || + (fInputLen < 100 && fRawLength > 600)) { int limit = fRawLength; - + if (limit > kBufSize) { limit = kBufSize; } - - for (srci=0; srci<limit; srci++) { + + for (srci = 0; srci < limit; srci++) { fInputBytes[srci] = fRawInput[srci]; } fInputLen = srci; } - + // // Tally up the byte occurence statistics. // These are available for use by the various detectors. // - Arrays.fill(fByteStats, (short)0); - for (srci=0; srci<fInputLen; srci++) { + Arrays.fill(fByteStats, (short) 0); + for (srci = 0; srci < fInputLen; srci++) { int val = fInputBytes[srci] & 0x00ff; fByteStats[val]++; } - + fC1Bytes = false; for (int i = 0x80; i <= 0x9F; i += 1) { if (fByteStats[i] != 0) { @@ -436,127 +540,5 @@ public class CharsetDetector { break; } } - } - - /* - * The following items are accessed by individual CharsetRecongizers during - * the recognition process - * - */ - byte[] fInputBytes = // The text to be checked. Markup will have been - new byte[kBufSize]; // removed if appropriate. - - int fInputLen; // Length of the byte data in fInputText. - - short fByteStats[] = // byte frequency statistics for the input text. - new short[256]; // Value is percent, not absolute. - // Value is rounded up, so zero really means zero occurences. - - boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input; - false; - - String fDeclaredEncoding; - - - - // - // Stuff private to CharsetDetector - // - byte[] fRawInput; // Original, untouched input bytes. - // If user gave us a byte array, this is it. - // If user gave us a stream, it's read to a - // buffer here. - int fRawLength; // Length of data in fRawInput array. - - InputStream fInputStream; // User's input stream, or null if the user - // gave us a byte array. - - boolean fStripTags = // If true, setText() will strip tags from input text. - false; - - - /* - * List of recognizers for all charsets known to the implementation. - */ - private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers(); - private static String [] fCharsetNames; - - /* - * Create the singleton instances of the CharsetRecognizer classes - */ - private static ArrayList<CharsetRecognizer> createRecognizers() { - ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>(); - - recognizers.add(new CharsetRecog_UTF8()); - - recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE()); - recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE()); - recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE()); - recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE()); - - recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis()); - recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP()); - recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN()); - recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR()); - recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030()); - recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp()); - recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr()); - recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5()); - - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr()); - - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr()); - - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl()); - - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru()); - - // Create an array of all charset names, as a side effect. - // Needed for the getAllDetectableCharsets() API. - String[] charsetNames = new String [recognizers.size()]; - int out = 0; - - for (CharsetRecognizer recognizer : recognizers) { - String name = recognizer.getName(); - - if (out == 0 || ! name.equals(charsetNames[out - 1])) { - charsetNames[out++] = name; - } - } - - fCharsetNames = new String[out]; - System.arraycopy(charsetNames, 0, fCharsetNames, 0, out); - - return recognizers; } }
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java Fri May 29 14:36:21 2015 @@ -1,9 +1,9 @@ /** -******************************************************************************* -* Copyright (C) 2005-2007, International Business Machines Corporation and * -* others. All Rights Reserved. * -******************************************************************************* -*/ + * ****************************************************************************** + * Copyright (C) 2005-2007, International Business Machines Corporation and * + * others. All Rights Reserved. * + * ****************************************************************************** + */ package org.apache.tika.parser.txt; import java.io.ByteArrayInputStream; @@ -28,13 +28,70 @@ import java.io.Reader; */ public class CharsetMatch implements Comparable<CharsetMatch> { - + + /** + * Bit flag indicating the match is based on the the encoding scheme. + * + * @see #getMatchType + * @stable ICU 3.4 + */ + static public final int ENCODING_SCHEME = 1; + /** + * Bit flag indicating the match is based on the presence of a BOM. + * + * @see #getMatchType + * @stable ICU 3.4 + */ + static public final int BOM = 2; + /** + * Bit flag indicating he match is based on the declared encoding. + * + * @see #getMatchType + * @stable ICU 3.4 + */ + static public final int DECLARED_ENCODING = 4; + /** + * Bit flag indicating the match is based on language statistics. + * + * @see #getMatchType + * @stable ICU 3.4 + */ + static public final int LANG_STATISTICS = 8; + // + // Private Data + // + private int fConfidence; + private CharsetRecognizer fRecognizer; + private byte[] fRawInput = null; // Original, untouched input bytes. + // If user gave us a byte array, this is it. + private int fRawLength; // Length of data in fRawInput array. + private InputStream fInputStream = null; // User's input stream, or null if the user + + /* + * Constructor. Implementation internal + */ + CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) { + fRecognizer = rec; + fConfidence = conf; + + // The references to the original aplication input data must be copied out + // of the charset recognizer to here, in case the application resets the + // recognizer before using this CharsetMatch. + if (det.fInputStream == null) { + // We only want the existing input byte data if it came straight from the user, + // not if is just the head of a stream. + fRawInput = det.fRawInput; + fRawLength = det.fRawLength; + } + fInputStream = det.fInputStream; + } + /** * Create a java.io.Reader for reading the Unicode character data corresponding * to the original byte data supplied to the Charset detect operation. * <p/> * CAUTION: if the source of the byte data was an InputStream, a Reader - * can be created for only one matching char set using this method. If more + * can be created for only one matching char set using this method. If more * than one charset needs to be tried, the caller will need to reset * the InputStream and create InputStreamReaders itself, based on the charset name. * @@ -44,11 +101,11 @@ public class CharsetMatch implements Com */ public Reader getReader() { InputStream inputStream = fInputStream; - + if (inputStream == null) { inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength); } - + try { inputStream.reset(); return new InputStreamReader(inputStream, getName()); @@ -65,7 +122,7 @@ public class CharsetMatch implements Com * * @stable ICU 3.4 */ - public String getString() throws java.io.IOException { + public String getString() throws java.io.IOException { return getString(-1); } @@ -90,24 +147,24 @@ public class CharsetMatch implements Com StringBuffer sb = new StringBuffer(); char[] buffer = new char[1024]; Reader reader = getReader(); - int max = maxLength < 0? Integer.MAX_VALUE : maxLength; + int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength; int bytesRead = 0; - + while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) { sb.append(buffer, 0, bytesRead); max -= bytesRead; } - + reader.close(); - + return sb.toString(); } else { - result = new String(fRawInput, getName()); + result = new String(fRawInput, getName()); } return result; } - + /** * Get an indication of the confidence in the charset detected. * Confidence values range from 0-100, with larger numbers indicating @@ -121,42 +178,9 @@ public class CharsetMatch implements Com public int getConfidence() { return fConfidence; } - /** - * Bit flag indicating the match is based on the the encoding scheme. - * - * @see #getMatchType - * @stable ICU 3.4 - */ - static public final int ENCODING_SCHEME = 1; - - /** - * Bit flag indicating the match is based on the presence of a BOM. - * - * @see #getMatchType - * @stable ICU 3.4 - */ - static public final int BOM = 2; - - /** - * Bit flag indicating he match is based on the declared encoding. - * - * @see #getMatchType - * @stable ICU 3.4 - */ - static public final int DECLARED_ENCODING = 4; - - /** - * Bit flag indicating the match is based on language statistics. - * - * @see #getMatchType - * @stable ICU 3.4 - */ - static public final int LANG_STATISTICS = 8; - - /** - * Return flags indicating what it was about the input data + * Return flags indicating what it was about the input data * that caused this charset to be considered as a possible match. * The result is a bitfield containing zero or more of the flags * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS. @@ -176,7 +200,7 @@ public class CharsetMatch implements Com } /** - * Get the name of the detected charset. + * Get the name of the detected charset. * The name will be one that can be used with other APIs on the * platform that accept charset names. It is the "Canonical name" * as defined by the class java.nio.charset.Charset; for @@ -193,9 +217,9 @@ public class CharsetMatch implements Com public String getName() { return fRecognizer.getName(); } - + /** - * Get the ISO code for the language of the detected charset. + * Get the ISO code for the language of the detected charset. * * @return The ISO code for the language or <code>null</code> if the language cannot be determined. * @@ -207,11 +231,11 @@ public class CharsetMatch implements Com /** * Compare to other CharsetMatch objects. - * Comparison is based on the match confidence value, which - * allows CharsetDetector.detectAll() to order its results. + * Comparison is based on the match confidence value, which + * allows CharsetDetector.detectAll() to order its results. * * @param o the CharsetMatch object to compare against. - * @return a negative integer, zero, or a positive integer as the + * @return a negative integer, zero, or a positive integer as the * confidence level of this CharsetMatch * is less than, equal to, or greater than that of * the argument. @@ -249,45 +273,14 @@ public class CharsetMatch implements Com public int hashCode() { return fConfidence; } - - /* - * Constructor. Implementation internal - */ - CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) { - fRecognizer = rec; - fConfidence = conf; - - // The references to the original aplication input data must be copied out - // of the charset recognizer to here, in case the application resets the - // recognizer before using this CharsetMatch. - if (det.fInputStream == null) { - // We only want the existing input byte data if it came straight from the user, - // not if is just the head of a stream. - fRawInput = det.fRawInput; - fRawLength = det.fRawLength; - } - fInputStream = det.fInputStream; - } - - - // - // Private Data - // - private int fConfidence; - private CharsetRecognizer fRecognizer; - private byte[] fRawInput = null; // Original, untouched input bytes. - // If user gave us a byte array, this is it. - private int fRawLength; // Length of data in fRawInput array. - - private InputStream fInputStream = null; // User's input stream, or null if the user - // gave us a byte array. + // gave us a byte array. public String toString() { - String s = "Match of " + fRecognizer.getName(); - if(fRecognizer.getLanguage() != null) { - s += " in " + fRecognizer.getLanguage(); - } - s += " with confidence " + fConfidence; - return s; + String s = "Match of " + fRecognizer.getName(); + if (fRecognizer.getLanguage() != null) { + s += " in " + fRecognizer.getLanguage(); + } + s += " with confidence " + fConfidence; + return s; } } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java Fri May 29 14:36:21 2015 @@ -7,98 +7,95 @@ package org.apache.tika.parser.txt; /** - * class CharsetRecog_2022 part of the ICU charset detection imlementation. - * This is a superclass for the individual detectors for - * each of the detectable members of the ISO 2022 family - * of encodings. - * - * The separate classes are nested within this class. - * + * class CharsetRecog_2022 part of the ICU charset detection imlementation. + * This is a superclass for the individual detectors for + * each of the detectable members of the ISO 2022 family + * of encodings. + * <p/> + * The separate classes are nested within this class. + * * @internal */ abstract class CharsetRecog_2022 extends CharsetRecognizer { - + /** * Matching function shared among the 2022 detectors JP, CN and KR * Counts up the number of legal an unrecognized escape sequences in * the sample of text, and computes a score based on the total number & * the proportion that fit the encoding. - * - * - * @param text the byte buffer containing text to analyse - * @param textLen the size of the text in the byte. + * + * @param text the byte buffer containing text to analyse + * @param textLen the size of the text in the byte. * @param escapeSequences the byte escape sequences to test for. * @return match quality, in the range of 0-100. */ - int match(byte [] text, int textLen, byte [][] escapeSequences) { - int i, j; - int escN; - int hits = 0; - int misses = 0; - int shifts = 0; - int quality; + int match(byte[] text, int textLen, byte[][] escapeSequences) { + int i, j; + int escN; + int hits = 0; + int misses = 0; + int shifts = 0; + int quality; scanInput: - for (i=0; i<textLen; i++) { - if (text[i] == 0x1b) { - checkEscapes: - for (escN=0; escN<escapeSequences.length; escN++) { - byte [] seq = escapeSequences[escN]; - - if ((textLen - i) < seq.length) { - continue checkEscapes; - } - - for (j=1; j<seq.length; j++) { - if (seq[j] != text[i+j]) { - continue checkEscapes; - } - } - - hits++; - i += seq.length-1; - continue scanInput; + for (i = 0; i < textLen; i++) { + if (text[i] == 0x1b) { + checkEscapes: + for (escN = 0; escN < escapeSequences.length; escN++) { + byte[] seq = escapeSequences[escN]; + + if ((textLen - i) < seq.length) { + continue checkEscapes; + } + + for (j = 1; j < seq.length; j++) { + if (seq[j] != text[i + j]) { + continue checkEscapes; } - - misses++; - } - - if (text[i] == 0x0e || text[i] == 0x0f) { - // Shift in/out - shifts++; + } + + hits++; + i += seq.length - 1; + continue scanInput; } + + misses++; } - + + if (text[i] == 0x0e || text[i] == 0x0f) { + // Shift in/out + shifts++; + } + } + if (hits == 0) { return 0; } - + // // Initial quality is based on relative proportion of recongized vs. // unrecognized escape sequences. // All good: quality = 100; // half or less good: quality = 0; // linear inbetween. - quality = (100*hits - 100*misses) / (hits + misses); - + quality = (100 * hits - 100 * misses) / (hits + misses); + // Back off quality if there were too few escape sequences seen. // Include shifts in this computation, so that KR does not get penalized // for having only a single Escape sequence, but many shifts. - if (hits+shifts < 5) { - quality -= (5-(hits+shifts))*10; + if (hits + shifts < 5) { + quality -= (5 - (hits + shifts)) * 10; } - + if (quality < 0) { quality = 0; - } + } return quality; } - - - + static class CharsetRecog_2022JP extends CharsetRecog_2022 { - private byte [] [] escapeSequences = { + private byte[][] escapeSequences = { {0x1b, 0x24, 0x28, 0x43}, // KS X 1001:1992 {0x1b, 0x24, 0x28, 0x44}, // JIS X 212-1990 {0x1b, 0x24, 0x40}, // JIS C 6226-1978 @@ -111,34 +108,34 @@ abstract class CharsetRecog_2022 extends {0x1b, 0x28, 0x4a}, // JIS-Roman {0x1b, 0x2e, 0x41}, // ISO 8859-1 {0x1b, 0x2e, 0x46} // ISO 8859-7 - }; - + }; + String getName() { return "ISO-2022-JP"; } - - int match(CharsetDetector det) { + + int match(CharsetDetector det) { return match(det.fInputBytes, det.fInputLen, escapeSequences); } } static class CharsetRecog_2022KR extends CharsetRecog_2022 { - private byte [] [] escapeSequences = { - {0x1b, 0x24, 0x29, 0x43} - }; - + private byte[][] escapeSequences = { + {0x1b, 0x24, 0x29, 0x43} + }; + String getName() { return "ISO-2022-KR"; } - - int match(CharsetDetector det) { + + int match(CharsetDetector det) { return match(det.fInputBytes, det.fInputLen, escapeSequences); } - + } static class CharsetRecog_2022CN extends CharsetRecog_2022 { - private byte [] [] escapeSequences = { + private byte[][] escapeSequences = { {0x1b, 0x24, 0x29, 0x41}, // GB 2312-80 {0x1b, 0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1 {0x1b, 0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2 @@ -151,16 +148,16 @@ abstract class CharsetRecog_2022 extends {0x1b, 0x4e}, // SS2 {0x1b, 0x4f}, // SS3 }; - + String getName() { return "ISO-2022-CN"; } - - - int match(CharsetDetector det) { + + + int match(CharsetDetector det) { return match(det.fInputBytes, det.fInputLen, escapeSequences); } } - - } + +} Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java Fri May 29 14:36:21 2015 @@ -1,9 +1,9 @@ /** -******************************************************************************* -* Copyright (C) 2005 - 2007, International Business Machines Corporation and * -* others. All Rights Reserved. * -******************************************************************************* -*/ + * ****************************************************************************** + * Copyright (C) 2005 - 2007, International Business Machines Corporation and * + * others. All Rights Reserved. * + * ****************************************************************************** + */ package org.apache.tika.parser.txt; /** @@ -21,29 +21,29 @@ class CharsetRecog_UTF8 extends CharsetR * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector) */ int match(CharsetDetector det) { - boolean hasBOM = false; - int numValid = 0; - int numInvalid = 0; - byte input[] = det.fRawInput; - int i; - int trailBytes = 0; - int confidence; - - if (det.fRawLength >= 3 && + boolean hasBOM = false; + int numValid = 0; + int numInvalid = 0; + byte input[] = det.fRawInput; + int i; + int trailBytes = 0; + int confidence; + + if (det.fRawLength >= 3 && (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) { hasBOM = true; } - + // Scan for multi-byte sequences - for (i=0; i<det.fRawLength; i++) { + for (i = 0; i < det.fRawLength; i++) { int b = input[i]; if ((b & 0x80) == 0) { continue; // ASCII } - + // Hi bit on char found. Figure out how long the sequence should be if ((b & 0x0e0) == 0x0c0) { - trailBytes = 1; + trailBytes = 1; } else if ((b & 0x0f0) == 0x0e0) { trailBytes = 2; } else if ((b & 0x0f8) == 0xf0) { @@ -55,11 +55,11 @@ class CharsetRecog_UTF8 extends CharsetR } trailBytes = 0; } - + // Verify that we've got the right number of trail bytes in the sequence - for (;;) { + for (; ; ) { i++; - if (i>=det.fRawLength) { + if (i >= det.fRawLength) { break; } b = input[i]; @@ -72,24 +72,24 @@ class CharsetRecog_UTF8 extends CharsetR break; } } - + } - + // Cook up some sort of confidence score, based on presense of a BOM // and the existence of valid and/or invalid multi-byte sequences. confidence = 0; - if (hasBOM && numInvalid==0) { + if (hasBOM && numInvalid == 0) { confidence = 100; - } else if (hasBOM && numValid > numInvalid*10) { + } else if (hasBOM && numValid > numInvalid * 10) { confidence = 80; } else if (numValid > 3 && numInvalid == 0) { - confidence = 100; + confidence = 100; } else if (numValid > 0 && numInvalid == 0) { confidence = 80; } else if (numValid == 0 && numInvalid == 0) { // Plain ASCII. - confidence = 10; - } else if (numValid > numInvalid*10) { + confidence = 10; + } else if (numValid > numInvalid * 10) { // Probably corruput utf-8 data. Valid sequences aren't likely by chance. confidence = 25; } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java Fri May 29 14:36:21 2015 @@ -10,7 +10,7 @@ package org.apache.tika.parser.txt; /** * This class matches UTF-16 and UTF-32, both big- and little-endian. The * BOM will be used if it is present. - * + * * @internal */ abstract class CharsetRecog_Unicode extends CharsetRecognizer { @@ -24,130 +24,115 @@ abstract class CharsetRecog_Unicode exte * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector) */ abstract int match(CharsetDetector det); - - static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode - { - String getName() - { + + static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode { + String getName() { return "UTF-16BE"; } - - int match(CharsetDetector det) - { + + int match(CharsetDetector det) { byte[] input = det.fRawInput; - - if (input.length>=2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) { + + if (input.length >= 2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) { return 100; } - + // TODO: Do some statistics to check for unsigned UTF-16BE return 0; } } - - static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode - { - String getName() - { + + static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode { + String getName() { return "UTF-16LE"; } - - int match(CharsetDetector det) - { + + int match(CharsetDetector det) { byte[] input = det.fRawInput; - - if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE)) - { - // An LE BOM is present. - if (input.length>=4 && input[2] == 0x00 && input[3] == 0x00) { - // It is probably UTF-32 LE, not UTF-16 - return 0; - } - return 100; - } - + + if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE)) { + // An LE BOM is present. + if (input.length >= 4 && input[2] == 0x00 && input[3] == 0x00) { + // It is probably UTF-32 LE, not UTF-16 + return 0; + } + return 100; + } + // TODO: Do some statistics to check for unsigned UTF-16LE return 0; } } - - static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode - { + + static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode { abstract int getChar(byte[] input, int index); - + abstract String getName(); - - int match(CharsetDetector det) - { - byte[] input = det.fRawInput; - int limit = (det.fRawLength / 4) * 4; - int numValid = 0; + + int match(CharsetDetector det) { + byte[] input = det.fRawInput; + int limit = (det.fRawLength / 4) * 4; + int numValid = 0; int numInvalid = 0; boolean hasBOM = false; int confidence = 0; - - if (limit==0) { + + if (limit == 0) { return 0; } if (getChar(input, 0) == 0x0000FEFF) { hasBOM = true; } - - for(int i = 0; i < limit; i += 4) { + + for (int i = 0; i < limit; i += 4) { int ch = getChar(input, i); - + if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) { numInvalid += 1; } else { numValid += 1; } } - - + + // Cook up some sort of confidence score, based on presence of a BOM // and the existence of valid and/or invalid multi-byte sequences. - if (hasBOM && numInvalid==0) { + if (hasBOM && numInvalid == 0) { confidence = 100; - } else if (hasBOM && numValid > numInvalid*10) { + } else if (hasBOM && numValid > numInvalid * 10) { confidence = 80; } else if (numValid > 3 && numInvalid == 0) { - confidence = 100; + confidence = 100; } else if (numValid > 0 && numInvalid == 0) { confidence = 80; - } else if (numValid > numInvalid*10) { + } else if (numValid > numInvalid * 10) { // Probably corrupt UTF-32BE data. Valid sequences aren't likely by chance. confidence = 25; } - + return confidence; } } - - static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32 - { - int getChar(byte[] input, int index) - { + + static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32 { + int getChar(byte[] input, int index) { return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 | - (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF); + (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF); } - - String getName() - { + + String getName() { return "UTF-32BE"; } } - - static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32 - { - int getChar(byte[] input, int index) - { + + static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32 { + int getChar(byte[] input, int index) { return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 | - (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF); + (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF); } - - String getName() - { + + String getName() { return "UTF-32LE"; } }
