[
https://issues.apache.org/jira/browse/TIKA-2484?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Tim Allison updated TIKA-2484:
------------------------------
Attachment: charset.zip
File from [~AndreasMeier]
> Improve CharsetDetector to recognize UTF-16LE/BE,UTF-32LE/BE and UTF-7
> with/without BOMs correctly
> --------------------------------------------------------------------------------------------------
>
> Key: TIKA-2484
> URL: https://issues.apache.org/jira/browse/TIKA-2484
> Project: Tika
> Issue Type: Improvement
> Components: parser
> Affects Versions: 1.16, 1.17
> Reporter: Andreas Meier
> Priority: Minor
> Attachments: IUC10-ar.UTF-16BE.with-BOM,
> IUC10-ar.UTF-16BE.without-BOM, IUC10-ar.UTF-16LE.with-BOM,
> IUC10-ar.UTF-16LE.without-BOM, IUC10-ar.UTF-32BE.with-BOM,
> IUC10-ar.UTF-32BE.without-BOM, IUC10-ar.UTF-32LE.with-BOM,
> IUC10-ar.UTF-32LE.without-BOM, IUC10-ar.UTF-7.with-BOM,
> IUC10-ar.UTF-7.without-BOM, IUC10-fr.UTF-16BE.with-BOM,
> IUC10-fr.UTF-16BE.without-BOM, IUC10-fr.UTF-16LE.with-BOM,
> IUC10-fr.UTF-16LE.without-BOM, IUC10-fr.UTF-32BE.with-BOM,
> IUC10-fr.UTF-32BE.without-BOM, IUC10-fr.UTF-32LE.with-BOM,
> IUC10-fr.UTF-32LE.without-BOM, IUC10-fr.UTF-7.with-BOM,
> IUC10-fr.UTF-7.without-BOM, charset.zip
>
>
> I would like to help to improve the recognition accuracy of the
> CharsetDetector.
> Therefore I created a testset of plain/text-files to check the quality of
> org.apache.tika.parser.txt.CharsetDetector: charset.tar.gz
> (Testset created out of
> http://source.icu-project.org/repos/icu/icu4j/tags/release-4-8/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml)
> The Testset was processed using TIKA1.17 (ID: 877d621, HEAD from 26.10.2017)
> and ICU4J 59.1 CharsetDetector with custom UTF-7 improvements. Here are the
> results:
> {noformat}
> TIKA-1.17
> charset.tar.gz
> Correct recognitions: 165/341
> {noformat}
> {noformat}
> TIKA-1.17+ UTF-7 recognizer:
> charset.tar.gz
> Correct recognitions: 213/341
> {noformat}
> {noformat}
> ICU4j 59.1 + UTF-7 recognizer:
> charset.tar.gz
> Correct recognitions: 333/341
> {noformat}
> As UTF-7 recognizer I used these two simple classes:
> {code:java}
> package test.utils;
> import java.util.Arrays;
> /**
> * Pattern state container for the Boyer-Moore algorithm
> */
> public final class BoyerMoorePattern
> {
> private final byte[] pattern;
> private final int[] skipArray;
> public BoyerMoorePattern(byte[] pattern)
> {
> this.pattern = pattern;
> skipArray = new int[256];
> Arrays.fill(skipArray, -1);
> // Initialize with pattern values
> for (int i = 0; i < pattern.length; i++)
> {
> skipArray[pattern[i] & 0xFF] = i;
> }
> }
> /**
> * Get the pattern length
> *
> * @return length as int
> */
> public int getLength()
> {
> return pattern.length;
> }
> /**
> * Searches for the first occurrence of the pattern in the input byte
> array.
> *
> * @param data - The data we want to search in
> * @param startIdx - The startindex
> * @param endIdx - The endindex
> * @return offset as int or -1 if not found at all
> */
> public final int searchPattern(byte[] data, int startIdx, int endIdx)
> {
> int patternLength = pattern.length;
> int skip = 0;
> for (int i = startIdx; i <= endIdx - patternLength; i += skip)
> {
> skip = 0;
> for (int j = patternLength - 1; j >= 0; j--)
> {
> if (pattern[j] != data[i + j])
> {
> skip = Math.max(1, j - skipArray[data[i + j] & 0xFF]);
> break;
> }
> }
> if (skip == 0)
> {
> return i;
> }
> }
> return -1;
> }
> /**
> * Searches for the first occurrence of the pattern in the input byte
> array.
> *
> * @param data - The data we want to search in
> * @param startIdx - The startindex
> * @return offset as int
> */
> public final int searchPattern(byte[] data, int startIdx)
> {
> return searchPattern(data, startIdx, data.length);
> }
> /**
> * Searches for the first occurrence of the pattern in the input byte
> array.
> *
> * @param data - The data we want to search in
> * @return offset as int or -1 if not found at all
> */
> public final int searchPattern(byte[] data)
> {
> return searchPattern(data, 0, data.length);
> }
> }
> {code}
> {code:java}
> package test;
> import java.io.IOException;
> import java.io.InputStream;
> import java.nio.charset.Charset;
> import java.util.logging.Logger;
> import java.util.regex.Matcher;
> import java.util.regex.Pattern;
> import org.apache.commons.io.IOUtils;
> import org.apache.tika.detect.EncodingDetector;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.parser.txt.CharsetDetector;
> import org.apache.tika.parser.txt.CharsetMatch;
> import test.utils.BoyerMoorePattern;
> public class MyEncodingDetector implements EncodingDetector {
>
> public Charset detect(InputStream input, Metadata metadata)
> throws IOException {
>
> CharsetDetector detector;
> CharsetMatch match;
> detector = new CharsetDetector();
> detector.setText(input);
> match = detector.detect();
> match = detector.detect();
>
> String charsetName = isItUtf7(match,
> IOUtils.toByteArray(input)); // determines whether the match is UTF-7 or not
>
> if (charsetName != null) {
> return Charset.forName(charsetName);
> }
> return null;
> }
> /**
> * Checks for BOM and determines whether it is UTF-7 or not.
> *
> * @param match - The default match we expect, if it is not UTF-7
> * @param data - The bytearray we want to check
> *
> * @return match
> */
> private String isItUtf7(CharsetMatch match, byte[] data)
> {
> if (isUTF7withBOM(data) || isUTF7withoutBOM(data)) {
> return "UTF-7";
> } else {
> if (match != null) {
> return match.getName();
> }
> return null;
> }
> }
>
> private boolean isUTF7withBOM(byte[] data) {
> if ((data.length > 4 && data[0] == 43 && data[1] == 47 && data[2] ==
> 118)
> && (data[3] == 56 || data[3] == 57 || data[3] == 43 ||
> data[3] == 47))
> {
> // Checkin byte-array for "byte order marks" (BOM):
> // 43 47 118 56
> // 43 47 118 57
> // 43 47 118 43
> // 43 47 118 47
> return true;
> }
> return false;
> }
>
> private boolean isUTF7withoutBOM(byte[] data) {
> byte[] utf7StartPattern = "+".getBytes();
> byte[] utf7EndPattern = "-".getBytes();
> BoyerMoorePattern bmpattern = new
> BoyerMoorePattern(utf7StartPattern); // create a new pattern with the bytes
> int startPosSP = bmpattern.searchPattern(data);
>
> BoyerMoorePattern empattern = new BoyerMoorePattern(utf7EndPattern);
> // create a new pattern with the bytes
> int startPosEP = empattern.searchPattern(data);
>
> if (startPosSP != -1 && startPosEP != -1) {
> // the pattern was found, so we can create a regular expression
> for the basic pattern now
> Pattern p = Pattern.compile("\\+[a-zA-Z]\\w{2,}\\-"); // a
> word with length of at least 3 characters or more
> Matcher m = p.matcher(new String(data));
>
> int numberMatches = 0;
> while (m.find()) {
> numberMatches++;
> }
>
> System.out.println("Number of possible UTF-7 regex matches: " +
> numberMatches);
> int minimumMatches = 3;
>
> if (numberMatches > minimumMatches) { // if there are more
> than minimumMatches "+xxx-" words the expected encoding shall be UTF-7
> return true;
> }
> }
>
> return false;
> }
> }
> {code}
> There might be some false positive (FP) recognitions with the current regex
> and the number of matches.
> A better approach might be to set the minimumMatches in dependence of the
> amount of text given to the detector.
> This is just a simple first try, nothing for productivity. It even does not
> cover all possible UTF-7 strings.
> By the way:
> I am perfectly aware of the fact that the current testset does only cover a
> few encodings. However, the specified files address the main weakness of the
> current CharsetDetector.
> I don't know the history that lead to the creation of the CharsetDetector in
> TIKA and why ICU4J was rebuild with extensions like the cp866 ngram
> detection, instead of participating in icu4j development.
> Wouldn't it be better to forward the changes of the CharsetDetector to the
> ICU4J developers so they can implement missing encodings?
> Is it planned to include the newest version of ICU4J in future releases of
> TIKA?
> What about neural networks to determine some or all charsets? (given that
> there are enough testfiles)
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)