Repository: any23 Updated Branches: refs/heads/master 6f1266a9a -> 0aa3d54c4
ANY23-411 fix encoding detector Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/0aa3d54c Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/0aa3d54c Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/0aa3d54c Branch: refs/heads/master Commit: 0aa3d54c41aa90d6dce5aa790f6f490c82e7c7f3 Parents: 6f1266a Author: Hans <[email protected]> Authored: Thu Oct 25 16:19:09 2018 -0500 Committer: Hans <[email protected]> Committed: Thu Oct 25 16:19:09 2018 -0500 ---------------------------------------------------------------------- .../apache/any23/encoding/EncodingDetector.java | 13 ++++++++ .../extractor/SingleDocumentExtraction.java | 2 +- .../any23/encoding/TikaEncodingDetector.java | 32 +++++++++++++++++++- 3 files changed, 45 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/0aa3d54c/api/src/main/java/org/apache/any23/encoding/EncodingDetector.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/encoding/EncodingDetector.java b/api/src/main/java/org/apache/any23/encoding/EncodingDetector.java index 9e4cf2b..b9de1ba 100644 --- a/api/src/main/java/org/apache/any23/encoding/EncodingDetector.java +++ b/api/src/main/java/org/apache/any23/encoding/EncodingDetector.java @@ -37,4 +37,17 @@ public interface EncodingDetector { */ String guessEncoding(InputStream input) throws IOException; + /** + * Guesses the data encoding. + * + * @param input the input stream containing the data. + * @param contentType the declared content type of the data. + * @return a string compliant to + * <a href="http://www.iana.org/assignments/character-sets">IANA Charset Specification</a>. + * @throws IOException if there is an error whilst guessing the encoding. + */ + default String guessEncoding(InputStream input, String contentType) throws IOException { + return guessEncoding(input); + } + } http://git-wip-us.apache.org/repos/asf/any23/blob/0aa3d54c/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java index 77ed28c..e84ab61 100644 --- a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java +++ b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java @@ -572,7 +572,7 @@ public class SingleDocumentExtraction { try { ensureHasLocalCopy(); InputStream is = new BufferedInputStream(localDocumentSource.openInputStream()); - String encoding = this.encoderDetector.guessEncoding(is); + String encoding = this.encoderDetector.guessEncoding(is, localDocumentSource.getContentType()); is.close(); return encoding; } catch (Exception e) { http://git-wip-us.apache.org/repos/asf/any23/blob/0aa3d54c/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java ---------------------------------------------------------------------- diff --git a/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java b/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java index 066de33..10cc34b 100644 --- a/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java +++ b/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java @@ -18,6 +18,7 @@ package org.apache.any23.encoding; import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; import org.apache.tika.parser.html.HtmlEncodingDetector; import org.apache.tika.parser.txt.CharsetDetector; import org.apache.tika.parser.txt.CharsetMatch; @@ -38,11 +39,18 @@ import java.util.regex.Pattern; * * @author Michele Mostarda ( [email protected] ) * @author Davide Palmisano ( [email protected] ) + * @author Hans Brende ([email protected]) * @version $Id$ */ public class TikaEncodingDetector implements EncodingDetector { - public String guessEncoding(InputStream is) throws IOException { + @Override + public String guessEncoding(InputStream input) throws IOException { + return guessEncoding(input, null); + } + + @Override + public String guessEncoding(InputStream is, String contentType) throws IOException { if (!is.markSupported()) { is = new BufferedInputStream(is); } @@ -54,6 +62,22 @@ public class TikaEncodingDetector implements EncodingDetector { Charset htmlCharset = htmlEncodingDetector.detect(is, new Metadata()); CharsetDetector charsetDetector = new CharsetDetector(65536); + + String incomingCharset = null; + if (contentType != null) { + MediaType mt = MediaType.parse(contentType); + if (mt != null) { + incomingCharset = mt.getParameters().get("charset"); + } + } + + if (incomingCharset != null) { + incomingCharset = CharsetUtils.clean(incomingCharset); + if (incomingCharset != null) { + charsetDetector.setDeclaredEncoding(incomingCharset); + } + } + //enableInputFilter() needs to precede setText() to have any effect charsetDetector.enableInputFilter(true); charsetDetector.setText(is); @@ -64,9 +88,15 @@ public class TikaEncodingDetector implements EncodingDetector { try { Charset charset = CharsetUtils.forName(match.getName()); int confidence = match.getConfidence(); + if (StandardCharsets.UTF_8.equals(charset)) { + confidence *= 4; + } if (charset.equals(htmlCharset) || charset.equals(xmlCharset)) { confidence *= 16; } + if (charset.name().equals(incomingCharset)) { + confidence *= 16; + } if (confidence > bestConfidence) { bestCharset = charset; bestConfidence = confidence;
