Repository: any23
Updated Branches:
  refs/heads/master e0fed529a -> 22b3047d5


ANY23-385 improve encoding detection


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/22b3047d
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/22b3047d
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/22b3047d

Branch: refs/heads/master
Commit: 22b3047d55f5e5b8fcba9c912424c9ed45313163
Parents: e0fed52
Author: Hans <[email protected]>
Authored: Sun Aug 5 18:39:01 2018 -0500
Committer: Hans <[email protected]>
Committed: Sun Aug 5 18:39:01 2018 -0500

----------------------------------------------------------------------
 .../any23/encoding/TikaEncodingDetector.java    | 88 +++++++++++++++++++-
 .../encoding/TikaEncodingDetectorTest.java      | 17 ++++
 2 files changed, 101 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/22b3047d/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
----------------------------------------------------------------------
diff --git 
a/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java 
b/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
index cdd19d8..066de33 100644
--- a/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
+++ b/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
@@ -17,12 +17,20 @@
 
 package org.apache.any23.encoding;
 
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.html.HtmlEncodingDetector;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
+import org.apache.tika.utils.CharsetUtils;
 
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 /**
  * An implementation of {@link EncodingDetector} based on
@@ -35,11 +43,83 @@ import java.io.InputStream;
 public class TikaEncodingDetector implements EncodingDetector {
 
     public String guessEncoding(InputStream is) throws IOException {
-        CharsetDetector charsetDetector = new CharsetDetector();
-        charsetDetector.setText( is instanceof BufferedInputStream ? is : new 
BufferedInputStream(is) );
+        if (!is.markSupported()) {
+            is = new BufferedInputStream(is);
+        }
+
+        Charset xmlCharset = detectXmlEncoding(is, 1024);
+
+        HtmlEncodingDetector htmlEncodingDetector = new HtmlEncodingDetector();
+        htmlEncodingDetector.setMarkLimit(16384);
+        Charset htmlCharset = htmlEncodingDetector.detect(is, new Metadata());
+
+        CharsetDetector charsetDetector = new CharsetDetector(65536);
+        //enableInputFilter() needs to precede setText() to have any effect
         charsetDetector.enableInputFilter(true);
-        CharsetMatch cm = charsetDetector.detect();
-        return cm.getName();
+        charsetDetector.setText(is);
+
+        Charset bestCharset = null;
+        int bestConfidence = 0;
+        for (CharsetMatch match : charsetDetector.detectAll()) {
+            try {
+                Charset charset = CharsetUtils.forName(match.getName());
+                int confidence = match.getConfidence();
+                if (charset.equals(htmlCharset) || charset.equals(xmlCharset)) 
{
+                    confidence *= 16;
+                }
+                if (confidence > bestConfidence) {
+                    bestCharset = charset;
+                    bestConfidence = confidence;
+                }
+            } catch (Exception e) {
+                    //ignore
+            }
+        }
+
+        if (bestConfidence >= 100)
+            return bestCharset.name();
+        if (htmlCharset != null)
+            return htmlCharset.name();
+        if (xmlCharset != null)
+            return xmlCharset.name();
+        if (bestCharset != null)
+            return bestCharset.name();
+        return null;
+    }
+
+    private static final Pattern xmlEncoding = Pattern.compile(
+            
"(?is)\\A\\s*<\\?\\s*xml\\s+[^<>]*encoding\\s*=\\s*(?:['\"]\\s*)?([-_:.a-z0-9]+)");
+
+    static Charset detectXmlEncoding(InputStream input, int markLimit) throws 
IOException {
+        if (input == null) {
+            return null;
+        }
+        input.mark(markLimit);
+        byte[] buffer = new byte[markLimit];
+        int n = 0;
+        int m = input.read(buffer);
+        while (m != -1 && n < buffer.length) {
+            n += m;
+            m = input.read(buffer, n, buffer.length - n);
+        }
+        input.reset();
+
+        // Interpret the head as ASCII and try to spot a meta tag with
+        // a possible character encoding hint
+
+        String head = StandardCharsets.US_ASCII.decode(ByteBuffer.wrap(buffer, 
0, n)).toString();
+
+        Matcher matcher = xmlEncoding.matcher(head);
+
+        if (matcher.find()) {
+            try {
+                return CharsetUtils.forName(matcher.group(1));
+            } catch (Exception e) {
+                return null;
+            }
+        } else {
+            return null;
+        }
     }
 
 }

http://git-wip-us.apache.org/repos/asf/any23/blob/22b3047d/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java
----------------------------------------------------------------------
diff --git 
a/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java
 
b/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java
index 8ebd2ee..9467e8b 100644
--- 
a/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java
+++ 
b/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java
@@ -22,8 +22,11 @@ import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 
 /**
  * Test case for {@link TikaEncodingDetector}.
@@ -76,6 +79,20 @@ public class TikaEncodingDetectorTest {
          assertEncoding( "UTF-8", "/html/encoding-test.html" );
     }
 
+    @Test
+    public void testXMLEncodingPattern() throws IOException {
+        String[] strings = {
+                "<?xml encoding=\"UTF-8\"?>",
+                " \n<?xMl encoding   = 'utf-8'?>",
+                "\n <?Xml enCoding=Utf8?>"
+        };
+        for (String s : strings) {
+            byte[] bytes = s.getBytes(StandardCharsets.US_ASCII);
+            Charset detected = TikaEncodingDetector.detectXmlEncoding(new 
ByteArrayInputStream(bytes), 256);
+            Assert.assertEquals(detected, StandardCharsets.UTF_8);
+        }
+    }
+
     private void assertEncoding(final String expected, final String resource) 
throws IOException {
         try (InputStream fis = getClass().getResourceAsStream(resource)) {
             String encoding = detector.guessEncoding(fis);

Reply via email to