This is an automated email from the ASF dual-hosted git repository.

lfcnassif pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 768526160 TIKA-3774: fix ignoreCharsets param of Icu4jEncodingDetector
     new d5b66db06 Merge branch 'main' of https://github.com/apache/tika into 
main
768526160 is described below

commit 768526160b3d12fc4df4671e093e101ccc44eb22
Author: Luis Nassif <[email protected]>
AuthorDate: Mon May 23 23:17:18 2022 -0300

    TIKA-3774: fix ignoreCharsets param of Icu4jEncodingDetector
---
 .../apache/tika/parser/txt/Icu4jEncodingDetector.java    |   2 +-
 .../org/apache/tika/parser/txt/CharsetDetectorTest.java  |  10 ++++++++--
 .../test-configs/tika-config-ignore-charset.xml          |   1 +
 .../resources/test-documents/test_ignore_IBM420.html     | Bin 0 -> 1869 bytes
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
index ce9ee9fa4..f89b27c12 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
@@ -75,7 +75,7 @@ public class Icu4jEncodingDetector implements 
EncodingDetector {
             try {
                 String n = match.getNormalizedName();
                 if (ignoreCharsets.contains(n)) {
-                    return null;
+                    continue;
                 }
                 return CharsetUtils.forName(match.getNormalizedName());
             } catch (IllegalArgumentException e) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
index d41e3498f..3e6594cb0 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
@@ -136,10 +136,16 @@ public class CharsetDetectorTest extends TikaTest {
         TikaConfig tikaConfig = new TikaConfig(
                 
getResourceAsStream("/test-configs/tika-config-ignore-charset.xml"));
 
-        Metadata m = new Metadata();
+        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
 
+        Metadata m = new Metadata();
         m.set(TikaCoreProperties.RESOURCE_NAME_KEY, "texty-text.txt");
         assertContains("ACTIVE AGE", getXML("testIgnoreCharset.txt",
-                new AutoDetectParser(tikaConfig), m).xml);
+                parser, m).xml);
+
+        m = new Metadata();
+        m.set(TikaCoreProperties.RESOURCE_NAME_KEY, "texty-text.txt");
+        assertContains("Please check your email", 
getXML("test_ignore_IBM420.html",
+                parser, m).xml);
     }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-ignore-charset.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-ignore-charset.xml
index 0b61f20c9..2ca84a940 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-ignore-charset.xml
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-ignore-charset.xml
@@ -23,6 +23,7 @@
         <encodingDetector 
class="org.apache.tika.parser.txt.Icu4jEncodingDetector">
             <params>
                 <param name="ignoreCharsets" type="list">
+                    <string>IBM420</string>
                     <string>IBM424</string>
                 </param>
             </params>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/test_ignore_IBM420.html
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/test_ignore_IBM420.html
new file mode 100644
index 000000000..2aecab221
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/test_ignore_IBM420.html
 differ

Reply via email to