[
https://issues.apache.org/jira/browse/TIKA-4685?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18063685#comment-18063685
]
Hudson commented on TIKA-4685:
------------------------------
FAILURE: Integrated in Jenkins build Tika ยป tika-main-jdk17 #1240 (See
[https://ci-builds.apache.org/job/Tika/job/tika-main-jdk17/1240/])
TIKA-4685 chardet (#2677) (github:
[https://github.com/apache/tika/commit/9627b999583cbd3bbcc8e2ce0ecba4532c9e81be])
* (add) tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java
* (add)
tika-encoding-detectors/tika-encoding-detector-icu4j/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
* (add)
tika-ml/tika-ml-core/src/main/java/org/apache/tika/langdetect/charsoup/TextFeatureExtractor.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetDetectionResult.java
* (add) tika-encoding-detectors/tika-encoding-detector-icu4j/pom.xml
* (edit)
tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
* (add)
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
* (add)
tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/test_ignore_IBM420.html
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java
* (edit)
tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
* (add) tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/BOMDetectorTest.java
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/LinearModelTest.java
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/BOMDetector.java
* (add)
tika-ml/tika-ml-core/src/main/java/org/apache/tika/langdetect/charsoup/WordTokenizer.java
* (edit)
tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
* (add) tika-encoding-detectors/pom.xml
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-parameterize-encoding-detector.json
* (add)
tika-encoding-detectors/tika-encoding-detector-icu4j/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
* (delete)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/TextQualityDiagTest.java
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
* (edit)
tika-core/src/main/java/org/apache/tika/detect/OverrideEncodingDetector.java
* (add) tika-encoding-detectors/tika-encoding-detector-mojibuster/pom.xml
* (add)
tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
* (add)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/SjisLangSignalTest.java
* (add)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkEncodingRules.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java
* (delete)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
* (add)
tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/testIgnoreCharset.txt
* (add) tika-ml/tika-ml-chardetect/src/test/python/anneal.py
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
* (add)
tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/multi-language.txt
* (add)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin
* (add)
tika-encoding-detectors/tika-encoding-detector-icu4j/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
* (add) tika-ml/tika-ml-core/src/main/java/org/apache/tika/ml/LinearModel.java
* (add)
tika-encoding-detectors/tika-encoding-detector-icu4j/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/python/download_madlad.py
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
* (add)
tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/CharsetConfusablesTest.java
* (add)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CharsetConfusables.java
* (add)
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/DiagnoseCharsetDetector.java
* (add)
tika-encoding-detectors/tika-encoding-detector-icu4j/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
* (add) tika-encoding-detectors/tika-encoding-detector-charsoup/pom.xml
* (edit) .mvn/maven.config
* (add) tika-encoding-detectors/tika-encoding-detector-universal/pom.xml
* (add)
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/ReplacementCharset.java
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
* (edit) pom.xml
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
* (add)
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java
* (add) tika-ml/tika-ml-chardetect/pom.xml
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/pom.xml
* (add)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
* (add)
tika-encoding-detectors/tika-encoding-detector-icu4j/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
* (add)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
* (add) tika-ml/pom.xml
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
* (add)
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BenchmarkCharsetDetectors.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json
* (add)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/EbcdicRoutingTest.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
* (add)
tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
* (add)
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
* (add)
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
* (add) tika-ml/tika-ml-chardetect/README.md
* (add)
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/XUserDefinedCharset.java
* (add)
tika-ml/tika-ml-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect.bin
* (add)
tika-encoding-detectors/tika-encoding-detector-icu4j/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-exclude-encoding-detector-default.json
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/MetaProcessor.java
* (add)
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java
* (add)
tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractorTest.java
* (edit) docs/pom.xml
* (edit) docs/modules/ROOT/nav.adoc
* (add)
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/MetaProcessor.java
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
* (add)
tika-ml/tika-ml-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
* (edit) tika-langdetect/tika-langdetect-charsoup/pom.xml
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java
* (add)
tika-ml/tika-ml-core/src/main/java/org/apache/tika/ml/FeatureExtractor.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
* (add)
tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/resume.html
* (delete)
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
* (add) tika-core/src/test/java/org/apache/tika/detect/BOMDetectorTest.java
* (add)
tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/testTXT_win-1252.txt
* (edit) tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
* (add)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java
* (add)
tika-encoding-detectors/tika-encoding-detector-universal/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java
* (edit) .gitignore
* (add)
tika-encoding-detectors/tika-encoding-detector-universal/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/pom.xml
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/XUserDefinedCharset.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
* (add)
tika-core/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
* (add)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
* (edit) tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java
* (add)
tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/TextQualityDiagTest.java
* (add) tika-ml/tika-ml-core/pom.xml
* (add)
tika-ml/tika-ml-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptCategory.java
* (add)
tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
* (delete)
tika-core/src/main/java/org/apache/tika/detect/WideUnicodeDetector.java
* (delete)
tika-core/src/test/java/org/apache/tika/detect/WideUnicodeDetectorTest.java
* (add)
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
* (add)
tika-encoding-detectors/tika-encoding-detector-html/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java
* (add)
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/EvalCharsetDetectors.java
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
* (add)
tika-encoding-detectors/tika-encoding-detector-icu4j/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
* (add)
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
* (add)
tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
* (add)
tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/configs/tika-config-ignore-charset.json
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
* (add)
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetDetectionResult.java
* (add)
tika-encoding-detectors/tika-encoding-detector-icu4j/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
* (add) docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
* (add) tika-encoding-detectors/tika-encoding-detector-html/pom.xml
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
* (delete)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/ReplacementCharset.java
* (add)
tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
* (add) tika-ml/tika-ml-core/src/main/java/org/apache/tika/ml/Prediction.java
> Add a new charset detector for 4.x
> ----------------------------------
>
> Key: TIKA-4685
> URL: https://issues.apache.org/jira/browse/TIKA-4685
> Project: Tika
> Issue Type: Task
> Reporter: Tim Allison
> Priority: Major
>
> While I was building out the maxent model for the updated language detector,
> I realized we had the resources (language files by language) and a maxent
> model just sitting around and ready to build a new charset detector based on
> byte ngrams.
> I have something working that appears to be quite good. We can replace both
> universal and icu4j. There's a chance that the results are hallucinated or
> that there's something surprising going on, but I think we should merge this
> and see what happens on our regression set.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)