This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch TIKA-4659-eval-lite
in repository https://gitbox.apache.org/repos/asf/tika.git
from e3aefe821d TIKA-4659 -- tika-eval-lite
add 40e47d1a8a TIKA-4659 -- tika-eval-lite
No new revisions were added by this update.
Summary of changes:
tika-app/pom.xml | 5 +
.../tika/detect/DefaultEncodingDetector.java | 29 ++
.../tika/detect/EncodingDetectorContext.java | 91 ++++++
.../tika/detect/TextQualityEncodingDetector.java | 283 ++++++++++++++++
.../org.apache.tika.textquality.TextQualityScorer | 15 +
.../tika-parsers-standard-package/pom.xml | 6 +
.../detect/TextQualityEncodingDetectorTest.java | 360 +++++++++++++++++++++
.../testArabicMisleadingCharset.html | 12 +
tika-server/tika-server-standard/pom.xml | 5 +
9 files changed, 806 insertions(+)
create mode 100644
tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
create mode 100644
tika-core/src/main/java/org/apache/tika/detect/TextQualityEncodingDetector.java
create mode 100644
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TextQualityEncodingDetectorTest.java
create mode 100644
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html