[
https://issues.apache.org/jira/browse/TIKA-4662?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18064810#comment-18064810
]
Hudson commented on TIKA-4662:
------------------------------
SUCCESS: Integrated in Jenkins build Tika ยป tika-main-jdk17 #1247 (See
[https://ci-builds.apache.org/job/Tika/job/tika-main-jdk17/1247/])
TIKA-4662 (#2623) -- update language detection (github:
[https://github.com/apache/tika/commit/81c328236868133378259beb98e5c8bba2fc6b36])
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/slv
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/smi
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/hin
* (edit)
tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/LanguageResource.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/nep
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ben
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ces
* (edit)
tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/textstats/TextStatsTest.java
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/python/diagnose_kor_eng.py
* (delete)
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect.bin
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/fra
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ewe
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/META-INF/services/org.apache.tika.language.detect.LanguageDetector
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/glv
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/kir
* (add)
docs/modules/ROOT/pages/advanced/lang-detection/short-text-language-decisions.md
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/msa
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/dsb
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/lus
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tgk
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ile
* (delete) tika-langdetect/tika-langdetect-tika/pom.xml
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mar
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/kin
* (edit) tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/guj
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tyv
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupModelRoutingTest.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ara
* (add)
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect-short-v1-20260310.bin
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CrossDomainEval.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/div
* (edit)
tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file1.pdf.json
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/pol
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/ssw
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/bam
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/hun
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/spa
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/chv
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/fas
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/aka
* (edit) tika-example/pom.xml
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/stq
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/diq
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/wln
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/avk
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/bre
* (edit) tika-app/pom.xml
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/por
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/rue
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/war
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/hbs
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/ProfilingWriter.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/bpy
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/pnb
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/de.ngp
* (edit) tika-translate/pom.xml
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/lup
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageIdentifier.java
* (edit) tika-encoding-detectors/tika-encoding-detector-mojibuster/pom.xml
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/cor
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/ava
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/roh
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/vep
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/nya
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/lfn
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/nob
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/sin
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/trv
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/ndo
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/oss
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/isl
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/lim
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/amh
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tel
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/smn
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/eng
* (add) docs/modules/ROOT/pages/configuration/encoding-detectors.adoc
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mlg
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/test/python/filter_pashto.py
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/eo.ngp
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/hrv
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mkd
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/hif
* (edit)
tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tsn
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/kor
* (edit)
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageProfilerBuilder.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/arz
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ckb
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/lat
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/sv.ngp
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CorpusReader.java
* (edit) tika-langdetect/pom.xml
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/asm
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/nap
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/nl.ngp
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/bak
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/yue
* (add) docs/modules/ROOT/pages/advanced/lang-detection/supported-languages.md
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/gle
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mhr
* (delete)
tika-ml/tika-ml-core/src/main/java/org/apache/tika/langdetect/charsoup/WordTokenizer.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/hat
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mzn
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/lmo
* (add)
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ShortTextFeatureExtractor.java
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/fa.ngp
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/da.ngp
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/szy
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/pt.ngp
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/vls
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/SjisLangSignalTest.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/mai
* (edit) tika-server/tika-server-standard/pom.xml
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ilo
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ConfusionDump.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ido
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/sgs
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/no.ngp
* (delete)
tika-ml/tika-ml-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect.bin
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/heb
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/glg
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/skr
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/sun
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CompareDetectors.java
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageProfilerBuilderTest.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/que
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/it.ngp
* (edit)
tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java
* (add)
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupDetectorConfig.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/eml
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/ProfilingWriterTest.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/deu
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/is.ngp
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/srd
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/kan
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/sv.test
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ResearchFeatureExtractor.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/cnh
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/fry
* (delete)
tika-ml/tika-ml-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptCategory.java
* (edit) tika-bom/pom.xml
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/arg
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/sna
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/hau
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/LangIdRegressionTest.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/bjn
* (edit) tika-example/src/main/java/org/apache/tika/example/Language.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/fin
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/nan
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageIdentifierTest.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/nav
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ydd
* (edit)
tika-example/src/main/java/org/apache/tika/example/LanguageDetectorExample.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ind
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/uig
* (add)
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/ConfusableGroups.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/srp
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/es.ngp
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/udm
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/aze
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/uk.ngp
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/pms
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/lav
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/vie
* (add) docs/modules/ROOT/pages/advanced/lang-detection/flores-STANDARD.log
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/hye
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/TrainShortModel.java
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/fr.test
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ell
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/ca.ngp
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/en.test
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/bar
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/dag
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/swe
* (add) docs/modules/ROOT/pages/advanced/charsoup-supported-languages.adoc
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/pan
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/min
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/sme
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/uzn
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/slk
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/tika.language.properties
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tam
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/da.test
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/cdo-x-rom
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/PrepareCorpus.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/bxr
* (delete)
tika-ml/tika-ml-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/tum
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/python/filter_uppercase.py
* (add)
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupMetadataFilter.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tat
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/sat
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/kat
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/nl.test
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/azb
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/sot
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/ven
* (add) docs/modules/ROOT/pages/advanced/lang-detection/flores-SHORT_TEXT.log
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/kha
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/est
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mal
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/fr.ngp
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractorTest.java
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageProfileTest.java
* (edit)
tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TranslateResource.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/dan
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/lit
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupDetectorConfigTest.java
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/langbuilder/welsh_corpus.txt
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/zul
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/yor
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/myv
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/nld
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/ast
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/el.ngp
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/sco
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/sah
* (edit)
tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/jbo
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/et.ngp
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/khm
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/som
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/grn
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/run
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/zea
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ina
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/szl
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
* (edit)
tika-translate/src/main/java/org/apache/tika/language/translate/impl/AbstractTranslator.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ukr
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/python/collect_wikipedia.py
* (edit) tika-server/tika-server-core/pom.xml
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/bcl
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/rus
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ltz
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/Phase2SmokeTest.java
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/python/eval_fasttext.py
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/hsb
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/ace
* (edit)
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/bos
* (add)
tika-langdetect/tika-langdetect-charsoup/src/main/resources/org/apache/tika/langdetect/charsoup/confusables.txt
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/python/filter_contamination.py
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/epo
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/pap
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/kab
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/nno
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ceb
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tso
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/hu.ngp
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/bul
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/hak-x-rom
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/et.test
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/kaa
* (add) docs/modules/ROOT/pages/advanced/lang-detection/flores200-dev-eval.md
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/vro
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/kaz
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/swh
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/san
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/lao
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/el.test
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/fi.test
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/alt
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/ami
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/snd
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/xho
* (edit)
tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/tools/CommonTokenGenerator.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/kal
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/zho
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/sl.ngp
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/de.test
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tha
* (add) docs/modules/ROOT/pages/advanced/lang-detection/flores-AUTOMATIC.log
* (add)
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect-v7-20260306.bin
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/BucketSaturationAnalyzer.java
* (add)
docs/modules/ROOT/pages/advanced/lang-detection/language-drop-decisions.md
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/uzb
* (edit) tika-langdetect/tika-langdetect-charsoup/pom.xml
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/nso
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractorTest.java
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/QuickF1Eval.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/gom
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/prs
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mwl
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/kom
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/bua
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/python/summarize_wikipedia.py
* (edit)
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptCategory.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/cos
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/mya
* (edit) docs/modules/ROOT/pages/configuration/index.adoc
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/AblationRunner.java
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/Phase2Trainer.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/gla
* (edit)
tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/LanguageResourceTest.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/khk
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/KoreanFalsePositives.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/scn
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mon
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ibo
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ModelQuantizer.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/kur
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/ru.ngp
* (edit)
tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/TikaLanguageDetector.java
* (add)
tika-langdetect/tika-langdetect-charsoup/src/main/python/extract_madlad_to_wiki.py
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/vol
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/en.ngp
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/tet
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tur
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageProfile.java
* (edit) tika-bundles/tika-bundle-standard/pom.xml
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/es.test
* (add) tika-langdetect/tika-langdetect-charsoup/src/test/python/clean_madlad.py
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/TrainLanguageModel.java
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/sk.ngp
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/hyw
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/gl.ngp
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/pus
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mlt
* (edit)
tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ita
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/th.ngp
* (edit) docs/modules/ROOT/pages/advanced/language-detection-build.adoc
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/ro.ngp
* (edit) docs/modules/ROOT/pages/advanced/language-detection.adoc
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/cat
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/kpv
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/lez
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ban
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/krc
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/tir
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/be-x-old
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tuk
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/tay
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/frr
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/wuu
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/nds
* (edit)
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/FeatureExtractor.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/koi
* (edit) tika-bundles/pom.xml
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ksh
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ori
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/jpn
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/eus
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ron
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/urd
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/mri
* (edit)
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ext
* (edit)
tika-example/src/test/java/org/apache/tika/example/LanguageDetectorExampleTest.java
* (edit)
tika-translate/src/main/java/org/apache/tika/language/translate/impl/JoshuaNetworkTranslator.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/cym
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CalibrateConfidence.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/csb
* (delete)
tika-ml/tika-ml-core/src/main/java/org/apache/tika/langdetect/charsoup/TextFeatureExtractor.java
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/pl.ngp
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/ProfilingHandler.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/fao
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/hil
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/sqi
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/lad
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/xmf
* (edit)
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/be.ngp
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/fi.ngp
* (delete)
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/lt.ngp
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/jav
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/it.test
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/pfl
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/python/check_script_consistency.py
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mrj
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/lug
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tgl
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/pam
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/nqo
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/orm
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/lt.test
* (edit)
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/che
* (delete)
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/pt.test
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/gsw
* (add)
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/DiagnoseUnknownScript.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/new
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/smo
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/afr
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/olo
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/bel
> Modernize lang-detector for at least 4.x
> ----------------------------------------
>
> Key: TIKA-4662
> URL: https://issues.apache.org/jira/browse/TIKA-4662
> Project: Tika
> Issue Type: Task
> Reporter: Tim Allison
> Priority: Minor
>
> We were using opennlp's maxent code with a custom built model. I recently did
> some work to modernize that a bit and to improve feature extraction, speed
> and model size. Let's upgrade for 4.x (at least).
--
This message was sent by Atlassian Jira
(v8.20.10#820010)