This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4690-add-generative-models in repository https://gitbox.apache.org/repos/asf/tika.git
commit c9d96d81a7dc1d5d5919520e65b79f1075470e8c Merge: a15df1625d 81c3282368 Author: tballison <[email protected]> AuthorDate: Tue Mar 10 16:02:48 2026 -0400 Merge branch 'main' into add-generative-models Made-with: Cursor # Conflicts: # tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java .../advanced/charsoup-supported-languages.adoc | 192 + .../advanced/lang-detection/flores-AUTOMATIC.log | 536 + .../advanced/lang-detection/flores-SHORT_TEXT.log | 414 + .../advanced/lang-detection/flores-STANDARD.log | 488 + .../advanced/lang-detection/flores200-dev-eval.md | 204 + .../lang-detection/language-drop-decisions.md | 175 + .../short-text-language-decisions.md | 384 + .../advanced/lang-detection/supported-languages.md | 240 + .../pages/advanced/language-detection-build.adoc | 647 +- .../ROOT/pages/advanced/language-detection.adoc | 264 +- .../pages/configuration/encoding-detectors.adoc | 203 + docs/modules/ROOT/pages/configuration/index.adoc | 1 + tika-app/pom.xml | 2 +- tika-bom/pom.xml | 10 - tika-bundles/pom.xml | 1 - tika-bundles/tika-bundle-standard/pom.xml | 1 - .../tika/language/detect/LanguageDetector.java | 21 +- .../charsoup/CharSoupEncodingDetector.java | 16 +- .../tika-encoding-detector-mojibuster/pom.xml | 2 +- .../ml/chardetect/ZipFilenameDetectionTest.java | 8 + .../apache/tika/eval/app/SimpleComparerTest.java | 18 +- .../resources/test-dirs/extractsB/file1.pdf.json | 2 +- .../src/main/resources/common_tokens/ace | 1578 + .../src/main/resources/common_tokens/afr | 60006 +++++++++--------- .../src/main/resources/common_tokens/aka | 9272 +++ .../src/main/resources/common_tokens/alt | 2668 + .../src/main/resources/common_tokens/amh | 19139 +++--- .../src/main/resources/common_tokens/ami | 2574 + .../src/main/resources/common_tokens/ara | 60008 +++++++++---------- .../src/main/resources/common_tokens/arg | 32637 +++++++--- .../src/main/resources/common_tokens/arz | 17123 ------ .../src/main/resources/common_tokens/asm | 10793 ++-- .../src/main/resources/common_tokens/ast | 30020 ---------- .../src/main/resources/common_tokens/ava | 2453 + .../src/main/resources/common_tokens/avk | 6710 +++ .../src/main/resources/common_tokens/azb | 26484 ++++++++ .../src/main/resources/common_tokens/aze | 60008 +++++++++---------- .../src/main/resources/common_tokens/bak | 59988 +++++++++--------- .../src/main/resources/common_tokens/bam | 1192 - .../src/main/resources/common_tokens/ban | 12613 ++-- .../src/main/resources/common_tokens/bar | 31854 +++++----- .../src/main/resources/common_tokens/bcl | 14470 ++++- .../src/main/resources/common_tokens/be-x-old | 30020 ++++++++++ .../src/main/resources/common_tokens/bel | 60008 +++++++++---------- .../src/main/resources/common_tokens/ben | 44061 +++++--------- .../src/main/resources/common_tokens/bjn | 8705 +-- .../src/main/resources/common_tokens/bos | 30020 ---------- .../src/main/resources/common_tokens/bpy | 1001 - .../src/main/resources/common_tokens/bre | 47871 +++++++++------ .../src/main/resources/common_tokens/bua | 2734 - .../src/main/resources/common_tokens/bul | 60008 +++++++++---------- .../src/main/resources/common_tokens/bxr | 3556 ++ .../src/main/resources/common_tokens/cat | 60008 +++++++++---------- .../src/main/resources/common_tokens/cdo-x-rom | 633 + .../src/main/resources/common_tokens/ceb | 60002 +++++++++--------- .../src/main/resources/common_tokens/ces | 60008 +++++++++---------- .../src/main/resources/common_tokens/che | 36578 +++++++++-- .../src/main/resources/common_tokens/chv | 24222 +++++--- .../src/main/resources/common_tokens/ckb | 46397 ++++++++------ .../src/main/resources/common_tokens/cnh | 17224 ++++++ .../src/main/resources/common_tokens/cor | 3558 ++ .../src/main/resources/common_tokens/cos | 10474 +++- .../src/main/resources/common_tokens/csb | 3585 +- .../src/main/resources/common_tokens/cym | 50185 +++++++++------- .../src/main/resources/common_tokens/dag | 4433 ++ .../src/main/resources/common_tokens/dan | 60008 +++++++++---------- .../src/main/resources/common_tokens/deu | 60008 +++++++++---------- .../src/main/resources/common_tokens/diq | 11793 ++-- .../src/main/resources/common_tokens/div | 31671 +--------- .../src/main/resources/common_tokens/dsb | 3542 +- .../src/main/resources/common_tokens/ell | 60008 +++++++++---------- .../src/main/resources/common_tokens/eml | 5000 -- .../src/main/resources/common_tokens/eng | 60008 +++++++++---------- .../src/main/resources/common_tokens/epo | 60008 +++++++++---------- .../src/main/resources/common_tokens/est | 60008 +++++++++---------- .../src/main/resources/common_tokens/eus | 59994 +++++++++--------- .../src/main/resources/common_tokens/ewe | 12375 +++- .../src/main/resources/common_tokens/ext | 5545 +- .../src/main/resources/common_tokens/fao | 39567 +++--------- .../src/main/resources/common_tokens/fas | 60008 +++++++++---------- .../src/main/resources/common_tokens/fin | 60008 +++++++++---------- .../src/main/resources/common_tokens/fra | 60008 +++++++++---------- .../src/main/resources/common_tokens/frr | 6648 +- .../src/main/resources/common_tokens/fry | 59942 +++++++++--------- .../src/main/resources/common_tokens/gla | 6667 ++ .../src/main/resources/common_tokens/gle | 52183 ++++++++-------- .../src/main/resources/common_tokens/glg | 59850 +++++++++--------- .../src/main/resources/common_tokens/glv | 6204 +- .../src/main/resources/common_tokens/gom | 13378 ++--- .../src/main/resources/common_tokens/grn | 7802 +-- .../src/main/resources/common_tokens/gsw | 59486 +++++++++--------- .../src/main/resources/common_tokens/guj | 24230 ++------ .../src/main/resources/common_tokens/hak-x-rom | 693 + .../src/main/resources/common_tokens/hat | 6627 -- .../src/main/resources/common_tokens/hau | 29608 ++++++--- .../src/main/resources/common_tokens/hbs | 30020 ---------- .../src/main/resources/common_tokens/heb | 60008 +++++++++---------- .../src/main/resources/common_tokens/hif | 2137 - .../src/main/resources/common_tokens/hil | 18496 ++++++ .../src/main/resources/common_tokens/hin | 36421 ++--------- .../src/main/resources/common_tokens/hrv | 60008 +++++++++---------- .../src/main/resources/common_tokens/hsb | 13309 ++-- .../src/main/resources/common_tokens/hun | 60008 +++++++++---------- .../src/main/resources/common_tokens/hye | 60008 +++++++++---------- .../src/main/resources/common_tokens/hyw | 19125 ++++++ .../src/main/resources/common_tokens/ibo | 19379 +++++- .../src/main/resources/common_tokens/ido | 21997 ++++--- .../src/main/resources/common_tokens/ile | 4013 +- .../src/main/resources/common_tokens/ilo | 10135 ++-- .../src/main/resources/common_tokens/ina | 15743 ++--- .../src/main/resources/common_tokens/ind | 60008 +++++++++---------- .../src/main/resources/common_tokens/isl | 60008 +++++++++---------- .../src/main/resources/common_tokens/ita | 60008 +++++++++---------- .../src/main/resources/common_tokens/jav | 59433 +++++++++--------- .../src/main/resources/common_tokens/jbo | 1071 + .../src/main/resources/common_tokens/jpn | 60008 +++++++++---------- .../src/main/resources/common_tokens/kaa | 4617 ++ .../src/main/resources/common_tokens/kab | 3263 + .../src/main/resources/common_tokens/kal | 6341 -- .../src/main/resources/common_tokens/kan | 58858 +++++++++--------- .../src/main/resources/common_tokens/kat | 60006 +++++++++--------- .../src/main/resources/common_tokens/kaz | 59998 +++++++++--------- .../src/main/resources/common_tokens/kha | 9653 +++ .../src/main/resources/common_tokens/khk | 4187 -- .../src/main/resources/common_tokens/khm | 8745 +++ .../src/main/resources/common_tokens/kin | 15225 ++--- .../src/main/resources/common_tokens/kir | 60004 +++++++++--------- .../src/main/resources/common_tokens/koi | 1373 - .../src/main/resources/common_tokens/kom | 2382 - .../src/main/resources/common_tokens/kor | 60008 +++++++++---------- .../src/main/resources/common_tokens/kpv | 3445 ++ .../src/main/resources/common_tokens/krc | 1974 - .../src/main/resources/common_tokens/ksh | 5117 +- .../src/main/resources/common_tokens/kur | 32152 ++++++---- .../src/main/resources/common_tokens/lad | 1681 - .../src/main/resources/common_tokens/lao | 1479 +- .../src/main/resources/common_tokens/lat | 59988 +++++++++--------- .../src/main/resources/common_tokens/lav | 60008 +++++++++---------- .../src/main/resources/common_tokens/lez | 3913 ++ .../src/main/resources/common_tokens/lfn | 5582 ++ .../src/main/resources/common_tokens/lim | 48870 ++++++--------- .../src/main/resources/common_tokens/lit | 60008 +++++++++---------- .../src/main/resources/common_tokens/lmo | 6924 --- .../src/main/resources/common_tokens/ltz | 60006 +++++++++--------- .../src/main/resources/common_tokens/lug | 35119 ++--------- .../src/main/resources/common_tokens/lup | 905 - .../src/main/resources/common_tokens/lus | 40004 ++++++++---- .../src/main/resources/common_tokens/mai | 755 - .../src/main/resources/common_tokens/mal | 59240 +++++++++--------- .../src/main/resources/common_tokens/mar | 39628 +++--------- .../src/main/resources/common_tokens/mhr | 9769 +-- .../src/main/resources/common_tokens/min | 29209 +++++---- .../src/main/resources/common_tokens/mkd | 60008 +++++++++---------- .../src/main/resources/common_tokens/mlg | 38614 +++++++++--- .../src/main/resources/common_tokens/mlt | 47776 ++++++--------- .../src/main/resources/common_tokens/mon | 57279 +++++++++--------- .../src/main/resources/common_tokens/mri | 9729 --- .../src/main/resources/common_tokens/mrj | 3902 +- .../src/main/resources/common_tokens/msa | 60006 +++++++++--------- .../src/main/resources/common_tokens/mwl | 25861 ++++---- .../src/main/resources/common_tokens/mya | 30020 ++++++++++ .../src/main/resources/common_tokens/myv | 3154 +- .../src/main/resources/common_tokens/mzn | 10405 ++-- .../src/main/resources/common_tokens/nan | 6673 --- .../src/main/resources/common_tokens/nap | 2039 - .../src/main/resources/common_tokens/nav | 533 - .../src/main/resources/common_tokens/ndo | 3142 - .../src/main/resources/common_tokens/nds | 54491 +++++++++-------- .../src/main/resources/common_tokens/nep | 35836 ++--------- .../src/main/resources/common_tokens/new | 2545 - .../src/main/resources/common_tokens/nld | 60008 +++++++++---------- .../src/main/resources/common_tokens/nno | 60006 +++++++++--------- .../src/main/resources/common_tokens/nob | 60008 +++++++++---------- .../src/main/resources/common_tokens/nqo | 2779 + .../src/main/resources/common_tokens/nso | 5150 +- .../src/main/resources/common_tokens/nya | 30020 ++++++++++ .../src/main/resources/common_tokens/olo | 2220 + .../src/main/resources/common_tokens/ori | 12884 ++-- .../src/main/resources/common_tokens/orm | 31991 +++++++++- .../src/main/resources/common_tokens/oss | 7575 ++- .../src/main/resources/common_tokens/pam | 6310 +- .../src/main/resources/common_tokens/pan | 11564 +--- .../src/main/resources/common_tokens/pap | 12598 ++-- .../src/main/resources/common_tokens/pfl | 4325 +- .../src/main/resources/common_tokens/pms | 6552 -- .../src/main/resources/common_tokens/pnb | 57576 +++++++++--------- .../src/main/resources/common_tokens/pol | 60008 +++++++++---------- .../src/main/resources/common_tokens/por | 60008 +++++++++---------- .../src/main/resources/common_tokens/prs | 12167 ---- .../src/main/resources/common_tokens/pus | 51255 +++++++++------- .../src/main/resources/common_tokens/que | 2170 - .../src/main/resources/common_tokens/roh | 33539 ++++------- .../src/main/resources/common_tokens/ron | 60008 +++++++++---------- .../src/main/resources/common_tokens/rue | 6615 +- .../src/main/resources/common_tokens/run | 3534 -- .../src/main/resources/common_tokens/rus | 60008 +++++++++---------- .../src/main/resources/common_tokens/sah | 31037 +++++----- .../src/main/resources/common_tokens/san | 14998 ++--- .../src/main/resources/common_tokens/sat | 6387 ++ .../src/main/resources/common_tokens/scn | 7559 --- .../src/main/resources/common_tokens/sco | 12070 ---- .../src/main/resources/common_tokens/sgs | 5547 +- .../src/main/resources/common_tokens/sin | 34762 +++++------ .../src/main/resources/common_tokens/skr | 6326 ++ .../src/main/resources/common_tokens/slk | 60008 +++++++++---------- .../src/main/resources/common_tokens/slv | 60008 +++++++++---------- .../src/main/resources/common_tokens/sme | 5891 +- .../src/main/resources/common_tokens/smi | 1676 - .../src/main/resources/common_tokens/smn | 2934 + .../src/main/resources/common_tokens/smo | 24490 ++++++++ .../src/main/resources/common_tokens/sna | 29768 ++------- .../src/main/resources/common_tokens/snd | 26767 +++++---- .../src/main/resources/common_tokens/som | 32074 +++------- .../src/main/resources/common_tokens/sot | 3535 -- .../src/main/resources/common_tokens/spa | 60008 +++++++++---------- .../src/main/resources/common_tokens/sqi | 60008 +++++++++---------- .../src/main/resources/common_tokens/srd | 3796 -- .../src/main/resources/common_tokens/srp | 60008 +++++++++---------- .../src/main/resources/common_tokens/ssw | 2035 - .../src/main/resources/common_tokens/stq | 3423 ++ .../src/main/resources/common_tokens/sun | 45923 ++++++-------- .../src/main/resources/common_tokens/swe | 60006 +++++++++--------- .../src/main/resources/common_tokens/swh | 28444 +++++++-- .../src/main/resources/common_tokens/szl | 8763 ++- .../src/main/resources/common_tokens/szy | 4825 ++ .../src/main/resources/common_tokens/tam | 44673 +++++--------- .../src/main/resources/common_tokens/tat | 60008 +++++++++---------- .../src/main/resources/common_tokens/tay | 1220 + .../src/main/resources/common_tokens/tel | 44436 +++++--------- .../src/main/resources/common_tokens/tet | 24501 ++++++++ .../src/main/resources/common_tokens/tgk | 57085 +++++++++--------- .../src/main/resources/common_tokens/tgl | 59972 +++++++++--------- .../src/main/resources/common_tokens/tha | 53250 +++++++--------- .../src/main/resources/common_tokens/tir | 30020 ++++++++++ .../src/main/resources/common_tokens/trv | 3333 + .../src/main/resources/common_tokens/tsn | 17815 ++++-- .../src/main/resources/common_tokens/tso | 15810 +++-- .../src/main/resources/common_tokens/tuk | 30868 ++++------ .../src/main/resources/common_tokens/tum | 4881 ++ .../src/main/resources/common_tokens/tur | 60008 +++++++++---------- .../src/main/resources/common_tokens/tyv | 8541 ++- .../src/main/resources/common_tokens/udm | 31389 +++++++++- .../src/main/resources/common_tokens/uig | 31403 +++++----- .../src/main/resources/common_tokens/ukr | 60008 +++++++++---------- .../src/main/resources/common_tokens/urd | 60006 +++++++++--------- .../src/main/resources/common_tokens/uzb | 59970 +++++++++--------- .../src/main/resources/common_tokens/uzn | 30020 ---------- .../src/main/resources/common_tokens/ven | 2457 - .../src/main/resources/common_tokens/vep | 7276 +++ .../src/main/resources/common_tokens/vie | 51687 +++++++--------- .../src/main/resources/common_tokens/vls | 15292 ++--- .../src/main/resources/common_tokens/vol | 7216 +-- .../src/main/resources/common_tokens/vro | 4730 +- .../src/main/resources/common_tokens/war | 55938 ++++++++--------- .../src/main/resources/common_tokens/wln | 12282 ++-- .../src/main/resources/common_tokens/wuu | 30020 ---------- .../src/main/resources/common_tokens/xho | 51611 +++++++++------- .../src/main/resources/common_tokens/xmf | 15787 +++-- .../src/main/resources/common_tokens/ydd | 21613 ++++--- .../src/main/resources/common_tokens/yor | 7833 ++- .../src/main/resources/common_tokens/yue | 9176 +++ .../src/main/resources/common_tokens/zea | 2318 - .../src/main/resources/common_tokens/zho | 60008 +++++++++---------- .../src/main/resources/common_tokens/zul | 35021 ++--------- .../core/metadata/TikaEvalMetadataFilterTest.java | 13 +- .../tika/eval/core/textstats/TextStatsTest.java | 28 +- .../core/tokens/tools/CommonTokenGenerator.java | 92 +- tika-example/pom.xml | 2 +- .../java/org/apache/tika/example/Language.java | 5 +- .../tika/example/LanguageDetectorExample.java | 3 +- .../java/org/apache/tika/example/MyFirstTika.java | 3 +- .../tika/example/LanguageDetectorExampleTest.java | 2 +- .../tika/pipes/kafka/tests/TikaPipesKafkaTest.java | 5 +- tika-langdetect/pom.xml | 3 +- .../charsoup/CharSoupFeatureExtractor.java | 12 +- .../tika/langdetect/charsoup/CharSoupModel.java | 126 +- .../tika/langdetect/charsoup/FeatureExtractor.java | 39 + .../charsoup/ScriptAwareFeatureExtractor.java | 248 +- .../tika/langdetect/charsoup/ScriptCategory.java | 13 +- .../charsoup/ShortTextFeatureExtractor.java | 348 + .../charsoup/langdetect-short-v1-20260310.bin | Bin 0 -> 3999308 bytes .../langdetect/charsoup/langdetect-v7-20260306.bin | Bin 0 -> 3328628 bytes .../apache/tika/langdetect/charsoup/langdetect.bin | Bin 1641016 -> 0 bytes tika-langdetect/tika-langdetect-charsoup/pom.xml | 12 + .../charsoup/CharSoupDetectorConfig.java | 120 + .../charsoup/CharSoupLanguageDetector.java | 680 +- .../charsoup/CharSoupMetadataFilter.java | 62 + .../tika/langdetect/charsoup/ConfusableGroups.java | 72 + .../src/main/python/extract_madlad_to_wiki.py | 182 + .../tika/langdetect/charsoup/confusables.txt | 52 + .../charsoup/CharSoupDetectorConfigTest.java | 125 + .../charsoup/CharSoupFeatureExtractorTest.java | 2 +- .../charsoup/CharSoupModelRoutingTest.java | 281 + .../langdetect/charsoup/LangIdRegressionTest.java | 25 +- .../charsoup/ScriptAwareFeatureExtractorTest.java | 33 +- .../langdetect/charsoup/SjisLangSignalTest.java | 59 +- .../langdetect/charsoup/tools/AblationRunner.java | 652 +- .../charsoup/tools/BucketSaturationAnalyzer.java | 76 +- .../charsoup/tools/CalibrateConfidence.java | 217 + .../charsoup/tools/CompareDetectors.java | 1470 +- .../langdetect/charsoup/tools/ConfusionDump.java | 184 + .../langdetect/charsoup/tools/CorpusReader.java | 73 +- .../langdetect/charsoup/tools/CrossDomainEval.java | 309 +- .../charsoup/tools/DiagnoseUnknownScript.java | 145 + .../charsoup/tools/KoreanFalsePositives.java | 146 + .../langdetect/charsoup/tools/ModelQuantizer.java | 28 +- .../langdetect/charsoup/tools/Phase2SmokeTest.java | 2 +- .../langdetect/charsoup/tools/Phase2Trainer.java | 102 + .../langdetect/charsoup/tools/PrepareCorpus.java | 992 + .../langdetect/charsoup/tools/QuickF1Eval.java | 134 +- .../charsoup/tools/ResearchFeatureExtractor.java | 457 + .../charsoup/tools/TrainLanguageModel.java | 982 +- .../langdetect/charsoup/tools/TrainShortModel.java | 179 + .../src/test/python/check_script_consistency.py | 253 + .../src/test/python/clean_madlad.py | 308 + .../src/test/python/collect_wikipedia.py | 556 + .../src/test/python/diagnose_kor_eng.py | 256 + .../src/test/python/eval_fasttext.py | 290 + .../src/test/python/filter_contamination.py | 240 + .../src/test/python/filter_pashto.py | 66 +- .../src/test/python/filter_uppercase.py | 145 + .../src/test/python/summarize_wikipedia.py | 170 + tika-langdetect/tika-langdetect-tika/pom.xml | 75 - .../tika/langdetect/tika/LanguageIdentifier.java | 260 - .../tika/langdetect/tika/LanguageProfile.java | 317 - .../langdetect/tika/LanguageProfilerBuilder.java | 767 - .../tika/langdetect/tika/ProfilingWriter.java | 103 - .../tika/langdetect/tika/TikaLanguageDetector.java | 92 - ...rg.apache.tika.language.detect.LanguageDetector | 16 - .../org/apache/tika/langdetect/tika/be.ngp | 1014 - .../org/apache/tika/langdetect/tika/ca.ngp | 1014 - .../org/apache/tika/langdetect/tika/da.ngp | 1014 - .../org/apache/tika/langdetect/tika/de.ngp | 1014 - .../org/apache/tika/langdetect/tika/el.ngp | 1014 - .../org/apache/tika/langdetect/tika/en.ngp | 1014 - .../org/apache/tika/langdetect/tika/eo.ngp | 1014 - .../org/apache/tika/langdetect/tika/es.ngp | 1014 - .../org/apache/tika/langdetect/tika/et.ngp | 1014 - .../org/apache/tika/langdetect/tika/fa.ngp | 1015 - .../org/apache/tika/langdetect/tika/fi.ngp | 1014 - .../org/apache/tika/langdetect/tika/fr.ngp | 1014 - .../org/apache/tika/langdetect/tika/gl.ngp | 1014 - .../org/apache/tika/langdetect/tika/hu.ngp | 1014 - .../org/apache/tika/langdetect/tika/is.ngp | 1014 - .../org/apache/tika/langdetect/tika/it.ngp | 1014 - .../org/apache/tika/langdetect/tika/lt.ngp | 1209 - .../org/apache/tika/langdetect/tika/nl.ngp | 1014 - .../org/apache/tika/langdetect/tika/no.ngp | 1014 - .../org/apache/tika/langdetect/tika/pl.ngp | 1014 - .../org/apache/tika/langdetect/tika/pt.ngp | 1014 - .../org/apache/tika/langdetect/tika/ro.ngp | 1014 - .../org/apache/tika/langdetect/tika/ru.ngp | 1014 - .../org/apache/tika/langdetect/tika/sk.ngp | 1014 - .../org/apache/tika/langdetect/tika/sl.ngp | 1014 - .../org/apache/tika/langdetect/tika/sv.ngp | 1014 - .../org/apache/tika/langdetect/tika/th.ngp | 1014 - .../tika/langdetect/tika/tika.language.properties | 56 - .../org/apache/tika/langdetect/tika/uk.ngp | 1014 - .../langdetect/tika/LanguageIdentifierTest.java | 185 - .../tika/langdetect/tika/LanguageProfileTest.java | 58 - .../tika/LanguageProfilerBuilderTest.java | 96 - .../tika/langdetect/tika/ProfilingHandler.java | 67 - .../tika/langdetect/tika/ProfilingWriterTest.java | 45 - .../org/apache/tika/langdetect/tika/da.test | 108 - .../org/apache/tika/langdetect/tika/de.test | 104 - .../org/apache/tika/langdetect/tika/el.test | 109 - .../org/apache/tika/langdetect/tika/en.test | 105 - .../org/apache/tika/langdetect/tika/es.test | 107 - .../org/apache/tika/langdetect/tika/et.test | 17 - .../org/apache/tika/langdetect/tika/fi.test | 106 - .../org/apache/tika/langdetect/tika/fr.test | 105 - .../org/apache/tika/langdetect/tika/it.test | 109 - .../langdetect/tika/langbuilder/welsh_corpus.txt | 2602 - .../org/apache/tika/langdetect/tika/lt.test | 32 - .../org/apache/tika/langdetect/tika/nl.test | 105 - .../org/apache/tika/langdetect/tika/pt.test | 105 - .../org/apache/tika/langdetect/tika/sv.test | 108 - .../charsoup/CharSoupFeatureExtractor.java | 456 - .../tika/langdetect/charsoup/ScriptCategory.java | 117 - .../langdetect/charsoup/TextFeatureExtractor.java | 59 - .../tika/langdetect/charsoup/WordTokenizer.java | 225 - .../apache/tika/langdetect/charsoup/langdetect.bin | Bin 1641016 -> 0 bytes .../apache/tika/parser/pkg/PackageParserTest.java | 3 + tika-server/tika-server-core/pom.xml | 2 +- .../server/core/resource/LanguageResource.java | 24 +- .../server/core/resource/TranslateResource.java | 4 +- .../tika/server/core/LanguageResourceTest.java | 2 +- tika-server/tika-server-standard/pom.xml | 6 + tika-translate/pom.xml | 2 +- .../translate/impl/AbstractTranslator.java | 24 +- .../translate/impl/JoshuaNetworkTranslator.java | 2 +- 391 files changed, 3482069 insertions(+), 3445298 deletions(-) diff --cc tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java index 2680c11a70,81871ae319..c97a6bdf6a --- a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java +++ b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java @@@ -274,16 -325,12 +325,14 @@@ public class ScriptAwareFeatureExtracto private static boolean isSpace(int cp) { return cp == ' ' || cp == '\t' - || Character.getType(cp) - == Character.SPACE_SEPARATOR; + || Character.getType(cp) == Character.SPACE_SEPARATOR; } - static boolean isCjkOrKana(int cp) { - if (Character.isIdeographic(cp)) return true; + public static boolean isCjkOrKana(int cp) { + if (Character.isIdeographic(cp)) { + return true; + } - Character.UnicodeScript us = - Character.UnicodeScript.of(cp); + Character.UnicodeScript us = Character.UnicodeScript.of(cp); return us == Character.UnicodeScript.HIRAGANA || us == Character.UnicodeScript.KATAKANA; }
