This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch TIKA-4662
in repository https://gitbox.apache.org/repos/asf/tika.git
from 1797fb0c01 TIKA-4662 -- checkstyle
add 9b775e6568 Bump org.xerial:sqlite-jdbc from 3.51.1.0 to 3.51.2.0
(#2607)
add 0dee7db5de Bump software.amazon.awssdk:bom from 2.41.28 to 2.41.29
(#2609)
add 768cd4d2bf Bump org.springframework:spring-context from 7.0.3 to 7.0.4
(#2608)
add 8918c66384 TIKA-4664 - add Poppler renderer, replace MuPDF, add OCR
safety limits (#2612)
add 2c98c63677 TIKA-4666 - add VLM parsers (Claude, Gemini, OpenAI) (#2614)
add 314a6e7fea TIKA-4668 -- modernize versioning with $revision (#2616)
add b6345d30f1 Merge origin/main into TIKA-4662
add fd8561403a TIKA-4662 -- update common tokens and rebuild model
No new revisions were added by this update.
Summary of changes:
.gitignore | 1 +
docs/modules/ROOT/examples/claude-vlm-basic.json | 10 +
docs/modules/ROOT/examples/claude-vlm-full.json | 18 +
docs/modules/ROOT/examples/gemini-vlm-basic.json | 10 +
docs/modules/ROOT/examples/gemini-vlm-full.json | 18 +
docs/modules/ROOT/examples/openai-vlm-basic.json | 11 +
docs/modules/ROOT/examples/openai-vlm-full.json | 18 +
docs/modules/ROOT/examples/vlm-pdf-parsing.json | 16 +
docs/modules/ROOT/nav.adoc | 2 +
docs/modules/ROOT/pages/advanced/index.adoc | 2 +
.../pages/advanced/language-detection-build.adoc | 561 +-
.../ROOT/pages/advanced/language-detection.adoc | 123 +-
.../ROOT/pages/advanced/local-vlm-server.adoc | 445 +
.../pages/configuration/parsers/vlm-parsers.adoc | 236 +
docs/pom.xml | 2 +-
pom.xml | 2 +-
tika-annotation-processor/pom.xml | 2 +-
tika-app/pom.xml | 2 +-
tika-bom/pom.xml | 155 +-
tika-bundles/pom.xml | 4 +-
tika-bundles/tika-bundle-standard/pom.xml | 4 +-
tika-core/pom.xml | 2 +-
tika-detectors/pom.xml | 2 +-
tika-detectors/tika-detector-magika/pom.xml | 2 +-
tika-detectors/tika-detector-siegfried/pom.xml | 2 +-
tika-e2e-tests/pom.xml | 32 +-
tika-e2e-tests/tika-grpc/pom.xml | 2 +-
tika-eval/pom.xml | 2 +-
tika-eval/tika-eval-app/pom.xml | 2 +-
.../org/apache/tika/eval/app/ProfilerBase.java | 2 +-
.../eval/app/tools/BatchTopCommonTokenCounter.java | 60 -
.../eval/app/tools/SlowCompositeReaderWrapper.java | 391 -
.../tika/eval/app/tools/TopCommonTokenCounter.java | 363 -
.../apache/tika/eval/app/AnalyzerManagerTest.java | 76 +-
.../apache/tika/eval/app/SimpleComparerTest.java | 12 +-
.../eval/app/tools/TopCommonTokenCounterTest.java | 80 +-
.../resources/test-dirs/extractsB/file1.pdf.json | 2 +-
tika-eval/tika-eval-core/pom.xml | 2 +-
.../tika/eval/core/langid/LanguageIDWrapper.java | 9 +-
.../tika/eval/core/tokens/AnalyzerManager.java | 33 +-
.../eval/core/tokens/CommonTokenCountManager.java | 8 -
.../tika/eval/core/tokens/TikaEvalTokenizer.java | 300 +
.../src/main/resources/common_tokens/afr | 60004 +++++++++---------
.../src/main/resources/common_tokens/amh | 25580 ++++----
.../src/main/resources/common_tokens/ara | 60008 +++++++++---------
.../src/main/resources/common_tokens/arg | 9618 +++
.../src/main/resources/common_tokens/arz | 17123 ++++++
.../src/main/resources/common_tokens/asm | 14736 ++---
.../src/main/resources/common_tokens/ast | 43989 +++++++++-----
.../src/main/resources/common_tokens/aze | 60004 +++++++++---------
.../src/main/resources/common_tokens/bak | 53066 +++++++++-------
.../src/main/resources/common_tokens/bam | 1192 +
.../src/main/resources/common_tokens/ban | 9402 +--
.../src/main/resources/common_tokens/bar | 17900 ++++++
.../src/main/resources/common_tokens/bcl | 2863 +
.../src/main/resources/common_tokens/bel | 60008 +++++++++---------
.../src/main/resources/common_tokens/ben | 43908 +++++++++-----
.../src/main/resources/common_tokens/ben-rom | 30022 ----------
.../src/main/resources/common_tokens/bih | 903 -
.../src/main/resources/common_tokens/bjn | 4081 ++
.../src/main/resources/common_tokens/bos | 60004 +++++++++---------
.../src/main/resources/common_tokens/bpy | 1001 +
.../src/main/resources/common_tokens/bre | 26954 ++++++---
.../src/main/resources/common_tokens/bua | 2734 +
.../src/main/resources/common_tokens/bul | 59964 +++++++++---------
.../src/main/resources/common_tokens/cat | 60000 +++++++++---------
.../src/main/resources/common_tokens/ceb | 55570 +++++++++--------
.../src/main/resources/common_tokens/ces | 59978 +++++++++---------
.../src/main/resources/common_tokens/che | 10343 ++--
.../src/main/resources/common_tokens/chv | 10199 ++++
.../src/main/resources/common_tokens/ckb | 16758 +++++-
.../src/main/resources/common_tokens/cmn | 30022 ----------
.../src/main/resources/common_tokens/cos | 2835 +
.../src/main/resources/common_tokens/csb | 1668 +
.../src/main/resources/common_tokens/cym | 30933 ++++++----
.../src/main/resources/common_tokens/dan | 59964 +++++++++---------
.../src/main/resources/common_tokens/deu | 59986 +++++++++---------
.../src/main/resources/common_tokens/diq | 5984 ++
.../src/main/resources/common_tokens/div | 60010 +++++++++----------
.../src/main/resources/common_tokens/dsb | 1320 +
.../src/main/resources/common_tokens/ekk | 30022 ----------
.../src/main/resources/common_tokens/ell | 60004 +++++++++---------
.../src/main/resources/common_tokens/eml | 5000 ++
.../src/main/resources/common_tokens/eng | 59996 +++++++++---------
.../src/main/resources/common_tokens/epo | 59970 +++++++++---------
.../src/main/resources/common_tokens/est | 59990 +++++++++---------
.../src/main/resources/common_tokens/eus | 59746 +++++++++---------
.../src/main/resources/common_tokens/ewe | 1484 +
.../src/main/resources/common_tokens/ext | 2768 +
.../src/main/resources/common_tokens/fao | 59778 +++++++++---------
.../src/main/resources/common_tokens/fas | 60010 +++++++++----------
.../src/main/resources/common_tokens/fin | 59978 +++++++++---------
.../src/main/resources/common_tokens/fra | 59992 +++++++++---------
.../src/main/resources/common_tokens/frr | 1994 +
.../src/main/resources/common_tokens/fry | 46948 +++++++++------
.../src/main/resources/common_tokens/ful | 9679 ---
.../src/main/resources/common_tokens/gla | 26778 ---------
.../src/main/resources/common_tokens/gle | 47428 ++++++++-------
.../src/main/resources/common_tokens/glg | 60010 +++++++++----------
.../src/main/resources/common_tokens/glv | 2583 +
.../src/main/resources/common_tokens/gom | 18860 +++---
.../src/main/resources/common_tokens/grn | 3448 ++
.../src/main/resources/common_tokens/gsw | 42562 +++++++++----
.../src/main/resources/common_tokens/gug | 5528 --
.../src/main/resources/common_tokens/guj | 46584 ++++++--------
.../src/main/resources/common_tokens/hat | 11055 ++--
.../src/main/resources/common_tokens/hau | 37315 +++---------
.../src/main/resources/common_tokens/hbs | 30020 ++++++++++
.../src/main/resources/common_tokens/heb | 60010 +++++++++----------
.../src/main/resources/common_tokens/hif | 2137 +
.../src/main/resources/common_tokens/hin | 60006 +++++++++---------
.../src/main/resources/common_tokens/hin-rom | 30022 ----------
.../src/main/resources/common_tokens/hrv | 60008 +++++++++---------
.../src/main/resources/common_tokens/hsb | 5205 ++
.../src/main/resources/common_tokens/hun | 60000 +++++++++---------
.../src/main/resources/common_tokens/hye | 60010 +++++++++----------
.../src/main/resources/common_tokens/ibo | 11859 +---
.../src/main/resources/common_tokens/ido | 8656 +++
.../src/main/resources/common_tokens/ile | 956 +
.../src/main/resources/common_tokens/ilo | 3845 ++
.../src/main/resources/common_tokens/ina | 6880 +++
.../src/main/resources/common_tokens/ind | 59978 +++++++++---------
.../src/main/resources/common_tokens/isl | 59592 +++++++++---------
.../src/main/resources/common_tokens/ita | 59976 +++++++++---------
.../src/main/resources/common_tokens/jav | 42845 +++++++++----
.../src/main/resources/common_tokens/jpn | 60010 +++++++++----------
.../src/main/resources/common_tokens/kal | 6341 ++
.../src/main/resources/common_tokens/kan | 43538 +++++++++-----
.../src/main/resources/common_tokens/kat | 60004 +++++++++---------
.../src/main/resources/common_tokens/kaz | 60010 +++++++++----------
.../src/main/resources/common_tokens/khk | 4187 ++
.../src/main/resources/common_tokens/khm | 30022 ----------
.../src/main/resources/common_tokens/kin | 12065 ++--
.../src/main/resources/common_tokens/kir | 55878 +++++++++--------
.../src/main/resources/common_tokens/knn | 5022 --
.../src/main/resources/common_tokens/koi | 1373 +
.../src/main/resources/common_tokens/kom | 2382 +
.../src/main/resources/common_tokens/kor | 60010 +++++++++----------
.../src/main/resources/common_tokens/krc | 1974 +
.../src/main/resources/common_tokens/ksh | 2841 +
.../src/main/resources/common_tokens/kur | 42426 ++++---------
.../src/main/resources/common_tokens/lad | 1681 +
.../src/main/resources/common_tokens/lao | 30826 +---------
.../src/main/resources/common_tokens/lat | 42560 +++++++++----
.../src/main/resources/common_tokens/lav | 60008 +++++++++---------
.../src/main/resources/common_tokens/lim | 41907 +++++++++----
.../src/main/resources/common_tokens/lin | 5655 --
.../src/main/resources/common_tokens/lit | 60004 +++++++++---------
.../src/main/resources/common_tokens/lmo | 6924 +++
.../src/main/resources/common_tokens/ltz | 40377 +++++++++----
.../src/main/resources/common_tokens/lug | 53486 +++++++++--------
.../src/main/resources/common_tokens/lup | 905 +
.../src/main/resources/common_tokens/lus | 10034 ++++
.../src/main/resources/common_tokens/lvs | 30022 ----------
.../src/main/resources/common_tokens/mai | 755 +
.../src/main/resources/common_tokens/mal | 31778 +++++++++-
.../src/main/resources/common_tokens/mar | 56710 +++++++++---------
.../src/main/resources/common_tokens/mhr | 8357 ++-
.../src/main/resources/common_tokens/min | 16659 +++--
.../src/main/resources/common_tokens/mkd | 60010 +++++++++----------
.../src/main/resources/common_tokens/mlg | 11905 ++--
.../src/main/resources/common_tokens/mlt | 53228 ++++++++--------
.../src/main/resources/common_tokens/mon | 44624 +++++++++-----
.../src/main/resources/common_tokens/mri | 14151 +++--
.../src/main/resources/common_tokens/mrj | 1994 +
.../src/main/resources/common_tokens/msa | 58366 +++++++++---------
.../src/main/resources/common_tokens/mwl | 12264 ++++
.../src/main/resources/common_tokens/mya | 30022 ----------
.../src/main/resources/common_tokens/mya-zaw | 30022 ----------
.../src/main/resources/common_tokens/myv | 1132 +
.../src/main/resources/common_tokens/mzn | 5501 ++
.../src/main/resources/common_tokens/nan | 9022 ++-
.../src/main/resources/common_tokens/nap | 2039 +
.../src/main/resources/common_tokens/nav | 533 +
.../src/main/resources/common_tokens/ndo | 3142 +
.../src/main/resources/common_tokens/nds | 35077 +++++++----
.../src/main/resources/common_tokens/nep | 50092 +++++++++-------
.../src/main/resources/common_tokens/new | 3537 +-
.../src/main/resources/common_tokens/nld | 59966 +++++++++---------
.../src/main/resources/common_tokens/nno | 60010 +++++++++----------
.../src/main/resources/common_tokens/nob | 60000 +++++++++---------
.../src/main/resources/common_tokens/nso | 8919 ++-
.../src/main/resources/common_tokens/oci | 12539 ----
.../src/main/resources/common_tokens/ori | 13578 +++--
.../src/main/resources/common_tokens/orm | 31985 +---------
.../src/main/resources/common_tokens/oss | 3369 ++
.../src/main/resources/common_tokens/pam | 3054 +
.../src/main/resources/common_tokens/pan | 18234 +++---
.../src/main/resources/common_tokens/pap | 9143 +++
.../src/main/resources/common_tokens/pes | 30022 ----------
.../src/main/resources/common_tokens/pfl | 2452 +
.../src/main/resources/common_tokens/pms | 6552 ++
.../src/main/resources/common_tokens/pnb | 37685 +++++++++---
.../src/main/resources/common_tokens/pol | 60000 +++++++++---------
.../src/main/resources/common_tokens/por | 59990 +++++++++---------
.../src/main/resources/common_tokens/prs | 12167 ++++
.../src/main/resources/common_tokens/pus | 34381 +++++++----
.../src/main/resources/common_tokens/que | 2170 +
.../src/main/resources/common_tokens/quz | 4441 --
.../src/main/resources/common_tokens/roh | 35391 ++++++-----
.../src/main/resources/common_tokens/ron | 60004 +++++++++---------
.../src/main/resources/common_tokens/rue | 2797 +
.../src/main/resources/common_tokens/run | 3534 ++
.../src/main/resources/common_tokens/rus | 60002 +++++++++---------
.../src/main/resources/common_tokens/sah | 14433 +++++
.../src/main/resources/common_tokens/san | 11972 +++-
.../src/main/resources/common_tokens/scn | 7559 +++
.../src/main/resources/common_tokens/sco | 12070 ++++
.../src/main/resources/common_tokens/sgs | 2547 +
.../src/main/resources/common_tokens/sin | 27552 ++++++---
.../src/main/resources/common_tokens/slk | 60008 +++++++++---------
.../src/main/resources/common_tokens/slv | 60010 +++++++++----------
.../src/main/resources/common_tokens/sme | 4120 ++
.../src/main/resources/common_tokens/smi | 1676 +
.../src/main/resources/common_tokens/sna | 23750 ++++++++
.../src/main/resources/common_tokens/snd | 18834 ++++--
.../src/main/resources/common_tokens/som | 37241 +++++++-----
.../src/main/resources/common_tokens/sot | 3535 ++
.../src/main/resources/common_tokens/spa | 59990 +++++++++---------
.../src/main/resources/common_tokens/sqi | 60010 +++++++++----------
.../src/main/resources/common_tokens/srd | 4285 +-
.../src/main/resources/common_tokens/srp | 60010 +++++++++----------
.../src/main/resources/common_tokens/ssw | 2278 +-
.../src/main/resources/common_tokens/sun | 29316 +++++++++
.../src/main/resources/common_tokens/swa | 9604 ---
.../src/main/resources/common_tokens/swe | 59992 +++++++++---------
.../src/main/resources/common_tokens/swh | 4172 ++
.../src/main/resources/common_tokens/szl | 2654 +
.../src/main/resources/common_tokens/tam | 42694 +++++++++----
.../src/main/resources/common_tokens/tam-rom | 30022 ----------
.../src/main/resources/common_tokens/tat | 59968 +++++++++---------
.../src/main/resources/common_tokens/tel | 54347 +++++++++--------
.../src/main/resources/common_tokens/tel-rom | 30022 ----------
.../src/main/resources/common_tokens/tgk | 49781 +++++++++------
.../src/main/resources/common_tokens/tgl | 59996 +++++++++---------
.../src/main/resources/common_tokens/tha | 33064 +++++++++-
.../src/main/resources/common_tokens/tsn | 19224 ++----
.../src/main/resources/common_tokens/tso | 4677 ++
.../src/main/resources/common_tokens/tuk | 35478 +++++------
.../src/main/resources/common_tokens/tur | 60006 +++++++++---------
.../src/main/resources/common_tokens/tyv | 2933 +
.../src/main/resources/common_tokens/udm | 1401 +
.../src/main/resources/common_tokens/uig | 34819 +++++------
.../src/main/resources/common_tokens/ukr | 59998 +++++++++---------
.../src/main/resources/common_tokens/urd | 59962 +++++++++---------
.../src/main/resources/common_tokens/urd-rom | 30022 ----------
.../src/main/resources/common_tokens/uzb | 56433 +++++++++--------
.../src/main/resources/common_tokens/uzn | 30020 ++++++++++
.../src/main/resources/common_tokens/ven | 2457 +
.../src/main/resources/common_tokens/vie | 55964 +++++++++--------
.../src/main/resources/common_tokens/vls | 7607 +++
.../src/main/resources/common_tokens/vol | 5010 +-
.../src/main/resources/common_tokens/vro | 2069 +
.../src/main/resources/common_tokens/war | 47024 +++++++++------
.../src/main/resources/common_tokens/wln | 5792 ++
.../src/main/resources/common_tokens/wol | 4502 --
.../src/main/resources/common_tokens/wuu | 30020 ++++++++++
.../src/main/resources/common_tokens/xho | 31235 +++++++---
.../src/main/resources/common_tokens/xmf | 6407 ++
.../src/main/resources/common_tokens/ydd | 9537 +++
.../src/main/resources/common_tokens/yid | 7668 ---
.../src/main/resources/common_tokens/yor | 4315 +-
.../src/main/resources/common_tokens/zea | 2318 +
.../src/main/resources/common_tokens/zho | 30020 ++++++++++
.../src/main/resources/common_tokens/zho-simp | 30022 ----------
.../src/main/resources/common_tokens/zho-trad | 30022 ----------
.../src/main/resources/common_tokens/zul | 45631 +++++++++-----
.../apache/tika/eval/core/langid/LangIdTest.java | 18 +-
.../tika/eval/core/textstats/TextStatsTest.java | 2 +-
.../core/tokens/TikaEvalTokenizerFuzzTest.java | 164 +
.../core/tokens}/tools/CommonTokenGenerator.java | 236 +-
tika-example/pom.xml | 2 +-
tika-grpc/pom.xml | 2 +-
tika-handlers/pom.xml | 2 +-
tika-handlers/tika-handler-boilerpipe/pom.xml | 2 +-
tika-integration-tests/pom.xml | 2 +-
.../tika-pipes-kafka-integration-tests/pom.xml | 2 +-
.../pom.xml | 2 +-
.../tika-pipes-s3-integration-tests/pom.xml | 2 +-
.../tika-pipes-solr-integration-tests/pom.xml | 2 +-
.../tika-resource-loading-tests/pom.xml | 2 +-
tika-integration-tests/tika-woodstox-tests/pom.xml | 2 +-
tika-java7/pom.xml | 2 +-
tika-langdetect/pom.xml | 3 +-
.../pom.xml | 22 +-
.../charsoup/CharSoupFeatureExtractor.java | 43 +-
.../tika/langdetect/charsoup/CharSoupModel.java | 260 +-
.../tika/langdetect/charsoup/FeatureExtractor.java | 0
.../charsoup/ScriptAwareFeatureExtractor.java | 0
.../tika/langdetect/charsoup/ScriptCategory.java | 0
.../tika/langdetect/charsoup/WordTokenizer.java | 0
.../apache/tika/langdetect/charsoup/langdetect.bin | Bin 0 -> 1641016 bytes
tika-langdetect/tika-langdetect-charsoup/pom.xml | 8 +-
...Detector.java => CharSoupLanguageDetector.java} | 10 +-
.../apache/tika/langdetect/charsoup/langdetect.bin | Bin 1632811 -> 0 bytes
.../langdetect/charsoup/LangIdRegressionTest.java | 154 +
.../langdetect/charsoup/tools/AblationRunner.java | 2 +-
.../charsoup/tools/CompareDetectors.java | 2 +-
.../langdetect/charsoup/tools/ConfusionDumper.java | 2 +-
.../langdetect/charsoup/tools/CrossDomainEval.java | 2 +-
.../langdetect/charsoup/tools/Phase2Trainer.java | 127 +-
.../langdetect/charsoup/tools/QuickF1Eval.java | 4 +-
.../charsoup/tools/TrainLanguageModel.java | 863 +-
.../langdetect/charsoup/tools/TrigramAblation.java | 2 +-
.../src/test/python/download_corpus.py | 86 +-
.../src/test/python/filter_pashto.py | 89 +
tika-langdetect/tika-langdetect-lingo24/pom.xml | 2 +-
tika-langdetect/tika-langdetect-mitll-text/pom.xml | 2 +-
tika-langdetect/tika-langdetect-opennlp/pom.xml | 2 +-
tika-langdetect/tika-langdetect-optimaize/pom.xml | 2 +-
.../tika-langdetect-test-commons/pom.xml | 2 +-
tika-langdetect/tika-langdetect-tika/pom.xml | 2 +-
tika-parent/pom.xml | 37 +-
tika-parsers/pom.xml | 2 +-
tika-parsers/tika-parsers-extended/pom.xml | 2 +-
.../tika-parser-scientific-module/pom.xml | 2 +-
.../tika-parser-scientific-package/pom.xml | 2 +-
.../tika-parser-sqlite3-module/pom.xml | 2 +-
.../tika-parser-sqlite3-package/pom.xml | 2 +-
.../pom.xml | 2 +-
tika-parsers/tika-parsers-ml/pom.xml | 3 +-
.../tika-parsers-ml/tika-parser-nlp-module/pom.xml | 2 +-
.../tika-parser-nlp-package/pom.xml | 2 +-
.../tika-parser-vlm-ocr-module/pom.xml | 132 +
.../apache/tika/parser/vlm/AbstractVLMParser.java | 464 +
.../apache/tika/parser/vlm/ClaudeVLMParser.java | 227 +
.../apache/tika/parser/vlm/GeminiVLMParser.java | 238 +
.../tika/parser/vlm/MarkdownToXHTMLEmitter.java | 409 +
.../apache/tika/parser/vlm/OpenAIVLMParser.java | 266 +
.../org/apache/tika/parser/vlm/VLMOCRConfig.java | 307 +
.../tika/parser/vlm/ClaudeVLMParserTest.java | 285 +
.../tika/parser/vlm/GeminiVLMParserTest.java | 260 +
.../parser/vlm/MarkdownToXHTMLEmitterTest.java | 253 +
.../tika/parser/vlm/OpenAIVLMParserTest.java | 291 +
.../tika-parsers-ml/tika-transcribe-aws/pom.xml | 2 +-
tika-parsers/tika-parsers-standard/pom.xml | 2 +-
.../tika-parsers-standard-modules/pom.xml | 2 +-
.../tika-parser-apple-module/pom.xml | 2 +-
.../tika-parser-audiovideo-module/pom.xml | 2 +-
.../tika-parser-cad-module/pom.xml | 2 +-
.../tika-parser-code-module/pom.xml | 2 +-
.../tika-parser-crypto-module/pom.xml | 2 +-
.../tika-parser-digest-commons/pom.xml | 2 +-
.../tika-parser-font-module/pom.xml | 2 +-
.../tika-parser-html-module/pom.xml | 2 +-
.../tika-parser-image-module/pom.xml | 2 +-
.../tika-parser-jdbc-commons/pom.xml | 2 +-
.../tika-parser-mail-commons/pom.xml | 2 +-
.../tika-parser-mail-module/pom.xml | 2 +-
.../tika-parser-microsoft-module/pom.xml | 2 +-
.../tika-parser-miscoffice-module/pom.xml | 2 +-
.../tika-parser-news-module/pom.xml | 2 +-
.../tika-parser-ocr-module/pom.xml | 2 +-
.../tika-parser-pdf-module/pom.xml | 2 +-
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 27 +
.../java/org/apache/tika/parser/pdf/OcrConfig.java | 60 +
.../apache/tika/parser/pdf/PDFParserConfig.java | 33 +
.../tika/renderer/pdf/mutool/MuPDFRenderer.java | 150 -
.../tika/renderer/pdf/poppler/PopplerRenderer.java | 293 +
.../renderer/pdf/poppler/PopplerRendererTest.java | 167 +
.../tika-parser-pkg-module/pom.xml | 2 +-
.../tika-parser-text-module/pom.xml | 2 +-
.../tika-parser-webarchive-module/pom.xml | 2 +-
.../tika-parser-xml-module/pom.xml | 2 +-
.../tika-parser-xmp-commons/pom.xml | 2 +-
.../tika-parser-zip-commons/pom.xml | 2 +-
.../tika-parsers-standard-package/pom.xml | 2 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 21 +-
...fig.json => tika-rendering-poppler-config.json} | 2 +-
tika-pipes/pom.xml | 2 +-
tika-pipes/tika-async-cli/pom.xml | 2 +-
tika-pipes/tika-httpclient-commons/pom.xml | 2 +-
tika-pipes/tika-pipes-api/pom.xml | 2 +-
tika-pipes/tika-pipes-config-store-ignite/pom.xml | 2 +-
.../tika/pipes/ignite/IgniteConfigStoreTest.java | 10 +-
tika-pipes/tika-pipes-core/pom.xml | 2 +-
tika-pipes/tika-pipes-fork-parser/pom.xml | 2 +-
tika-pipes/tika-pipes-integration-tests/pom.xml | 2 +-
tika-pipes/tika-pipes-iterator-commons/pom.xml | 2 +-
tika-pipes/tika-pipes-plugins/pom.xml | 2 +-
.../tika-pipes-atlassian-jwt/pom.xml | 2 +-
.../tika-pipes-plugins/tika-pipes-az-blob/pom.xml | 2 +-
.../tika-pipes-plugins/tika-pipes-csv/pom.xml | 2 +-
.../tika-pipes-file-system/pom.xml | 2 +-
.../tika-pipes-plugins/tika-pipes-gcs/pom.xml | 2 +-
.../tika-pipes-google-drive/pom.xml | 2 +-
.../tika-pipes-plugins/tika-pipes-http/pom.xml | 2 +-
.../tika-pipes-plugins/tika-pipes-jdbc/pom.xml | 2 +-
.../tika-pipes-plugins/tika-pipes-json/pom.xml | 2 +-
.../tika-pipes-plugins/tika-pipes-kafka/pom.xml | 2 +-
.../tika-pipes-microsoft-graph/pom.xml | 2 +-
.../tika-pipes-opensearch/pom.xml | 2 +-
.../tika-pipes-plugins/tika-pipes-s3/pom.xml | 2 +-
.../tika-pipes-plugins/tika-pipes-solr/pom.xml | 2 +-
tika-pipes/tika-pipes-reporter-commons/pom.xml | 2 +-
tika-plugins-core/pom.xml | 2 +-
tika-serialization/pom.xml | 2 +-
tika-server/pom.xml | 2 +-
tika-server/tika-server-client/pom.xml | 2 +-
tika-server/tika-server-core/pom.xml | 2 +-
tika-server/tika-server-standard/pom.xml | 2 +-
tika-translate/pom.xml | 2 +-
tika-xmp/pom.xml | 2 +-
403 files changed, 3409790 insertions(+), 3047475 deletions(-)
create mode 100644 docs/modules/ROOT/examples/claude-vlm-basic.json
create mode 100644 docs/modules/ROOT/examples/claude-vlm-full.json
create mode 100644 docs/modules/ROOT/examples/gemini-vlm-basic.json
create mode 100644 docs/modules/ROOT/examples/gemini-vlm-full.json
create mode 100644 docs/modules/ROOT/examples/openai-vlm-basic.json
create mode 100644 docs/modules/ROOT/examples/openai-vlm-full.json
create mode 100644 docs/modules/ROOT/examples/vlm-pdf-parsing.json
create mode 100644 docs/modules/ROOT/pages/advanced/local-vlm-server.adoc
create mode 100644
docs/modules/ROOT/pages/configuration/parsers/vlm-parsers.adoc
delete mode 100644
tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/BatchTopCommonTokenCounter.java
delete mode 100644
tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java
delete mode 100644
tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TopCommonTokenCounter.java
create mode 100644
tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TikaEvalTokenizer.java
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/arg
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/arz
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/bam
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/bar
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/bcl
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ben-rom
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/bih
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/bjn
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/bpy
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/bua
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/chv
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/cmn
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/cos
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/csb
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/diq
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/dsb
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ekk
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/eml
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ewe
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ext
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/frr
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ful
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/gla
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/glv
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/grn
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/gug
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/hbs
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/hif
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/hin-rom
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/hsb
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ido
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ile
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ilo
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ina
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/kal
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/khk
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/khm
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/knn
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/koi
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/kom
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/krc
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ksh
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/lad
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/lin
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/lmo
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/lup
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/lus
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/lvs
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/mai
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/mrj
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/mwl
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/mya
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/mya-zaw
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/myv
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/mzn
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/nap
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/nav
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ndo
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/oci
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/oss
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/pam
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/pap
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/pes
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/pfl
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/pms
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/prs
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/que
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/quz
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/rue
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/run
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/sah
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/scn
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/sco
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/sgs
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/sme
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/smi
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/sna
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/sot
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/sun
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/swa
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/swh
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/szl
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/tam-rom
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/tel-rom
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/tso
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/tyv
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/udm
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/urd-rom
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/uzn
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ven
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/vls
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/vro
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/wln
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/wol
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/wuu
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/xmf
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ydd
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/yid
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/zea
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/zho
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/zho-simp
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/zho-trad
create mode 100644
tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/TikaEvalTokenizerFuzzTest.java
rename
{tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup
=>
tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens}/tools/CommonTokenGenerator.java
(50%)
copy tika-langdetect/{tika-langdetect-test-commons =>
tika-langdetect-charsoup-core}/pom.xml (75%)
rename tika-langdetect/{tika-langdetect-charsoup =>
tika-langdetect-charsoup-core}/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
(90%)
rename tika-langdetect/{tika-langdetect-charsoup =>
tika-langdetect-charsoup-core}/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java
(58%)
rename tika-langdetect/{tika-langdetect-charsoup =>
tika-langdetect-charsoup-core}/src/main/java/org/apache/tika/langdetect/charsoup/FeatureExtractor.java
(100%)
rename tika-langdetect/{tika-langdetect-charsoup =>
tika-langdetect-charsoup-core}/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
(100%)
rename tika-langdetect/{tika-langdetect-charsoup =>
tika-langdetect-charsoup-core}/src/main/java/org/apache/tika/langdetect/charsoup/ScriptCategory.java
(100%)
rename tika-langdetect/{tika-langdetect-charsoup =>
tika-langdetect-charsoup-core}/src/main/java/org/apache/tika/langdetect/charsoup/WordTokenizer.java
(100%)
create mode 100644
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect.bin
rename
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/{TikaLanguageDetector.java
=> CharSoupLanguageDetector.java} (98%)
delete mode 100644
tika-langdetect/tika-langdetect-charsoup/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect.bin
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/LangIdRegressionTest.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/python/filter_pashto.py
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/pom.xml
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitter.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/VLMOCRConfig.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitterTest.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java
delete mode 100644
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
create mode 100644
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/poppler/PopplerRenderer.java
create mode 100644
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java
rename
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/{tika-rendering-mupdf-config.json
=> tika-rendering-poppler-config.json} (85%)