This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4662
in repository https://gitbox.apache.org/repos/asf/tika.git


    from 1797fb0c01 TIKA-4662 -- checkstyle
     add 9b775e6568 Bump org.xerial:sqlite-jdbc from 3.51.1.0 to 3.51.2.0 
(#2607)
     add 0dee7db5de Bump software.amazon.awssdk:bom from 2.41.28 to 2.41.29 
(#2609)
     add 768cd4d2bf Bump org.springframework:spring-context from 7.0.3 to 7.0.4 
(#2608)
     add 8918c66384 TIKA-4664 - add Poppler renderer, replace MuPDF, add OCR 
safety limits (#2612)
     add 2c98c63677 TIKA-4666 - add VLM parsers (Claude, Gemini, OpenAI) (#2614)
     add 314a6e7fea TIKA-4668 -- modernize versioning with $revision (#2616)
     add b6345d30f1 Merge origin/main into TIKA-4662
     add fd8561403a TIKA-4662 -- update common tokens and rebuild model

No new revisions were added by this update.

Summary of changes:
 .gitignore                                         |     1 +
 docs/modules/ROOT/examples/claude-vlm-basic.json   |    10 +
 docs/modules/ROOT/examples/claude-vlm-full.json    |    18 +
 docs/modules/ROOT/examples/gemini-vlm-basic.json   |    10 +
 docs/modules/ROOT/examples/gemini-vlm-full.json    |    18 +
 docs/modules/ROOT/examples/openai-vlm-basic.json   |    11 +
 docs/modules/ROOT/examples/openai-vlm-full.json    |    18 +
 docs/modules/ROOT/examples/vlm-pdf-parsing.json    |    16 +
 docs/modules/ROOT/nav.adoc                         |     2 +
 docs/modules/ROOT/pages/advanced/index.adoc        |     2 +
 .../pages/advanced/language-detection-build.adoc   |   561 +-
 .../ROOT/pages/advanced/language-detection.adoc    |   123 +-
 .../ROOT/pages/advanced/local-vlm-server.adoc      |   445 +
 .../pages/configuration/parsers/vlm-parsers.adoc   |   236 +
 docs/pom.xml                                       |     2 +-
 pom.xml                                            |     2 +-
 tika-annotation-processor/pom.xml                  |     2 +-
 tika-app/pom.xml                                   |     2 +-
 tika-bom/pom.xml                                   |   155 +-
 tika-bundles/pom.xml                               |     4 +-
 tika-bundles/tika-bundle-standard/pom.xml          |     4 +-
 tika-core/pom.xml                                  |     2 +-
 tika-detectors/pom.xml                             |     2 +-
 tika-detectors/tika-detector-magika/pom.xml        |     2 +-
 tika-detectors/tika-detector-siegfried/pom.xml     |     2 +-
 tika-e2e-tests/pom.xml                             |    32 +-
 tika-e2e-tests/tika-grpc/pom.xml                   |     2 +-
 tika-eval/pom.xml                                  |     2 +-
 tika-eval/tika-eval-app/pom.xml                    |     2 +-
 .../org/apache/tika/eval/app/ProfilerBase.java     |     2 +-
 .../eval/app/tools/BatchTopCommonTokenCounter.java |    60 -
 .../eval/app/tools/SlowCompositeReaderWrapper.java |   391 -
 .../tika/eval/app/tools/TopCommonTokenCounter.java |   363 -
 .../apache/tika/eval/app/AnalyzerManagerTest.java  |    76 +-
 .../apache/tika/eval/app/SimpleComparerTest.java   |    12 +-
 .../eval/app/tools/TopCommonTokenCounterTest.java  |    80 +-
 .../resources/test-dirs/extractsB/file1.pdf.json   |     2 +-
 tika-eval/tika-eval-core/pom.xml                   |     2 +-
 .../tika/eval/core/langid/LanguageIDWrapper.java   |     9 +-
 .../tika/eval/core/tokens/AnalyzerManager.java     |    33 +-
 .../eval/core/tokens/CommonTokenCountManager.java  |     8 -
 .../tika/eval/core/tokens/TikaEvalTokenizer.java   |   300 +
 .../src/main/resources/common_tokens/afr           | 60004 +++++++++---------
 .../src/main/resources/common_tokens/amh           | 25580 ++++----
 .../src/main/resources/common_tokens/ara           | 60008 +++++++++---------
 .../src/main/resources/common_tokens/arg           |  9618 +++
 .../src/main/resources/common_tokens/arz           | 17123 ++++++
 .../src/main/resources/common_tokens/asm           | 14736 ++---
 .../src/main/resources/common_tokens/ast           | 43989 +++++++++-----
 .../src/main/resources/common_tokens/aze           | 60004 +++++++++---------
 .../src/main/resources/common_tokens/bak           | 53066 +++++++++-------
 .../src/main/resources/common_tokens/bam           |  1192 +
 .../src/main/resources/common_tokens/ban           |  9402 +--
 .../src/main/resources/common_tokens/bar           | 17900 ++++++
 .../src/main/resources/common_tokens/bcl           |  2863 +
 .../src/main/resources/common_tokens/bel           | 60008 +++++++++---------
 .../src/main/resources/common_tokens/ben           | 43908 +++++++++-----
 .../src/main/resources/common_tokens/ben-rom       | 30022 ----------
 .../src/main/resources/common_tokens/bih           |   903 -
 .../src/main/resources/common_tokens/bjn           |  4081 ++
 .../src/main/resources/common_tokens/bos           | 60004 +++++++++---------
 .../src/main/resources/common_tokens/bpy           |  1001 +
 .../src/main/resources/common_tokens/bre           | 26954 ++++++---
 .../src/main/resources/common_tokens/bua           |  2734 +
 .../src/main/resources/common_tokens/bul           | 59964 +++++++++---------
 .../src/main/resources/common_tokens/cat           | 60000 +++++++++---------
 .../src/main/resources/common_tokens/ceb           | 55570 +++++++++--------
 .../src/main/resources/common_tokens/ces           | 59978 +++++++++---------
 .../src/main/resources/common_tokens/che           | 10343 ++--
 .../src/main/resources/common_tokens/chv           | 10199 ++++
 .../src/main/resources/common_tokens/ckb           | 16758 +++++-
 .../src/main/resources/common_tokens/cmn           | 30022 ----------
 .../src/main/resources/common_tokens/cos           |  2835 +
 .../src/main/resources/common_tokens/csb           |  1668 +
 .../src/main/resources/common_tokens/cym           | 30933 ++++++----
 .../src/main/resources/common_tokens/dan           | 59964 +++++++++---------
 .../src/main/resources/common_tokens/deu           | 59986 +++++++++---------
 .../src/main/resources/common_tokens/diq           |  5984 ++
 .../src/main/resources/common_tokens/div           | 60010 +++++++++----------
 .../src/main/resources/common_tokens/dsb           |  1320 +
 .../src/main/resources/common_tokens/ekk           | 30022 ----------
 .../src/main/resources/common_tokens/ell           | 60004 +++++++++---------
 .../src/main/resources/common_tokens/eml           |  5000 ++
 .../src/main/resources/common_tokens/eng           | 59996 +++++++++---------
 .../src/main/resources/common_tokens/epo           | 59970 +++++++++---------
 .../src/main/resources/common_tokens/est           | 59990 +++++++++---------
 .../src/main/resources/common_tokens/eus           | 59746 +++++++++---------
 .../src/main/resources/common_tokens/ewe           |  1484 +
 .../src/main/resources/common_tokens/ext           |  2768 +
 .../src/main/resources/common_tokens/fao           | 59778 +++++++++---------
 .../src/main/resources/common_tokens/fas           | 60010 +++++++++----------
 .../src/main/resources/common_tokens/fin           | 59978 +++++++++---------
 .../src/main/resources/common_tokens/fra           | 59992 +++++++++---------
 .../src/main/resources/common_tokens/frr           |  1994 +
 .../src/main/resources/common_tokens/fry           | 46948 +++++++++------
 .../src/main/resources/common_tokens/ful           |  9679 ---
 .../src/main/resources/common_tokens/gla           | 26778 ---------
 .../src/main/resources/common_tokens/gle           | 47428 ++++++++-------
 .../src/main/resources/common_tokens/glg           | 60010 +++++++++----------
 .../src/main/resources/common_tokens/glv           |  2583 +
 .../src/main/resources/common_tokens/gom           | 18860 +++---
 .../src/main/resources/common_tokens/grn           |  3448 ++
 .../src/main/resources/common_tokens/gsw           | 42562 +++++++++----
 .../src/main/resources/common_tokens/gug           |  5528 --
 .../src/main/resources/common_tokens/guj           | 46584 ++++++--------
 .../src/main/resources/common_tokens/hat           | 11055 ++--
 .../src/main/resources/common_tokens/hau           | 37315 +++---------
 .../src/main/resources/common_tokens/hbs           | 30020 ++++++++++
 .../src/main/resources/common_tokens/heb           | 60010 +++++++++----------
 .../src/main/resources/common_tokens/hif           |  2137 +
 .../src/main/resources/common_tokens/hin           | 60006 +++++++++---------
 .../src/main/resources/common_tokens/hin-rom       | 30022 ----------
 .../src/main/resources/common_tokens/hrv           | 60008 +++++++++---------
 .../src/main/resources/common_tokens/hsb           |  5205 ++
 .../src/main/resources/common_tokens/hun           | 60000 +++++++++---------
 .../src/main/resources/common_tokens/hye           | 60010 +++++++++----------
 .../src/main/resources/common_tokens/ibo           | 11859 +---
 .../src/main/resources/common_tokens/ido           |  8656 +++
 .../src/main/resources/common_tokens/ile           |   956 +
 .../src/main/resources/common_tokens/ilo           |  3845 ++
 .../src/main/resources/common_tokens/ina           |  6880 +++
 .../src/main/resources/common_tokens/ind           | 59978 +++++++++---------
 .../src/main/resources/common_tokens/isl           | 59592 +++++++++---------
 .../src/main/resources/common_tokens/ita           | 59976 +++++++++---------
 .../src/main/resources/common_tokens/jav           | 42845 +++++++++----
 .../src/main/resources/common_tokens/jpn           | 60010 +++++++++----------
 .../src/main/resources/common_tokens/kal           |  6341 ++
 .../src/main/resources/common_tokens/kan           | 43538 +++++++++-----
 .../src/main/resources/common_tokens/kat           | 60004 +++++++++---------
 .../src/main/resources/common_tokens/kaz           | 60010 +++++++++----------
 .../src/main/resources/common_tokens/khk           |  4187 ++
 .../src/main/resources/common_tokens/khm           | 30022 ----------
 .../src/main/resources/common_tokens/kin           | 12065 ++--
 .../src/main/resources/common_tokens/kir           | 55878 +++++++++--------
 .../src/main/resources/common_tokens/knn           |  5022 --
 .../src/main/resources/common_tokens/koi           |  1373 +
 .../src/main/resources/common_tokens/kom           |  2382 +
 .../src/main/resources/common_tokens/kor           | 60010 +++++++++----------
 .../src/main/resources/common_tokens/krc           |  1974 +
 .../src/main/resources/common_tokens/ksh           |  2841 +
 .../src/main/resources/common_tokens/kur           | 42426 ++++---------
 .../src/main/resources/common_tokens/lad           |  1681 +
 .../src/main/resources/common_tokens/lao           | 30826 +---------
 .../src/main/resources/common_tokens/lat           | 42560 +++++++++----
 .../src/main/resources/common_tokens/lav           | 60008 +++++++++---------
 .../src/main/resources/common_tokens/lim           | 41907 +++++++++----
 .../src/main/resources/common_tokens/lin           |  5655 --
 .../src/main/resources/common_tokens/lit           | 60004 +++++++++---------
 .../src/main/resources/common_tokens/lmo           |  6924 +++
 .../src/main/resources/common_tokens/ltz           | 40377 +++++++++----
 .../src/main/resources/common_tokens/lug           | 53486 +++++++++--------
 .../src/main/resources/common_tokens/lup           |   905 +
 .../src/main/resources/common_tokens/lus           | 10034 ++++
 .../src/main/resources/common_tokens/lvs           | 30022 ----------
 .../src/main/resources/common_tokens/mai           |   755 +
 .../src/main/resources/common_tokens/mal           | 31778 +++++++++-
 .../src/main/resources/common_tokens/mar           | 56710 +++++++++---------
 .../src/main/resources/common_tokens/mhr           |  8357 ++-
 .../src/main/resources/common_tokens/min           | 16659 +++--
 .../src/main/resources/common_tokens/mkd           | 60010 +++++++++----------
 .../src/main/resources/common_tokens/mlg           | 11905 ++--
 .../src/main/resources/common_tokens/mlt           | 53228 ++++++++--------
 .../src/main/resources/common_tokens/mon           | 44624 +++++++++-----
 .../src/main/resources/common_tokens/mri           | 14151 +++--
 .../src/main/resources/common_tokens/mrj           |  1994 +
 .../src/main/resources/common_tokens/msa           | 58366 +++++++++---------
 .../src/main/resources/common_tokens/mwl           | 12264 ++++
 .../src/main/resources/common_tokens/mya           | 30022 ----------
 .../src/main/resources/common_tokens/mya-zaw       | 30022 ----------
 .../src/main/resources/common_tokens/myv           |  1132 +
 .../src/main/resources/common_tokens/mzn           |  5501 ++
 .../src/main/resources/common_tokens/nan           |  9022 ++-
 .../src/main/resources/common_tokens/nap           |  2039 +
 .../src/main/resources/common_tokens/nav           |   533 +
 .../src/main/resources/common_tokens/ndo           |  3142 +
 .../src/main/resources/common_tokens/nds           | 35077 +++++++----
 .../src/main/resources/common_tokens/nep           | 50092 +++++++++-------
 .../src/main/resources/common_tokens/new           |  3537 +-
 .../src/main/resources/common_tokens/nld           | 59966 +++++++++---------
 .../src/main/resources/common_tokens/nno           | 60010 +++++++++----------
 .../src/main/resources/common_tokens/nob           | 60000 +++++++++---------
 .../src/main/resources/common_tokens/nso           |  8919 ++-
 .../src/main/resources/common_tokens/oci           | 12539 ----
 .../src/main/resources/common_tokens/ori           | 13578 +++--
 .../src/main/resources/common_tokens/orm           | 31985 +---------
 .../src/main/resources/common_tokens/oss           |  3369 ++
 .../src/main/resources/common_tokens/pam           |  3054 +
 .../src/main/resources/common_tokens/pan           | 18234 +++---
 .../src/main/resources/common_tokens/pap           |  9143 +++
 .../src/main/resources/common_tokens/pes           | 30022 ----------
 .../src/main/resources/common_tokens/pfl           |  2452 +
 .../src/main/resources/common_tokens/pms           |  6552 ++
 .../src/main/resources/common_tokens/pnb           | 37685 +++++++++---
 .../src/main/resources/common_tokens/pol           | 60000 +++++++++---------
 .../src/main/resources/common_tokens/por           | 59990 +++++++++---------
 .../src/main/resources/common_tokens/prs           | 12167 ++++
 .../src/main/resources/common_tokens/pus           | 34381 +++++++----
 .../src/main/resources/common_tokens/que           |  2170 +
 .../src/main/resources/common_tokens/quz           |  4441 --
 .../src/main/resources/common_tokens/roh           | 35391 ++++++-----
 .../src/main/resources/common_tokens/ron           | 60004 +++++++++---------
 .../src/main/resources/common_tokens/rue           |  2797 +
 .../src/main/resources/common_tokens/run           |  3534 ++
 .../src/main/resources/common_tokens/rus           | 60002 +++++++++---------
 .../src/main/resources/common_tokens/sah           | 14433 +++++
 .../src/main/resources/common_tokens/san           | 11972 +++-
 .../src/main/resources/common_tokens/scn           |  7559 +++
 .../src/main/resources/common_tokens/sco           | 12070 ++++
 .../src/main/resources/common_tokens/sgs           |  2547 +
 .../src/main/resources/common_tokens/sin           | 27552 ++++++---
 .../src/main/resources/common_tokens/slk           | 60008 +++++++++---------
 .../src/main/resources/common_tokens/slv           | 60010 +++++++++----------
 .../src/main/resources/common_tokens/sme           |  4120 ++
 .../src/main/resources/common_tokens/smi           |  1676 +
 .../src/main/resources/common_tokens/sna           | 23750 ++++++++
 .../src/main/resources/common_tokens/snd           | 18834 ++++--
 .../src/main/resources/common_tokens/som           | 37241 +++++++-----
 .../src/main/resources/common_tokens/sot           |  3535 ++
 .../src/main/resources/common_tokens/spa           | 59990 +++++++++---------
 .../src/main/resources/common_tokens/sqi           | 60010 +++++++++----------
 .../src/main/resources/common_tokens/srd           |  4285 +-
 .../src/main/resources/common_tokens/srp           | 60010 +++++++++----------
 .../src/main/resources/common_tokens/ssw           |  2278 +-
 .../src/main/resources/common_tokens/sun           | 29316 +++++++++
 .../src/main/resources/common_tokens/swa           |  9604 ---
 .../src/main/resources/common_tokens/swe           | 59992 +++++++++---------
 .../src/main/resources/common_tokens/swh           |  4172 ++
 .../src/main/resources/common_tokens/szl           |  2654 +
 .../src/main/resources/common_tokens/tam           | 42694 +++++++++----
 .../src/main/resources/common_tokens/tam-rom       | 30022 ----------
 .../src/main/resources/common_tokens/tat           | 59968 +++++++++---------
 .../src/main/resources/common_tokens/tel           | 54347 +++++++++--------
 .../src/main/resources/common_tokens/tel-rom       | 30022 ----------
 .../src/main/resources/common_tokens/tgk           | 49781 +++++++++------
 .../src/main/resources/common_tokens/tgl           | 59996 +++++++++---------
 .../src/main/resources/common_tokens/tha           | 33064 +++++++++-
 .../src/main/resources/common_tokens/tsn           | 19224 ++----
 .../src/main/resources/common_tokens/tso           |  4677 ++
 .../src/main/resources/common_tokens/tuk           | 35478 +++++------
 .../src/main/resources/common_tokens/tur           | 60006 +++++++++---------
 .../src/main/resources/common_tokens/tyv           |  2933 +
 .../src/main/resources/common_tokens/udm           |  1401 +
 .../src/main/resources/common_tokens/uig           | 34819 +++++------
 .../src/main/resources/common_tokens/ukr           | 59998 +++++++++---------
 .../src/main/resources/common_tokens/urd           | 59962 +++++++++---------
 .../src/main/resources/common_tokens/urd-rom       | 30022 ----------
 .../src/main/resources/common_tokens/uzb           | 56433 +++++++++--------
 .../src/main/resources/common_tokens/uzn           | 30020 ++++++++++
 .../src/main/resources/common_tokens/ven           |  2457 +
 .../src/main/resources/common_tokens/vie           | 55964 +++++++++--------
 .../src/main/resources/common_tokens/vls           |  7607 +++
 .../src/main/resources/common_tokens/vol           |  5010 +-
 .../src/main/resources/common_tokens/vro           |  2069 +
 .../src/main/resources/common_tokens/war           | 47024 +++++++++------
 .../src/main/resources/common_tokens/wln           |  5792 ++
 .../src/main/resources/common_tokens/wol           |  4502 --
 .../src/main/resources/common_tokens/wuu           | 30020 ++++++++++
 .../src/main/resources/common_tokens/xho           | 31235 +++++++---
 .../src/main/resources/common_tokens/xmf           |  6407 ++
 .../src/main/resources/common_tokens/ydd           |  9537 +++
 .../src/main/resources/common_tokens/yid           |  7668 ---
 .../src/main/resources/common_tokens/yor           |  4315 +-
 .../src/main/resources/common_tokens/zea           |  2318 +
 .../src/main/resources/common_tokens/zho           | 30020 ++++++++++
 .../src/main/resources/common_tokens/zho-simp      | 30022 ----------
 .../src/main/resources/common_tokens/zho-trad      | 30022 ----------
 .../src/main/resources/common_tokens/zul           | 45631 +++++++++-----
 .../apache/tika/eval/core/langid/LangIdTest.java   |    18 +-
 .../tika/eval/core/textstats/TextStatsTest.java    |     2 +-
 .../core/tokens/TikaEvalTokenizerFuzzTest.java     |   164 +
 .../core/tokens}/tools/CommonTokenGenerator.java   |   236 +-
 tika-example/pom.xml                               |     2 +-
 tika-grpc/pom.xml                                  |     2 +-
 tika-handlers/pom.xml                              |     2 +-
 tika-handlers/tika-handler-boilerpipe/pom.xml      |     2 +-
 tika-integration-tests/pom.xml                     |     2 +-
 .../tika-pipes-kafka-integration-tests/pom.xml     |     2 +-
 .../pom.xml                                        |     2 +-
 .../tika-pipes-s3-integration-tests/pom.xml        |     2 +-
 .../tika-pipes-solr-integration-tests/pom.xml      |     2 +-
 .../tika-resource-loading-tests/pom.xml            |     2 +-
 tika-integration-tests/tika-woodstox-tests/pom.xml |     2 +-
 tika-java7/pom.xml                                 |     2 +-
 tika-langdetect/pom.xml                            |     3 +-
 .../pom.xml                                        |    22 +-
 .../charsoup/CharSoupFeatureExtractor.java         |    43 +-
 .../tika/langdetect/charsoup/CharSoupModel.java    |   260 +-
 .../tika/langdetect/charsoup/FeatureExtractor.java |     0
 .../charsoup/ScriptAwareFeatureExtractor.java      |     0
 .../tika/langdetect/charsoup/ScriptCategory.java   |     0
 .../tika/langdetect/charsoup/WordTokenizer.java    |     0
 .../apache/tika/langdetect/charsoup/langdetect.bin |   Bin 0 -> 1641016 bytes
 tika-langdetect/tika-langdetect-charsoup/pom.xml   |     8 +-
 ...Detector.java => CharSoupLanguageDetector.java} |    10 +-
 .../apache/tika/langdetect/charsoup/langdetect.bin |   Bin 1632811 -> 0 bytes
 .../langdetect/charsoup/LangIdRegressionTest.java  |   154 +
 .../langdetect/charsoup/tools/AblationRunner.java  |     2 +-
 .../charsoup/tools/CompareDetectors.java           |     2 +-
 .../langdetect/charsoup/tools/ConfusionDumper.java |     2 +-
 .../langdetect/charsoup/tools/CrossDomainEval.java |     2 +-
 .../langdetect/charsoup/tools/Phase2Trainer.java   |   127 +-
 .../langdetect/charsoup/tools/QuickF1Eval.java     |     4 +-
 .../charsoup/tools/TrainLanguageModel.java         |   863 +-
 .../langdetect/charsoup/tools/TrigramAblation.java |     2 +-
 .../src/test/python/download_corpus.py             |    86 +-
 .../src/test/python/filter_pashto.py               |    89 +
 tika-langdetect/tika-langdetect-lingo24/pom.xml    |     2 +-
 tika-langdetect/tika-langdetect-mitll-text/pom.xml |     2 +-
 tika-langdetect/tika-langdetect-opennlp/pom.xml    |     2 +-
 tika-langdetect/tika-langdetect-optimaize/pom.xml  |     2 +-
 .../tika-langdetect-test-commons/pom.xml           |     2 +-
 tika-langdetect/tika-langdetect-tika/pom.xml       |     2 +-
 tika-parent/pom.xml                                |    37 +-
 tika-parsers/pom.xml                               |     2 +-
 tika-parsers/tika-parsers-extended/pom.xml         |     2 +-
 .../tika-parser-scientific-module/pom.xml          |     2 +-
 .../tika-parser-scientific-package/pom.xml         |     2 +-
 .../tika-parser-sqlite3-module/pom.xml             |     2 +-
 .../tika-parser-sqlite3-package/pom.xml            |     2 +-
 .../pom.xml                                        |     2 +-
 tika-parsers/tika-parsers-ml/pom.xml               |     3 +-
 .../tika-parsers-ml/tika-parser-nlp-module/pom.xml |     2 +-
 .../tika-parser-nlp-package/pom.xml                |     2 +-
 .../tika-parser-vlm-ocr-module/pom.xml             |   132 +
 .../apache/tika/parser/vlm/AbstractVLMParser.java  |   464 +
 .../apache/tika/parser/vlm/ClaudeVLMParser.java    |   227 +
 .../apache/tika/parser/vlm/GeminiVLMParser.java    |   238 +
 .../tika/parser/vlm/MarkdownToXHTMLEmitter.java    |   409 +
 .../apache/tika/parser/vlm/OpenAIVLMParser.java    |   266 +
 .../org/apache/tika/parser/vlm/VLMOCRConfig.java   |   307 +
 .../tika/parser/vlm/ClaudeVLMParserTest.java       |   285 +
 .../tika/parser/vlm/GeminiVLMParserTest.java       |   260 +
 .../parser/vlm/MarkdownToXHTMLEmitterTest.java     |   253 +
 .../tika/parser/vlm/OpenAIVLMParserTest.java       |   291 +
 .../tika-parsers-ml/tika-transcribe-aws/pom.xml    |     2 +-
 tika-parsers/tika-parsers-standard/pom.xml         |     2 +-
 .../tika-parsers-standard-modules/pom.xml          |     2 +-
 .../tika-parser-apple-module/pom.xml               |     2 +-
 .../tika-parser-audiovideo-module/pom.xml          |     2 +-
 .../tika-parser-cad-module/pom.xml                 |     2 +-
 .../tika-parser-code-module/pom.xml                |     2 +-
 .../tika-parser-crypto-module/pom.xml              |     2 +-
 .../tika-parser-digest-commons/pom.xml             |     2 +-
 .../tika-parser-font-module/pom.xml                |     2 +-
 .../tika-parser-html-module/pom.xml                |     2 +-
 .../tika-parser-image-module/pom.xml               |     2 +-
 .../tika-parser-jdbc-commons/pom.xml               |     2 +-
 .../tika-parser-mail-commons/pom.xml               |     2 +-
 .../tika-parser-mail-module/pom.xml                |     2 +-
 .../tika-parser-microsoft-module/pom.xml           |     2 +-
 .../tika-parser-miscoffice-module/pom.xml          |     2 +-
 .../tika-parser-news-module/pom.xml                |     2 +-
 .../tika-parser-ocr-module/pom.xml                 |     2 +-
 .../tika-parser-pdf-module/pom.xml                 |     2 +-
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |    27 +
 .../java/org/apache/tika/parser/pdf/OcrConfig.java |    60 +
 .../apache/tika/parser/pdf/PDFParserConfig.java    |    33 +
 .../tika/renderer/pdf/mutool/MuPDFRenderer.java    |   150 -
 .../tika/renderer/pdf/poppler/PopplerRenderer.java |   293 +
 .../renderer/pdf/poppler/PopplerRendererTest.java  |   167 +
 .../tika-parser-pkg-module/pom.xml                 |     2 +-
 .../tika-parser-text-module/pom.xml                |     2 +-
 .../tika-parser-webarchive-module/pom.xml          |     2 +-
 .../tika-parser-xml-module/pom.xml                 |     2 +-
 .../tika-parser-xmp-commons/pom.xml                |     2 +-
 .../tika-parser-zip-commons/pom.xml                |     2 +-
 .../tika-parsers-standard-package/pom.xml          |     2 +-
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |    21 +-
 ...fig.json => tika-rendering-poppler-config.json} |     2 +-
 tika-pipes/pom.xml                                 |     2 +-
 tika-pipes/tika-async-cli/pom.xml                  |     2 +-
 tika-pipes/tika-httpclient-commons/pom.xml         |     2 +-
 tika-pipes/tika-pipes-api/pom.xml                  |     2 +-
 tika-pipes/tika-pipes-config-store-ignite/pom.xml  |     2 +-
 .../tika/pipes/ignite/IgniteConfigStoreTest.java   |    10 +-
 tika-pipes/tika-pipes-core/pom.xml                 |     2 +-
 tika-pipes/tika-pipes-fork-parser/pom.xml          |     2 +-
 tika-pipes/tika-pipes-integration-tests/pom.xml    |     2 +-
 tika-pipes/tika-pipes-iterator-commons/pom.xml     |     2 +-
 tika-pipes/tika-pipes-plugins/pom.xml              |     2 +-
 .../tika-pipes-atlassian-jwt/pom.xml               |     2 +-
 .../tika-pipes-plugins/tika-pipes-az-blob/pom.xml  |     2 +-
 .../tika-pipes-plugins/tika-pipes-csv/pom.xml      |     2 +-
 .../tika-pipes-file-system/pom.xml                 |     2 +-
 .../tika-pipes-plugins/tika-pipes-gcs/pom.xml      |     2 +-
 .../tika-pipes-google-drive/pom.xml                |     2 +-
 .../tika-pipes-plugins/tika-pipes-http/pom.xml     |     2 +-
 .../tika-pipes-plugins/tika-pipes-jdbc/pom.xml     |     2 +-
 .../tika-pipes-plugins/tika-pipes-json/pom.xml     |     2 +-
 .../tika-pipes-plugins/tika-pipes-kafka/pom.xml    |     2 +-
 .../tika-pipes-microsoft-graph/pom.xml             |     2 +-
 .../tika-pipes-opensearch/pom.xml                  |     2 +-
 .../tika-pipes-plugins/tika-pipes-s3/pom.xml       |     2 +-
 .../tika-pipes-plugins/tika-pipes-solr/pom.xml     |     2 +-
 tika-pipes/tika-pipes-reporter-commons/pom.xml     |     2 +-
 tika-plugins-core/pom.xml                          |     2 +-
 tika-serialization/pom.xml                         |     2 +-
 tika-server/pom.xml                                |     2 +-
 tika-server/tika-server-client/pom.xml             |     2 +-
 tika-server/tika-server-core/pom.xml               |     2 +-
 tika-server/tika-server-standard/pom.xml           |     2 +-
 tika-translate/pom.xml                             |     2 +-
 tika-xmp/pom.xml                                   |     2 +-
 403 files changed, 3409790 insertions(+), 3047475 deletions(-)
 create mode 100644 docs/modules/ROOT/examples/claude-vlm-basic.json
 create mode 100644 docs/modules/ROOT/examples/claude-vlm-full.json
 create mode 100644 docs/modules/ROOT/examples/gemini-vlm-basic.json
 create mode 100644 docs/modules/ROOT/examples/gemini-vlm-full.json
 create mode 100644 docs/modules/ROOT/examples/openai-vlm-basic.json
 create mode 100644 docs/modules/ROOT/examples/openai-vlm-full.json
 create mode 100644 docs/modules/ROOT/examples/vlm-pdf-parsing.json
 create mode 100644 docs/modules/ROOT/pages/advanced/local-vlm-server.adoc
 create mode 100644 
docs/modules/ROOT/pages/configuration/parsers/vlm-parsers.adoc
 delete mode 100644 
tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/BatchTopCommonTokenCounter.java
 delete mode 100644 
tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java
 delete mode 100644 
tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TopCommonTokenCounter.java
 create mode 100644 
tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TikaEvalTokenizer.java
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/arg
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/arz
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/bam
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/bar
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/bcl
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ben-rom
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/bih
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/bjn
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/bpy
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/bua
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/chv
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/cmn
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/cos
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/csb
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/diq
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/dsb
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ekk
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/eml
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ewe
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ext
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/frr
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ful
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/gla
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/glv
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/grn
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/gug
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/hbs
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/hif
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/hin-rom
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/hsb
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ido
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ile
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ilo
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ina
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/kal
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/khk
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/khm
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/knn
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/koi
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/kom
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/krc
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ksh
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/lad
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/lin
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/lmo
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/lup
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/lus
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/lvs
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/mai
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/mrj
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/mwl
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/mya
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/mya-zaw
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/myv
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/mzn
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/nap
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/nav
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ndo
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/oci
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/oss
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/pam
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/pap
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/pes
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/pfl
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/pms
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/prs
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/que
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/quz
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/rue
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/run
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/sah
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/scn
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/sco
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/sgs
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/sme
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/smi
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/sna
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/sot
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/sun
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/swa
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/swh
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/szl
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/tam-rom
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/tel-rom
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/tso
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/tyv
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/udm
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/urd-rom
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/uzn
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ven
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/vls
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/vro
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/wln
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/wol
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/wuu
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/xmf
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ydd
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/yid
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/zea
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/zho
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/zho-simp
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/zho-trad
 create mode 100644 
tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/TikaEvalTokenizerFuzzTest.java
 rename 
{tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup
 => 
tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens}/tools/CommonTokenGenerator.java
 (50%)
 copy tika-langdetect/{tika-langdetect-test-commons => 
tika-langdetect-charsoup-core}/pom.xml (75%)
 rename tika-langdetect/{tika-langdetect-charsoup => 
tika-langdetect-charsoup-core}/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
 (90%)
 rename tika-langdetect/{tika-langdetect-charsoup => 
tika-langdetect-charsoup-core}/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java
 (58%)
 rename tika-langdetect/{tika-langdetect-charsoup => 
tika-langdetect-charsoup-core}/src/main/java/org/apache/tika/langdetect/charsoup/FeatureExtractor.java
 (100%)
 rename tika-langdetect/{tika-langdetect-charsoup => 
tika-langdetect-charsoup-core}/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
 (100%)
 rename tika-langdetect/{tika-langdetect-charsoup => 
tika-langdetect-charsoup-core}/src/main/java/org/apache/tika/langdetect/charsoup/ScriptCategory.java
 (100%)
 rename tika-langdetect/{tika-langdetect-charsoup => 
tika-langdetect-charsoup-core}/src/main/java/org/apache/tika/langdetect/charsoup/WordTokenizer.java
 (100%)
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect.bin
 rename 
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/{TikaLanguageDetector.java
 => CharSoupLanguageDetector.java} (98%)
 delete mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect.bin
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/LangIdRegressionTest.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/python/filter_pashto.py
 create mode 100644 
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/pom.xml
 create mode 100644 
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java
 create mode 100644 
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java
 create mode 100644 
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java
 create mode 100644 
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitter.java
 create mode 100644 
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java
 create mode 100644 
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/VLMOCRConfig.java
 create mode 100644 
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java
 create mode 100644 
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java
 create mode 100644 
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitterTest.java
 create mode 100644 
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java
 delete mode 100644 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
 create mode 100644 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/poppler/PopplerRenderer.java
 create mode 100644 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java
 rename 
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/{tika-rendering-mupdf-config.json
 => tika-rendering-poppler-config.json} (85%)


Reply via email to