This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4734
in repository https://gitbox.apache.org/repos/asf/tika.git

commit dfe180f691bcc4ab22b81fee0bdd52149f9de14b
Merge: 0b0dc4f75c cabd1f2d44
Author: tallison <[email protected]>
AuthorDate: Wed May 27 10:39:09 2026 -0400

    Merge branch 'main' into TIKA-4734

 .github/workflows/docker-snapshot.yml              |    4 +-
 .mvn/extensions.xml                                |    2 +-
 .skills/tika-eval-h2-query.md                      |   92 ++
 docs/modules/ROOT/nav.adoc                         |    1 +
 .../integration-testing/run-uat-script.adoc        |   10 +-
 .../advanced/integration-testing/tika-app.adoc     |    8 +-
 .../integration-testing/tika-eval-regression.adoc  |  364 +++++
 .../advanced/integration-testing/tika-server.adoc  |   18 +-
 .../pages/maintainers/release-guides/docker.adoc   |    4 +-
 .../release-guides/release-artifacts.adoc          |   10 +-
 .../pages/maintainers/release-guides/tika.adoc     |    2 +-
 docs/modules/ROOT/pages/migration-to-4x/index.adoc |    2 +-
 .../migration-to-4x/migrating-tika-server-4x.adoc  |   10 +-
 .../pages/migration-to-4x/migrating-to-4x.adoc     |    4 +-
 docs/modules/ROOT/pages/pipes/configuration.adoc   |    6 +-
 docs/modules/ROOT/pages/pipes/cpu-sizing.adoc      |    2 +-
 docs/modules/ROOT/pages/pipes/parse-modes.adoc     |    6 +-
 docs/modules/ROOT/pages/pipes/troubleshooting.adoc |  131 ++
 docs/modules/ROOT/pages/using-tika/cli/index.adoc  |   74 +-
 .../ROOT/pages/using-tika/server/index.adoc        |    4 +-
 docs/modules/ROOT/pages/using-tika/server/tls.adoc |    2 +-
 pom.xml                                            |   52 +-
 tika-app/pom.xml                                   |    4 +
 .../main/java/org/apache/tika/cli/AsyncHelper.java |   21 +-
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |   78 +-
 .../apache/tika/cli/XmlToJsonConfigConverter.java  |    8 +-
 .../java/org/apache/tika/cli/AsyncHelperTest.java  |   30 +-
 .../tika/cli/XmlToJsonConfigConverterTest.java     |    6 +-
 .../test/resources/configs/config-template.json    |    2 -
 .../src/test/resources/configs/tika-config2.json   |    2 +-
 .../ParsingEmbeddedDocumentExtractor.java          |   29 +-
 .../tika/sax/BasicContentHandlerFactory.java       |   30 +
 .../org/apache/tika/sax/StrictXHTMLValidator.java  |  229 +++
 .../org/apache/tika/sax/XHTMLBalancingHandler.java |  123 ++
 .../src/test/java/org/apache/tika/TikaTest.java    |   37 +-
 .../apache/tika/sax/XHTMLBalancingHandlerTest.java |  130 ++
 .../test/resources/tika-config-ignite-local.json   |   20 +-
 .../src/test/resources/tika-config-ignite.json     |   20 +-
 tika-e2e-tests/tika-server/pom.xml                 |    3 +-
 .../tika/server/e2e/TikaServerHttp2Test.java       |   25 +-
 .../apache/tika/ml/chardetect/AdaptiveProbe.java   |   80 +
 .../tika/ml/chardetect/CharsetConfusables.java     |   27 +
 .../tika/ml/chardetect/HtmlByteStripper.java       |  198 ++-
 .../ml/chardetect/MojibusterEncodingDetector.java  |  198 ++-
 .../NaiveBayesBigramEncodingDetector.java          |  488 +++++-
 .../org/apache/tika/ml/chardetect/nb-bigram.bin    |  Bin 975490 -> 1016638 
bytes
 .../tika/ml/chardetect/AdaptiveProbeTest.java      |  118 ++
 .../apache/tika/ml/chardetect/CalibrateTopK.java   |  353 ++++
 .../apache/tika/ml/chardetect/CheckUtf8OnFile.java |   83 +
 .../tika/ml/chardetect/HtmlByteStripperTest.java   |  245 +++
 .../ml/chardetect/InspectBigramContributions.java  |  221 +++
 .../apache/tika/ml/chardetect/TraceMojibuster.java |  233 +++
 tika-eval/tika-eval-app/pom.xml                    |    4 +
 .../src/test/resources/s3/tika-config-s3.json      |    2 +-
 .../chardetect/tools/BuildCharsetTrainingData.java |   23 +-
 .../chardetect/tools/DiagnoseDiscrimination.java   |  399 +++++
 .../chardetect/tools/RebalanceCharsetTraining.java |  209 +++
 .../ml/chardetect/tools/TrainNaiveBayesBigram.java |   56 +-
 tika-ml/tika-ml-junkdetect/pom.xml                 |   12 +
 .../{V7Tables.java => BigramTables.java}           |   36 +-
 .../tika/ml/junkdetect/HtmlContentCleaner.java     |  108 ++
 .../apache/tika/ml/junkdetect/JunkDetector.java    |  842 ++++++----
 .../ml/junkdetect/JunkFilterEncodingDetector.java  |  274 ++--
 .../tika/ml/junkdetect/TextQualityFeatures.java    |  608 +++++++
 .../ml/junkdetect/tools/AnalyzeHanByBlock.java     |  201 ---
 .../ml/junkdetect/tools/BuildJunkTrainingData.java |    7 +
 .../ml/junkdetect/tools/CountPerScriptBigrams.java |  326 ----
 .../tika/ml/junkdetect/tools/EvalJunkDetector.java |  777 ---------
 .../junkdetect/tools/EvalJunkOnCharsetDevtest.java |  688 --------
 .../tools/JunkDetectorTrainingConfig.java          |    9 +-
 .../junkdetect/tools/PrototypeCodepointHash.java   | 1208 --------------
 .../tika/ml/junkdetect/tools/TrainJunkModel.java   | 1701 ++++++++++----------
 .../org/apache/tika/ml/junkdetect/junkdetect.bin   |  Bin 2810396 -> 2321862 
bytes
 .../apache/tika/ml/junkdetect/EntityRefProbe.java  |  164 --
 ...rV7Test.java => JunkDetectorRoundTripTest.java} |  165 +-
 .../tika/ml/junkdetect/JunkDetectorSmokeTest.java  |    7 +-
 .../junkdetect/JunkFilterEncodingDetectorTest.java |   43 +
 .../ml/junkdetect/LatinSiblingComparisonTest.java  |  141 ++
 .../ml/junkdetect/TextQualityFeaturesTest.java     |  201 +++
 .../apache/tika/ml/junkdetect/TraceJunkFilter.java |  536 ++++++
 .../tools/BuildJunkAugmentationData.java           |  862 ++++++++++
 .../tools/BuildJunkAugmentationDataTest.java       |  429 +++++
 .../tools/JunkDetectorTrainingConfigTest.java      |    5 +-
 tika-parent/pom.xml                                |   14 +-
 .../apache/tika/parser/pkg/PackageParserTest.java  |    4 +
 .../resources/configs/tika-config-rendering.json   |    2 +-
 .../tika/parser/iwork/PagesContentHandler.java     |   19 +-
 .../java/org/apache/tika/parser/prt/PRTParser.java |   85 +-
 .../apache/tika/parser/code/SourceCodeParser.java  |   25 +
 .../tika/parser/code/SourceCodeParserTest.java     |    1 -
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |   15 +-
 .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java  |   55 +
 .../ooxml/OOXMLWordAndPowerPointTextHandler.java   |    7 +
 .../microsoft/ooxml/SAXBasedMetadataExtractor.java |   89 +-
 .../ooxml/SXSLFPowerPointExtractorDecorator.java   |   16 +-
 .../ooxml/SXWPFWordExtractorDecorator.java         |    6 +
 .../ooxml/XSSFExcelExtractorDecorator.java         |   36 +
 .../ooxml/SAXBasedMetadataExtractorTest.java       |  216 +++
 .../test/resources/configs/tika-libpst-config.json |    2 +-
 .../resources/configs/tika-libpst-eml-config.json  |    2 +-
 .../org/apache/tika/parser/epub/EpubParser.java    |   44 +-
 .../tika/parser/odf/OpenDocumentBodyHandler.java   |   77 +
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |   25 +-
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |   14 +
 .../test-documents/testPDF_jsActionOnPage.pdf      |   26 +
 .../java/org/apache/tika/parser/pkg/ZipParser.java |   29 +-
 .../java/org/apache/tika/parser/txt/TXTParser.java |   27 +-
 .../java/org/apache/tika/parser/tmx/TMXParser.java |    2 +
 .../apache/tika/parser/xliff/XLIFF12Parser.java    |    3 +-
 .../org/apache/tika/async/cli/PluginsWriter.java   |  203 ++-
 .../apache/tika/async/cli/SimpleAsyncConfig.java   |   13 +
 .../org/apache/tika/async/cli/TikaAsyncCLI.java    |   16 +-
 .../apache/tika/async/cli/AsyncCliParserTest.java  |   25 +
 .../apache/tika/async/cli/AsyncProcessorTest.java  |   42 +
 ...plate.json => config-content-only-default.json} |    7 +-
 .../test/resources/configs/config-template.json    |    2 -
 tika-pipes/tika-pipes-core/pom.xml                 |    9 +
 .../tika/pipes/core/AbstractComponentManager.java  |   15 +
 .../tika/pipes/core/PerClientServerManager.java    |   45 +-
 .../org/apache/tika/pipes/core/PipesConfig.java    |   26 +-
 .../apache/tika/pipes/core/ServerProcessIO.java    |  112 ++
 .../tika/pipes/core/SharedServerManager.java       |   41 +-
 .../tika/pipes/core/config/ConfigMerger.java       |    3 -
 .../tika/pipes/core/config/ConfigOverrides.java    |   14 +-
 .../apache/tika/pipes/core/server/PipesServer.java |   54 +
 .../apache/tika/pipes/core/server/PipesWorker.java |    3 +
 .../tika/pipes/core/config/ConfigMergerTest.java   |    3 +-
 .../core/testutil/AbstractConfigExamplesTest.java  |   89 +
 .../apache/tika/pipes/fork/PipesForkParser.java    |    1 -
 .../tika/pipes/fork/PipesForkParserConfig.java     |   11 -
 tika-pipes/tika-pipes-plugins/pom.xml              |    7 +
 .../pipes/atlassianjwt/ConfigExamplesTest.java     |   33 +-
 .../tika/pipes/azblob/ConfigExamplesTest.java      |   71 +-
 .../apache/tika/pipes/csv/ConfigExamplesTest.java  |   33 +-
 .../apache/tika/pipes/es/ConfigExamplesTest.java   |   63 +-
 .../tika/pipes/emitter/fs/FileSystemEmitter.java   |   18 +-
 .../fs/FileSystemEmitterRuntimeConfigTest.java     |   31 +
 .../apache/tika/pipes/fs/ConfigExamplesTest.java   |   28 +-
 .../config-examples/file-system-emitter.json       |    7 +-
 .../config-examples/file-system-fetcher.json       |    7 +-
 .../config-examples/file-system-pipeline.json      |   14 +-
 .../apache/tika/pipes/gcs/ConfigExamplesTest.java  |   53 +-
 .../tika/pipes/googledrive/ConfigExamplesTest.java |   33 +-
 .../apache/tika/pipes/http/ConfigExamplesTest.java |   34 +-
 .../apache/tika/pipes/jdbc/ConfigExamplesTest.java |   52 +-
 .../apache/tika/pipes/json/ConfigExamplesTest.java |   33 +-
 .../tika/pipes/kafka/ConfigExamplesTest.java       |   51 +-
 .../tika-pipes-microsoft-graph/pom.xml             |    2 +-
 .../pipes/microsoftgraph/ConfigExamplesTest.java   |   33 +-
 .../tika/pipes/opensearch/ConfigExamplesTest.java  |   50 +-
 .../apache/tika/pipes/s3/ConfigExamplesTest.java   |   53 +-
 .../apache/tika/pipes/solr/ConfigExamplesTest.java |   52 +-
 .../config/loader/AbstractSpiComponentLoader.java  |   45 +-
 .../tika/config/loader/ComponentInstantiator.java  |   14 +-
 .../apache/tika/config/loader/ParserLoader.java    |    7 +
 .../apache/tika/config/loader/TikaJsonConfig.java  |    2 +-
 .../apache/tika/config/loader/TikaLoaderTest.java  |  115 +-
 tika-server/README.md                              |   10 +-
 tika-server/docker-build/CHANGES.md                |    4 +-
 tika-server/docker-build/README.md                 |    8 +-
 .../docker-build/docker-compose-tika-customocr.yml |   10 +-
 .../docker-build/docker-compose-tika-grobid.yml    |   10 +-
 tika-server/docker-build/full/Dockerfile           |   24 +-
 tika-server/docker-build/full/Dockerfile.snapshot  |    2 +-
 tika-server/docker-build/minimal/Dockerfile        |   24 +-
 .../docker-build/minimal/Dockerfile.snapshot       |    2 +-
 .../tika/server/core/IntegrationTestBase.java      |   31 +
 .../server/core/benchmark/TikaServerBenchmark.java |    2 +-
 .../bin/install_tika_service.sh                    |   21 +-
 tika-server/tika-server-standard/bin/tika          |   10 +-
 tika-server/tika-server-standard/bin/tika.in.sh    |    2 +-
 tika-server/tika-server-standard/pom.xml           |   45 +
 .../src/main/assembly/assembly.xml                 |    5 +-
 tika-translate/pom.xml                             |    2 +-
 174 files changed, 10721 insertions(+), 5899 deletions(-)


Reply via email to