This is an automated email from the ASF dual-hosted git repository.

tballison pushed a change to branch TIKA-4734
in repository https://gitbox.apache.org/repos/asf/tika.git


    from 0b0dc4f75c TIKA-4734 -- fix xml config converter
     add da1801a84c TIKA-4733 -- improve release artifact robustness and 
documentation (#2825)
     add 0b38268d4f TIKA-4735 -- fix content-only (#2826)
     add c2b15c9ce1 TIKA-4733 -- fix docker-snapshot.yml to match new release 
zip artifacts (#2827)
     add 19b4c66927 TIKA-4728 - fix xhtml in widgets (#2817)
     add 4b66205620 TIKA-4736 -- image extraction fails (#2828)
     add 8ef279d581 TIKA-4327: update aws, netty, woodstox, plexus
     add 933fb96d10 Bump com.github.luben:zstd-jni from 1.5.7-8 to 1.5.7-9 
(#2829)
     add 795f30c368 Bump com.microsoft.graph:microsoft-graph from 6.64.0 to 
6.65.0 (#2835)
     add 1aea9db9a2 Bump org.apache.kafka:kafka-clients from 4.2.0 to 4.3.0 
(#2834)
     add 3622658b9a Bump software.amazon.awssdk:bom from 2.44.10 to 2.44.12 
(#2833)
     add 29f287fe0c Bump org.apache.maven.plugins:maven-site-plugin from 3.21.0 
to 3.22.0 (#2832)
     add 3b1f68ec11 Bump org.ow2.asm:asm from 9.10 to 9.10.1 (#2830)
     add cfddd1afc0 Bump eu.maveniverse.maven.nisse:extension from 0.9.0 to 
0.9.2 (#2831)
     add 4bfbdf22cf TIKA-4737 -- improve docs for tika-pipes via tika-app 
(#2836)
     add 0cbdb26e24 TIKA-4740 -- fix flaky windows test
     add 1abcd65381 TIKA-4740 -- update docs
     add d02dc13903 TIKA-4740 -- tika-server-core fix (#2841)
     add a2bc3513ac TIKA-4731 - improve charset detection and junk detection 
(#2839)
     add 4f6ad8b0f3 TIKA-4739 (#2837)
     add cabd1f2d44 fix potential sax dos (#2838)
     new dfe180f691 Merge branch 'main' into TIKA-4734
     new a1560f046e further update

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .github/workflows/docker-snapshot.yml              |    4 +-
 .mvn/extensions.xml                                |    2 +-
 .skills/tika-eval-h2-query.md                      |   92 ++
 docs/modules/ROOT/nav.adoc                         |    1 +
 .../integration-testing/run-uat-script.adoc        |   10 +-
 .../advanced/integration-testing/tika-app.adoc     |    8 +-
 .../integration-testing/tika-eval-regression.adoc  |  364 +++++
 .../advanced/integration-testing/tika-server.adoc  |   18 +-
 .../pages/maintainers/release-guides/docker.adoc   |    4 +-
 .../release-guides/release-artifacts.adoc          |   10 +-
 .../pages/maintainers/release-guides/tika.adoc     |    2 +-
 docs/modules/ROOT/pages/migration-to-4x/index.adoc |    2 +-
 .../migration-to-4x/migrating-tika-server-4x.adoc  |   10 +-
 .../pages/migration-to-4x/migrating-to-4x.adoc     |    4 +-
 docs/modules/ROOT/pages/pipes/configuration.adoc   |    6 +-
 docs/modules/ROOT/pages/pipes/cpu-sizing.adoc      |    2 +-
 docs/modules/ROOT/pages/pipes/parse-modes.adoc     |    6 +-
 docs/modules/ROOT/pages/pipes/troubleshooting.adoc |  131 ++
 docs/modules/ROOT/pages/using-tika/cli/index.adoc  |   74 +-
 .../ROOT/pages/using-tika/server/index.adoc        |    4 +-
 docs/modules/ROOT/pages/using-tika/server/tls.adoc |    2 +-
 pom.xml                                            |   52 +-
 tika-app/pom.xml                                   |    4 +
 .../main/java/org/apache/tika/cli/AsyncHelper.java |   21 +-
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |   78 +-
 .../apache/tika/cli/XmlToJsonConfigConverter.java  |    8 +-
 .../java/org/apache/tika/cli/AsyncHelperTest.java  |   30 +-
 .../test/java/org/apache/tika/cli/TikaCLITest.java |    8 +-
 .../tika/cli/XmlToJsonConfigConverterTest.java     |    6 +-
 .../test/resources/configs/config-template.json    |    2 -
 .../src/test/resources/configs/tika-config2.json   |    2 +-
 .../ParsingEmbeddedDocumentExtractor.java          |   29 +-
 .../tika/sax/BasicContentHandlerFactory.java       |   30 +
 .../org/apache/tika/sax/StrictXHTMLValidator.java  |  229 +++
 .../org/apache/tika/sax/XHTMLBalancingHandler.java |  123 ++
 .../src/test/java/org/apache/tika/TikaTest.java    |   37 +-
 .../apache/tika/sax/XHTMLBalancingHandlerTest.java |  130 ++
 .../test/resources/tika-config-ignite-local.json   |   20 +-
 .../src/test/resources/tika-config-ignite.json     |   20 +-
 tika-e2e-tests/tika-server/pom.xml                 |    3 +-
 .../tika/server/e2e/TikaServerHttp2Test.java       |   25 +-
 .../apache/tika/ml/chardetect/AdaptiveProbe.java   |   80 +
 .../tika/ml/chardetect/CharsetConfusables.java     |   27 +
 .../tika/ml/chardetect/HtmlByteStripper.java       |  198 ++-
 .../ml/chardetect/MojibusterEncodingDetector.java  |  198 ++-
 .../NaiveBayesBigramEncodingDetector.java          |  488 +++++-
 .../org/apache/tika/ml/chardetect/nb-bigram.bin    |  Bin 975490 -> 1016638 
bytes
 .../tika/ml/chardetect/AdaptiveProbeTest.java      |  118 ++
 .../apache/tika/ml/chardetect/CalibrateTopK.java   |  353 ++++
 .../apache/tika/ml/chardetect/CheckUtf8OnFile.java |   83 +
 .../tika/ml/chardetect/HtmlByteStripperTest.java   |  245 +++
 .../ml/chardetect/InspectBigramContributions.java  |  221 +++
 .../apache/tika/ml/chardetect/TraceMojibuster.java |  233 +++
 tika-eval/tika-eval-app/pom.xml                    |    4 +
 .../src/test/resources/s3/tika-config-s3.json      |    2 +-
 .../chardetect/tools/BuildCharsetTrainingData.java |   23 +-
 .../chardetect/tools/DiagnoseDiscrimination.java   |  399 +++++
 .../chardetect/tools/RebalanceCharsetTraining.java |  209 +++
 .../ml/chardetect/tools/TrainNaiveBayesBigram.java |   56 +-
 tika-ml/tika-ml-junkdetect/pom.xml                 |   12 +
 .../{V7Tables.java => BigramTables.java}           |   36 +-
 .../tika/ml/junkdetect/HtmlContentCleaner.java     |  108 ++
 .../apache/tika/ml/junkdetect/JunkDetector.java    |  842 ++++++----
 .../ml/junkdetect/JunkFilterEncodingDetector.java  |  274 ++--
 .../tika/ml/junkdetect/TextQualityFeatures.java    |  608 +++++++
 .../ml/junkdetect/tools/AnalyzeHanByBlock.java     |  201 ---
 .../ml/junkdetect/tools/BuildJunkTrainingData.java |    7 +
 .../ml/junkdetect/tools/CountPerScriptBigrams.java |  326 ----
 .../tika/ml/junkdetect/tools/EvalJunkDetector.java |  777 ---------
 .../junkdetect/tools/EvalJunkOnCharsetDevtest.java |  688 --------
 .../tools/JunkDetectorTrainingConfig.java          |    9 +-
 .../junkdetect/tools/PrototypeCodepointHash.java   | 1208 --------------
 .../tika/ml/junkdetect/tools/TrainJunkModel.java   | 1701 ++++++++++----------
 .../org/apache/tika/ml/junkdetect/junkdetect.bin   |  Bin 2810396 -> 2321862 
bytes
 .../apache/tika/ml/junkdetect/EntityRefProbe.java  |  164 --
 ...rV7Test.java => JunkDetectorRoundTripTest.java} |  165 +-
 .../tika/ml/junkdetect/JunkDetectorSmokeTest.java  |    7 +-
 .../junkdetect/JunkFilterEncodingDetectorTest.java |   43 +
 .../ml/junkdetect/LatinSiblingComparisonTest.java  |  141 ++
 .../ml/junkdetect/TextQualityFeaturesTest.java     |  201 +++
 .../apache/tika/ml/junkdetect/TraceJunkFilter.java |  536 ++++++
 .../tools/BuildJunkAugmentationData.java           |  862 ++++++++++
 .../tools/BuildJunkAugmentationDataTest.java       |  429 +++++
 .../tools/JunkDetectorTrainingConfigTest.java      |    5 +-
 tika-parent/pom.xml                                |   14 +-
 .../apache/tika/parser/pkg/PackageParserTest.java  |    4 +
 .../resources/configs/tika-config-rendering.json   |    2 +-
 .../tika/parser/iwork/PagesContentHandler.java     |   19 +-
 .../java/org/apache/tika/parser/prt/PRTParser.java |   85 +-
 .../apache/tika/parser/code/SourceCodeParser.java  |   25 +
 .../tika/parser/code/SourceCodeParserTest.java     |    1 -
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |   15 +-
 .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java  |   55 +
 .../ooxml/OOXMLWordAndPowerPointTextHandler.java   |    7 +
 .../microsoft/ooxml/SAXBasedMetadataExtractor.java |   89 +-
 .../ooxml/SXSLFPowerPointExtractorDecorator.java   |   16 +-
 .../ooxml/SXWPFWordExtractorDecorator.java         |    6 +
 .../ooxml/XSSFExcelExtractorDecorator.java         |   36 +
 .../ooxml/SAXBasedMetadataExtractorTest.java       |  216 +++
 .../test/resources/configs/tika-libpst-config.json |    2 +-
 .../resources/configs/tika-libpst-eml-config.json  |    2 +-
 .../org/apache/tika/parser/epub/EpubParser.java    |   44 +-
 .../tika/parser/odf/OpenDocumentBodyHandler.java   |   77 +
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |   25 +-
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |   14 +
 .../test-documents/testPDF_jsActionOnPage.pdf      |   26 +
 .../java/org/apache/tika/parser/pkg/ZipParser.java |   29 +-
 .../java/org/apache/tika/parser/txt/TXTParser.java |   27 +-
 .../java/org/apache/tika/parser/tmx/TMXParser.java |    2 +
 .../apache/tika/parser/xliff/XLIFF12Parser.java    |    3 +-
 .../org/apache/tika/async/cli/PluginsWriter.java   |  203 ++-
 .../apache/tika/async/cli/SimpleAsyncConfig.java   |   13 +
 .../org/apache/tika/async/cli/TikaAsyncCLI.java    |   16 +-
 .../apache/tika/async/cli/AsyncCliParserTest.java  |   25 +
 .../apache/tika/async/cli/AsyncProcessorTest.java  |   42 +
 ...plate.json => config-content-only-default.json} |    7 +-
 .../test/resources/configs/config-template.json    |    2 -
 tika-pipes/tika-pipes-core/pom.xml                 |    9 +
 .../tika/pipes/core/AbstractComponentManager.java  |   15 +
 .../tika/pipes/core/PerClientServerManager.java    |   45 +-
 .../org/apache/tika/pipes/core/PipesConfig.java    |   26 +-
 .../apache/tika/pipes/core/ServerProcessIO.java    |  112 ++
 .../tika/pipes/core/SharedServerManager.java       |   41 +-
 .../tika/pipes/core/config/ConfigMerger.java       |    3 -
 .../tika/pipes/core/config/ConfigOverrides.java    |   14 +-
 .../apache/tika/pipes/core/server/PipesServer.java |   54 +
 .../apache/tika/pipes/core/server/PipesWorker.java |    3 +
 .../tika/pipes/core/config/ConfigMergerTest.java   |    3 +-
 .../core/testutil/AbstractConfigExamplesTest.java  |   89 +
 .../apache/tika/pipes/fork/PipesForkParser.java    |    1 -
 .../tika/pipes/fork/PipesForkParserConfig.java     |   11 -
 tika-pipes/tika-pipes-plugins/pom.xml              |    7 +
 .../pipes/atlassianjwt/ConfigExamplesTest.java     |   33 +-
 .../tika/pipes/azblob/ConfigExamplesTest.java      |   71 +-
 .../apache/tika/pipes/csv/ConfigExamplesTest.java  |   33 +-
 .../apache/tika/pipes/es/ConfigExamplesTest.java   |   63 +-
 .../tika/pipes/emitter/fs/FileSystemEmitter.java   |   18 +-
 .../fs/FileSystemEmitterRuntimeConfigTest.java     |   31 +
 .../apache/tika/pipes/fs/ConfigExamplesTest.java   |   28 +-
 .../config-examples/file-system-emitter.json       |    7 +-
 .../config-examples/file-system-fetcher.json       |    7 +-
 .../config-examples/file-system-pipeline.json      |   14 +-
 .../apache/tika/pipes/gcs/ConfigExamplesTest.java  |   53 +-
 .../tika/pipes/googledrive/ConfigExamplesTest.java |   33 +-
 .../apache/tika/pipes/http/ConfigExamplesTest.java |   34 +-
 .../apache/tika/pipes/jdbc/ConfigExamplesTest.java |   52 +-
 .../apache/tika/pipes/json/ConfigExamplesTest.java |   33 +-
 .../tika/pipes/kafka/ConfigExamplesTest.java       |   51 +-
 .../tika-pipes-microsoft-graph/pom.xml             |    2 +-
 .../pipes/microsoftgraph/ConfigExamplesTest.java   |   33 +-
 .../tika/pipes/opensearch/ConfigExamplesTest.java  |   50 +-
 .../apache/tika/pipes/s3/ConfigExamplesTest.java   |   53 +-
 .../apache/tika/pipes/solr/ConfigExamplesTest.java |   52 +-
 .../config/loader/AbstractSpiComponentLoader.java  |   45 +-
 .../tika/config/loader/ComponentInstantiator.java  |   14 +-
 .../apache/tika/config/loader/ParserLoader.java    |    7 +
 .../apache/tika/config/loader/TikaJsonConfig.java  |    2 +-
 .../apache/tika/config/loader/TikaLoaderTest.java  |  115 +-
 tika-server/README.md                              |   10 +-
 tika-server/docker-build/CHANGES.md                |    4 +-
 tika-server/docker-build/README.md                 |    8 +-
 .../docker-build/docker-compose-tika-customocr.yml |   10 +-
 .../docker-build/docker-compose-tika-grobid.yml    |   10 +-
 tika-server/docker-build/full/Dockerfile           |   24 +-
 tika-server/docker-build/full/Dockerfile.snapshot  |    2 +-
 tika-server/docker-build/minimal/Dockerfile        |   24 +-
 .../docker-build/minimal/Dockerfile.snapshot       |    2 +-
 .../tika/server/core/IntegrationTestBase.java      |   31 +
 .../server/core/benchmark/TikaServerBenchmark.java |    2 +-
 .../bin/install_tika_service.sh                    |   21 +-
 tika-server/tika-server-standard/bin/tika          |   10 +-
 tika-server/tika-server-standard/bin/tika.in.sh    |    2 +-
 tika-server/tika-server-standard/pom.xml           |   45 +
 .../src/main/assembly/assembly.xml                 |    5 +-
 tika-translate/pom.xml                             |    2 +-
 175 files changed, 10725 insertions(+), 5903 deletions(-)
 create mode 100644 .skills/tika-eval-h2-query.md
 create mode 100644 
docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc
 create mode 100644 docs/modules/ROOT/pages/pipes/troubleshooting.adoc
 create mode 100644 
tika-core/src/main/java/org/apache/tika/sax/StrictXHTMLValidator.java
 create mode 100644 
tika-core/src/main/java/org/apache/tika/sax/XHTMLBalancingHandler.java
 create mode 100644 
tika-core/src/test/java/org/apache/tika/sax/XHTMLBalancingHandlerTest.java
 create mode 100644 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java
 create mode 100644 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/AdaptiveProbeTest.java
 create mode 100644 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CalibrateTopK.java
 create mode 100644 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CheckUtf8OnFile.java
 create mode 100644 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java
 create mode 100644 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/InspectBigramContributions.java
 create mode 100644 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/TraceMojibuster.java
 create mode 100644 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/DiagnoseDiscrimination.java
 create mode 100644 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/RebalanceCharsetTraining.java
 rename 
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/{V7Tables.java
 => BigramTables.java} (86%)
 create mode 100644 
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/HtmlContentCleaner.java
 create mode 100644 
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
 delete mode 100644 
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java
 delete mode 100644 
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java
 delete mode 100644 
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java
 delete mode 100644 
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkOnCharsetDevtest.java
 delete mode 100644 
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java
 delete mode 100644 
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/EntityRefProbe.java
 rename 
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/{JunkDetectorV7Test.java
 => JunkDetectorRoundTripTest.java} (67%)
 create mode 100644 
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/LatinSiblingComparisonTest.java
 create mode 100644 
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/TextQualityFeaturesTest.java
 create mode 100644 
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/TraceJunkFilter.java
 create mode 100644 
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationData.java
 create mode 100644 
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
 create mode 100644 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java
 create mode 100644 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_jsActionOnPage.pdf
 copy 
tika-pipes/tika-async-cli/src/test/resources/configs/{config-template.json => 
config-content-only-default.json} (86%)
 create mode 100644 
tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/ServerProcessIO.java
 create mode 100644 
tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/testutil/AbstractConfigExamplesTest.java

Reply via email to