This is an automated email from the ASF dual-hosted git repository.
tballison pushed a change to branch TIKA-4734
in repository https://gitbox.apache.org/repos/asf/tika.git
from 0b0dc4f75c TIKA-4734 -- fix xml config converter
add da1801a84c TIKA-4733 -- improve release artifact robustness and
documentation (#2825)
add 0b38268d4f TIKA-4735 -- fix content-only (#2826)
add c2b15c9ce1 TIKA-4733 -- fix docker-snapshot.yml to match new release
zip artifacts (#2827)
add 19b4c66927 TIKA-4728 - fix xhtml in widgets (#2817)
add 4b66205620 TIKA-4736 -- image extraction fails (#2828)
add 8ef279d581 TIKA-4327: update aws, netty, woodstox, plexus
add 933fb96d10 Bump com.github.luben:zstd-jni from 1.5.7-8 to 1.5.7-9
(#2829)
add 795f30c368 Bump com.microsoft.graph:microsoft-graph from 6.64.0 to
6.65.0 (#2835)
add 1aea9db9a2 Bump org.apache.kafka:kafka-clients from 4.2.0 to 4.3.0
(#2834)
add 3622658b9a Bump software.amazon.awssdk:bom from 2.44.10 to 2.44.12
(#2833)
add 29f287fe0c Bump org.apache.maven.plugins:maven-site-plugin from 3.21.0
to 3.22.0 (#2832)
add 3b1f68ec11 Bump org.ow2.asm:asm from 9.10 to 9.10.1 (#2830)
add cfddd1afc0 Bump eu.maveniverse.maven.nisse:extension from 0.9.0 to
0.9.2 (#2831)
add 4bfbdf22cf TIKA-4737 -- improve docs for tika-pipes via tika-app
(#2836)
add 0cbdb26e24 TIKA-4740 -- fix flaky windows test
add 1abcd65381 TIKA-4740 -- update docs
add d02dc13903 TIKA-4740 -- tika-server-core fix (#2841)
add a2bc3513ac TIKA-4731 - improve charset detection and junk detection
(#2839)
add 4f6ad8b0f3 TIKA-4739 (#2837)
add cabd1f2d44 fix potential sax dos (#2838)
new dfe180f691 Merge branch 'main' into TIKA-4734
new a1560f046e further update
The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
.github/workflows/docker-snapshot.yml | 4 +-
.mvn/extensions.xml | 2 +-
.skills/tika-eval-h2-query.md | 92 ++
docs/modules/ROOT/nav.adoc | 1 +
.../integration-testing/run-uat-script.adoc | 10 +-
.../advanced/integration-testing/tika-app.adoc | 8 +-
.../integration-testing/tika-eval-regression.adoc | 364 +++++
.../advanced/integration-testing/tika-server.adoc | 18 +-
.../pages/maintainers/release-guides/docker.adoc | 4 +-
.../release-guides/release-artifacts.adoc | 10 +-
.../pages/maintainers/release-guides/tika.adoc | 2 +-
docs/modules/ROOT/pages/migration-to-4x/index.adoc | 2 +-
.../migration-to-4x/migrating-tika-server-4x.adoc | 10 +-
.../pages/migration-to-4x/migrating-to-4x.adoc | 4 +-
docs/modules/ROOT/pages/pipes/configuration.adoc | 6 +-
docs/modules/ROOT/pages/pipes/cpu-sizing.adoc | 2 +-
docs/modules/ROOT/pages/pipes/parse-modes.adoc | 6 +-
docs/modules/ROOT/pages/pipes/troubleshooting.adoc | 131 ++
docs/modules/ROOT/pages/using-tika/cli/index.adoc | 74 +-
.../ROOT/pages/using-tika/server/index.adoc | 4 +-
docs/modules/ROOT/pages/using-tika/server/tls.adoc | 2 +-
pom.xml | 52 +-
tika-app/pom.xml | 4 +
.../main/java/org/apache/tika/cli/AsyncHelper.java | 21 +-
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 78 +-
.../apache/tika/cli/XmlToJsonConfigConverter.java | 8 +-
.../java/org/apache/tika/cli/AsyncHelperTest.java | 30 +-
.../test/java/org/apache/tika/cli/TikaCLITest.java | 8 +-
.../tika/cli/XmlToJsonConfigConverterTest.java | 6 +-
.../test/resources/configs/config-template.json | 2 -
.../src/test/resources/configs/tika-config2.json | 2 +-
.../ParsingEmbeddedDocumentExtractor.java | 29 +-
.../tika/sax/BasicContentHandlerFactory.java | 30 +
.../org/apache/tika/sax/StrictXHTMLValidator.java | 229 +++
.../org/apache/tika/sax/XHTMLBalancingHandler.java | 123 ++
.../src/test/java/org/apache/tika/TikaTest.java | 37 +-
.../apache/tika/sax/XHTMLBalancingHandlerTest.java | 130 ++
.../test/resources/tika-config-ignite-local.json | 20 +-
.../src/test/resources/tika-config-ignite.json | 20 +-
tika-e2e-tests/tika-server/pom.xml | 3 +-
.../tika/server/e2e/TikaServerHttp2Test.java | 25 +-
.../apache/tika/ml/chardetect/AdaptiveProbe.java | 80 +
.../tika/ml/chardetect/CharsetConfusables.java | 27 +
.../tika/ml/chardetect/HtmlByteStripper.java | 198 ++-
.../ml/chardetect/MojibusterEncodingDetector.java | 198 ++-
.../NaiveBayesBigramEncodingDetector.java | 488 +++++-
.../org/apache/tika/ml/chardetect/nb-bigram.bin | Bin 975490 -> 1016638
bytes
.../tika/ml/chardetect/AdaptiveProbeTest.java | 118 ++
.../apache/tika/ml/chardetect/CalibrateTopK.java | 353 ++++
.../apache/tika/ml/chardetect/CheckUtf8OnFile.java | 83 +
.../tika/ml/chardetect/HtmlByteStripperTest.java | 245 +++
.../ml/chardetect/InspectBigramContributions.java | 221 +++
.../apache/tika/ml/chardetect/TraceMojibuster.java | 233 +++
tika-eval/tika-eval-app/pom.xml | 4 +
.../src/test/resources/s3/tika-config-s3.json | 2 +-
.../chardetect/tools/BuildCharsetTrainingData.java | 23 +-
.../chardetect/tools/DiagnoseDiscrimination.java | 399 +++++
.../chardetect/tools/RebalanceCharsetTraining.java | 209 +++
.../ml/chardetect/tools/TrainNaiveBayesBigram.java | 56 +-
tika-ml/tika-ml-junkdetect/pom.xml | 12 +
.../{V7Tables.java => BigramTables.java} | 36 +-
.../tika/ml/junkdetect/HtmlContentCleaner.java | 108 ++
.../apache/tika/ml/junkdetect/JunkDetector.java | 842 ++++++----
.../ml/junkdetect/JunkFilterEncodingDetector.java | 274 ++--
.../tika/ml/junkdetect/TextQualityFeatures.java | 608 +++++++
.../ml/junkdetect/tools/AnalyzeHanByBlock.java | 201 ---
.../ml/junkdetect/tools/BuildJunkTrainingData.java | 7 +
.../ml/junkdetect/tools/CountPerScriptBigrams.java | 326 ----
.../tika/ml/junkdetect/tools/EvalJunkDetector.java | 777 ---------
.../junkdetect/tools/EvalJunkOnCharsetDevtest.java | 688 --------
.../tools/JunkDetectorTrainingConfig.java | 9 +-
.../junkdetect/tools/PrototypeCodepointHash.java | 1208 --------------
.../tika/ml/junkdetect/tools/TrainJunkModel.java | 1701 ++++++++++----------
.../org/apache/tika/ml/junkdetect/junkdetect.bin | Bin 2810396 -> 2321862
bytes
.../apache/tika/ml/junkdetect/EntityRefProbe.java | 164 --
...rV7Test.java => JunkDetectorRoundTripTest.java} | 165 +-
.../tika/ml/junkdetect/JunkDetectorSmokeTest.java | 7 +-
.../junkdetect/JunkFilterEncodingDetectorTest.java | 43 +
.../ml/junkdetect/LatinSiblingComparisonTest.java | 141 ++
.../ml/junkdetect/TextQualityFeaturesTest.java | 201 +++
.../apache/tika/ml/junkdetect/TraceJunkFilter.java | 536 ++++++
.../tools/BuildJunkAugmentationData.java | 862 ++++++++++
.../tools/BuildJunkAugmentationDataTest.java | 429 +++++
.../tools/JunkDetectorTrainingConfigTest.java | 5 +-
tika-parent/pom.xml | 14 +-
.../apache/tika/parser/pkg/PackageParserTest.java | 4 +
.../resources/configs/tika-config-rendering.json | 2 +-
.../tika/parser/iwork/PagesContentHandler.java | 19 +-
.../java/org/apache/tika/parser/prt/PRTParser.java | 85 +-
.../apache/tika/parser/code/SourceCodeParser.java | 25 +
.../tika/parser/code/SourceCodeParserTest.java | 1 -
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 15 +-
.../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 55 +
.../ooxml/OOXMLWordAndPowerPointTextHandler.java | 7 +
.../microsoft/ooxml/SAXBasedMetadataExtractor.java | 89 +-
.../ooxml/SXSLFPowerPointExtractorDecorator.java | 16 +-
.../ooxml/SXWPFWordExtractorDecorator.java | 6 +
.../ooxml/XSSFExcelExtractorDecorator.java | 36 +
.../ooxml/SAXBasedMetadataExtractorTest.java | 216 +++
.../test/resources/configs/tika-libpst-config.json | 2 +-
.../resources/configs/tika-libpst-eml-config.json | 2 +-
.../org/apache/tika/parser/epub/EpubParser.java | 44 +-
.../tika/parser/odf/OpenDocumentBodyHandler.java | 77 +
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 25 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 14 +
.../test-documents/testPDF_jsActionOnPage.pdf | 26 +
.../java/org/apache/tika/parser/pkg/ZipParser.java | 29 +-
.../java/org/apache/tika/parser/txt/TXTParser.java | 27 +-
.../java/org/apache/tika/parser/tmx/TMXParser.java | 2 +
.../apache/tika/parser/xliff/XLIFF12Parser.java | 3 +-
.../org/apache/tika/async/cli/PluginsWriter.java | 203 ++-
.../apache/tika/async/cli/SimpleAsyncConfig.java | 13 +
.../org/apache/tika/async/cli/TikaAsyncCLI.java | 16 +-
.../apache/tika/async/cli/AsyncCliParserTest.java | 25 +
.../apache/tika/async/cli/AsyncProcessorTest.java | 42 +
...plate.json => config-content-only-default.json} | 7 +-
.../test/resources/configs/config-template.json | 2 -
tika-pipes/tika-pipes-core/pom.xml | 9 +
.../tika/pipes/core/AbstractComponentManager.java | 15 +
.../tika/pipes/core/PerClientServerManager.java | 45 +-
.../org/apache/tika/pipes/core/PipesConfig.java | 26 +-
.../apache/tika/pipes/core/ServerProcessIO.java | 112 ++
.../tika/pipes/core/SharedServerManager.java | 41 +-
.../tika/pipes/core/config/ConfigMerger.java | 3 -
.../tika/pipes/core/config/ConfigOverrides.java | 14 +-
.../apache/tika/pipes/core/server/PipesServer.java | 54 +
.../apache/tika/pipes/core/server/PipesWorker.java | 3 +
.../tika/pipes/core/config/ConfigMergerTest.java | 3 +-
.../core/testutil/AbstractConfigExamplesTest.java | 89 +
.../apache/tika/pipes/fork/PipesForkParser.java | 1 -
.../tika/pipes/fork/PipesForkParserConfig.java | 11 -
tika-pipes/tika-pipes-plugins/pom.xml | 7 +
.../pipes/atlassianjwt/ConfigExamplesTest.java | 33 +-
.../tika/pipes/azblob/ConfigExamplesTest.java | 71 +-
.../apache/tika/pipes/csv/ConfigExamplesTest.java | 33 +-
.../apache/tika/pipes/es/ConfigExamplesTest.java | 63 +-
.../tika/pipes/emitter/fs/FileSystemEmitter.java | 18 +-
.../fs/FileSystemEmitterRuntimeConfigTest.java | 31 +
.../apache/tika/pipes/fs/ConfigExamplesTest.java | 28 +-
.../config-examples/file-system-emitter.json | 7 +-
.../config-examples/file-system-fetcher.json | 7 +-
.../config-examples/file-system-pipeline.json | 14 +-
.../apache/tika/pipes/gcs/ConfigExamplesTest.java | 53 +-
.../tika/pipes/googledrive/ConfigExamplesTest.java | 33 +-
.../apache/tika/pipes/http/ConfigExamplesTest.java | 34 +-
.../apache/tika/pipes/jdbc/ConfigExamplesTest.java | 52 +-
.../apache/tika/pipes/json/ConfigExamplesTest.java | 33 +-
.../tika/pipes/kafka/ConfigExamplesTest.java | 51 +-
.../tika-pipes-microsoft-graph/pom.xml | 2 +-
.../pipes/microsoftgraph/ConfigExamplesTest.java | 33 +-
.../tika/pipes/opensearch/ConfigExamplesTest.java | 50 +-
.../apache/tika/pipes/s3/ConfigExamplesTest.java | 53 +-
.../apache/tika/pipes/solr/ConfigExamplesTest.java | 52 +-
.../config/loader/AbstractSpiComponentLoader.java | 45 +-
.../tika/config/loader/ComponentInstantiator.java | 14 +-
.../apache/tika/config/loader/ParserLoader.java | 7 +
.../apache/tika/config/loader/TikaJsonConfig.java | 2 +-
.../apache/tika/config/loader/TikaLoaderTest.java | 115 +-
tika-server/README.md | 10 +-
tika-server/docker-build/CHANGES.md | 4 +-
tika-server/docker-build/README.md | 8 +-
.../docker-build/docker-compose-tika-customocr.yml | 10 +-
.../docker-build/docker-compose-tika-grobid.yml | 10 +-
tika-server/docker-build/full/Dockerfile | 24 +-
tika-server/docker-build/full/Dockerfile.snapshot | 2 +-
tika-server/docker-build/minimal/Dockerfile | 24 +-
.../docker-build/minimal/Dockerfile.snapshot | 2 +-
.../tika/server/core/IntegrationTestBase.java | 31 +
.../server/core/benchmark/TikaServerBenchmark.java | 2 +-
.../bin/install_tika_service.sh | 21 +-
tika-server/tika-server-standard/bin/tika | 10 +-
tika-server/tika-server-standard/bin/tika.in.sh | 2 +-
tika-server/tika-server-standard/pom.xml | 45 +
.../src/main/assembly/assembly.xml | 5 +-
tika-translate/pom.xml | 2 +-
175 files changed, 10725 insertions(+), 5903 deletions(-)
create mode 100644 .skills/tika-eval-h2-query.md
create mode 100644
docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc
create mode 100644 docs/modules/ROOT/pages/pipes/troubleshooting.adoc
create mode 100644
tika-core/src/main/java/org/apache/tika/sax/StrictXHTMLValidator.java
create mode 100644
tika-core/src/main/java/org/apache/tika/sax/XHTMLBalancingHandler.java
create mode 100644
tika-core/src/test/java/org/apache/tika/sax/XHTMLBalancingHandlerTest.java
create mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java
create mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/AdaptiveProbeTest.java
create mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CalibrateTopK.java
create mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CheckUtf8OnFile.java
create mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java
create mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/InspectBigramContributions.java
create mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/TraceMojibuster.java
create mode 100644
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/DiagnoseDiscrimination.java
create mode 100644
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/RebalanceCharsetTraining.java
rename
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/{V7Tables.java
=> BigramTables.java} (86%)
create mode 100644
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/HtmlContentCleaner.java
create mode 100644
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
delete mode 100644
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java
delete mode 100644
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java
delete mode 100644
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java
delete mode 100644
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkOnCharsetDevtest.java
delete mode 100644
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java
delete mode 100644
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/EntityRefProbe.java
rename
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/{JunkDetectorV7Test.java
=> JunkDetectorRoundTripTest.java} (67%)
create mode 100644
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/LatinSiblingComparisonTest.java
create mode 100644
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/TextQualityFeaturesTest.java
create mode 100644
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/TraceJunkFilter.java
create mode 100644
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationData.java
create mode 100644
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
create mode 100644
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java
create mode 100644
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_jsActionOnPage.pdf
copy
tika-pipes/tika-async-cli/src/test/resources/configs/{config-template.json =>
config-content-only-default.json} (86%)
create mode 100644
tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/ServerProcessIO.java
create mode 100644
tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/testutil/AbstractConfigExamplesTest.java