This is an automated email from the ASF dual-hosted git repository. ndipiazza pushed a commit to branch TIKA-4181-grpc in repository https://gitbox.apache.org/repos/asf/tika.git
commit 100ef9c3063e49106a2c7fbff4942bbe9edc7042 Merge: 322452021 941f8f26c Author: Nicholas DiPiazza <[email protected]> AuthorDate: Fri Mar 29 02:30:30 2024 -0500 Merge branch 'main' of github.com:apache/tika into TIKA-4181-grpc CHANGES.txt | 2 + tika-app/pom.xml | 1 + .../src/main/java/org/apache/tika/cli/TikaCLI.java | 2 +- .../java/org/apache/tika/cli/TikaCLIAsyncTest.java | 89 +++++++ .../test/java/org/apache/tika/cli/TikaCLITest.java | 59 +--- tika-batch/pom.xml | 3 + tika-core/src/main/java/org/apache/tika/Tika.java | 4 + .../org/apache/tika/detect/AutoDetectReader.java | 38 +-- .../tika/detect/CompositeEncodingDetector.java | 7 + .../AbstractEmbeddedDocumentBytesHandler.java | 69 +++++ .../tika/extractor/BasicEmbeddedBytesSelector.java | 77 ++++++ .../BasicEmbeddedDocumentBytesHandler.java | 58 ++++ .../tika/extractor/EmbeddedBytesSelector.java | 31 +-- .../EmbeddedDocumentByteStoreExtractorFactory.java | 36 +-- .../extractor/EmbeddedDocumentBytesHandler.java | 32 +-- .../ParsingEmbeddedDocumentExtractor.java | 10 +- .../apache/tika/extractor/RUnpackExtractor.java | 183 +++++++++++++ .../tika/extractor/RUnpackExtractorFactory.java | 111 ++++++++ .../org/apache/tika/io/BoundedInputStream.java | 31 ++- .../main/java/org/apache/tika/metadata/IPTC.java | 8 + .../main/java/org/apache/tika/metadata/PDF.java | 6 + .../apache/tika/metadata/TikaCoreProperties.java | 20 ++ .../main/java/org/apache/tika/mime/MimeTypes.java | 4 +- .../org/apache/tika/parser/AbstractParser.java | 1 + .../org/apache/tika/parser/AutoDetectParser.java | 11 +- .../apache/tika/parser/AutoDetectParserConfig.java | 4 +- .../org/apache/tika/parser/ParserDecorator.java | 1 + .../apache/tika/parser/RecursiveParserWrapper.java | 2 + .../parser/multiple/AbstractMultipleParser.java | 1 + .../java/org/apache/tika/pipes/FetchEmitTuple.java | 52 +++- .../java/org/apache/tika/pipes/PipesServer.java | 296 +++++++++++++++------ .../extractor/EmbeddedDocumentBytesConfig.java | 167 ++++++++++++ .../EmittingEmbeddedDocumentBytesHandler.java | 73 +++++ .../org/apache/tika/mime/tika-mimetypes.xml | 89 ++++++- .../java/org/apache/tika/TikaDetectionTest.java | 2 +- .../tika/parser/AutoDetectParserConfigTest.java | 72 +++++ .../org/apache/tika/parser/mock/MockParser.java | 26 +- .../org/apache/tika/pipes/PipesServerTest.java | 120 ++++++++- ...rocessorTest.java => AsyncChaosMonkeyTest.java} | 2 +- .../config/TIKA-4207-embedded-bytes-config.xml | 13 +- .../apache/tika/pipes/TIKA-4207-limit-bytes.xml | 19 +- .../resources/org/apache/tika/pipes/TIKA-4207.xml | 19 +- tika-eval/tika-eval-app/pom.xml | 7 +- .../org/apache/tika/eval/app/AbstractProfiler.java | 17 +- .../org/apache/tika/eval/app/ExtractProfiler.java | 4 + .../java/org/apache/tika/eval/app/db/Cols.java | 3 + tika-eval/tika-eval-core/pom.xml | 1 + .../eval/core/metadata/TikaEvalMetadataFilter.java | 4 + .../core/metadata/TikaEvalMetadataFilterTest.java | 1 + tika-fuzzing/pom.xml | 1 + tika-java7/pom.xml | 1 + tika-parent/pom.xml | 102 +++---- .../apache/tika/parser/geopkg/GeoPkgDBParser.java | 54 ++++ .../apache/tika/parser/geopkg/GeoPkgParser.java | 127 +++++++++ .../GeoPkgTableReader.java} | 59 ++-- .../tika/parser/sqlite3/SQLite3DBParser.java | 2 +- .../tika/parser/sqlite3/SQLite3TableReader.java | 2 +- .../services/org.apache.tika.parser.Parser | 1 + .../tika-parsers-ml/tika-age-recogniser/pom.xml | 2 +- .../tika/parser/iwork/IWorkPackageParser.java | 47 ++-- .../apache/tika/parser/html/HtmlParserTest.java | 2 +- .../detect/microsoft/ooxml/OPCPackageDetector.java | 47 ++-- .../apache/tika/parser/microsoft/WMFParser.java | 3 +- .../tika/parser/microsoft/chm/ChmCommons.java | 11 +- .../tika/parser/microsoft/chm/ChmExtractor.java | 3 +- .../tika/parser/microsoft/chm/ChmPmgiHeader.java | 2 +- .../ooxml/XSLFPowerPointExtractorDecorator.java | 3 +- .../tika/parser/microsoft/chm/TestChmLzxState.java | 3 +- .../apache/tika/detect/ole/MiscOLEDetector.java | 4 +- .../apache/tika/parser/epub/EncryptionParser.java | 88 ------ .../org/apache/tika/parser/epub/EpubParser.java | 193 +++++++++++--- .../apache/tika/parser/iptc/IptcAnpaParser.java | 1 + .../apache/tika/parser/ocr/TesseractOCRParser.java | 20 +- .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 6 + .../org/apache/tika/parser/pdf/OCRPageCounter.java | 31 +-- .../java/org/apache/tika/parser/pdf/PDFParser.java | 6 + .../org/apache/tika/parser/pdf/XFAExtractor.java | 3 + .../org/apache/tika/parser/pdf/PDFParserTest.java | 12 +- .../detect/gzip/GZipSpecializationDetector.java | 4 + .../org/apache/tika/parser/pkg/PackageParser.java | 7 +- .../org/apache/tika/parser/txt/BOMDetector.java | 93 +++++++ .../apache/tika/parser/txt/BOMDetectorTest.java | 91 +++++++ .../org/apache/tika/parser/txt/TXTParserTest.java | 2 + .../org/apache/tika/parser/warc/WARCParser.java | 14 +- .../apache/tika/parser/warc/WARCParserTest.java | 31 ++- .../test/resources/test-documents/example.arc.gz | Bin 0 -> 1027 bytes .../src/test/resources/test-documents/testARC.arc | 50 ++++ .../apache/tika/parser/xml/MetadataHandler.java | 4 + .../tika/detect/TestContainerAwareDetector.java | 5 + .../java/org/apache/tika/mime/TestMimeTypes.java | 6 + .../tika/parser/RecursiveParserWrapperTest.java | 5 +- .../parser/microsoft/ooxml/TruncatedOOXMLTest.java | 9 + .../tika/parser/ocr/TesseractOCRParserTest.java | 9 + .../apache/tika/parser/pkg/Seven7ParserTest.java | 12 +- .../resources/configs/tika-config-no-names.xml | 2 +- .../resources/configs/tika-config-with-names.xml | 2 +- .../src/test/resources/test-documents/test3mf.3mf | Bin 0 -> 28243 bytes .../resources/test-documents/testSTL-ascii.stl | 16 ++ .../resources/test-documents/testSTL-binary.stl | Bin 0 -> 160 bytes tika-pipes/tika-async-cli/pom.xml | 7 + .../apache/tika/async/cli/AsyncProcessorTest.java | 140 ++++++++++ .../apache/tika/async/cli/TikaAsyncCLITest.java | 2 +- .../test/resources/configs/TIKA-4207-emitter.xml | 28 +- .../resources/{ => configs}/tika-config-broken.xml | 0 .../basic_embedded.xml} | 29 +- tika-pipes/tika-pipes-iterators/pom.xml | 1 + .../tika-pipes-iterator-json}/pom.xml | 43 ++- .../pipesiterator/json/JsonPipesIterator.java | 65 +++++ .../pipesiterator/json/TestJsonPipesIterator.java | 85 ++++++ .../test-documents/test-with-embedded-bytes.json | 100 +++++++ .../src/test/resources/test-documents/test.json | 100 +++++++ .../pipes/reporters/jdbc/JDBCPipesReporter.java | 52 ++-- .../metadata/serialization/JsonFetchEmitTuple.java | 71 ++++- .../serialization/JsonFetchEmitTupleTest.java | 20 ++ tika-server/tika-server-core/pom.xml | 10 +- .../apache/tika/server/core/TikaServerProcess.java | 2 +- .../tika/server/core/resource/AsyncResource.java | 32 ++- .../tika/server/core/resource/TikaResource.java | 2 +- .../apache/tika/server/core/TikaVersionTest.java | 2 +- .../apache/tika/server/core/TikaWelcomeTest.java | 4 +- .../apache/tika/server/standard/TikaPipesTest.java | 93 +++++++ tika-translate/pom.xml | 1 + tika-xmp/pom.xml | 1 + 123 files changed, 3290 insertions(+), 686 deletions(-)
