This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit fc19baf5f1ac57084dc8915cd20bb36a65b3e275 Merge: 283aa339b 4b20970cb Author: tballison <[email protected]> AuthorDate: Thu Apr 21 07:31:58 2022 -0400 Merge remote-tracking branch 'origin/main' into main CHANGES.txt | 27 ++++- tika-bundles/tika-bundle-standard/pom.xml | 2 +- .../EmbeddedDocumentExtractorFactory.java | 18 +--- .../tika/extractor/EmbeddedDocumentUtil.java | 26 ++--- .../ParsingEmbeddedDocumentExtractor.java | 8 +- .../ParsingEmbeddedDocumentExtractorFactory.java | 28 +++--- .../java/org/apache/tika/metadata/Metadata.java | 4 - .../apache/tika/metadata/TikaCoreProperties.java | 16 +++ .../org/apache/tika/parser/AutoDetectParser.java | 30 +++--- .../apache/tika/parser/AutoDetectParserConfig.java | 41 +++++++- .../org/apache/tika/parser/CompositeParser.java | 39 +++++++- .../tika/sax/ContentHandlerDecoratorFactory.java | 18 +--- .../java/org/apache/tika/sax/StandardsText.java | 2 +- .../java/org/apache/tika/utils/ParserUtils.java | 15 ++- .../metadatafilter/OpenNLPMetadataFilter.java | 49 +++++++++ .../optimaize/OptimaizeLangDetector.java | 13 ++- .../metadatafilter/OptimaizeMetadataFilter.java | 49 +++++++++ tika-parent/pom.xml | 17 ++-- .../detect/microsoft/POIFSContainerDetector.java | 66 ++++++++++--- .../parser/microsoft/AbstractPOIFSExtractor.java | 33 ++++--- .../tika/parser/microsoft/ExcelExtractor.java | 2 +- .../tika/parser/microsoft/HSLFExtractor.java | 9 +- .../tika/parser/microsoft/JackcessExtractor.java | 2 +- .../apache/tika/parser/microsoft/OfficeParser.java | 24 ++++- .../tika/parser/microsoft/OutlookExtractor.java | 2 +- .../tika/parser/microsoft/WordExtractor.java | 2 +- .../microsoft/onenote/OneNoteTreeWalker.java | 6 +- .../microsoft/ooxml/AbstractOOXMLExtractor.java | 2 +- .../parser/microsoft/pst/OutlookPSTParser.java | 2 +- .../tika/parser/microsoft/xml/WordMLParser.java | 8 +- .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 13 +-- .../tika/parser/pdf/ImageGraphicsEngine.java | 4 +- .../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +- .../tika/parser/AutoDetectParserConfigTest.java | 67 +++++++++++++ .../tika/parser/microsoft/XML2003ParserTest.java | 4 +- .../tika/parser/microsoft/rtf/RTFParserTest.java | 14 +++ .../tika/parser/ocr/TesseractOCRParserTest.java | 5 + .../UpcasingContentHandlerDecoratorFactory.java | 25 ++--- .../resources/configs/tika-config-no-names.xml | 33 +++++++ ...ka-config-upcasing-custom-handler-decorator.xml | 29 ++++++ .../resources/configs/tika-config-with-names.xml | 33 +++++++ tika-pipes/pom.xml | 4 +- .../org/apache/tika/client/HttpClientFactory.java | 15 +-- .../server/core/DefaultInputStreamFactory.java | 7 ++ .../tika/server/core/FetcherStreamFactory.java | 30 ++++-- .../tika/server/core/InputStreamFactory.java | 15 +++ .../tika/server/core/TikaServerWatchDog.java | 8 +- .../server/core/resource/DetectorResource.java | 5 +- .../server/core/resource/MetadataResource.java | 5 +- .../core/resource/RecursiveMetadataResource.java | 2 +- .../tika/server/core/resource/TikaResource.java | 15 +-- .../server/core/resource/UnpackerResource.java | 6 +- .../org/apache/tika/server/core/CXFTestBase.java | 9 +- .../org/apache/tika/server/core/TikaPipesTest.java | 3 +- .../tika/server/core/TikaResourceFetcherTest.java | 110 +++++++++++++++++++++ .../tika-config-server-fetcher-template.xml | 38 +++++++ tika-server/tika-server-eval/pom.xml | 1 + tika-server/tika-server-standard/pom.xml | 22 +++-- .../standard/resource/XMPMetadataResource.java | 3 +- .../apache/tika/server/standard/FetcherTest.java | 12 +-- ...herTest.java => OpenNLPMetadataFilterTest.java} | 74 +++++++------- ...rTest.java => OptimaizeMetadataFilterTest.java} | 73 +++++++------- .../apache/tika/server/standard/TikaPipesTest.java | 6 +- .../tika/server/standard/TikaResourceTest.java | 5 + .../tika-config-langdetect-opennlp-filter.xml | 32 ++++++ .../tika-config-langdetect-optimaize-filter.xml | 32 ++++++ 66 files changed, 1032 insertions(+), 289 deletions(-)
