This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4207 in repository https://gitbox.apache.org/repos/asf/tika.git
from 7ca6d1759 TIKA-4207 -- small improvements to AsyncResource and WMFParser add 36a0dca43 TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler (#1629) add 2bc0f9bdc TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked (#1630) add 0be76cf28 Bump logback.version from 1.5.0 to 1.5.1 add 4a5a21ea1 Merge pull request #1632 from apache/dependabot/maven/logback.version-1.5.1 add 8ab8673ce Bump aws.version from 1.12.668 to 1.12.669 add 386a5934a Merge pull request #1631 from apache/dependabot/maven/aws.version-1.12.669 add 215b75b67 TIKA-4166: update puppycrawl add b3e4252b2 Bump aws.version from 1.12.669 to 1.12.670 add 1f9e773e8 Merge pull request #1634 from apache/dependabot/maven/aws.version-1.12.670 add 6b726fbe5 Bump jakarta.activation:jakarta.activation-api from 2.1.2 to 2.1.3 add 6a0a59d42 Merge pull request #1635 from apache/dependabot/maven/jakarta.activation-jakarta.activation-api-2.1.3 add ffc7df20f TIKA-4166: update aws, azure, mockito add b5023198b Bump logback.version from 1.5.1 to 1.5.2 add 86d1e897e Merge pull request #1637 from apache/dependabot/maven/logback.version-1.5.2 add 1a5f23ff4 Bump aws.version from 1.12.671 to 1.12.672 add e3bb8cfea Merge pull request #1638 from apache/dependabot/maven/aws.version-1.12.672 add c8097b6ad Bump logback.version from 1.5.2 to 1.5.3 add dc612a7b5 Merge pull request #1639 from apache/dependabot/maven/logback.version-1.5.3 add 32ef34ff4 TIKA-4199: add comment, print to stderr add 64c083d12 Bump aws.version from 1.12.672 to 1.12.673 add 2f6e4cd30 Merge pull request #1640 from apache/dependabot/maven/aws.version-1.12.673 add 36664ef41 Bump com.google.cloud:google-cloud-storage from 2.34.0 to 2.35.0 add 26c33d46c Merge pull request #1641 from apache/dependabot/maven/com.google.cloud-google-cloud-storage-2.35.0 add 6cf215017 Bump org.testcontainers:testcontainers-bom from 1.19.6 to 1.19.7 add 8b3230dff Merge pull request #1642 from apache/dependabot/maven/org.testcontainers-testcontainers-bom-1.19.7 add 5221d8874 Bump aws.version from 1.12.673 to 1.12.674 add 43a4e58cc Merge pull request #1643 from apache/dependabot/maven/aws.version-1.12.674 add b7c5d48ce Bump aws.version from 1.12.674 to 1.12.675 add 79b194a69 Merge pull request #1644 from apache/dependabot/maven/aws.version-1.12.675 add a89e9779f Bump jakarta.xml.bind:jakarta.xml.bind-api from 4.0.1 to 4.0.2 add 4af4be5be Merge pull request #1645 from apache/dependabot/maven/jakarta.xml.bind-jakarta.xml.bind-api-4.0.2 add 8b398201a TIKA-4199: revert "complete delegate class", field "in" is a dummy; remove workaround for commons-compress 1.26 add 5b259d60a TIKA-4199: adjust test results now that commons compress bug has been fixed add 4d6acfc10 TIKA-4199: update commons-compress add 1dd99bf45 TIKA-4166: update aws add 5f4e380ff TIKA-4166: update jaxb add d477bfd3b TIKA-4166: revert jaxb update add 0f077da2a TIKA-4166: update jaxb and prevent convergence problem add f0b76e503 Bump com.googlecode.plist:dd-plist from 1.27 to 1.28 add da3f8c970 Merge pull request #1649 from apache/dependabot/maven/com.googlecode.plist-dd-plist-1.28 add 67790a364 Bump org.apache.maven.plugins:maven-assembly-plugin from 3.6.0 to 3.7.0 add 418258161 Merge pull request #1646 from apache/dependabot/maven/org.apache.maven.plugins-maven-assembly-plugin-3.7.0 add bc2167a30 Bump log4j2.version from 2.23.0 to 2.23.1 add 17caf585d Merge pull request #1648 from apache/dependabot/maven/log4j2.version-2.23.1 add b980d9d86 Bump com.fasterxml.jackson:jackson-bom from 2.16.1 to 2.16.2 add bdb6a4656 Merge pull request #1647 from apache/dependabot/maven/com.fasterxml.jackson-jackson-bom-2.16.2 add 84f0a5b7f Bump aws.version from 1.12.676 to 1.12.677 add 3a7bbc50d Merge pull request #1651 from apache/dependabot/maven/aws.version-1.12.677 add 3ffadd5a3 Bump aws.version from 1.12.677 to 1.12.678 add 49064dbe2 Merge pull request #1652 from apache/dependabot/maven/aws.version-1.12.678 add e65d52cb5 Bump org.xerial:sqlite-jdbc from 3.45.1.0 to 3.45.2.0 add 846f3a080 Merge pull request #1655 from apache/dependabot/maven/org.xerial-sqlite-jdbc-3.45.2.0 add be7640d53 Bump com.fasterxml.jackson:jackson-bom from 2.16.2 to 2.17.0 add 7cd6ee86b Merge pull request #1653 from apache/dependabot/maven/com.fasterxml.jackson-jackson-bom-2.17.0 add 23d26d770 Bump reactor.netty.version from 1.1.15 to 1.1.17 add 18d9fd769 Merge pull request #1654 from apache/dependabot/maven/reactor.netty.version-1.1.17 add 1d666ea04 Bump io.projectreactor:reactor-core from 3.6.2 to 3.6.4 add 207594f9f Merge pull request #1656 from apache/dependabot/maven/io.projectreactor-reactor-core-3.6.4 add 533e056bb TIKA-4166: update puppycrawl, cxf add 8d5c3578a Bump aws.version from 1.12.678 to 1.12.679 add ef75d45aa Merge pull request #1658 from apache/dependabot/maven/aws.version-1.12.679 add df573d07c Bump com.google.guava:guava from 33.0.0-jre to 33.1.0-jre add 290742590 Merge pull request #1657 from apache/dependabot/maven/com.google.guava-guava-33.1.0-jre add 91820226e TIKA-4166: update mime4j add 3ccfcb485 Bump pdfbox.version from 3.0.1 to 3.0.2 add e9aa16994 Merge pull request #1660 from apache/dependabot/maven/pdfbox.version-3.0.2 add 3c131e76a Bump org.springframework:spring-context from 5.3.32 to 5.3.33 add d90a564ad Merge pull request #1662 from apache/dependabot/maven/org.springframework-spring-context-5.3.33 add 6d02aa2ed Bump aws.version from 1.12.679 to 1.12.680 add cf2073dda Merge pull request #1661 from apache/dependabot/maven/aws.version-1.12.680 add 2ec57fb14 Bump aws.version from 1.12.680 to 1.12.681 add 0a224b32d Merge pull request #1664 from apache/dependabot/maven/aws.version-1.12.681 add c963c51da Bump com.google.cloud:google-cloud-storage from 2.35.0 to 2.36.0 add 2e614b438 Merge pull request #1663 from apache/dependabot/maven/com.google.cloud-google-cloud-storage-2.36.0 add 67d593c27 TIKA-4166: update puppycrawl add 7735eeb16 Bump aws.version from 1.12.681 to 1.12.682 add f1b7f07b7 Merge pull request #1665 from apache/dependabot/maven/aws.version-1.12.682 add 0a9f17c2d TIKA-4166: update zookeeper add fcdff7cf7 Bump org.apache.maven.plugins:maven-assembly-plugin from 3.7.0 to 3.7.1 add b3c8c3e7e Merge pull request #1666 from apache/dependabot/maven/org.apache.maven.plugins-maven-assembly-plugin-3.7.1 add 2fa9ab30c Bump org.apache.maven.plugins:maven-compiler-plugin add 0e166b0d1 Merge pull request #1667 from apache/dependabot/maven/org.apache.maven.plugins-maven-compiler-plugin-3.13.0 add 880b34556 Bump aws.version from 1.12.682 to 1.12.683 add eac6f090b Merge pull request #1668 from apache/dependabot/maven/aws.version-1.12.683 add 9ea184af5 Bump aws.version from 1.12.683 to 1.12.684 add 96fd5fd6c Merge pull request #1671 from apache/dependabot/maven/aws.version-1.12.684 add e63730e12 TIKA-4213 -- improve jdbc pipes reporter (#1669) add 7dc3d28a5 TIKA-4211 -- first attempt (#1670) add 85d713a9a TIKA-4215 -- avoid loading all the tika resources just to get the version (#1672) add 237e73f18 TIKA-4216 (#1673) add 08727d522 TIKA-4217 -- require new line or white space as part of bitmap magic (#1674) new dae75c632 Merge remote-tracking branch 'origin/main' into TIKA-4207 new 9ffc4df4a TIKA-4207 -- allow users to configure include/exclude for attachment types and/or mime types The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: .../src/main/java/org/apache/tika/cli/TikaCLI.java | 2 +- tika-core/src/main/java/org/apache/tika/Tika.java | 4 ++ .../tika/extractor/BasicEmbeddedBytesSelector.java | 77 ++++++++++++++++++++++ ...ctorFactory.java => EmbeddedBytesSelector.java} | 16 +++-- .../ParsingEmbeddedDocumentExtractor.java | 28 +++++++- .../ParsingEmbeddedDocumentExtractorFactory.java | 56 ++++++++++++++-- .../main/java/org/apache/tika/metadata/PDF.java | 4 ++ .../apache/tika/metadata/TikaCoreProperties.java | 4 ++ .../org/apache/tika/mime/tika-mimetypes.xml | 53 ++++++++++++--- .../tika/parser/AutoDetectParserConfigTest.java | 72 ++++++++++++++++++++ .../config/TIKA-4207-embedded-bytes-config.xml | 11 +++- tika-eval/tika-eval-app/pom.xml | 2 - .../org/apache/tika/eval/app/AbstractProfiler.java | 17 ++++- .../org/apache/tika/eval/app/ExtractProfiler.java | 4 ++ .../java/org/apache/tika/eval/app/db/Cols.java | 3 + tika-parent/pom.xml | 60 +++++++++-------- .../ooxml/XSLFPowerPointExtractorDecorator.java | 3 +- .../apache/tika/parser/ocr/TesseractOCRParser.java | 20 +++++- .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 6 ++ .../org/apache/tika/parser/pdf/OCRPageCounter.java | 4 ++ .../org/apache/tika/parser/pdf/PDFParserTest.java | 8 +++ .../org/apache/tika/parser/pkg/PackageParser.java | 50 +------------- .../parser/microsoft/ooxml/TruncatedOOXMLTest.java | 4 +- .../tika/parser/ocr/TesseractOCRParserTest.java | 9 +++ .../apache/tika/parser/pkg/Seven7ParserTest.java | 3 +- .../pipes/reporters/jdbc/JDBCPipesReporter.java | 52 ++++++++------- .../apache/tika/server/core/TikaServerProcess.java | 2 +- .../tika/server/core/resource/TikaResource.java | 2 +- .../apache/tika/server/core/TikaVersionTest.java | 2 +- .../apache/tika/server/core/TikaWelcomeTest.java | 4 +- 30 files changed, 444 insertions(+), 138 deletions(-) create mode 100644 tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java copy tika-core/src/main/java/org/apache/tika/extractor/{EmbeddedDocumentExtractorFactory.java => EmbeddedBytesSelector.java} (74%) create mode 100644 tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java copy tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml => tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml (78%)