This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4207
in repository https://gitbox.apache.org/repos/asf/tika.git


    from 7ca6d1759 TIKA-4207 -- small improvements to AsyncResource and 
WMFParser
     add 36a0dca43 TIKA-4205 -- fix dependencies in tika-eval-app and add a few 
more columns to the ExtractProfiler (#1629)
     add 2bc0f9bdc TIKA-4202 -- add ocr page count to PDFs -- actually 
increment counter and move the location of the counter to before OCR is invoked 
(#1630)
     add 0be76cf28 Bump logback.version from 1.5.0 to 1.5.1
     add 4a5a21ea1 Merge pull request #1632 from 
apache/dependabot/maven/logback.version-1.5.1
     add 8ab8673ce Bump aws.version from 1.12.668 to 1.12.669
     add 386a5934a Merge pull request #1631 from 
apache/dependabot/maven/aws.version-1.12.669
     add 215b75b67 TIKA-4166: update puppycrawl
     add b3e4252b2 Bump aws.version from 1.12.669 to 1.12.670
     add 1f9e773e8 Merge pull request #1634 from 
apache/dependabot/maven/aws.version-1.12.670
     add 6b726fbe5 Bump jakarta.activation:jakarta.activation-api from 2.1.2 to 
2.1.3
     add 6a0a59d42 Merge pull request #1635 from 
apache/dependabot/maven/jakarta.activation-jakarta.activation-api-2.1.3
     add ffc7df20f TIKA-4166: update aws, azure, mockito
     add b5023198b Bump logback.version from 1.5.1 to 1.5.2
     add 86d1e897e Merge pull request #1637 from 
apache/dependabot/maven/logback.version-1.5.2
     add 1a5f23ff4 Bump aws.version from 1.12.671 to 1.12.672
     add e3bb8cfea Merge pull request #1638 from 
apache/dependabot/maven/aws.version-1.12.672
     add c8097b6ad Bump logback.version from 1.5.2 to 1.5.3
     add dc612a7b5 Merge pull request #1639 from 
apache/dependabot/maven/logback.version-1.5.3
     add 32ef34ff4 TIKA-4199: add comment, print to stderr
     add 64c083d12 Bump aws.version from 1.12.672 to 1.12.673
     add 2f6e4cd30 Merge pull request #1640 from 
apache/dependabot/maven/aws.version-1.12.673
     add 36664ef41 Bump com.google.cloud:google-cloud-storage from 2.34.0 to 
2.35.0
     add 26c33d46c Merge pull request #1641 from 
apache/dependabot/maven/com.google.cloud-google-cloud-storage-2.35.0
     add 6cf215017 Bump org.testcontainers:testcontainers-bom from 1.19.6 to 
1.19.7
     add 8b3230dff Merge pull request #1642 from 
apache/dependabot/maven/org.testcontainers-testcontainers-bom-1.19.7
     add 5221d8874 Bump aws.version from 1.12.673 to 1.12.674
     add 43a4e58cc Merge pull request #1643 from 
apache/dependabot/maven/aws.version-1.12.674
     add b7c5d48ce Bump aws.version from 1.12.674 to 1.12.675
     add 79b194a69 Merge pull request #1644 from 
apache/dependabot/maven/aws.version-1.12.675
     add a89e9779f Bump jakarta.xml.bind:jakarta.xml.bind-api from 4.0.1 to 
4.0.2
     add 4af4be5be Merge pull request #1645 from 
apache/dependabot/maven/jakarta.xml.bind-jakarta.xml.bind-api-4.0.2
     add 8b398201a TIKA-4199: revert "complete delegate class", field "in" is a 
dummy; remove workaround for commons-compress 1.26
     add 5b259d60a TIKA-4199: adjust test results now that commons compress bug 
has been fixed
     add 4d6acfc10 TIKA-4199: update commons-compress
     add 1dd99bf45 TIKA-4166: update aws
     add 5f4e380ff TIKA-4166: update jaxb
     add d477bfd3b TIKA-4166: revert jaxb update
     add 0f077da2a TIKA-4166: update jaxb and prevent convergence problem
     add f0b76e503 Bump com.googlecode.plist:dd-plist from 1.27 to 1.28
     add da3f8c970 Merge pull request #1649 from 
apache/dependabot/maven/com.googlecode.plist-dd-plist-1.28
     add 67790a364 Bump org.apache.maven.plugins:maven-assembly-plugin from 
3.6.0 to 3.7.0
     add 418258161 Merge pull request #1646 from 
apache/dependabot/maven/org.apache.maven.plugins-maven-assembly-plugin-3.7.0
     add bc2167a30 Bump log4j2.version from 2.23.0 to 2.23.1
     add 17caf585d Merge pull request #1648 from 
apache/dependabot/maven/log4j2.version-2.23.1
     add b980d9d86 Bump com.fasterxml.jackson:jackson-bom from 2.16.1 to 2.16.2
     add bdb6a4656 Merge pull request #1647 from 
apache/dependabot/maven/com.fasterxml.jackson-jackson-bom-2.16.2
     add 84f0a5b7f Bump aws.version from 1.12.676 to 1.12.677
     add 3a7bbc50d Merge pull request #1651 from 
apache/dependabot/maven/aws.version-1.12.677
     add 3ffadd5a3 Bump aws.version from 1.12.677 to 1.12.678
     add 49064dbe2 Merge pull request #1652 from 
apache/dependabot/maven/aws.version-1.12.678
     add e65d52cb5 Bump org.xerial:sqlite-jdbc from 3.45.1.0 to 3.45.2.0
     add 846f3a080 Merge pull request #1655 from 
apache/dependabot/maven/org.xerial-sqlite-jdbc-3.45.2.0
     add be7640d53 Bump com.fasterxml.jackson:jackson-bom from 2.16.2 to 2.17.0
     add 7cd6ee86b Merge pull request #1653 from 
apache/dependabot/maven/com.fasterxml.jackson-jackson-bom-2.17.0
     add 23d26d770 Bump reactor.netty.version from 1.1.15 to 1.1.17
     add 18d9fd769 Merge pull request #1654 from 
apache/dependabot/maven/reactor.netty.version-1.1.17
     add 1d666ea04 Bump io.projectreactor:reactor-core from 3.6.2 to 3.6.4
     add 207594f9f Merge pull request #1656 from 
apache/dependabot/maven/io.projectreactor-reactor-core-3.6.4
     add 533e056bb TIKA-4166: update puppycrawl, cxf
     add 8d5c3578a Bump aws.version from 1.12.678 to 1.12.679
     add ef75d45aa Merge pull request #1658 from 
apache/dependabot/maven/aws.version-1.12.679
     add df573d07c Bump com.google.guava:guava from 33.0.0-jre to 33.1.0-jre
     add 290742590 Merge pull request #1657 from 
apache/dependabot/maven/com.google.guava-guava-33.1.0-jre
     add 91820226e TIKA-4166: update mime4j
     add 3ccfcb485 Bump pdfbox.version from 3.0.1 to 3.0.2
     add e9aa16994 Merge pull request #1660 from 
apache/dependabot/maven/pdfbox.version-3.0.2
     add 3c131e76a Bump org.springframework:spring-context from 5.3.32 to 5.3.33
     add d90a564ad Merge pull request #1662 from 
apache/dependabot/maven/org.springframework-spring-context-5.3.33
     add 6d02aa2ed Bump aws.version from 1.12.679 to 1.12.680
     add cf2073dda Merge pull request #1661 from 
apache/dependabot/maven/aws.version-1.12.680
     add 2ec57fb14 Bump aws.version from 1.12.680 to 1.12.681
     add 0a224b32d Merge pull request #1664 from 
apache/dependabot/maven/aws.version-1.12.681
     add c963c51da Bump com.google.cloud:google-cloud-storage from 2.35.0 to 
2.36.0
     add 2e614b438 Merge pull request #1663 from 
apache/dependabot/maven/com.google.cloud-google-cloud-storage-2.36.0
     add 67d593c27 TIKA-4166: update puppycrawl
     add 7735eeb16 Bump aws.version from 1.12.681 to 1.12.682
     add f1b7f07b7 Merge pull request #1665 from 
apache/dependabot/maven/aws.version-1.12.682
     add 0a9f17c2d TIKA-4166: update zookeeper
     add fcdff7cf7 Bump org.apache.maven.plugins:maven-assembly-plugin from 
3.7.0 to 3.7.1
     add b3c8c3e7e Merge pull request #1666 from 
apache/dependabot/maven/org.apache.maven.plugins-maven-assembly-plugin-3.7.1
     add 2fa9ab30c Bump org.apache.maven.plugins:maven-compiler-plugin
     add 0e166b0d1 Merge pull request #1667 from 
apache/dependabot/maven/org.apache.maven.plugins-maven-compiler-plugin-3.13.0
     add 880b34556 Bump aws.version from 1.12.682 to 1.12.683
     add eac6f090b Merge pull request #1668 from 
apache/dependabot/maven/aws.version-1.12.683
     add 9ea184af5 Bump aws.version from 1.12.683 to 1.12.684
     add 96fd5fd6c Merge pull request #1671 from 
apache/dependabot/maven/aws.version-1.12.684
     add e63730e12 TIKA-4213 -- improve jdbc pipes reporter (#1669)
     add 7dc3d28a5 TIKA-4211 -- first attempt (#1670)
     add 85d713a9a TIKA-4215 -- avoid loading all the tika resources just to 
get the version (#1672)
     add 237e73f18 TIKA-4216 (#1673)
     add 08727d522 TIKA-4217 -- require new line or white space as part of 
bitmap magic (#1674)
     new dae75c632 Merge remote-tracking branch 'origin/main' into TIKA-4207
     new 9ffc4df4a TIKA-4207 -- allow users to configure include/exclude for 
attachment types and/or mime types

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  2 +-
 tika-core/src/main/java/org/apache/tika/Tika.java  |  4 ++
 .../tika/extractor/BasicEmbeddedBytesSelector.java | 77 ++++++++++++++++++++++
 ...ctorFactory.java => EmbeddedBytesSelector.java} | 16 +++--
 .../ParsingEmbeddedDocumentExtractor.java          | 28 +++++++-
 .../ParsingEmbeddedDocumentExtractorFactory.java   | 56 ++++++++++++++--
 .../main/java/org/apache/tika/metadata/PDF.java    |  4 ++
 .../apache/tika/metadata/TikaCoreProperties.java   |  4 ++
 .../org/apache/tika/mime/tika-mimetypes.xml        | 53 ++++++++++++---
 .../tika/parser/AutoDetectParserConfigTest.java    | 72 ++++++++++++++++++++
 .../config/TIKA-4207-embedded-bytes-config.xml     | 11 +++-
 tika-eval/tika-eval-app/pom.xml                    |  2 -
 .../org/apache/tika/eval/app/AbstractProfiler.java | 17 ++++-
 .../org/apache/tika/eval/app/ExtractProfiler.java  |  4 ++
 .../java/org/apache/tika/eval/app/db/Cols.java     |  3 +
 tika-parent/pom.xml                                | 60 +++++++++--------
 .../ooxml/XSLFPowerPointExtractorDecorator.java    |  3 +-
 .../apache/tika/parser/ocr/TesseractOCRParser.java | 20 +++++-
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  6 ++
 .../org/apache/tika/parser/pdf/OCRPageCounter.java |  4 ++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  8 +++
 .../org/apache/tika/parser/pkg/PackageParser.java  | 50 +-------------
 .../parser/microsoft/ooxml/TruncatedOOXMLTest.java |  4 +-
 .../tika/parser/ocr/TesseractOCRParserTest.java    |  9 +++
 .../apache/tika/parser/pkg/Seven7ParserTest.java   |  3 +-
 .../pipes/reporters/jdbc/JDBCPipesReporter.java    | 52 ++++++++-------
 .../apache/tika/server/core/TikaServerProcess.java |  2 +-
 .../tika/server/core/resource/TikaResource.java    |  2 +-
 .../apache/tika/server/core/TikaVersionTest.java   |  2 +-
 .../apache/tika/server/core/TikaWelcomeTest.java   |  4 +-
 30 files changed, 444 insertions(+), 138 deletions(-)
 create mode 100644 
tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java
 copy 
tika-core/src/main/java/org/apache/tika/extractor/{EmbeddedDocumentExtractorFactory.java
 => EmbeddedBytesSelector.java} (74%)
 create mode 100644 
tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
 copy 
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml
 => 
tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml
 (78%)

Reply via email to