This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new f6264c704 TIKA-3889 -- include counts of 3d objects
f6264c704 is described below
commit f6264c7044148f98dd733b9194a92918bb36bea7
Author: tballison <[email protected]>
AuthorDate: Wed Oct 19 13:54:17 2022 -0400
TIKA-3889 -- include counts of 3d objects
---
tika-core/src/main/java/org/apache/tika/metadata/PDF.java | 4 ++++
.../src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java | 4 ++++
2 files changed, 8 insertions(+)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index e683f321d..05dbdb869 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -170,4 +170,8 @@ public interface PDF {
Property ANNOTATION_SUBTYPES = Property.internalTextBag(PDF_PREFIX +
"annotationSubtypes");
+ /**
+ * Number of 3D annotations a PDF contains. This makes {@link PDF#HAS_3D}
redundant.
+ */
+ Property NUM_3D_ANNOTATIONS = Property.internalInteger(PDF_PREFIX +
"num3DAnnotations");
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 58daefb72..c9cfc94c7 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -185,6 +185,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
//contains at least one broken font
boolean containsDamagedFont = false;
+ int num3DAnnotations = 0;
+
AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler,
ParseContext context,
Metadata metadata, PDFParserConfig config) throws
IOException {
this.pdDocument = pdDocument;
@@ -711,6 +713,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
//To make this stricter, we could get the 3DD stream
object and see if the
//subtype is U3D or PRC or model/ (prefix for model
mime type)
metadata.set(PDF.HAS_3D, true);
+ num3DAnnotations++;
}
for (COSDictionary fileSpec :
findFileSpecs(annotation.getCOSObject())) {
PDComplexFileSpecification cfs = new
PDComplexFileSpecification(fileSpec);
@@ -1012,6 +1015,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
metadata.set(PDF.CONTAINS_DAMAGED_FONT, containsDamagedFont);
metadata.set(PDF.CONTAINS_NON_EMBEDDED_FONT, containsNonEmbeddedFont);
+ metadata.set(PDF.NUM_3D_ANNOTATIONS, num3DAnnotations);
}
void extractBookmarkText() throws SAXException, IOException, TikaException
{