This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 2504756 TIKA-3217 -- add XMP pdf schema to metadata extraction for
PDFs.
2504756 is described below
commit 2504756b68223b429d481877f4977fc995f24724
Author: tallison <[email protected]>
AuthorDate: Fri Oct 30 14:35:56 2020 -0400
TIKA-3217 -- add XMP pdf schema to metadata extraction for PDFs.
---
.../main/java/org/apache/tika/metadata/PDF.java | 2 +
.../java/org/apache/tika/parser/pdf/PDFParser.java | 1 +
.../tika/parser/pdf/PDMetadataExtractor.java | 74 ++++++++++++++--------
.../org/apache/tika/parser/pdf/PDFParserTest.java | 10 ++-
4 files changed, 60 insertions(+), 27 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index b7ca989..cea40bd 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -66,6 +66,8 @@ public interface PDF {
Property IS_ENCRYPTED = Property.internalBoolean(PDF_PREFIX+"encrypted");
+ Property PRODUCER = Property.internalText(PDF_PREFIX+"producer");
+
/**
* This specifies where an action or destination would be found/triggered
* in the document: on document open, before close, etc.
diff --git
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index f045123..dd18464 100644
---
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -283,6 +283,7 @@ public class PDFParser extends AbstractParser implements
Initializable {
PDMetadataExtractor.addMetadata(metadata, Office.KEYWORDS,
info.getKeywords());
PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS,
info.getKeywords());
PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_PRODUCER,
info.getProducer());
+ PDMetadataExtractor.addMetadata(metadata, PDF.PRODUCER,
info.getProducer());
PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_SUBJECT,
info.getSubject());
diff --git
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
index b288d5f..d771fd4 100644
---
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
+++
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
@@ -26,6 +26,7 @@ import org.apache.jempbox.xmp.XMPMetadata;
import org.apache.jempbox.xmp.XMPSchema;
import org.apache.jempbox.xmp.XMPSchemaBasic;
import org.apache.jempbox.xmp.XMPSchemaDublinCore;
+import org.apache.jempbox.xmp.XMPSchemaPDF;
import org.apache.jempbox.xmp.pdfa.XMPSchemaPDFAId;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
@@ -37,6 +38,7 @@ import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.PDF;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -63,34 +65,26 @@ class PDMetadataExtractor {
metadata.set(PDF.HAS_XMP, "true");
//now go for the XMP
Document dom = loadDOM(pdMetadata, metadata, context);
-
- XMPMetadata xmp = null;
- if (dom != null) {
- xmp = new XMPMetadata(dom);
+ if (dom == null) {
+ return;
}
+ XMPMetadata xmp = new XMPMetadata(dom);
XMPSchemaDublinCore dcSchema = null;
- XMPSchemaBasic basic = null;
- if (xmp != null) {
- try {
- dcSchema = xmp.getDublinCoreSchema();
- } catch (IOException e) {
- }
- try {
- basic = xmp.getBasicSchema();
- } catch (IOException e) {
- //swallow
- }
- JempboxExtractor.extractXMPMM(xmp, metadata);
+ try {
+ dcSchema = xmp.getDublinCoreSchema();
+ } catch (IOException e) {
}
- extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION,
null, dcSchema);
- extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR,
dcSchema);
- extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR,
dcSchema);
- extractMultilingualItems(metadata, TikaCoreProperties.TITLE, null,
dcSchema);
-
- extractBasic(basic, metadata);
+ if (dcSchema != null) {
+ extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION,
null, dcSchema);
+ extractDublinCoreListItems(metadata,
TikaCoreProperties.CONTRIBUTOR, dcSchema);
+ extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR,
dcSchema);
+ extractMultilingualItems(metadata, TikaCoreProperties.TITLE, null,
dcSchema);
+ }
+ extractBasic(xmp, metadata);
+ extractPDF(xmp, metadata);
+ JempboxExtractor.extractXMPMM(xmp, metadata);
try {
- if (xmp != null) {
xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE,
XMPSchemaPDFAId.class);
XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId)
xmp.getSchemaByClass(XMPSchemaPDFAId.class);
if (pdfaxmp != null) {
@@ -106,13 +100,41 @@ class PDMetadataExtractor {
}
}
// TODO WARN if this XMP version is inconsistent with document
header version?
- }
} catch (IOException e) {
metadata.set(TikaCoreProperties.TIKA_META_PREFIX +
"pdf:metadata-xmp-parse-failed", "" + e);
}
}
- private static void extractBasic(XMPSchemaBasic basic, Metadata metadata) {
+ private static void extractPDF(XMPMetadata xmp, Metadata metadata) {
+ if (xmp == null) {
+ return;
+ }
+
+ XMPSchemaPDF pdf = null;
+ try {
+ pdf = xmp.getPDFSchema();
+ } catch (IOException e) {
+ return;
+ }
+ if (pdf == null) {
+ return;
+ }
+ setNotNull(PDF.PRODUCER, pdf.getProducer(), metadata);
+ setNotNull(Office.KEYWORDS, pdf.getKeywords(), metadata);
+ setNotNull(PDF.PDF_VERSION, pdf.getPDFVersion(), metadata);
+ }
+
+ private static void extractBasic(XMPMetadata xmp, Metadata metadata) {
+ if (xmp == null) {
+ return;
+ }
+
+ XMPSchemaBasic basic = null;
+ try {
+ basic = xmp.getBasicSchema();
+ } catch (IOException e) {
+ return;
+ }
if (basic == null) {
return;
}
@@ -155,7 +177,7 @@ class PDMetadataExtractor {
private static void setNotNull(Property property, String value, Metadata
metadata) {
if (metadata.get(property) == null && value != null &&
value.trim().length() > 0) {
- metadata.set(property, value);
+ metadata.set(property, decode(value));
}
}
diff --git
a/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 6bb4bdf..0bdb249 100644
---
a/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -26,6 +26,7 @@ import static org.junit.Assert.fail;
import static org.junit.Assume.assumeTrue;
import java.io.InputStream;
+import java.nio.file.Path;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
@@ -1382,6 +1383,14 @@ public class PDFParserTest extends TikaTest {
}
@Test
+ public void testXMPPDFSchema() throws Exception {
+ //as of this writing, we don't currently have any pdfs in our
+ //test suite with data that is different btwn pdf doc info and xmp. :(
+ Metadata metadata = getXML("testPopupAnnotation.pdf").metadata;
+ assertEquals("IBM Lotus Symphony 3.0", metadata.get(PDF.PRODUCER));
+ }
+
+ @Test
public void testExtractInlineImageMetadata() throws Exception {
ParseContext context = new ParseContext();
PDFParserConfig config = new PDFParserConfig();
@@ -1424,5 +1433,4 @@ public class PDFParserTest extends TikaTest {
return true;
}
}
-
}