This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 380e9f75bcdd4d2f48ead8075849e41f51b4f38e Author: tallison <[email protected]> AuthorDate: Fri Oct 30 14:35:56 2020 -0400 TIKA-3217 -- add XMP pdf schema to metadata extraction for PDFs. # Conflicts: # tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java # tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java and various other conflict fixes --- .../apache/tika/detect/FileCommandDetector.java | 9 +-- .../main/java/org/apache/tika/metadata/PDF.java | 2 + .../java/org/apache/tika/eval/TikaEvalCLI.java | 1 + .../java/org/apache/tika/parser/pdf/PDFParser.java | 10 ++- .../tika/parser/pdf/PDMetadataExtractor.java | 78 ++++++++++++++-------- .../org/apache/tika/parser/pdf/PDFParserTest.java | 10 ++- 6 files changed, 73 insertions(+), 37 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java index fd851c7..6544e43 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java @@ -26,8 +26,6 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.utils.ProcessUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.BufferedInputStream; import java.io.BufferedReader; @@ -38,6 +36,8 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; +import java.util.logging.Level; +import java.util.logging.Logger; import static java.nio.charset.StandardCharsets.UTF_8; import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; @@ -57,7 +57,7 @@ public class FileCommandDetector implements Detector { //TODO: file has some diff mimes names for some very common mimes //should we map file mimes to Tika mimes, e.g. text/xml -> application/xml?? - private static final Logger LOGGER = LoggerFactory.getLogger(FileCommandDetector.class); + private static final Logger LOGGER = Logger.getLogger(FileCommandDetector.class.getName()); private static boolean HAS_WARNED = false; private static final long DEFAULT_TIMEOUT_MS = 6000; private static String DEFAULT_FILE_COMMAND_PATH = "file"; @@ -85,7 +85,8 @@ public class FileCommandDetector implements Detector { } if (!hasFileCommand) { if (! HAS_WARNED) { - LOGGER.warn("'file' command isn't working: '"+fileCommandPath+"'"); + LOGGER.log(Level.WARNING, + "'file' command isn't working: '"+fileCommandPath+"'"); HAS_WARNED = true; } return MediaType.OCTET_STREAM; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java index 608d5df..c678e92 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java @@ -67,6 +67,8 @@ public interface PDF { Property IS_ENCRYPTED = Property.internalBoolean(PDF_PREFIX+"encrypted"); + Property PRODUCER = Property.internalText(PDF_PREFIX+"producer"); + /** * This specifies where an action or destination would be found/triggered * in the document: on document open, before close, etc. diff --git a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java index 14faa6b..6b709b8 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java @@ -16,6 +16,7 @@ */ package org.apache.tika.eval; +import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index d610879..f66f086 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -50,8 +50,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.AccessPermissions; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Office; -import org.apache.tika.metadata.OfficeOpenXMLCore; + import org.apache.tika.metadata.PDF; import org.apache.tika.metadata.PagedText; import org.apache.tika.metadata.Property; @@ -462,7 +461,7 @@ public class PDFParser extends AbstractParser implements Initializable { * If true, sort text tokens by their x/y position * before extracting text. This may be necessary for * some PDFs (if the text tokens are not rendered "in - * order"), while for other PDFs it can produce the + * order"), while for othe6 -- Add FileProfilerr PDFs it can produce the * wrong result (for example if there are 2 columns, * the text will be interleaved). Default is false. */ @@ -600,6 +599,11 @@ public class PDFParser extends AbstractParser implements Initializable { } @Field + public void setDropThreshold(float dropThreshold) { + defaultConfig.setDropThreshold(dropThreshold); + } + + @Field public void setMaxMainMemoryBytes(long maxMainMemoryBytes) { defaultConfig.setMaxMainMemoryBytes(maxMainMemoryBytes); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java index 0d3f59d..714bed8 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java @@ -22,20 +22,20 @@ import java.util.Calendar; import java.util.List; import java.util.Locale; -import org.apache.commons.lang3.StringUtils; import org.apache.jempbox.xmp.XMPMetadata; import org.apache.jempbox.xmp.XMPSchema; import org.apache.jempbox.xmp.XMPSchemaBasic; import org.apache.jempbox.xmp.XMPSchemaDublinCore; +import org.apache.jempbox.xmp.XMPSchemaPDF; import org.apache.jempbox.xmp.pdfa.XMPSchemaPDFAId; import org.apache.pdfbox.cos.COSArray; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSString; import org.apache.pdfbox.pdmodel.common.PDMetadata; -import org.apache.poi.util.IOUtils; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.IOUtils; import org.apache.tika.metadata.DublinCore; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.PDF; @@ -64,34 +64,26 @@ class PDMetadataExtractor { metadata.set(PDF.HAS_XMP, "true"); //now go for the XMP Document dom = loadDOM(pdMetadata, metadata, context); - - XMPMetadata xmp = null; - if (dom != null) { - xmp = new XMPMetadata(dom); + if (dom == null) { + return; } + XMPMetadata xmp = new XMPMetadata(dom); XMPSchemaDublinCore dcSchema = null; - XMPSchemaBasic basic = null; - if (xmp != null) { - try { - dcSchema = xmp.getDublinCoreSchema(); - } catch (IOException e) { - } - try { - basic = xmp.getBasicSchema(); - } catch (IOException e) { - //swallow - } - JempboxExtractor.extractXMPMM(xmp, metadata); + try { + dcSchema = xmp.getDublinCoreSchema(); + } catch (IOException e) { } - extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); - extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, dcSchema); - extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, dcSchema); - extractMultilingualItems(metadata, TikaCoreProperties.TITLE, null, dcSchema); - - extractBasic(basic, metadata); + if (dcSchema != null) { + extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); + extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, dcSchema); + extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, dcSchema); + extractMultilingualItems(metadata, TikaCoreProperties.TITLE, null, dcSchema); + } + extractBasic(xmp, metadata); + extractPDF(xmp, metadata); + JempboxExtractor.extractXMPMM(xmp, metadata); try { - if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { @@ -107,13 +99,41 @@ class PDMetadataExtractor { } } // TODO WARN if this XMP version is inconsistent with document header version? - } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } } - private static void extractBasic(XMPSchemaBasic basic, Metadata metadata) { + private static void extractPDF(XMPMetadata xmp, Metadata metadata) { + if (xmp == null) { + return; + } + + XMPSchemaPDF pdf = null; + try { + pdf = xmp.getPDFSchema(); + } catch (IOException e) { + return; + } + if (pdf == null) { + return; + } + setNotNull(PDF.PRODUCER, pdf.getProducer(), metadata); + setNotNull(TikaCoreProperties.KEYWORDS, pdf.getKeywords(), metadata); + setNotNull(PDF.PDF_VERSION, pdf.getPDFVersion(), metadata); + } + + private static void extractBasic(XMPMetadata xmp, Metadata metadata) { + if (xmp == null) { + return; + } + + XMPSchemaBasic basic = null; + try { + basic = xmp.getBasicSchema(); + } catch (IOException e) { + return; + } if (basic == null) { return; } @@ -155,8 +175,8 @@ class PDMetadataExtractor { } private static void setNotNull(Property property, String value, Metadata metadata) { - if (metadata.get(property) == null && ! StringUtils.isEmpty(value)) { - metadata.set(property, value); + if (metadata.get(property) == null && value != null && value.trim().length() > 0) { + metadata.set(property, decode(value)); } } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 320689e..4ad2b12 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -26,6 +26,7 @@ import static org.junit.Assert.fail; import static org.junit.Assume.assumeTrue; import java.io.InputStream; +import java.nio.file.Path; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; @@ -1598,6 +1599,14 @@ public class PDFParserTest extends TikaTest { } @Test + public void testXMPPDFSchema() throws Exception { + //as of this writing, we don't currently have any pdfs in our + //test suite with data that is different btwn pdf doc info and xmp. :( + Metadata metadata = getXML("testPopupAnnotation.pdf").metadata; + assertEquals("IBM Lotus Symphony 3.0", metadata.get(PDF.PRODUCER)); + } + + @Test public void testExtractInlineImageMetadata() throws Exception { ParseContext context = new ParseContext(); PDFParserConfig config = new PDFParserConfig(); @@ -1640,5 +1649,4 @@ public class PDFParserTest extends TikaTest { return true; } } - }
