Repository: tika Updated Branches: refs/heads/master 1af1078ad -> 3e1450538
TIKA-1999 add limit to number of events extracted from the XMPMM section by the JempboxExtractor Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3e145053 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3e145053 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3e145053 Branch: refs/heads/master Commit: 3e14505381eefa603adabe61171c0c19fc685b2f Parents: 1af1078 Author: tballison <[email protected]> Authored: Wed Jun 8 11:45:30 2016 -0400 Committer: tballison <[email protected]> Committed: Wed Jun 8 11:45:30 2016 -0400 ---------------------------------------------------------------------- .../tika/parser/image/xmp/JempboxExtractor.java | 31 ++++ .../parser/image/xmp/JempboxExtractorTest.java | 29 ++- .../test/resources/test-documents/testXMP.xmp | 178 +++++++++++++++++++ 3 files changed, 237 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/3e145053/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java index 0f326a8..d9ae71d 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java @@ -42,15 +42,21 @@ import org.xml.sax.SAXException; public class JempboxExtractor { + + private static int MAX_EVENT_HISTORY_IN_XMPMM = 1024; + // The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8" private static final String DEFAULT_XMP_CHARSET = UTF_8.name(); + private XMPPacketScanner scanner = new XMPPacketScanner(); private Metadata metadata; + private static int maxXMPMMHistory; public JempboxExtractor(Metadata metadata) { this.metadata = metadata; } + public void parse(InputStream file) throws IOException, TikaException { ByteArrayOutputStream xmpraw = new ByteArrayOutputStream(); if (!scanner.parse(file, xmpraw)) { @@ -160,7 +166,11 @@ public class JempboxExtractor { //in DerivedFrom section } if (mmSchema.getHistory() != null) { + int eventsAdded = 0; for (ResourceEvent stevt : mmSchema.getHistory()) { + if (eventsAdded >= MAX_EVENT_HISTORY_IN_XMPMM) { + break; + } String instanceId = null; String action = null; Calendar when = null; @@ -188,6 +198,7 @@ public class JempboxExtractor { metadata.add(XMPMM.HISTORY_ACTION, action); metadata.add(XMPMM.HISTORY_WHEN, dateString); metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, softwareAgent); + eventsAdded++; } } } @@ -199,4 +210,24 @@ public class JempboxExtractor { m.add(p, value); } } + + /** + * Maximum number of events to extract from the + * event history in the XMP Media Management (XMPMM) section. + * The extractor will silently stop adding events after it + * has reached this threshold. + * <p> + * The default is 1024. + */ + public static void setMaxXMPMMHistory(int maxEvents) { + MAX_EVENT_HISTORY_IN_XMPMM = maxEvents; + } + + /** + * + * @return maximum number of events to extract from the XMPMM history. + */ + public static int getMaxXMPMMHistory() { + return maxXMPMMHistory; + } } http://git-wip-us.apache.org/repos/asf/tika/blob/3e145053/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java index 4718539..cdbf5eb 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java @@ -19,17 +19,24 @@ package org.apache.tika.parser.image.xmp; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.Collection; +import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.XMPMM; +import org.apache.tika.parser.ParseContext; import org.junit.Test; -public class JempboxExtractorTest { +import javax.xml.parsers.DocumentBuilder; + +public class JempboxExtractorTest extends TikaTest { @Test public void testParseJpeg() throws IOException, TikaException { @@ -104,4 +111,24 @@ public class JempboxExtractorTest { Arrays.asList("Mr B", "Mr A"))); } + @Test + public void testMaxXMPMMHistory() throws Exception { + int maxHistory = JempboxExtractor.getMaxXMPMMHistory(); + try { + Metadata m = new Metadata(); + JempboxExtractor ex = new JempboxExtractor(m); + ex.parse(getResourceAsStream("/test-documents/testXMP.xmp")); + assertEquals(7, m.getValues(XMPMM.HISTORY_EVENT_INSTANCEID).length); + + JempboxExtractor.setMaxXMPMMHistory(5); + m = new Metadata(); + ex = new JempboxExtractor(m); + ex.parse(getResourceAsStream("/test-documents/testXMP.xmp")); + assertEquals(5, m.getValues(XMPMM.HISTORY_EVENT_INSTANCEID).length); + } finally { + //if something goes wrong, make sure to set this back to what it was + JempboxExtractor.setMaxXMPMMHistory(maxHistory); + } + } + } http://git-wip-us.apache.org/repos/asf/tika/blob/3e145053/tika-parsers/src/test/resources/test-documents/testXMP.xmp ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testXMP.xmp b/tika-parsers/src/test/resources/test-documents/testXMP.xmp new file mode 100644 index 0000000..00fe0f9 --- /dev/null +++ b/tika-parsers/src/test/resources/test-documents/testXMP.xmp @@ -0,0 +1,178 @@ +<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?> +<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Adobe XMP Core 5.4-c005 78.147326, 2012/08/23-13:03:03 "> + <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> + <rdf:Description rdf:about="" + xmlns:xmp="http://ns.adobe.com/xap/1.0/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:xmpMM="http://ns.adobe.com/xap/1.0/mm/" + xmlns:stEvt="http://ns.adobe.com/xap/1.0/sType/ResourceEvent#" + xmlns:pdf="http://ns.adobe.com/pdf/1.3/" + xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/" + xmlns:pdfaExtension="http://www.aiim.org/pdfa/ns/extension/" + xmlns:pdfaSchema="http://www.aiim.org/pdfa/ns/schema#" + xmlns:pdfaProperty="http://www.aiim.org/pdfa/ns/property#"> + <xmp:CreateDate>2014-03-04T21:56:45+01:00</xmp:CreateDate> + <xmp:CreatorTool>Adobe Acrobat 10.0</xmp:CreatorTool> + <xmp:ModifyDate>2014-03-04T23:54:48+01:00</xmp:ModifyDate> + <xmp:MetadataDate>2014-03-04T23:54:48+01:00</xmp:MetadataDate> + <dc:format>application/pdf</dc:format> + <dc:title> + <rdf:Alt> + <rdf:li xml:lang="x-default">Sample Acrobat 4.x (PDF Version 1.3)</rdf:li> + </rdf:Alt> + </dc:title> + <dc:creator> + <rdf:Bag/> + </dc:creator> + <xmpMM:DocumentID>uuid:cccee1fc-51b3-4b52-ac86-672af3974d25</xmpMM:DocumentID> + <xmpMM:InstanceID>uuid:afa71b09-7cc5-48ac-8664-ac6dcf8b5ab4</xmpMM:InstanceID> + <xmpMM:RenditionClass>default</xmpMM:RenditionClass> + <xmpMM:VersionID>1</xmpMM:VersionID> + <xmpMM:History> + <rdf:Seq> + <rdf:li rdf:parseType="Resource"> + <stEvt:action>converted</stEvt:action> + <stEvt:instanceID>uuid:0313504b-a0b0-4dac-a9f0-357221f2eadf</stEvt:instanceID> + <stEvt:parameters>converted to PDF/A-1a</stEvt:parameters> + <stEvt:softwareAgent>Preflight</stEvt:softwareAgent> + <stEvt:when>2014-03-04T23:50:41+01:00</stEvt:when> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <stEvt:action>converted</stEvt:action> + <stEvt:instanceID>uuid:edc4279e-0d5f-465e-b13e-1298402fd11c</stEvt:instanceID> + <stEvt:parameters>PDF/A conversion failed; Version and conformance level identification removed</stEvt:parameters> + <stEvt:softwareAgent>Preflight</stEvt:softwareAgent> + <stEvt:when>2014-03-04T23:50:42+01:00</stEvt:when> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <stEvt:action>converted</stEvt:action> + <stEvt:instanceID>uuid:f565b775-43f3-4a9a-8541-e98c4115db6d</stEvt:instanceID> + <stEvt:parameters>converted to PDF/A-1a</stEvt:parameters> + <stEvt:softwareAgent>Preflight</stEvt:softwareAgent> + <stEvt:when>2014-03-04T23:51:34+01:00</stEvt:when> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <stEvt:action>converted</stEvt:action> + <stEvt:instanceID>uuid:9fd5e0a8-14a5-4920-ad7f-870c0b8ee65f</stEvt:instanceID> + <stEvt:parameters>converted to PDF/A-1a</stEvt:parameters> + <stEvt:softwareAgent>Preflight</stEvt:softwareAgent> + <stEvt:when>2014-03-04T23:51:36+01:00</stEvt:when> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <stEvt:action>converted</stEvt:action> + <stEvt:instanceID>uuid:09b6cfba-efde-4e07-a77f-70de858cc0aa</stEvt:instanceID> + <stEvt:parameters>PDF/A conversion failed; Version and conformance level identification removed</stEvt:parameters> + <stEvt:softwareAgent>Preflight</stEvt:softwareAgent> + <stEvt:when>2014-03-04T23:51:37+01:00</stEvt:when> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <stEvt:action>converted</stEvt:action> + <stEvt:instanceID>uuid:1e4ffbd7-dabc-4aae-801c-15b3404ade36</stEvt:instanceID> + <stEvt:parameters>converted to PDF/A-1b</stEvt:parameters> + <stEvt:softwareAgent>Preflight</stEvt:softwareAgent> + <stEvt:when>2014-03-04T23:52:22+01:00</stEvt:when> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <stEvt:action>converted</stEvt:action> + <stEvt:instanceID>uuid:c1669773-a6ca-4bdd-aade-519030d0af00</stEvt:instanceID> + <stEvt:parameters>converted to PDF/A-1b</stEvt:parameters> + <stEvt:softwareAgent>Preflight</stEvt:softwareAgent> + <stEvt:when>2014-03-04T23:54:48+01:00</stEvt:when> + </rdf:li> + </rdf:Seq> + </xmpMM:History> + <pdf:Producer>Acrobat Web Capture 10.0</pdf:Producer> + <pdfaid:part>1</pdfaid:part> + <pdfaid:conformance>B</pdfaid:conformance> + <pdfaExtension:schemas> + <rdf:Bag> + <rdf:li rdf:parseType="Resource"> + <pdfaSchema:namespaceURI>http://ns.adobe.com/pdf/1.3/</pdfaSchema:namespaceURI> + <pdfaSchema:prefix>pdf</pdfaSchema:prefix> + <pdfaSchema:schema>Adobe PDF Schema</pdfaSchema:schema> + <pdfaSchema:property> + <rdf:Seq> + <rdf:li rdf:parseType="Resource"> + <pdfaProperty:category>internal</pdfaProperty:category> + <pdfaProperty:description>A name object indicating whether the document has been modified to include trapping information</pdfaProperty:description> + <pdfaProperty:name>Trapped</pdfaProperty:name> + <pdfaProperty:valueType>Text</pdfaProperty:valueType> + </rdf:li> + </rdf:Seq> + </pdfaSchema:property> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <pdfaSchema:namespaceURI>http://ns.adobe.com/xap/1.0/mm/</pdfaSchema:namespaceURI> + <pdfaSchema:prefix>xmpMM</pdfaSchema:prefix> + <pdfaSchema:schema>XMP Media Management Schema</pdfaSchema:schema> + <pdfaSchema:property> + <rdf:Seq> + <rdf:li rdf:parseType="Resource"> + <pdfaProperty:category>internal</pdfaProperty:category> + <pdfaProperty:description>UUID based identifier for specific incarnation of a document</pdfaProperty:description> + <pdfaProperty:name>InstanceID</pdfaProperty:name> + <pdfaProperty:valueType>URI</pdfaProperty:valueType> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <pdfaProperty:category>internal</pdfaProperty:category> + <pdfaProperty:description>The common identifier for all versions and renditions of a document.</pdfaProperty:description> + <pdfaProperty:name>OriginalDocumentID</pdfaProperty:name> + <pdfaProperty:valueType>URI</pdfaProperty:valueType> + </rdf:li> + </rdf:Seq> + </pdfaSchema:property> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <pdfaSchema:namespaceURI>http://www.aiim.org/pdfa/ns/id/</pdfaSchema:namespaceURI> + <pdfaSchema:prefix>pdfaid</pdfaSchema:prefix> + <pdfaSchema:schema>PDF/A ID Schema</pdfaSchema:schema> + <pdfaSchema:property> + <rdf:Seq> + <rdf:li rdf:parseType="Resource"> + <pdfaProperty:category>internal</pdfaProperty:category> + <pdfaProperty:description>Part of PDF/A standard</pdfaProperty:description> + <pdfaProperty:name>part</pdfaProperty:name> + <pdfaProperty:valueType>Integer</pdfaProperty:valueType> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <pdfaProperty:category>internal</pdfaProperty:category> + <pdfaProperty:description>Amendment of PDF/A standard</pdfaProperty:description> + <pdfaProperty:name>amd</pdfaProperty:name> + <pdfaProperty:valueType>Text</pdfaProperty:valueType> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <pdfaProperty:category>internal</pdfaProperty:category> + <pdfaProperty:description>Conformance level of PDF/A standard</pdfaProperty:description> + <pdfaProperty:name>conformance</pdfaProperty:name> + <pdfaProperty:valueType>Text</pdfaProperty:valueType> + </rdf:li> + </rdf:Seq> + </pdfaSchema:property> + </rdf:li> + </rdf:Bag> + </pdfaExtension:schemas> + </rdf:Description> + </rdf:RDF> +</x:xmpmeta> + + + + + + + + + + + + + + + + + + + + + +<?xpacket end="w"?> \ No newline at end of file
