This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new ebbd895 TIKA-2950 -- add boolean metadata value to identify signed
ooxml files
ebbd895 is described below
commit ebbd895749337f5bfaa4a653d245ba9356f3207c
Author: TALLISON <[email protected]>
AuthorDate: Fri Sep 27 11:31:32 2019 -0400
TIKA-2950 -- add boolean metadata value to identify signed ooxml files
# Conflicts:
# tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
---
.../org/apache/tika/metadata/TikaCoreProperties.java | 8 +++++---
.../parser/microsoft/ooxml/OOXMLExtractorFactory.java | 9 +++++++++
.../apache/tika/parser/microsoft/ooxml/OOXMLParser.java | 3 +++
.../tika/parser/microsoft/ooxml/OOXMLParserTest.java | 13 +++++++++++++
.../test/resources/test-documents/testEXCEL_signed.xlsx | Bin 0 -> 15221 bytes
.../test/resources/test-documents/testPPT_signed.pptx | Bin 0 -> 39761 bytes
.../test/resources/test-documents/testWord_signed.docx | Bin 0 -> 18245 bytes
7 files changed, 30 insertions(+), 3 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 24e3ae0..616e812 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -222,7 +222,7 @@ public interface TikaCoreProperties {
*/
Property CREATED = Property.composite(DublinCore.CREATED,
new Property[] {
- Office.CREATION_DATE,
+ Office.CREATION_DATE,
});
/**
@@ -230,7 +230,7 @@ public interface TikaCoreProperties {
* @see Office#SAVE_DATE
*/
Property MODIFIED = Property.composite(DublinCore.MODIFIED,
- new Property[] {
+ new Property[] {
Office.SAVE_DATE,
Property.internalText("Last-Modified")
});
@@ -284,5 +284,7 @@ public interface TikaCoreProperties {
EmbeddedResourceType.ATTACHMENT.toString(),
EmbeddedResourceType.INLINE.toString());
-
+
+
+ Property HAS_SIGNATURE = Property.internalBoolean("hasSignature");
}
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 4ac436c..141dee3 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -46,6 +46,7 @@ import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
@@ -120,6 +121,14 @@ public class OOXMLExtractorFactory {
}
}
+ if (pkg != null) {
+ PackageRelationshipCollection prc =
+
pkg.getRelationshipsByType(OOXMLParser.SIGNATURE_RELATIONSHIP);
+ if (prc != null && prc.size() > 0) {
+ metadata.set(TikaCoreProperties.HAS_SIGNATURE, "true");
+ }
+ }
+
MediaType type = null;
String mediaTypeString = metadata.get(Metadata.CONTENT_TYPE);
if (mediaTypeString != null) {
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
index 81ec4b6..c18e500 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
@@ -41,6 +41,9 @@ public class OOXMLParser extends AbstractOfficeParser {
ZipSecureFile.setMinInflateRatio(-1.0d);
}
+ protected static final String SIGNATURE_RELATIONSHIP =
+
"http://schemas.openxmlformats.org/package/2006/relationships/digital-signature/origin";
+
protected static final MediaType XPS =
MediaType.application("vnd.ms-xpsdocument");
protected static final Set<MediaType> SUPPORTED_TYPES =
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 1edd89b..542073c 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1878,6 +1878,19 @@ public class OOXMLParserTest extends TikaTest {
//TIKA_2446
getRecursiveMetadata("testZIP_corrupted_oom.zip");
}
+
+ @Test
+ public void testSigned() throws Exception {
+ Metadata m = getXML("testWORD_signed.docx").metadata;
+ assertEquals("true", m.get(TikaCoreProperties.HAS_SIGNATURE));
+
+ m = getXML("testEXCEL_signed.xlsx").metadata;
+ assertEquals("true", m.get(TikaCoreProperties.HAS_SIGNATURE));
+
+ m = getXML("testPPT_signed.pptx").metadata;
+ assertEquals("true", m.get(TikaCoreProperties.HAS_SIGNATURE));
+
+ }
}
diff --git
a/tika-parsers/src/test/resources/test-documents/testEXCEL_signed.xlsx
b/tika-parsers/src/test/resources/test-documents/testEXCEL_signed.xlsx
new file mode 100644
index 0000000..b7a0df2
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testEXCEL_signed.xlsx differ
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_signed.pptx
b/tika-parsers/src/test/resources/test-documents/testPPT_signed.pptx
new file mode 100644
index 0000000..1de8f5d
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testPPT_signed.pptx differ
diff --git
a/tika-parsers/src/test/resources/test-documents/testWord_signed.docx
b/tika-parsers/src/test/resources/test-documents/testWord_signed.docx
new file mode 100644
index 0000000..cad54b3
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testWord_signed.docx differ