This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 6f6e6d389 TIKA-4437 -- improve extraction of features from doc and
docx (#2262)
6f6e6d389 is described below
commit 6f6e6d389007c682cde51a95714e725895dd0c6e
Author: Tim Allison <[email protected]>
AuthorDate: Wed Jul 2 11:36:42 2025 -0400
TIKA-4437 -- improve extraction of features from doc and docx (#2262)
---
.../main/java/org/apache/tika/metadata/Office.java | 5 ++
.../tika/parser/microsoft/WordExtractor.java | 42 +++++++++
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 3 +
.../ooxml/XWPFWordExtractorDecorator.java | 7 ++
.../microsoft/ooxml/xwpf/XWPFFeatureExtractor.java | 94 +++++++++++++++++++++
.../tika/parser/microsoft/WordParserTest.java | 10 +++
.../parser/microsoft/ooxml/OOXMLParserTest.java | 10 +++
.../resources/test-documents/testWORD_features.doc | Bin 0 -> 11264 bytes
.../test-documents/testWORD_features.docx | Bin 0 -> 8169 bytes
9 files changed, 171 insertions(+)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index 4f0146aeb..477ffef14 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -180,4 +180,9 @@ public interface Office {
Property NUM_HIDDEN_SLIDES =
Property.internalInteger("msoffice:ppt:num-hidden-slides");
Property HAS_ANIMATIONS =
Property.internalBoolean("msoffice:ppt:has-animations");
+
+ //w:vanish or isVanish or isFldVanish
+ Property HAS_HIDDEN_TEXT =
Property.internalBoolean("msoffice:doc:has-hidden-text");
+
+ Property HAS_TRACK_CHANGES =
Property.internalBoolean("msoffice:has-track-changes");
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 5207ec05b..505474f45 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -36,6 +36,7 @@ import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.model.FieldsDocumentPart;
import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.model.RevisionMarkAuthorTable;
import org.apache.poi.hwpf.model.SavedByEntry;
import org.apache.poi.hwpf.model.SavedByTable;
import org.apache.poi.hwpf.model.StyleDescription;
@@ -61,6 +62,7 @@ import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -160,6 +162,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
return;
}
+ boolean hasComments = false;
extractSavedByMetadata(document);
org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
@@ -200,6 +203,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
for (String paragraph : wordExtractor.getCommentsText()) {
xhtml.element("p", paragraph);
+ hasComments = true;
}
for (String paragraph : wordExtractor.getEndnoteText()) {
@@ -231,6 +235,44 @@ public class WordExtractor extends AbstractPOIFSExtractor {
} catch (FileNotFoundException e) {
//swallow
}
+ if (hasComments) {
+ parentMetadata.set(Office.HAS_COMMENTS, true);
+ }
+ extractFeatures(document, parentMetadata);
+ }
+
+ private void extractFeatures(HWPFDocument document, Metadata
parentMetadata) {
+ RevisionMarkAuthorTable revisionMarkAuthorTable =
document.getRevisionMarkAuthorTable();
+ if (revisionMarkAuthorTable != null) {
+ Set<String> authors = new
HashSet<>(revisionMarkAuthorTable.getEntries());
+ if (! authors.isEmpty()) {
+ for (String author : authors) {
+ parentMetadata.add(Office.COMMENT_PERSONS, author);
+ }
+ }
+ }
+ Range documentRange = document.getRange();
+ int numRuns = documentRange.numCharacterRuns();
+ boolean hasHidden = false;
+ boolean hasTrackChanges = false;
+ for (int i = 0; i < numRuns; i++) {
+ CharacterRun run = documentRange.getCharacterRun(i);
+ if (run.isVanished() || run.isFldVanished()) {
+ hasHidden = true;
+ }
+ if (run.isMarkedDeleted()) {
+ hasTrackChanges = true;
+ }
+ if (run.isMarkedInserted()) {
+ hasTrackChanges = true;
+ }
+ }
+ if (hasHidden) {
+ parentMetadata.set(Office.HAS_HIDDEN_TEXT, true);
+ }
+ if (hasTrackChanges) {
+ parentMetadata.set(Office.HAS_TRACK_CHANGES, true);
+ }
}
private void extractSavedByMetadata(HWPFDocument document) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 1475b7838..5c9d2f62b 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -130,6 +130,9 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
return new MetadataExtractor(extractor);
}
+ ParseContext getParseContext() {
+ return context;
+ }
/**
* @see
*
org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(ContentHandler,
Metadata,
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index 922cdbd01..8b925bf75 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -73,6 +73,7 @@ import org.apache.tika.parser.microsoft.EMFParser;
import org.apache.tika.parser.microsoft.FormattingUtils;
import org.apache.tika.parser.microsoft.WordExtractor;
import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFFeatureExtractor;
import org.apache.tika.sax.ToTextContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;
@@ -125,6 +126,7 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
// process text in the order that it occurs in
extractIBodyText(document, listManager, xhtml);
+ extractFeatures(document, metadata);
//handle the diagram data
handleGeneralTextContainingPart(RELATION_DIAGRAM_DATA, "diagram-data",
@@ -145,6 +147,11 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
}
}
+ private void extractFeatures(XWPFDocument document, Metadata metadata) {
+ XWPFFeatureExtractor ex = new XWPFFeatureExtractor();
+ ex.process(document, metadata, getParseContext());
+ }
+
@Override
protected Map<String, EmbeddedPartMetadata> getEmbeddedPartMetadataMap() {
return embeddedPartMetadataMap;
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java
new file mode 100644
index 000000000..06e45afb7
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.StringUtils;
+import org.apache.tika.utils.XMLReaderUtils;
+
+/**
+ * This is designed to extract features that are useful for forensics,
e-discovery and digital preservation.
+ * Specifically, the presence of: tracked changes, hidden text, comments and
comment authors. Because several of these
+ * features can be placed on run properties, which can be in lots of places,
we're scraping
+ * the document xml
+ */
+public class XWPFFeatureExtractor {
+
+ public void process(XWPFDocument xwpfDocument, Metadata metadata,
ParseContext parseContext) {
+ try (InputStream is = xwpfDocument.getPackagePart()
+ .getInputStream()) {
+ FeatureHandler featureHandler = new FeatureHandler();
+ XMLReaderUtils.parseSAX(is, featureHandler, parseContext);
+ if (featureHandler.hasComments) {
+ metadata.set(Office.HAS_COMMENTS, true);
+ }
+ if (featureHandler.hasHidden) {
+ metadata.set(Office.HAS_HIDDEN_TEXT, true);
+ }
+ if (featureHandler.hasTrackChanges) {
+ metadata.set(Office.HAS_TRACK_CHANGES, true);
+ }
+ if (! featureHandler.authors.isEmpty()) {
+ for (String author : featureHandler.authors) {
+ metadata.add(Office.COMMENT_PERSONS, author);
+ }
+ }
+ } catch (IOException | TikaException | SAXException e) {
+ //swallow
+ }
+ }
+
+ private static class FeatureHandler extends DefaultHandler {
+ //see:
https://www.ericwhite.com/blog/using-xml-dom-to-detect-tracked-revisions-in-an-open-xml-wordprocessingml-document/
+ private static final Set<String> TRACK_CHANGES = Set.of("ins", "del",
"moveFrom", "moveTo");
+ private final Set<String> authors = new HashSet<>();
+ private boolean hasHidden = false;
+ private boolean hasTrackChanges = false;
+ private boolean hasComments = false;
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
Attributes atts)
+ throws SAXException {
+ //we could check to ensure that the vanish element actually
surrounds text
+ //the current check could lead to false positives where
<w:vanish/> is around a space or no text.
+ if ("vanish".equals(localName)) {
+ hasHidden = true;
+ } else if (TRACK_CHANGES.contains(localName)) {
+ String trackChangesAuthor =
XMLReaderUtils.getAttrValue("author", atts);
+ if (!StringUtils.isBlank(trackChangesAuthor)) {
+ authors.add(trackChangesAuthor);
+ }
+ hasTrackChanges = true;
+ } else if ("commentReference".equals(localName) ||
"commentRangeStart".equals(localName)) {
+ hasComments = true;
+ }
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 67f0d4e95..ecded8465 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -676,4 +676,14 @@ public class WordParserTest extends TikaTest {
getRecursiveMetadata("testWORD_protected_drm.doc");
});
}
+
+ @Test
+ public void testFeatureExtraction() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testWORD_features.doc");
+ Metadata m = metadataList.get(0);
+ assertContains("Kyle Reese",
Arrays.asList(m.getValues(Office.COMMENT_PERSONS)));
+ assertEquals("true", m.get(Office.HAS_HIDDEN_TEXT));
+ assertEquals("true", m.get(Office.HAS_TRACK_CHANGES));
+ assertEquals("true", m.get(Office.HAS_COMMENTS));
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 9559e73c2..a8169e7d8 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1803,4 +1803,14 @@ public class OOXMLParserTest extends
MultiThreadedTikaTest {
});
}
+
+ @Test
+ public void testFeatureExtraction() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testWORD_features.docx");
+ Metadata m = metadataList.get(0);
+ assertContains("Kyle Reese",
Arrays.asList(m.getValues(Office.COMMENT_PERSONS)));
+ assertEquals("true", m.get(Office.HAS_HIDDEN_TEXT));
+ assertEquals("true", m.get(Office.HAS_TRACK_CHANGES));
+ assertEquals("true", m.get(Office.HAS_COMMENTS));
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_features.doc
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_features.doc
new file mode 100644
index 000000000..d26c3d39b
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_features.doc
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_features.docx
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_features.docx
new file mode 100644
index 000000000..0cc401f9a
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_features.docx
differ