This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4437 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 10fd38ac5df5f809f7660d17254b2cf71fa03fa5 Author: tallison <[email protected]> AuthorDate: Fri Jun 13 14:40:32 2025 -0400 TIKA-4437 WIP --- .../microsoft/ooxml/xwpf/XWPFFeatureExtractor.java | 50 ++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java new file mode 100644 index 000000000..55be27a3b --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java @@ -0,0 +1,50 @@ +package org.apache.tika.parser.microsoft.ooxml.xwpf; + +import java.util.HashSet; +import java.util.Set; + +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import org.apache.tika.utils.StringUtils; + +/** + * This is designed to extract features that are useful for forensics, e-discovery and digital preservation. + * Specifically, the presence of: tracked changes, hidden text, comments and comment authors. Because several of these + * features can be placed on run properties, which can be in lots of places, I found it simpler to scrape + * the document xml + */ +public class XWPFFeatureExtractor { + + public void process(OPCPackage opcPackage) { + } + + private static class FeatureHandler extends DefaultHandler { + //see: https://www.ericwhite.com/blog/using-xml-dom-to-detect-tracked-revisions-in-an-open-xml-wordprocessingml-document/ + private static final Set<String> TRACK_CHANGES = Set.of("ins", "del", "moveFrom", "moveTo"); + private Set<String> authors = new HashSet<>(); + private boolean hasHidden = false; + private boolean hasTrackChanges = false; + private boolean hasComments = false; + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + //we could check to ensure that the vanish element actually surround text + //the current check could lead to false positives where <w:vanish/> is around a space or no text. + if ("vanish".equals(localName)) { + hasHidden = true; + } else if (TRACK_CHANGES.contains(localName)) { + String trackChangesAuthor = atts.getValue("author"); + if (!StringUtils.isBlank(trackChangesAuthor)) { + authors.add(trackChangesAuthor); + } + hasTrackChanges = true; + } else if ("commentReference".equals(localName) || "commentRangeStart".equals(localName)) { + hasComments = true; + } + } + } +}
