This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4437
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 10fd38ac5df5f809f7660d17254b2cf71fa03fa5
Author: tallison <[email protected]>
AuthorDate: Fri Jun 13 14:40:32 2025 -0400

    TIKA-4437 WIP
---
 .../microsoft/ooxml/xwpf/XWPFFeatureExtractor.java | 50 ++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java
new file mode 100644
index 000000000..55be27a3b
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java
@@ -0,0 +1,50 @@
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.utils.StringUtils;
+
+/**
+ * This is designed to extract features that are useful for forensics, 
e-discovery and digital preservation.
+ * Specifically, the presence of: tracked changes, hidden text, comments and 
comment authors. Because several of these
+ * features can be placed on run properties, which can be in lots of places, I 
found it simpler to scrape
+ * the document xml
+ */
+public class XWPFFeatureExtractor {
+
+    public void process(OPCPackage opcPackage) {
+    }
+
+    private static class FeatureHandler extends DefaultHandler {
+        //see: 
https://www.ericwhite.com/blog/using-xml-dom-to-detect-tracked-revisions-in-an-open-xml-wordprocessingml-document/
+        private static final Set<String> TRACK_CHANGES = Set.of("ins", "del", 
"moveFrom", "moveTo");
+        private Set<String> authors = new HashSet<>();
+        private boolean hasHidden = false;
+        private boolean hasTrackChanges = false;
+        private boolean hasComments = false;
+
+        @Override
+        public void startElement(String uri, String localName, String qName, 
Attributes atts)
+                throws SAXException {
+            //we could check to ensure that the vanish element actually 
surround text
+            //the current check could lead to false positives where 
<w:vanish/> is around a space or no text.
+            if ("vanish".equals(localName)) {
+                hasHidden = true;
+            } else if (TRACK_CHANGES.contains(localName)) {
+                String trackChangesAuthor = atts.getValue("author");
+                if (!StringUtils.isBlank(trackChangesAuthor)) {
+                    authors.add(trackChangesAuthor);
+                }
+                hasTrackChanges = true;
+            } else if ("commentReference".equals(localName) || 
"commentRangeStart".equals(localName)) {
+                hasComments = true;
+            }
+        }
+    }
+}

Reply via email to