This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4437 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 9c8feb0be4c674b5e0434c64881924dcf6625504 Author: tallison <[email protected]> AuthorDate: Thu Jun 26 10:20:16 2025 -0400 TIKA-4437 -- mostly done. Need to add shareable test docs and unit tests --- .../main/java/org/apache/tika/metadata/Office.java | 5 +++ .../tika/parser/microsoft/WordExtractor.java | 38 ++++++++++++++++++++- .../microsoft/ooxml/AbstractOOXMLExtractor.java | 3 ++ .../ooxml/XWPFWordExtractorDecorator.java | 7 ++++ .../microsoft/ooxml/xwpf/XWPFFeatureExtractor.java | 39 +++++++++++++++++++--- 5 files changed, 86 insertions(+), 6 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java b/tika-core/src/main/java/org/apache/tika/metadata/Office.java index 4f0146aeb..477ffef14 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java @@ -180,4 +180,9 @@ public interface Office { Property NUM_HIDDEN_SLIDES = Property.internalInteger("msoffice:ppt:num-hidden-slides"); Property HAS_ANIMATIONS = Property.internalBoolean("msoffice:ppt:has-animations"); + + //w:vanish or isVanish or isFldVanish + Property HAS_HIDDEN_TEXT = Property.internalBoolean("msoffice:doc:has-hidden-text"); + + Property HAS_TRACK_CHANGES = Property.internalBoolean("msoffice:has-track-changes"); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java index 5207ec05b..a72d6383b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java @@ -36,6 +36,7 @@ import org.apache.poi.hwpf.OldWordFileFormatException; import org.apache.poi.hwpf.extractor.Word6Extractor; import org.apache.poi.hwpf.model.FieldsDocumentPart; import org.apache.poi.hwpf.model.PicturesTable; +import org.apache.poi.hwpf.model.RevisionMarkAuthorTable; import org.apache.poi.hwpf.model.SavedByEntry; import org.apache.poi.hwpf.model.SavedByTable; import org.apache.poi.hwpf.model.StyleDescription; @@ -61,6 +62,7 @@ import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; @@ -197,7 +199,6 @@ public class WordExtractor extends AbstractPOIFSExtractor { for (String paragraph : wordExtractor.getFootnoteText()) { xhtml.element("p", paragraph); } - for (String paragraph : wordExtractor.getCommentsText()) { xhtml.element("p", paragraph); } @@ -231,6 +232,41 @@ public class WordExtractor extends AbstractPOIFSExtractor { } catch (FileNotFoundException e) { //swallow } + extractFeatures(document, parentMetadata); + } + + private void extractFeatures(HWPFDocument document, Metadata parentMetadata) { + RevisionMarkAuthorTable revisionMarkAuthorTable = document.getRevisionMarkAuthorTable(); + if (revisionMarkAuthorTable != null) { + Set<String> authors = new HashSet<>(revisionMarkAuthorTable.getEntries()); + if (! authors.isEmpty()) { + for (String author : authors) { + parentMetadata.add(Office.COMMENT_PERSONS, author); + } + } + } + Range documentRange = document.getRange(); + int numRuns = documentRange.numCharacterRuns(); + boolean hasHidden = false; + boolean hasTrackChanges = false; + for (int i = 0; i < numRuns; i++) { + CharacterRun run = documentRange.getCharacterRun(i); + if (run.isVanished() || run.isFldVanished()) { + hasHidden = true; + } + if (run.isMarkedDeleted()) { + hasTrackChanges = true; + } + if (run.isMarkedInserted()) { + hasTrackChanges = true; + } + } + if (hasHidden) { + parentMetadata.set(Office.HAS_HIDDEN_TEXT, true); + } + if (hasTrackChanges) { + parentMetadata.set(Office.HAS_TRACK_CHANGES, true); + } } private void extractSavedByMetadata(HWPFDocument document) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java index 1475b7838..5c9d2f62b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java @@ -130,6 +130,9 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { return new MetadataExtractor(extractor); } + ParseContext getParseContext() { + return context; + } /** * @see * org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(ContentHandler, Metadata, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java index 922cdbd01..8b925bf75 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java @@ -73,6 +73,7 @@ import org.apache.tika.parser.microsoft.EMFParser; import org.apache.tika.parser.microsoft.FormattingUtils; import org.apache.tika.parser.microsoft.WordExtractor; import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle; +import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFFeatureExtractor; import org.apache.tika.sax.ToTextContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.StringUtils; @@ -125,6 +126,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { // process text in the order that it occurs in extractIBodyText(document, listManager, xhtml); + extractFeatures(document, metadata); //handle the diagram data handleGeneralTextContainingPart(RELATION_DIAGRAM_DATA, "diagram-data", @@ -145,6 +147,11 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { } } + private void extractFeatures(XWPFDocument document, Metadata metadata) { + XWPFFeatureExtractor ex = new XWPFFeatureExtractor(); + ex.process(document, metadata, getParseContext()); + } + @Override protected Map<String, EmbeddedPartMetadata> getEmbeddedPartMetadataMap() { return embeddedPartMetadataMap; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java index 55be27a3b..89270bb58 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java @@ -1,30 +1,59 @@ package org.apache.tika.parser.microsoft.ooxml.xwpf; +import java.io.IOException; +import java.io.InputStream; import java.util.HashSet; import java.util.Set; import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.parser.ParseContext; import org.apache.tika.utils.StringUtils; +import org.apache.tika.utils.XMLReaderUtils; /** * This is designed to extract features that are useful for forensics, e-discovery and digital preservation. * Specifically, the presence of: tracked changes, hidden text, comments and comment authors. Because several of these - * features can be placed on run properties, which can be in lots of places, I found it simpler to scrape + * features can be placed on run properties, which can be in lots of places, we're scraping * the document xml */ public class XWPFFeatureExtractor { - public void process(OPCPackage opcPackage) { + public void process(XWPFDocument xwpfDocument, Metadata metadata, ParseContext parseContext) { + try (InputStream is = xwpfDocument.getPackagePart() + .getInputStream()) { + FeatureHandler featureHandler = new FeatureHandler(); + XMLReaderUtils.parseSAX(is, featureHandler, parseContext); + if (featureHandler.hasComments) { + metadata.set(Office.HAS_COMMENTS, true); + } + if (featureHandler.hasHidden) { + metadata.set(Office.HAS_HIDDEN_TEXT, true); + } + if (featureHandler.hasTrackChanges) { + metadata.set(Office.HAS_TRACK_CHANGES, true); + } + if (! featureHandler.authors.isEmpty()) { + for (String author : featureHandler.authors) { + metadata.add(Office.COMMENT_PERSONS, author); + } + } + } catch (IOException | TikaException | SAXException e) { + //swallow + } } private static class FeatureHandler extends DefaultHandler { //see: https://www.ericwhite.com/blog/using-xml-dom-to-detect-tracked-revisions-in-an-open-xml-wordprocessingml-document/ private static final Set<String> TRACK_CHANGES = Set.of("ins", "del", "moveFrom", "moveTo"); - private Set<String> authors = new HashSet<>(); + private final Set<String> authors = new HashSet<>(); private boolean hasHidden = false; private boolean hasTrackChanges = false; private boolean hasComments = false; @@ -32,12 +61,12 @@ public class XWPFFeatureExtractor { @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { - //we could check to ensure that the vanish element actually surround text + //we could check to ensure that the vanish element actually surrounds text //the current check could lead to false positives where <w:vanish/> is around a space or no text. if ("vanish".equals(localName)) { hasHidden = true; } else if (TRACK_CHANGES.contains(localName)) { - String trackChangesAuthor = atts.getValue("author"); + String trackChangesAuthor = XMLReaderUtils.getAttrValue("author", atts); if (!StringUtils.isBlank(trackChangesAuthor)) { authors.add(trackChangesAuthor); }
