This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4434 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 17638630fdbc86a8c96bf70943df62999c26354c Author: tallison <[email protected]> AuthorDate: Thu Jun 5 16:01:30 2025 -0400 TIKA-4434 -- extract more info out of ppt/pptx --- .../main/java/org/apache/tika/metadata/Office.java | 5 + .../tika/parser/microsoft/HSLFExtractor.java | 35 +++- .../ooxml/OOXMLWordAndPowerPointTextHandler.java | 13 ++ .../ooxml/SXSLFPowerPointExtractorDecorator.java | 10 +- .../ooxml/XSLFPowerPointExtractorDecorator.java | 188 +++++++++++++-------- 5 files changed, 176 insertions(+), 75 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java b/tika-core/src/main/java/org/apache/tika/metadata/Office.java index 7883df999..4f0146aeb 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java @@ -175,4 +175,9 @@ public interface Office { Property COMMENT_PERSONS = Property.internalTextBag("msoffice:comment-person-display-name"); + Property HAS_HIDDEN_SLIDES = Property.internalBoolean("msoffice:ppt:has-hidden-slides"); + + Property NUM_HIDDEN_SLIDES = Property.internalInteger("msoffice:ppt:num-hidden-slides"); + + Property HAS_ANIMATIONS = Property.internalBoolean("msoffice:ppt:has-animations"); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java index 31588d1c9..7661c0d06 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java @@ -28,6 +28,8 @@ import org.apache.poi.common.usermodel.Hyperlink; import org.apache.poi.hslf.exceptions.EncryptedPowerPointFileException; import org.apache.poi.hslf.model.HeadersFooters; import org.apache.poi.hslf.record.DocInfoListContainer; +import org.apache.poi.hslf.record.Record; +import org.apache.poi.hslf.record.RecordContainer; import org.apache.poi.hslf.record.RecordTypes; import org.apache.poi.hslf.record.VBAInfoAtom; import org.apache.poi.hslf.record.VBAInfoContainer; @@ -59,6 +61,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; @@ -66,6 +69,15 @@ import org.apache.tika.utils.StringUtils; public class HSLFExtractor extends AbstractPOIFSExtractor { + //This is from Andreas: https://stackoverflow.com/a/45664920 + private static final int[] TIMING_RECORD_PATH = { + RecordTypes.ProgTags.typeID, + RecordTypes.ProgBinaryTag.typeID, + RecordTypes.BinaryTagData.typeID + }; + + private static final int EXT_TIME_NODE_CONTAINER = 0xf144; + public HSLFExtractor(ParseContext context, Metadata metadata) { super(context, metadata); } @@ -93,6 +105,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { xhtml.startElement("div", "class", "slideShow"); /* Iterate over slides and extract text */ + int hiddenSlides = 0; for (HSLFSlide slide : _slides) { xhtml.startElement("div", "class", "slide"); HeadersFooters slideHeaderFooters = @@ -152,8 +165,15 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { // Slide complete xhtml.endElement("div"); + if (slide.isHidden()) { + hiddenSlides++; + } + findAnimations(slide); + } + if (hiddenSlides > 0) { + parentMetadata.set(Office.NUM_HIDDEN_SLIDES, hiddenSlides); + parentMetadata.set(Office.HAS_HIDDEN_SLIDES, true); } - handleSlideEmbeddedPictures(ss, xhtml); handleShowEmbeddedResources(ss, xhtml, true); @@ -167,6 +187,19 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { xhtml.endElement("div"); } + private void findAnimations(HSLFSlide slide) { + if (parentMetadata.get(Office.HAS_ANIMATIONS) != null) { + return; + } + RecordContainer lastRecord = slide.getSheetContainer(); + for (int ri : TIMING_RECORD_PATH) { + lastRecord = (RecordContainer) lastRecord.findFirstOfType(ri); + } + if (lastRecord.findFirstOfType(EXT_TIME_NODE_CONTAINER) != null) { + parentMetadata.set(Office.HAS_ANIMATIONS, true); + } + } + /** * This is the catch-all for embedded objects. If we didn't come across * them in the shapes in the slides, headers/footers, etc, try to diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java index 77d088701..8137e6967 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java @@ -85,6 +85,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { private final static String RUBY = "ruby"; //phonetic section private final static String RT = "rt"; //phonetic run private static final String VAL = "val"; + private static final String SLIDE = "sld"; + private static final String SHOW = "show"; private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006"; private final static String O_NS = "urn:schemas-microsoft-com:office:office"; @@ -148,6 +150,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { OOXMLWordAndPowerPointTextHandler.EditType.NONE; private DateUtils dateUtils = new DateUtils(); + private boolean hiddenSlide = false; + public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler, Map<String, String> hyperlinks) { this(bodyContentsHandler, hyperlinks, true, true); @@ -333,6 +337,11 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { inV = true; } else if (RT.equals(localName)) { inRt = true; + } else if (SLIDE.equals(localName)) { + String val = atts.getValue("show"); + if ("0".equals(val) || "false".equals(val)) { + hiddenSlide = true; + } } } @@ -571,4 +580,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { void endBookmark(String id) throws SAXException; } + + public boolean isHiddenSlide() { + return hiddenSlide; + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java index c036f086f..a95abf95f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java @@ -41,6 +41,7 @@ import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor; @@ -178,10 +179,13 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { // Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart); xhtml.startElement("div", "class", "slide-content"); try (InputStream stream = slidePart.getInputStream()) { + OOXMLWordAndPowerPointTextHandler wordAndPPTHandler = new OOXMLWordAndPowerPointTextHandler( + new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships); XMLReaderUtils.parseSAX(CloseShieldInputStream.wrap(stream), - new EmbeddedContentHandler(new OOXMLWordAndPowerPointTextHandler( - new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships)), context); - + new EmbeddedContentHandler(wordAndPPTHandler), context); + if (wordAndPPTHandler.isHiddenSlide()) { + metadata.set(Office.HAS_HIDDEN_SLIDES, true); + } } catch (TikaException | IOException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java index 38e9c8aac..d292f5c57 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java @@ -19,7 +19,10 @@ package org.apache.tika.parser.microsoft.ooxml; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; import javax.xml.namespace.QName; import org.apache.poi.common.usermodel.Hyperlink; @@ -55,7 +58,10 @@ import org.apache.poi.xslf.usermodel.XSLFTextRun; import org.apache.poi.xslf.usermodel.XSLFTextShape; import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlObject; +import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentAuthor; +import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentAuthorList; import org.openxmlformats.schemas.presentationml.x2006.main.CTPicture; +import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide; import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList; import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry; import org.xml.sax.SAXException; @@ -63,8 +69,10 @@ import org.xml.sax.helpers.AttributesImpl; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.StringUtils; public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { @@ -85,92 +93,130 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { */ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException { XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument(); - XSLFCommentAuthors commentAuthors = slideShow.getCommentAuthors(); + handleCommentAuthors(slideShow); List<XSLFSlide> slides = slideShow.getSlides(); + AtomicInteger hiddenSlideCounter = new AtomicInteger(0); for (XSLFSlide slide : slides) { - String slideDesc; - if (slide.getPackagePart() != null && slide.getPackagePart().getPartName() != null) { - slideDesc = getJustFileName(slide.getPackagePart().getPartName().toString()); - slideDesc += "_"; - } else { - slideDesc = null; - } + handleSlide(slide, xhtml, hiddenSlideCounter); + } + if (hiddenSlideCounter.get() > 0) { + metadata.set(Office.HAS_HIDDEN_SLIDES, true); + metadata.set(Office.NUM_HIDDEN_SLIDES, hiddenSlideCounter.get()); + } + } + + private void handleSlide(XSLFSlide slide, XHTMLContentHandler xhtml, AtomicInteger hiddenSlideCounter) throws SAXException { + String slideDesc; + if (slide.getPackagePart() != null && slide.getPackagePart().getPartName() != null) { + slideDesc = getJustFileName(slide.getPackagePart().getPartName().toString()); + slideDesc += "_"; + } else { + slideDesc = null; + } + + if (slide.isHidden()) { + hiddenSlideCounter.incrementAndGet(); + } + + // slide content + xhtml.startElement("div", "class", "slide-content"); + extractContent(slide.getShapes(), false, xhtml, slideDesc); + xhtml.endElement("div"); - // slide content - xhtml.startElement("div", "class", "slide-content"); - extractContent(slide.getShapes(), false, xhtml, slideDesc); + if (config.isIncludeSlideMasterContent()) { + // slide layout which is the master sheet for this slide + xhtml.startElement("div", "class", "slide-master-content"); + XSLFSlideLayout slideLayout = slide.getMasterSheet(); + extractContent(slideLayout.getShapes(), true, xhtml, null); xhtml.endElement("div"); - if (config.isIncludeSlideMasterContent()) { - // slide layout which is the master sheet for this slide - xhtml.startElement("div", "class", "slide-master-content"); - XSLFSlideLayout slideLayout = slide.getMasterSheet(); - extractContent(slideLayout.getShapes(), true, xhtml, null); + // slide master which is the master sheet for all text layouts + XSLFSheet slideMaster = slideLayout.getMasterSheet(); + extractContent(slideMaster.getShapes(), true, xhtml, null); + } + if (config.isIncludeSlideNotes()) { + // notes (if present) + XSLFNotes slideNotes = slide.getNotes(); + if (slideNotes != null) { + xhtml.startElement("div", "class", "slide-notes"); + + extractContent(slideNotes.getShapes(), false, xhtml, slideDesc); + + // master sheet for this notes + XSLFNotesMaster notesMaster = slideNotes.getMasterSheet(); + if (notesMaster != null) { + extractContent(notesMaster.getShapes(), true, xhtml, null); + } xhtml.endElement("div"); - - // slide master which is the master sheet for all text layouts - XSLFSheet slideMaster = slideLayout.getMasterSheet(); - extractContent(slideMaster.getShapes(), true, xhtml, null); } - if (config.isIncludeSlideNotes()) { - // notes (if present) - XSLFNotes slideNotes = slide.getNotes(); - if (slideNotes != null) { - xhtml.startElement("div", "class", "slide-notes"); - - extractContent(slideNotes.getShapes(), false, xhtml, slideDesc); - - // master sheet for this notes - XSLFNotesMaster notesMaster = slideNotes.getMasterSheet(); - if (notesMaster != null) { - extractContent(notesMaster.getShapes(), true, xhtml, null); + } + + // comments (if present) + List<XSLFComment> comments = slide.getComments(); + if (comments != null) { + StringBuilder authorStringBuilder = new StringBuilder(); + for (XSLFComment comment : comments) { + authorStringBuilder.setLength(0); + xhtml.startElement("p", "class", "slide-comment"); + if (comment.getAuthor() != null) { + authorStringBuilder.append(comment.getAuthor()); + } + if (comment.getAuthorInitials() != null) { + if (authorStringBuilder.length() > 0) { + authorStringBuilder.append(" "); } - xhtml.endElement("div"); + authorStringBuilder.append("(").append(comment.getAuthorInitials()).append(")"); + } + if (comment.getText() != null && authorStringBuilder.length() > 0) { + authorStringBuilder.append(" - "); } + if (authorStringBuilder.length() > 0) { + xhtml.startElement("b"); + xhtml.characters(authorStringBuilder.toString()); + xhtml.endElement("b"); + } + + xhtml.characters(comment.getText()); + xhtml.endElement("p"); } + } + //now dump diagram data + handleGeneralTextContainingPart(RELATION_DIAGRAM_DATA, "diagram-data", + slide.getPackagePart(), metadata, + new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), + new HashMap<>()//empty + )); + //now dump chart data + handleGeneralTextContainingPart(XSLFRelation.CHART.getRelation(), "chart", + slide.getPackagePart(), metadata, + new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), + new HashMap<>()//empty + )); + + CTSlide ctSlide = slide.getXmlObject(); + if (ctSlide.isSetTiming()) { + //perhaps require more, like: ctSlide.getTiming()?.getTnLst()?.getParArray()?.length + metadata.set(Office.HAS_ANIMATIONS, true); + } + } - // comments (if present) - List<XSLFComment> comments = slide.getComments(); - if (comments != null) { - StringBuilder authorStringBuilder = new StringBuilder(); - for (XSLFComment comment : comments) { - authorStringBuilder.setLength(0); - xhtml.startElement("p", "class", "slide-comment"); - if (comment.getAuthor() != null) { - authorStringBuilder.append(comment.getAuthor()); - } - if (comment.getAuthorInitials() != null) { - if (authorStringBuilder.length() > 0) { - authorStringBuilder.append(" "); - } - authorStringBuilder.append("(").append(comment.getAuthorInitials()).append(")"); - } - if (comment.getText() != null && authorStringBuilder.length() > 0) { - authorStringBuilder.append(" - "); - } - if (authorStringBuilder.length() > 0) { - xhtml.startElement("b"); - xhtml.characters(authorStringBuilder.toString()); - xhtml.endElement("b"); + private void handleCommentAuthors(XMLSlideShow slideShow) { + XSLFCommentAuthors commentAuthors = slideShow.getCommentAuthors(); + if (commentAuthors != null) { + CTCommentAuthorList ctAuthorList = commentAuthors.getCTCommentAuthorsList(); + CTCommentAuthor[] ctAuthorArray = ctAuthorList.getCmAuthorArray(); + if (ctAuthorArray != null) { + Set<String> names = new HashSet<>(); + for (CTCommentAuthor ctCommentAuthor : ctAuthorArray) { + String n = ctCommentAuthor.getName(); + if (StringUtils.isBlank(n) || names.contains(n)) { + continue; } - - xhtml.characters(comment.getText()); - xhtml.endElement("p"); + metadata.add(Office.COMMENT_PERSONS, n); + names.add(n); } } - //now dump diagram data - handleGeneralTextContainingPart(RELATION_DIAGRAM_DATA, "diagram-data", - slide.getPackagePart(), metadata, - new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), - new HashMap<>()//empty - )); - //now dump chart data - handleGeneralTextContainingPart(XSLFRelation.CHART.getRelation(), "chart", - slide.getPackagePart(), metadata, - new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), - new HashMap<>()//empty - )); } }
