Author: nick Date: Wed Sep 21 17:03:38 2011 New Revision: 1173761 URL: http://svn.apache.org/viewvc?rev=1173761&view=rev Log: TIKA-712 Fetch Master Slide text for PPT and PPTX text extraction
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1173761&r1=1173760&r2=1173761&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Wed Sep 21 17:03:38 2011 @@ -39,7 +39,12 @@ public class HSLFExtractor extends Abstr throws IOException, SAXException, TikaException { PowerPointExtractor powerPointExtractor = new PowerPointExtractor(filesystem); - xhtml.element("p", powerPointExtractor.getText(true, true)); + powerPointExtractor.setSlidesByDefault(true); + powerPointExtractor.setNotesByDefault(true); + powerPointExtractor.setCommentsByDefault(true); + powerPointExtractor.setMasterByDefault(true); + + xhtml.element("p", powerPointExtractor.getText()); List<OLEShape> shapeList = powerPointExtractor.getOLEShapes(); for (OLEShape shape : shapeList) { Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1173761&r1=1173760&r2=1173761&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java Wed Sep 21 17:03:38 2011 @@ -33,6 +33,7 @@ import org.apache.poi.xslf.usermodel.XML import org.apache.poi.xslf.usermodel.XSLFCommonSlideData; import org.apache.poi.xslf.usermodel.XSLFRelation; import org.apache.poi.xslf.usermodel.XSLFSlide; +import org.apache.poi.xslf.usermodel.XSLFSlideMaster; import org.apache.tika.exception.TikaException; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; @@ -77,20 +78,30 @@ public class XSLFPowerPointExtractorDeco continue; } + XSLFSlideMaster master = slide.getMasterSheet(); CTNotesSlide notes = rawSlideShow.getNotes(slideId); CTCommentList comments = rawSlideShow.getSlideComments(slideId); + // TODO In POI 3.8 beta 5, improve how we get this xhtml.startElement("div"); XSLFCommonSlideData common = new XSLFCommonSlideData(slide.getXmlObject().getCSld()); extractShapeContent(common, xhtml); + // If there are comments, extract them if (comments != null) { for (CTComment comment : comments.getCmArray()) { xhtml.element("p", comment.getText()); } } + + // Get text from the master slide + if(master != null) { + // TODO In POI 3.8 beta 5, improve how we get this + extractShapeContent(new XSLFCommonSlideData(master.getXmlObject().getCSld()), xhtml); + } if (notes != null) { + // TODO In POI 3.8 beta 5, improve how we get this extractShapeContent(new XSLFCommonSlideData(notes.getCSld()), xhtml); } xhtml.endElement("div");