Author: nick
Date: Wed Sep 21 17:03:38 2011
New Revision: 1173761

URL: http://svn.apache.org/viewvc?rev=1173761&view=rev
Log:
TIKA-712 Fetch Master Slide text for PPT and PPTX text extraction

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1173761&r1=1173760&r2=1173761&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
 Wed Sep 21 17:03:38 2011
@@ -39,7 +39,12 @@ public class HSLFExtractor extends Abstr
             throws IOException, SAXException, TikaException {
         PowerPointExtractor powerPointExtractor =
             new PowerPointExtractor(filesystem);
-        xhtml.element("p", powerPointExtractor.getText(true, true));
+        powerPointExtractor.setSlidesByDefault(true);
+        powerPointExtractor.setNotesByDefault(true);
+        powerPointExtractor.setCommentsByDefault(true);
+        powerPointExtractor.setMasterByDefault(true);
+        
+        xhtml.element("p", powerPointExtractor.getText());
 
         List<OLEShape> shapeList = powerPointExtractor.getOLEShapes();
         for (OLEShape shape : shapeList) {

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1173761&r1=1173760&r2=1173761&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
 Wed Sep 21 17:03:38 2011
@@ -33,6 +33,7 @@ import org.apache.poi.xslf.usermodel.XML
 import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
 import org.apache.poi.xslf.usermodel.XSLFSlide;
+import org.apache.poi.xslf.usermodel.XSLFSlideMaster;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -77,20 +78,30 @@ public class XSLFPowerPointExtractorDeco
               continue;
            }
            
+            XSLFSlideMaster master = slide.getMasterSheet();
             CTNotesSlide notes = rawSlideShow.getNotes(slideId);
             CTCommentList comments = rawSlideShow.getSlideComments(slideId);
 
+            // TODO In POI 3.8 beta 5, improve how we get this
             xhtml.startElement("div");
             XSLFCommonSlideData common = new 
XSLFCommonSlideData(slide.getXmlObject().getCSld());
             extractShapeContent(common, xhtml);
 
+            // If there are comments, extract them
             if (comments != null) {
                 for (CTComment comment : comments.getCmArray()) {
                     xhtml.element("p", comment.getText());
                 }
             }
+            
+            // Get text from the master slide
+            if(master != null) {
+               // TODO In POI 3.8 beta 5, improve how we get this
+               extractShapeContent(new 
XSLFCommonSlideData(master.getXmlObject().getCSld()), xhtml);
+            }
 
             if (notes != null) {
+               // TODO In POI 3.8 beta 5, improve how we get this
                 extractShapeContent(new XSLFCommonSlideData(notes.getCSld()), 
xhtml);
             }
             xhtml.endElement("div");


Reply via email to