Author: tallison
Date: Fri Sep 27 18:55:31 2013
New Revision: 1527030

URL: http://svn.apache.org/r1527030
Log:
TIKA-1171 -- extra asterisks from master slide in PPT; added tests to TIKA-712 
test files to show 1171 was fixed.  Borrowed extraction code from POI 
PowerPointExtractor

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1527030&r1=1527029&r2=1527030&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
 Fri Sep 27 18:55:31 2013
@@ -68,19 +68,13 @@ public class HSLFExtractor extends Abstr
          }
 
          // Slide master, if present
-         // TODO: re-enable this once we fix TIKA-712
-         MasterSheet master = slide.getMasterSheet();
-         if(master != null) {
-            xhtml.startElement("p", "class", "slide-master-content");
-            textRunsToText(xhtml, master.getTextRuns(), true );
-            xhtml.endElement("p");
-         }
+         extractMaster(xhtml, slide.getMasterSheet());
 
          // Slide text
          {
             xhtml.startElement("p", "class", "slide-content");
 
-            textRunsToText(xhtml, slide.getTextRuns(), false );
+            textRunsToText(xhtml, slide.getTextRuns());
 
             xhtml.endElement("p");
          }
@@ -155,7 +149,7 @@ public class HSLFExtractor extends Abstr
          }
 
          // Notes text
-         textRunsToText(xhtml, notes.getTextRuns(), false);
+         textRunsToText(xhtml, notes.getTextRuns());
 
          // Repeat the notes footer, if set
          if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) 
{
@@ -170,6 +164,31 @@ public class HSLFExtractor extends Abstr
       xhtml.endElement("div");
    }
 
+   private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master) 
throws SAXException {
+      if (master == null){
+         return;
+      }
+      Shape[] shapes = master.getShapes();
+      if (shapes == null || shapes.length == 0){
+         return;
+      }
+
+      xhtml.startElement("div", "class", "slide-master-content");
+      for (int i = 0; i < shapes.length; i++){
+         Shape sh = shapes[i];
+         if (sh != null && ! MasterSheet.isPlaceholder(sh)){
+            if (sh instanceof TextShape){
+               TextShape tsh = (TextShape)sh;
+               String text = tsh.getText();
+               if (text != null){
+                  xhtml.element("p", text);
+               }
+            }
+         }
+      }
+      xhtml.endElement("div");
+   }
+
    private void extractTableText(XHTMLContentHandler xhtml, Table shape) 
throws SAXException {
       xhtml.startElement("table");
       for (int row = 0; row < shape.getNumberOfRows(); row++){
@@ -188,17 +207,20 @@ public class HSLFExtractor extends Abstr
       xhtml.endElement("table");   
    }
 
-   private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs, 
boolean isMaster) throws SAXException {
+   private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs) 
throws SAXException {
       if (runs==null) {
          return;
       }
 
       for (TextRun run : runs) {
          if (run != null) {
+           // Leaving in wisdom from TIKA-712 for easy revert.
            // Avoid boiler-plate text on the master slide (0
            // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
-           if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {
-               xhtml.characters(run.getText());
+           //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 
1)) {
+           String txt = run.getText();
+           if (txt != null){
+               xhtml.characters(txt);
                xhtml.startElement("br");
                xhtml.endElement("br");
            }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1527030&r1=1527029&r2=1527030&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 Fri Sep 27 18:55:31 2013
@@ -141,6 +141,9 @@ public class PowerPointParserTest extend
 
         // Make sure boilerplate text didn't come through:
         assertEquals(-1, content.indexOf("Click to edit Master"));
+ 
+       //TIKA-1171
+       assertEquals(-1, content.indexOf("*"));
     }
 
     // TODO: once we fix TIKA-712, re-enable this
@@ -161,6 +164,9 @@ public class PowerPointParserTest extend
 
         // Make sure boilerplate text didn't come through:
         assertEquals(-1, content.indexOf("Click to edit Master"));
+
+        //TIKA-1171
+        assertEquals(-1, content.indexOf("*"));
     }
 
     // TODO: once we fix TIKA-712, re-enable this
@@ -181,6 +187,8 @@ public class PowerPointParserTest extend
 
         // Make sure boilerplate text didn't come through:
         assertEquals(-1, content.indexOf("Click to edit Master"));
+        //TIKA-1171
+        assertEquals(-1, content.indexOf("*"));
     }
 
     /**


Reply via email to