Author: mikemccand
Date: Sat Dec  1 18:00:49 2012
New Revision: 1416030

URL: http://svn.apache.org/viewvc?rev=1416030&view=rev
Log:
TIKA-712: extract master text, except for title/body

Modified:
    tika/trunk/CHANGES.txt
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1416030&r1=1416029&r2=1416030&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sat Dec  1 18:00:49 2012
@@ -42,7 +42,8 @@ Release 1.3 - Current Development
   * MS PowerPoint (.ppt): When a PowerPoint (.ppt) document contains
     embedded files, Tika now places a <div class="embedded" id="XXX"/> into the
     XHTML so you can see where in the main text the embedded document
-    occurred (TIKA-1025).
+    occurred (TIKA-1025).  Text from the master slide is now extracted
+    (TIKA-712).
 
   * MHTML: fixed Null charset name exception when a mime part has an
     unrecognized charset (TIKA-1011).

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1416030&r1=1416029&r2=1416030&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
 Sat Dec  1 18:00:49 2012
@@ -69,20 +69,18 @@ public class HSLFExtractor extends Abstr
 
          // Slide master, if present
          // TODO: re-enable this once we fix TIKA-712
-         /*
          MasterSheet master = slide.getMasterSheet();
          if(master != null) {
             xhtml.startElement("p", "class", "slide-master-content");
-            textRunsToText(xhtml, master.getTextRuns() );
+            textRunsToText(xhtml, master.getTextRuns(), true );
             xhtml.endElement("p");
          }
-         */
 
          // Slide text
          {
             xhtml.startElement("p", "class", "slide-content");
 
-            textRunsToText(xhtml, slide.getTextRuns() );
+            textRunsToText(xhtml, slide.getTextRuns(), false );
 
             xhtml.endElement("p");
          }
@@ -150,7 +148,7 @@ public class HSLFExtractor extends Abstr
          }
 
          // Notes text
-         textRunsToText(xhtml, notes.getTextRuns());
+         textRunsToText(xhtml, notes.getTextRuns(), false);
 
          // Repeat the notes footer, if set
          if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) 
{
@@ -165,16 +163,20 @@ public class HSLFExtractor extends Abstr
       xhtml.endElement("div");
    }
 
-   private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs) 
throws SAXException {
+   private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs, 
boolean isMaster) throws SAXException {
       if (runs==null) {
          return;
       }
 
       for (TextRun run : runs) {
          if (run != null) {
-            xhtml.characters( run.getText() );
-            xhtml.startElement("br");
-            xhtml.endElement("br");
+           // Avoid boiler-plate text on the master slide (0
+           // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
+           if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {
+               xhtml.characters(run.getText());
+               xhtml.startElement("br");
+               xhtml.endElement("br");
+           }
          }
       }
    }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1416030&r1=1416029&r2=1416030&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 Sat Dec  1 18:00:49 2012
@@ -143,7 +143,6 @@ public class PowerPointParserTest extend
     }
 
     // TODO: once we fix TIKA-712, re-enable this
-    /*
     public void testMasterText() throws Exception {
         ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
@@ -162,10 +161,8 @@ public class PowerPointParserTest extend
         // Make sure boilerplate text didn't come through:
         assertEquals(-1, content.indexOf("Click to edit Master"));
     }
-    */
 
     // TODO: once we fix TIKA-712, re-enable this
-    /*
     public void testMasterText2() throws Exception {
         ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
@@ -184,7 +181,6 @@ public class PowerPointParserTest extend
         // Make sure boilerplate text didn't come through:
         assertEquals(-1, content.indexOf("Click to edit Master"));
     }
-    */
 
     /**
      * Ensures that custom OLE2 (HPSF) properties are extracted


Reply via email to