Author: tallison
Date: Thu Oct  8 02:23:46 2015
New Revision: 1707432

URL: http://svn.apache.org/viewvc?rev=1707432&view=rev
Log:
TIKA-1755 make div and other formatting more consistent btwn PPT and PPTX

Added:
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_comment.ppt   
(with props)
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_comment.pptx  
 (with props)
Modified:
    tika/trunk/CHANGES.txt
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1707432&r1=1707431&r2=1707432&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Oct  8 02:23:46 2015
@@ -1,10 +1,13 @@
 Release 1.11 - Current Development
+  
+  * Make div and other markup more consistent between PPT and 
+    PPTX (TIKA-1755).
 
   * Parse multiple authors from MSOffice's semi-colon delimited
     author field (TIKA-1765).
   
   * Include CTAKESConfig.properties within tika-parsers resources 
-    by default (TIKA-1741)
+    by default (TIKA-1741).
   
   * Prevent infinite recursion when processing inline images
     in PDF files by limiting extraction of duplicate images

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1707432&r1=1707431&r2=1707432&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
 Thu Oct  8 02:23:46 2015
@@ -106,16 +106,27 @@ public class HSLFExtractor extends Abstr
             }
 
             // Comments, if present
+            StringBuilder authorStringBuilder = new StringBuilder();
             for (Comment comment : slide.getComments()) {
+                authorStringBuilder.setLength(0);
                 xhtml.startElement("p", "class", "slide-comment");
-                if (comment.getAuthor() != null) {
-                    xhtml.startElement("b");
-                    xhtml.characters(comment.getAuthor());
-                    xhtml.endElement("b");
 
+                if (comment.getAuthor() != null) {
+                    authorStringBuilder.append(comment.getAuthor());
+                }
+                if (comment.getAuthorInitials() != null) {
+                    if (authorStringBuilder.length() > 0) {
+                        authorStringBuilder.append(" ");
+                    }
+                    
authorStringBuilder.append("("+comment.getAuthorInitials()+")");
+                }
+                if (authorStringBuilder.length() > 0) {
                     if (comment.getText() != null) {
-                        xhtml.characters(" - ");
+                        authorStringBuilder.append(" - ");
                     }
+                    xhtml.startElement("b");
+                    xhtml.characters(authorStringBuilder.toString());
+                    xhtml.endElement("b");
                 }
                 if (comment.getText() != null) {
                     xhtml.characters(comment.getText());
@@ -136,7 +147,7 @@ public class HSLFExtractor extends Abstr
         xhtml.endElement("div");
 
       /* notes */
-        xhtml.startElement("div", "class", "slideNotes");
+        xhtml.startElement("div", "class", "slide-notes");
         HashSet<Integer> seenNotes = new HashSet<>();
         HeadersFooters hf = ss.getNotesHeadersFooters();
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1707432&r1=1707431&r2=1707432&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
 Thu Oct  8 02:23:46 2015
@@ -31,6 +31,7 @@ import org.apache.poi.xslf.XSLFSlideShow
 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
 import org.apache.poi.xslf.usermodel.Placeholder;
 import org.apache.poi.xslf.usermodel.XMLSlideShow;
+import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
 import org.apache.poi.xslf.usermodel.XSLFComments;
 import org.apache.poi.xslf.usermodel.XSLFGraphicFrame;
 import org.apache.poi.xslf.usermodel.XSLFGroupShape;
@@ -45,6 +46,7 @@ import org.apache.poi.xslf.usermodel.XSL
 import org.apache.poi.xslf.usermodel.XSLFTable;
 import org.apache.poi.xslf.usermodel.XSLFTableCell;
 import org.apache.poi.xslf.usermodel.XSLFTableRow;
+import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
 import org.apache.poi.xslf.usermodel.XSLFTextShape;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.ParseContext;
@@ -52,6 +54,7 @@ import org.apache.tika.sax.XHTMLContentH
 import org.apache.xmlbeans.XmlException;
 import org.apache.xmlbeans.XmlObject;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentAuthor;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTPicture;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
@@ -68,6 +71,7 @@ public class XSLFPowerPointExtractorDeco
      */
     protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, 
IOException {
         XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
+        XSLFCommentAuthors commentAuthors = slideShow.getCommentAuthors();
 
         List<XSLFSlide> slides = slideShow.getSlides();
         for (XSLFSlide slide : slides) {
@@ -79,12 +83,16 @@ public class XSLFPowerPointExtractorDeco
                 slideDesc = null;
             }
 
-            // slide
+            // slide content
+            xhtml.startElement("div", "class", "slide-content");
             extractContent(slide.getShapes(), false, xhtml, slideDesc);
+            xhtml.endElement("div");
 
             // slide layout which is the master sheet for this slide
+            xhtml.startElement("div", "class", "slide-master-content");
             XSLFSlideLayout slideLayout = slide.getMasterSheet();
             extractContent(slideLayout.getShapes(), true, xhtml, null);
+            xhtml.endElement("div");
 
             // slide master which is the master sheet for all text layouts
             XSLFSheet slideMaster = slideLayout.getMasterSheet();
@@ -93,19 +101,46 @@ public class XSLFPowerPointExtractorDeco
             // notes (if present)
             XSLFNotes slideNotes = slide.getNotes();
             if (slideNotes != null) {
+                xhtml.startElement("div", "class", "slide-notes");
+
                 extractContent(slideNotes.getShapes(), false, xhtml, 
slideDesc);
 
                 // master sheet for this notes
                 XSLFNotesMaster notesMaster = slideNotes.getMasterSheet();
                 extractContent(notesMaster.getShapes(), true, xhtml, null);
+                xhtml.endElement("div");
             }
 
             // comments (if present)
             XSLFComments comments = slide.getComments();
             if (comments != null) {
+                StringBuilder authorStringBuilder = new StringBuilder();
                 for (int i = 0; i < comments.getNumberOfComments(); i++) {
+                    authorStringBuilder.setLength(0);
                     CTComment comment = comments.getCommentAt(i);
-                    xhtml.element("p", comment.getText());
+                    xhtml.startElement("p", "class", "slide-comment");
+                    CTCommentAuthor cta = 
commentAuthors.getAuthorById(comment.getAuthorId());
+                    if (cta != null) {
+                        if (cta.getName() != null) {
+                            authorStringBuilder.append(cta.getName());
+                        }
+                        if (cta.getInitials() != null) {
+                            if (authorStringBuilder.length() > 0) {
+                                authorStringBuilder.append(" ");
+                            }
+                            
authorStringBuilder.append("("+cta.getInitials()+")");
+                        }
+                        if (comment.getText() != null && 
authorStringBuilder.length() > 0) {
+                            authorStringBuilder.append(" - ");
+                        }
+                        if (authorStringBuilder.length() > 0) {
+                            xhtml.startElement("b");
+                            xhtml.characters(authorStringBuilder.toString());
+                            xhtml.endElement("b");
+                        }
+                    }
+                    xhtml.characters(comment.getText());
+                    xhtml.endElement("p");
                 }
             }
         }
@@ -120,17 +155,16 @@ public class XSLFPowerPointExtractorDeco
                 if (skipPlaceholders && ph != null) {
                     continue;
                 }
-                xhtml.element("p", txt.getText());
+                for (XSLFTextParagraph p : txt.getTextParagraphs()) {
+                    xhtml.element("p", p.getText());
+                }
             } else if (sh instanceof XSLFGroupShape) {
                 // recurse into groups of shapes
                 XSLFGroupShape group = (XSLFGroupShape) sh;
                 extractContent(group.getShapes(), skipPlaceholders, xhtml, 
slideDesc);
             } else if (sh instanceof XSLFTable) {
-                XSLFTable tbl = (XSLFTable) sh;
-                for (XSLFTableRow row : tbl) {
-                    List<XSLFTableCell> cells = row.getCells();
-                    extractContent(cells, skipPlaceholders, xhtml, slideDesc);
-                }
+                //unlike tables in Word, ppt/x can't have recursive tables...I 
don't think
+                extractTable((XSLFTable)sh, xhtml);
             } else if (sh instanceof XSLFGraphicFrame) {
                 XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
                 XmlObject[] sp = frame.getXmlObject().selectPath(
@@ -172,6 +206,22 @@ public class XSLFPowerPointExtractorDeco
         }
     }
 
+    private void extractTable(XSLFTable tbl, XHTMLContentHandler xhtml) throws 
SAXException {
+        xhtml.startElement("table");
+        for (XSLFTableRow row : tbl) {
+            xhtml.startElement("tr");
+            List<XSLFTableCell> cells = row.getCells();
+            for (XSLFTableCell c : row.getCells()) {
+                xhtml.startElement("td");
+                xhtml.characters(c.getText());
+                xhtml.endElement("td");
+            }
+            xhtml.endElement("tr");
+        }
+        xhtml.endElement("table");
+
+    }
+
     /**
      * In PowerPoint files, slides have things embedded in them,
      * and slide drawings which have the images

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1707432&r1=1707431&r2=1707432&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 Thu Oct  8 02:23:46 2015
@@ -55,72 +55,66 @@ public class PowerPointParserTest extend
 
     @Test
     public void testVarious() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
-
-        try (InputStream stream = 
PowerPointParserTest.class.getResourceAsStream(
-                "/test-documents/testPPT_various.ppt")) {
-            new OfficeParser().parse(stream, handler, metadata, new 
ParseContext());
-        }
-
-        String content = handler.toString();
-        //content = content.replaceAll("\\s+"," ");
-        assertContains("Footnote appears here", content);
-        assertContains("This is a footnote.", content);
-        assertContains("This is the header text.", content);
-        assertContains("This is the footer text.", content);
-        assertContains("Here is a text box", content);
-        assertContains("Bold", content);
-        assertContains("italic", content);
-        assertContains("underline", content);
-        assertContains("superscript", content);
-        assertContains("subscript", content);
-        assertContains("Here is a citation:", content);
-        assertContains("Figure 1 This is a caption for Figure 1", content);
-        assertContains("(Kramer)", content);
-        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 
Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
-        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 
column 2", content.replaceAll("\\s+"," "));
-        assertContains("This is a hyperlink", content);
-        assertContains("Here is a list:", content);
+        String xml = getXML("testPPT_various.ppt", metadata).xml;
+        assertContains("<p>Footnote appears here", xml);
+        assertContains("<p>[1]This is a footnote.", xml);
+        assertContains("<p>This is the header text.</p>", xml);
+        assertContains("<p>This is the footer text.</p>", xml);
+        assertContains("<p>Here is a text box</p>", xml);
+        //TODO: fix this spacing: assertContains("<p>Bold ", xml);
+        assertContains("italic", xml);
+        assertContains("underline", xml);
+        assertContains("superscript", xml);
+        assertContains("subscript", xml);
+        assertContains("<p>Here is a citation:", xml);
+        assertContains("Figure 1 This is a caption for Figure 1", xml);
+        assertContains("(Kramer)", xml);
+        assertContains("<table><tr>\t<td>Row 1 Col 1</td>", xml);
+        assertContains("<td>Row 2 Col 2</td>\t<td>Row 2 Col 3</td></tr>", xml);
+        assertContains("<p>Row 1 column 1</p>", xml);
+        assertContains("<p>Row 2 column 2</p>", xml);
+        assertContains("<p>This is a hyperlink", xml);
+        assertContains("<p>Here is a list:", xml);
         for(int row=1;row<=3;row++) {
             //assertContains("·\tBullet " + row, content);
             //assertContains("\u00b7\tBullet " + row, content);
-            // TODO OfficeParser fails to extract the bullet symbol
-            assertContains("Bullet " + row, content);
+            assertContains("<p>Bullet " + row, xml);
         }
-        assertContains("Here is a numbered list:", content);
+        assertContains("Here is a numbered list:", xml);
         for(int row=1;row<=3;row++) {
             //assertContains(row + ")\tNumber bullet " + row, content);
             //assertContains(row + ") Number bullet " + row, content);
-            // TODO: OfficeParser fails to number the bullets:
-            assertContains("Number bullet " + row, content);
+            // TODO: OOXMLExtractor fails to number the bullets:
+            assertContains("<p>Number bullet " + row, xml);
         }
 
         for(int row=1;row<=2;row++) {
             for(int col=1;col<=3;col++) {
-                assertContains("Row " + row + " Col " + col, content);
+                assertContains("Row " + row + " Col " + col, xml);
             }
         }
-
-        assertContains("Keyword1 Keyword2", content);
+        assertContains("Keyword1 Keyword2", xml);
         assertEquals("Keyword1 Keyword2",
                      metadata.get(TikaCoreProperties.KEYWORDS));
 
-        assertContains("Subject is here", content);
+        assertContains("Subject is here", xml);
         assertEquals("Subject is here",
                      metadata.get(OfficeOpenXMLCore.SUBJECT));
         // TODO: Remove subject in Tika 2.0
         assertEquals("Subject is here",
                      metadata.get(Metadata.SUBJECT));
 
-        assertContains("Suddenly some Japanese text:", content);
+        assertContains("Suddenly some Japanese text:", xml);
         // Special version of (GHQ)
-        assertContains("\uff08\uff27\uff28\uff31\uff09", content);
+        assertContains("\uff08\uff27\uff28\uff31\uff09", xml);
         // 6 other characters
-        
assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f",
 content);
+        
assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f",
+                xml);
 
-        assertContains("And then some Gothic text:", content);
-        
assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A",
 content);
+        assertContains("And then some Gothic text:", xml);
+        
assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A",
+                xml);
     }
 
     @Test
@@ -238,4 +232,10 @@ public class PowerPointParserTest extend
                 "<div class=\"slide-content\"><p>Now</p>",
                 result.xml);
     }
+
+    @Test
+    public void testCommentAuthorship() throws Exception {
+        XMLResult r = getXML("testPPT_comment.ppt");
+        assertContains("<p class=\"slide-comment\"><b>Allison, Timothy B. 
(ATB)", r.xml);
+    }
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1707432&r1=1707431&r2=1707432&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 Thu Oct  8 02:23:46 2015
@@ -621,71 +621,71 @@ public class OOXMLParserTest extends Tik
 
     @Test
     public void testVariousPPTX() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
-
-        try (InputStream stream = OOXMLParserTest.class.getResourceAsStream(
-                "/test-documents/testPPT_various.pptx")) {
-            new AutoDetectParser().parse(stream, handler, metadata, new 
ParseContext());
-        }
-
-        String content = handler.toString();
-        //content = content.replaceAll("\\s+"," ");
-        assertContains("Footnote appears here", content);
-        assertContains("This is a footnote.", content);
-        assertContains("This is the header text.", content);
-        assertContains("This is the footer text.", content);
-        assertContains("Here is a text box", content);
-        assertContains("Bold", content);
-        assertContains("italic", content);
-        assertContains("underline", content);
-        assertContains("superscript", content);
-        assertContains("subscript", content);
-        assertContains("Here is a citation:", content);
-        assertContains("Figure 1 This is a caption for Figure 1", content);
-        assertContains("(Kramer)", content);
-        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 
Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
-        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 
column 2", content.replaceAll("\\s+"," "));
-        assertContains("This is a hyperlink", content);
-        assertContains("Here is a list:", content);
+        String xml = getXML("testPPT_various.pptx", metadata).xml;
+        assertContains("<p>Footnote appears here", xml);
+        assertContains("<p>[1] This is a footnote.", xml);
+        assertContains("<p>This is the header text.</p>", xml);
+        assertContains("<p>This is the footer text.</p>", xml);
+        assertContains("<p>Here is a text box</p>", xml);
+        assertContains("<p>Bold", xml);
+        assertContains("italic", xml);
+        assertContains("underline", xml);
+        assertContains("superscript", xml);
+        assertContains("subscript", xml);
+        assertContains("<p>Here is a citation:", xml);
+        assertContains("Figure 1 This is a caption for Figure 1", xml);
+        assertContains("(Kramer)", xml);
+        assertContains("<table><tr>\t<td>Row 1 Col 1</td>", xml);
+        assertContains("<td>Row 2 Col 2</td>\t<td>Row 2 Col 3</td></tr>", xml);
+        assertContains("<p>Row 1 column 1</p>", xml);
+        assertContains("<p>Row 2 column 2</p>", xml);
+        assertContains("<p>This is a hyperlink", xml);
+        assertContains("<p>Here is a list:", xml);
         for(int row=1;row<=3;row++) {
             //assertContains("·\tBullet " + row, content);
             //assertContains("\u00b7\tBullet " + row, content);
-            assertContains("Bullet " + row, content);
+            assertContains("<p>Bullet " + row, xml);
         }
-        assertContains("Here is a numbered list:", content);
+        assertContains("Here is a numbered list:", xml);
         for(int row=1;row<=3;row++) {
             //assertContains(row + ")\tNumber bullet " + row, content);
             //assertContains(row + ") Number bullet " + row, content);
             // TODO: OOXMLExtractor fails to number the bullets:
-            assertContains("Number bullet " + row, content);
+            assertContains("<p>Number bullet " + row, xml);
         }
 
         for(int row=1;row<=2;row++) {
             for(int col=1;col<=3;col++) {
-                assertContains("Row " + row + " Col " + col, content);
+                assertContains("Row " + row + " Col " + col, xml);
             }
         }
 
-        assertContains("Keyword1 Keyword2", content);
+        assertContains("Keyword1 Keyword2", xml);
         assertEquals("Keyword1 Keyword2",
-                     metadata.get(Metadata.KEYWORDS));
+                metadata.get(Metadata.KEYWORDS));
 
-        assertContains("Subject is here", content);
+        assertContains("Subject is here", xml);
         // TODO: Remove subject in Tika 2.0
         assertEquals("Subject is here",
                      metadata.get(Metadata.SUBJECT));
         assertEquals("Subject is here",
-                     metadata.get(OfficeOpenXMLCore.SUBJECT));
+                metadata.get(OfficeOpenXMLCore.SUBJECT));
 
-        assertContains("Suddenly some Japanese text:", content);
+        assertContains("Suddenly some Japanese text:", xml);
         // Special version of (GHQ)
-        assertContains("\uff08\uff27\uff28\uff31\uff09", content);
+        assertContains("\uff08\uff27\uff28\uff31\uff09", xml);
         // 6 other characters
-        
assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f",
 content);
+        
assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f",
 xml);
 
-        assertContains("And then some Gothic text:", content);
-        
assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A",
 content);
+        assertContains("And then some Gothic text:", xml);
+        
assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A",
 xml);
+    }
+
+    @Test
+    public void testCommentPPTX() throws Exception {
+        XMLResult r = getXML("testPPT_comment.pptx");
+        assertContains("<p class=\"slide-comment\"><b>Allison, Timothy B. 
(ATB)", r.xml);
     }
 
     @Test
@@ -1046,9 +1046,8 @@ public class OOXMLParserTest extends Tik
     @Test
     public void testPPTXThumbnail() throws Exception {
         String xml = getXML("testPPTX_Thumbnail.pptx").xml;
-        int a = xml.indexOf("<body><p>This file contains an embedded 
thumbnail</p>");
+        int a = xml.indexOf("<body><div class=\"slide-content\"><p>This file 
contains an embedded thumbnail");
         int b = xml.indexOf("<div class=\"embedded\" 
id=\"/docProps/thumbnail.jpeg\" />");
-
         assertTrue(a != -1);
         assertTrue(b != -1);
         assertTrue(a < b);

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_comment.ppt
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_comment.ppt?rev=1707432&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_comment.ppt
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_comment.pptx
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_comment.pptx?rev=1707432&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_comment.pptx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to