Author: mikemccand
Date: Fri Sep 28 12:47:09 2012
New Revision: 1391432

URL: http://svn.apache.org/viewvc?rev=1391432&view=rev
Log:
TIKA-997: leave placeholder at end of slide where embedded document appears in 
.pptx documents

Added:
    
tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_zip.pptx
   (with props)
Modified:
    tika/trunk/CHANGES.txt
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1391432&r1=1391431&r2=1391432&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Sep 28 12:47:09 2012
@@ -28,6 +28,14 @@ Release 1.3 - Current Development
   * RTF: Page, word and character count metadata are now extracted for
     RTF documents (TIKA-999).
 
+  * MS PowerPoint (.pptx): When a PowerPoint (.pptx) document contains
+    embedded files, Tika now places a <div class="embedded" id="XXX"/> into the
+    XHTML so you can see where in the main text the embedded document
+    occurred.  The id (rId) is included in the Metadata of each
+    embedded document as the new Metadata.EMBEDDED_RELATIONSHIP_ID
+    key, and TikaCLI prepends the rId (if present) onto the filename
+    it extracts (TIKA-997).
+
 Release 1.2 - 07/10/2012
 ---------------------------------
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1391432&r1=1391431&r2=1391432&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
 Fri Sep 28 12:47:09 2012
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
+import javax.xml.namespace.QName;
 
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.opc.PackagePart;
@@ -28,17 +29,15 @@ import org.apache.poi.openxml4j.opc.Pack
 import org.apache.poi.openxml4j.opc.TargetMode;
 import org.apache.poi.xslf.XSLFSlideShow;
 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
-import org.apache.poi.xslf.usermodel.DrawingParagraph;
 import org.apache.poi.xslf.usermodel.Placeholder;
 import org.apache.poi.xslf.usermodel.XMLSlideShow;
 import org.apache.poi.xslf.usermodel.XSLFComments;
-import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
+import org.apache.poi.xslf.usermodel.XSLFGraphicFrame;
 import org.apache.poi.xslf.usermodel.XSLFGroupShape;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
 import org.apache.poi.xslf.usermodel.XSLFShape;
 import org.apache.poi.xslf.usermodel.XSLFSheet;
 import org.apache.poi.xslf.usermodel.XSLFSlide;
-import org.apache.poi.xslf.usermodel.XSLFSlideMaster;
 import org.apache.poi.xslf.usermodel.XSLFTable;
 import org.apache.poi.xslf.usermodel.XSLFTableCell;
 import org.apache.poi.xslf.usermodel.XSLFTableRow;
@@ -47,11 +46,11 @@ import org.apache.tika.exception.TikaExc
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.xmlbeans.XmlException;
+import org.apache.xmlbeans.XmlObject;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentList;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
 
 public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
     public XSLFPowerPointExtractorDecorator(ParseContext context, 
XSLFPowerPointExtractor extractor) {
@@ -117,6 +116,23 @@ public class XSLFPowerPointExtractorDeco
                     List<XSLFTableCell> cells = row.getCells();
                     extractContent(cells.toArray(new 
XSLFTableCell[cells.size()]), skipPlaceholders, xhtml);
                 }
+            } else if (sh instanceof XSLFGraphicFrame) {
+                XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
+                XmlObject[] sp = frame.getXmlObject().selectPath(
+                                   "declare namespace 
p='http://schemas.openxmlformats.org/presentationml/2006/main' .//*/p:oleObj");
+                if (sp != null) {
+                    for(XmlObject emb : sp) {
+                        XmlObject relIDAtt = emb.selectAttribute(new 
QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships";, 
"id"));
+                        if (relIDAtt != null) {
+                            String relID = 
relIDAtt.getDomNode().getNodeValue();
+                            AttributesImpl attributes = new AttributesImpl();
+                            attributes.addAttribute("", "class", "class", 
"CDATA", "embedded");
+                            attributes.addAttribute("", "id", "id", "CDATA", 
relID);
+                            xhtml.startElement("div", attributes);
+                            xhtml.endElement("div");
+                        }
+                    }
+                }
             }
         }
     }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1391432&r1=1391431&r2=1391432&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 Fri Sep 28 12:47:09 2012
@@ -865,4 +865,33 @@ public class OOXMLParserTest extends Tik
        assertTrue(k < l);
        assertTrue(l < m);
     }
+
+    // TIKA-997:
+    public void testEmbeddedZipInPPTX() throws Exception {
+        InputStream input = OOXMLParserTest.class.getResourceAsStream(
+              "/test-documents/test_embedded_zip.pptx");
+        Metadata metadata = new Metadata();
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                 SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+        handler.setResult(new StreamResult(sw));
+
+        try {
+            new OOXMLParser().parse(input, handler, metadata, new 
ParseContext());
+        } finally {
+            input.close();
+        }
+        String xml = sw.toString();
+        int i = xml.indexOf("Send me a note");
+        int j = xml.indexOf("<div class=\"embedded\" id=\"rId4\"/>");
+        int k = xml.indexOf("<p>No title</p>");
+        assertTrue(i != -1);
+        assertTrue(j != -1);
+        assertTrue(k != -1);
+        assertTrue(i < j);
+        assertTrue(j < k);
+    }
 }

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_zip.pptx
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_zip.pptx?rev=1391432&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_zip.pptx
------------------------------------------------------------------------------
    svn:executable = *

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_zip.pptx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to