Author: mikemccand
Date: Fri Sep 28 12:47:09 2012
New Revision: 1391432
URL: http://svn.apache.org/viewvc?rev=1391432&view=rev
Log:
TIKA-997: leave placeholder at end of slide where embedded document appears in
.pptx documents
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_zip.pptx
(with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1391432&r1=1391431&r2=1391432&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Sep 28 12:47:09 2012
@@ -28,6 +28,14 @@ Release 1.3 - Current Development
* RTF: Page, word and character count metadata are now extracted for
RTF documents (TIKA-999).
+ * MS PowerPoint (.pptx): When a PowerPoint (.pptx) document contains
+ embedded files, Tika now places a <div class="embedded" id="XXX"/> into the
+ XHTML so you can see where in the main text the embedded document
+ occurred. The id (rId) is included in the Metadata of each
+ embedded document as the new Metadata.EMBEDDED_RELATIONSHIP_ID
+ key, and TikaCLI prepends the rId (if present) onto the filename
+ it extracts (TIKA-997).
+
Release 1.2 - 07/10/2012
---------------------------------
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1391432&r1=1391431&r2=1391432&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
Fri Sep 28 12:47:09 2012
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import javax.xml.namespace.QName;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.PackagePart;
@@ -28,17 +29,15 @@ import org.apache.poi.openxml4j.opc.Pack
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
-import org.apache.poi.xslf.usermodel.DrawingParagraph;
import org.apache.poi.xslf.usermodel.Placeholder;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFComments;
-import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
+import org.apache.poi.xslf.usermodel.XSLFGraphicFrame;
import org.apache.poi.xslf.usermodel.XSLFGroupShape;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xslf.usermodel.XSLFShape;
import org.apache.poi.xslf.usermodel.XSLFSheet;
import org.apache.poi.xslf.usermodel.XSLFSlide;
-import org.apache.poi.xslf.usermodel.XSLFSlideMaster;
import org.apache.poi.xslf.usermodel.XSLFTable;
import org.apache.poi.xslf.usermodel.XSLFTableCell;
import org.apache.poi.xslf.usermodel.XSLFTableRow;
@@ -47,11 +46,11 @@ import org.apache.tika.exception.TikaExc
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.xmlbeans.XmlException;
+import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentList;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
public XSLFPowerPointExtractorDecorator(ParseContext context,
XSLFPowerPointExtractor extractor) {
@@ -117,6 +116,23 @@ public class XSLFPowerPointExtractorDeco
List<XSLFTableCell> cells = row.getCells();
extractContent(cells.toArray(new
XSLFTableCell[cells.size()]), skipPlaceholders, xhtml);
}
+ } else if (sh instanceof XSLFGraphicFrame) {
+ XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
+ XmlObject[] sp = frame.getXmlObject().selectPath(
+ "declare namespace
p='http://schemas.openxmlformats.org/presentationml/2006/main' .//*/p:oleObj");
+ if (sp != null) {
+ for(XmlObject emb : sp) {
+ XmlObject relIDAtt = emb.selectAttribute(new
QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships",
"id"));
+ if (relIDAtt != null) {
+ String relID =
relIDAtt.getDomNode().getNodeValue();
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class",
"CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA",
relID);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ }
+ }
+ }
}
}
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1391432&r1=1391431&r2=1391432&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Fri Sep 28 12:47:09 2012
@@ -865,4 +865,33 @@ public class OOXMLParserTest extends Tik
assertTrue(k < l);
assertTrue(l < m);
}
+
+ // TIKA-997:
+ public void testEmbeddedZipInPPTX() throws Exception {
+ InputStream input = OOXMLParserTest.class.getResourceAsStream(
+ "/test-documents/test_embedded_zip.pptx");
+ Metadata metadata = new Metadata();
+ StringWriter sw = new StringWriter();
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+ handler.setResult(new StreamResult(sw));
+
+ try {
+ new OOXMLParser().parse(input, handler, metadata, new
ParseContext());
+ } finally {
+ input.close();
+ }
+ String xml = sw.toString();
+ int i = xml.indexOf("Send me a note");
+ int j = xml.indexOf("<div class=\"embedded\" id=\"rId4\"/>");
+ int k = xml.indexOf("<p>No title</p>");
+ assertTrue(i != -1);
+ assertTrue(j != -1);
+ assertTrue(k != -1);
+ assertTrue(i < j);
+ assertTrue(j < k);
+ }
}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_zip.pptx
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_zip.pptx?rev=1391432&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_zip.pptx
------------------------------------------------------------------------------
svn:executable = *
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/test_embedded_zip.pptx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream