Author: mikemccand
Date: Tue Sep 11 15:12:27 2012
New Revision: 1383443
URL: http://svn.apache.org/viewvc?rev=1383443&view=rev
Log:
TIKA-989: leave placeholder where embedded document appears in .docx files
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_embedded_pdf.docx
(with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1383443&r1=1383442&r2=1383443&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Tue Sep 11 15:12:27 2012
@@ -17,6 +17,14 @@ Release 1.3 - Current Development
* CLI: TikaCLI -m now handles multi-valued metadata keys correctly
(previously it only printed the first value). (TIKA-920)
+ * MS Word (.docx): When a Word (.docx) document contains embedded
+ files, Tika now places a <div class="embedded" id="XXX"/> into the
+ XHTML so you can see where in the main text the embedded document
+ occurred. The id (rId) is included in the Metadata of each
+ embedded document as the new Metadata.EMBEDDED_RELATIONSHIP_ID
+ key, and TikaCLI prepends the rId (if present) onto the filename
+ it extracts (TIKA-989).
+
Release 1.2 - 07/10/2012
---------------------------------
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1383443&r1=1383442&r2=1383443&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Tue Sep
11 15:12:27 2012
@@ -704,6 +704,11 @@ public class TikaCLI {
}
}
+ String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
+ if (relID != null) {
+ name = relID + "_" + name;
+ }
+
File outputFile = new File(extractDir, name);
if (outputFile.exists()) {
System.err.println("File '"+name+"' already exists; skipping");
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java?rev=1383443&r1=1383442&r2=1383443&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java
Tue Sep 11 15:12:27 2012
@@ -24,4 +24,6 @@ public interface TikaMetadataKeys {
String RESOURCE_NAME_KEY = "resourceName";
String PROTECTED = "protected";
+
+ String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId";
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1383443&r1=1383442&r2=1383443&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
Tue Sep 11 15:12:27 2012
@@ -126,12 +126,12 @@ public abstract class AbstractOOXMLExtra
String type = rel.getRelationshipType();
if (RELATION_OLE_OBJECT.equals(type)
&&
TYPE_OLE_OBJECT.equals(target.getContentType())) {
- handleEmbeddedOLE(target, handler);
+ handleEmbeddedOLE(target, handler, rel.getId());
} else if (RELATION_AUDIO.equals(type)
|| RELATION_IMAGE.equals(type)
|| RELATION_PACKAGE.equals(type)
|| RELATION_OLE_OBJECT.equals(type)) {
- handleEmbeddedFile(target, handler);
+ handleEmbeddedFile(target, handler, rel.getId());
}
}
}
@@ -144,12 +144,13 @@ public abstract class AbstractOOXMLExtra
/**
* Handles an embedded OLE object in the document
*/
- private void handleEmbeddedOLE(PackagePart part, ContentHandler handler)
+ private void handleEmbeddedOLE(PackagePart part, ContentHandler handler,
String rel)
throws IOException, SAXException {
POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream());
try {
Metadata metadata = new Metadata();
TikaInputStream stream = null;
+ metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
DirectoryNode root = fs.getRoot();
POIFSDocumentType type = POIFSDocumentType.detectType(root);
@@ -183,7 +184,7 @@ public abstract class AbstractOOXMLExtra
metadata, false);
}
} else {
- handleEmbeddedFile(part, handler);
+ handleEmbeddedFile(part, handler, rel);
}
} catch (FileNotFoundException e) {
// There was no CONTENTS entry, so skip this part
@@ -195,9 +196,10 @@ public abstract class AbstractOOXMLExtra
/**
* Handles an embedded file in the document
*/
- protected void handleEmbeddedFile(PackagePart part, ContentHandler handler)
+ protected void handleEmbeddedFile(PackagePart part, ContentHandler
handler, String rel)
throws SAXException, IOException {
Metadata metadata = new Metadata();
+ metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
// Get the name
String name = part.getPartName().getName();
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1383443&r1=1383442&r2=1383443&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
Tue Sep 11 15:12:27 2012
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import javax.xml.namespace.QName;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
@@ -41,11 +42,14 @@ import org.apache.poi.xwpf.usermodel.XWP
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.microsoft.WordExtractor;
import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle;
+import org.apache.tika.parser.microsoft.WordExtractor;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlException;
+import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
@@ -132,6 +136,41 @@ public class XWPFWordExtractorDecorator
} else {
xhtml.startElement(tag, "class", styleClass);
}
+
+ // Output placeholder for any embedded docs:
+
+ // TODO: replace w/ XPath/XQuery:
+ for(XWPFRun run : paragraph.getRuns()) {
+ XmlCursor c = run.getCTR().newCursor();
+ c.selectPath("./*");
+ while (c.toNextSelection()) {
+ XmlObject o = c.getObject();
+ if (o instanceof CTObject) {
+ XmlCursor c2 = o.newCursor();
+ c2.selectPath("./*");
+ while (c2.toNextSelection()) {
+ XmlObject o2 = c2.getObject();
+
+ XmlObject embedAtt = o2.selectAttribute(new QName("Type"));
+ if (embedAtt != null &&
embedAtt.getDomNode().getNodeValue().equals("Embed")) {
+ // Type is "Embed"
+ XmlObject relIDAtt = o2.selectAttribute(new
QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships",
"id"));
+ if (relIDAtt != null) {
+ String relID = relIDAtt.getDomNode().getNodeValue();
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class",
"CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA",
relID);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ }
+ }
+ }
+ c2.dispose();
+ }
+ }
+
+ c.dispose();
+ }
// Attach bookmarks for the paragraph
// (In future, we might put them in the right place, for now
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1383443&r1=1383442&r2=1383443&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Tue Sep 11 15:12:27 2012
@@ -239,7 +239,6 @@ public class OOXMLParserTest extends Tik
for (int i=0; i<extensions.length; i++) {
String extension = extensions[i];
String filename = "testPPT." + extension;
- String mimetype = mimeTypes[i];
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
@@ -831,4 +830,39 @@ public class OOXMLParserTest extends Tik
assertEquals("2010-12-30T22:00:00Z",
metadata.get("custom:MyCustomDate"));
assertEquals("2010-12-29T22:00:00Z",
metadata.get("custom:myCustomSecondDate"));
}
+
+ // TIKA-989:
+ public void testEmbeddedPDF() throws Exception {
+ InputStream input = OOXMLParserTest.class.getResourceAsStream(
+ "/test-documents/testWORD_embedded_pdf.docx");
+ Metadata metadata = new Metadata();
+ StringWriter sw = new StringWriter();
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+ handler.setResult(new StreamResult(sw));
+
+ try {
+ new OOXMLParser().parse(input, handler, metadata, new
ParseContext());
+ } finally {
+ input.close();
+ }
+ String xml = sw.toString();
+ int i = xml.indexOf("Here is the pdf file:");
+ int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\"/>");
+ int k = xml.indexOf("Bye Bye");
+ int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\"/>");
+ int m = xml.indexOf("Bye for real.");
+ assertTrue(i != -1);
+ assertTrue(j != -1);
+ assertTrue(k != -1);
+ assertTrue(l != -1);
+ assertTrue(m != -1);
+ assertTrue(i < j);
+ assertTrue(j < k);
+ assertTrue(k < l);
+ assertTrue(l < m);
+ }
}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_embedded_pdf.docx
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_embedded_pdf.docx?rev=1383443&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_embedded_pdf.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream