Author: mikemccand
Date: Tue Dec 4 12:00:08 2012
New Revision: 1416902
URL: http://svn.apache.org/viewvc?rev=1416902&view=rev
Log:
TIKA-1036: also set EMBEDDED_RELATIONSHIP_ID in the Metadata when extracting
the embedded document
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=1416902&r1=1416901&r2=1416902&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
Tue Dec 4 12:00:08 2012
@@ -157,6 +157,8 @@ public class PackageParser extends Abstr
attributes.addAttribute("", "id", "id", "CDATA", name);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
+
+ entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
}
if (extractor.shouldParseEmbedded(entrydata)) {
// For detectors to work, we need a mark/reset supporting
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java?rev=1416902&r1=1416901&r2=1416902&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
Tue Dec 4 12:00:08 2012
@@ -17,10 +17,14 @@
package org.apache.tika.parser.pkg;
import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
import org.apache.tika.Tika;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
@@ -117,10 +121,45 @@ public class ZipParserTest extends Abstr
assertTrue(content.contains("README"));
}
+ private class GatherRelIDsDocumentExtractor implements
EmbeddedDocumentExtractor {
+ public Set<String> allRelIDs = new HashSet<String>();
+ public boolean shouldParseEmbedded(Metadata metadata) {
+ String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
+ if (relID != null) {
+ allRelIDs.add(relID);
+ }
+ return false;
+ }
+
+ public void parseEmbedded(InputStream inputStream, ContentHandler
contentHandler, Metadata metadata, boolean outputHtml) {
+ throw new UnsupportedOperationException("should never be called");
+ }
+ }
+
// TIKA-1036
public void testPlaceholders() throws Exception {
String xml = getXML("testEmbedded.zip").xml;
assertContains("<div class=\"embedded\" id=\"test1.txt\"/>", xml);
assertContains("<div class=\"embedded\" id=\"test2.txt\"/>", xml);
+
+ // Also make sure EMBEDDED_RELATIONSHIP_ID was
+ // passed when parsing the embedded docs:
+ Parser parser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, parser);
+ GatherRelIDsDocumentExtractor relIDs = new
GatherRelIDsDocumentExtractor();
+ context.set(EmbeddedDocumentExtractor.class, relIDs);
+ InputStream input =
getResourceAsStream("/test-documents/testEmbedded.zip");
+ try {
+ parser.parse(input,
+ new BodyContentHandler(),
+ new Metadata(),
+ context);
+ } finally {
+ input.close();
+ }
+
+ assertTrue(relIDs.allRelIDs.contains("test1.txt"));
+ assertTrue(relIDs.allRelIDs.contains("test2.txt"));
}
}