Author: nick
Date: Thu Dec 16 07:39:21 2010
New Revision: 1049802
URL: http://svn.apache.org/viewvc?rev=1049802&view=rev
Log:
Inside ExtractorFactory, support finding embedded OOXML documents and providing
extractors for them
Added:
poi/trunk/test-data/document/word_with_embeded_ooxml.doc (with props)
Modified:
poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
Modified:
poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java?rev=1049802&r1=1049801&r2=1049802&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
(original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java Thu
Dec 16 07:39:21 2010
@@ -191,10 +191,11 @@ public class ExtractorFactory {
throw new IllegalArgumentException("No supported documents found in the
OOXML package (found "+corePart.getContentType()+")");
}
- public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs)
throws IOException {
- return createExtractor(fs.getRoot(), fs);
+ public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs)
throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
+ // Only ever an OLE2 one from the root of the FS
+ return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs);
}
- public static POIOLE2TextExtractor createExtractor(DirectoryNode
poifsDir, POIFSFileSystem fs) throws IOException {
+ public static POITextExtractor createExtractor(DirectoryNode poifsDir,
POIFSFileSystem fs) throws IOException, InvalidFormatException,
OpenXML4JException, XmlException {
// Look for certain entries in the stream, to figure it
// out from
for(Iterator<Entry> entries = poifsDir.getEntries();
entries.hasNext(); ) {
@@ -234,6 +235,12 @@ public class ExtractorFactory {
) {
return new OutlookTextExtactor(poifsDir, fs);
}
+ if(entry.getName().equals("Package")) {
+ OPCPackage pkg = OPCPackage.open(
+
poifsDir.createDocumentInputStream(entry.getName())
+ );
+ return createExtractor(pkg);
+ }
}
throw new IllegalArgumentException("No supported documents
found in the OLE2 stream");
}
@@ -246,7 +253,7 @@ public class ExtractorFactory {
* empty array. Otherwise, you'll get one open
* {...@link POITextExtractor} for each embeded file.
*/
- public static POITextExtractor[]
getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
+ public static POITextExtractor[]
getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException,
InvalidFormatException, OpenXML4JException, XmlException {
// All the embded directories we spotted
ArrayList<Entry> dirs = new ArrayList<Entry>();
// For anything else not directly held in as a POIFS directory
Modified:
poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java?rev=1049802&r1=1049801&r2=1049802&view=diff
==============================================================================
---
poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
(original)
+++
poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
Thu Dec 16 07:39:21 2010
@@ -60,6 +60,7 @@ public class TestExtractorFactory extend
private File docx;
private File dotx;
private File docEmb;
+ private File docEmbOOXML;
private File ppt;
private File pptx;
@@ -88,6 +89,7 @@ public class TestExtractorFactory extend
docx = wpTests.getFile("SampleDoc.docx");
dotx = wpTests.getFile("test.dotx");
docEmb = wpTests.getFile("word_with_embeded.doc");
+ docEmbOOXML = wpTests.getFile("word_with_embeded_ooxml.doc");
POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
ppt = slTests.getFile("SampleShow.ppt");
@@ -536,7 +538,7 @@ public class TestExtractorFactory extend
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(6, embeds.length);
- int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0;
+ int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
for(int i=0; i<embeds.length; i++) {
assertTrue(embeds[i].getText().length() > 20);
@@ -569,6 +571,27 @@ public class TestExtractorFactory extend
assertEquals(1, numWord);
assertEquals(0, numMsg);
+ // Word which contains an OOXML file
+ ext = (POIOLE2TextExtractor)
+ ExtractorFactory.createExtractor(docEmbOOXML);
+ embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+ numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
+ assertEquals(3, embeds.length);
+ for(int i=0; i<embeds.length; i++) {
+ assertTrue(embeds[i].getText().length() > 20);
+ if(embeds[i] instanceof PowerPointExtractor) numPpt++;
+ else if(embeds[i] instanceof ExcelExtractor) numXls++;
+ else if(embeds[i] instanceof WordExtractor) numWord++;
+ else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
+ else if(embeds[i] instanceof XWPFWordExtractor) numWordX++;
+ }
+ assertEquals(1, numPpt);
+ assertEquals(1, numXls);
+ assertEquals(0, numWord);
+ assertEquals(1, numWordX);
+ assertEquals(0, numMsg);
+
// Outlook
ext = (OutlookTextExtactor)
ExtractorFactory.createExtractor(msgEmb);
Added: poi/trunk/test-data/document/word_with_embeded_ooxml.doc
URL:
http://svn.apache.org/viewvc/poi/trunk/test-data/document/word_with_embeded_ooxml.doc?rev=1049802&view=auto
==============================================================================
Binary file - no diff available.
Propchange: poi/trunk/test-data/document/word_with_embeded_ooxml.doc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]