Author: nick
Date: Mon Jan 11 14:55:43 2010
New Revision: 897887
URL: http://svn.apache.org/viewvc?rev=897887&view=rev
Log:
Add PublisherTextExtractor support to ExtractorFactory
Modified:
poi/trunk/src/documentation/content/xdocs/status.xml
poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java
Modified: poi/trunk/src/documentation/content/xdocs/status.xml
URL:
http://svn.apache.org/viewvc/poi/trunk/src/documentation/content/xdocs/status.xml?rev=897887&r1=897886&r2=897887&view=diff
==============================================================================
--- poi/trunk/src/documentation/content/xdocs/status.xml (original)
+++ poi/trunk/src/documentation/content/xdocs/status.xml Mon Jan 11 14:55:43
2010
@@ -34,6 +34,7 @@
<changes>
<release version="3.7-SNAPSHOT" date="2010-??-??">
+ <action dev="POI-DEVELOPERS" type="add">Add PublisherTextExtractor
support to ExtractorFactory</action>
<action dev="POI-DEVELOPERS" type="add">Add XSLF support for text
extraction from tables</action>
<action dev="POI-DEVELOPERS" type="add">Support attachments as
embeded documents within the new OutlookTextExtractor</action>
<action dev="POI-DEVELOPERS" type="add">Add a text extractor
(OutlookTextExtractor) to HSMF for simpler extraction of text from .msg
files</action>
Modified:
poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java?rev=897887&r1=897886&r2=897887&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
(original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java Mon
Jan 11 14:55:43 2010
@@ -31,6 +31,7 @@
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
+import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
@@ -142,6 +143,9 @@
if(entry.getName().equals("VisioDocument")) {
return new VisioTextExtractor(poifsDir, fs);
}
+ if(entry.getName().equals("Quill")) {
+ return new PublisherTextExtractor(poifsDir, fs);
+ }
if(
entry.getName().equals("__substg1.0_1000001E") ||
entry.getName().equals("__substg1.0_1000001F") ||
Modified:
poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java?rev=897887&r1=897886&r2=897887&view=diff
==============================================================================
---
poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
(original)
+++
poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
Mon Jan 11 14:55:43 2010
@@ -24,6 +24,7 @@
import org.apache.poi.POITextExtractor;
import org.apache.poi.POIDataSamples;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
+import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
@@ -62,6 +63,8 @@
private File msgEmb;
private File vsd;
+
+ private File pub;
protected void setUp() throws Exception {
super.setUp();
@@ -86,6 +89,9 @@
POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
+ POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
+ pub = pubTests.getFile("Simple.pub");
+
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
msg = olTests.getFile("quick.msg");
msgEmb = olTests.getFile("attachment_test_msg.msg");
@@ -169,6 +175,15 @@
ExtractorFactory.createExtractor(vsd).getText().length() > 50
);
+ // Publisher
+ assertTrue(
+ ExtractorFactory.createExtractor(pub)
+ instanceof PublisherTextExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(pub).getText().length() > 50
+ );
+
// Outlook msg
assertTrue(
ExtractorFactory.createExtractor(msg)
@@ -248,6 +263,15 @@
ExtractorFactory.createExtractor(new
FileInputStream(vsd)).getText().length() > 50
);
+ // Publisher
+ assertTrue(
+ ExtractorFactory.createExtractor(new FileInputStream(pub))
+ instanceof PublisherTextExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new
FileInputStream(pub)).getText().length() > 50
+ );
+
// Outlook msg
assertTrue(
ExtractorFactory.createExtractor(new FileInputStream(msg))
@@ -302,6 +326,15 @@
assertTrue(
ExtractorFactory.createExtractor(new
POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
);
+
+ // Publisher
+ assertTrue(
+ ExtractorFactory.createExtractor(new POIFSFileSystem(new
FileInputStream(pub)))
+ instanceof PublisherTextExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new POIFSFileSystem(new
FileInputStream(pub))).getText().length() > 50
+ );
// Outlook msg
assertTrue(
@@ -426,6 +459,7 @@
assertEquals(1, numWord);
// TODO - PowerPoint
+ // TODO - Publisher
// TODO - Visio
}
}
Modified:
poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java?rev=897887&r1=897886&r2=897887&view=diff
==============================================================================
---
poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java
(original)
+++
poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java
Mon Jan 11 14:55:43 2010
@@ -26,6 +26,7 @@
import org.apache.poi.hpbf.model.qcbits.QCBit;
import org.apache.poi.hpbf.model.qcbits.QCTextBit;
import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
@@ -39,6 +40,9 @@
super(doc);
this.doc = doc;
}
+ public PublisherTextExtractor(DirectoryNode dir, POIFSFileSystem fs) throws
IOException {
+ this(new HPBFDocument(dir, fs));
+ }
public PublisherTextExtractor(POIFSFileSystem fs) throws IOException {
this(new HPBFDocument(fs));
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]