Author: nick
Date: Fri Apr 1 15:02:14 2011
New Revision: 1087734
URL: http://svn.apache.org/viewvc?rev=1087734&view=rev
Log:
Update OutlookTextExtractor to request 7 bit encoding guessing
Modified:
poi/trunk/src/documentation/content/xdocs/status.xml
poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
Modified: poi/trunk/src/documentation/content/xdocs/status.xml
URL:
http://svn.apache.org/viewvc/poi/trunk/src/documentation/content/xdocs/status.xml?rev=1087734&r1=1087733&r2=1087734&view=diff
==============================================================================
--- poi/trunk/src/documentation/content/xdocs/status.xml (original)
+++ poi/trunk/src/documentation/content/xdocs/status.xml Fri Apr 1 15:02:14
2011
@@ -34,6 +34,7 @@
<changes>
<release version="3.8-beta3" date="2011-??-??">
+ <action dev="poi-developers" type="fix">OutlookTextExtractor now
requests 7 bit encoding guessing</action>
<action dev="poi-developers" type="add">Improve HSMF encoding
guessing for 7 bit fields in MAPIMessage</action>
<action dev="poi-developers" type="add">Allow HSMF access to the
HTML body contents in MAPIMessage</action>
</release>
Modified:
poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java?rev=1087734&r1=1087733&r2=1087734&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
Fri Apr 1 15:02:14 2011
@@ -32,9 +32,8 @@ import org.apache.poi.util.StringUtil;
public class StringChunk extends Chunk {
private static final String DEFAULT_ENCODING = "CP1252";
private String encoding7Bit = DEFAULT_ENCODING;
- private String value;
- /** Only kept around for 7 bit strings */
private byte[] rawValue;
+ private String value;
/**
* Creates a String Chunk.
@@ -72,23 +71,22 @@ public class StringChunk extends Chunk {
// Re-read the String if we're a 7 bit one
if(type == Types.ASCII_STRING) {
- parseString(rawValue);
+ parseString();
}
}
public void readValue(InputStream value) throws IOException {
- byte[] data = IOUtils.toByteArray(value);
- parseString(data);
+ rawValue = IOUtils.toByteArray(value);
+ parseString();
}
- private void parseString(byte[] data) {
+ private void parseString() {
String tmpValue;
switch(type) {
case Types.ASCII_STRING:
- tmpValue = parseAs7BitData(data, encoding7Bit);
- this.rawValue = data;
+ tmpValue = parseAs7BitData(rawValue, encoding7Bit);
break;
case Types.UNICODE_STRING:
- tmpValue = StringUtil.getFromUnicodeLE(data);
+ tmpValue = StringUtil.getFromUnicodeLE(rawValue);
break;
default:
throw new IllegalArgumentException("Invalid type " + type + " for
String Chunk");
@@ -99,34 +97,46 @@ public class StringChunk extends Chunk {
}
public void writeValue(OutputStream out) throws IOException {
- byte[] data;
-
+ out.write(rawValue);
+ }
+ private void storeString() {
switch(type) {
case Types.ASCII_STRING:
try {
- data = value.getBytes(encoding7Bit);
+ rawValue = value.getBytes(encoding7Bit);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("Encoding not found - " + encoding7Bit,
e);
}
break;
case Types.UNICODE_STRING:
- data = new byte[value.length()*2];
- StringUtil.putUnicodeLE(value, data, 0);
+ rawValue = new byte[value.length()*2];
+ StringUtil.putUnicodeLE(value, rawValue, 0);
break;
default:
throw new IllegalArgumentException("Invalid type " + type + " for
String Chunk");
}
-
- out.write(data);
}
+ /**
+ * Returns the Text value of the chunk
+ */
public String getValue() {
return this.value;
}
- public String toString() {
- return this.value;
- }
+
+ public byte[] getRawValue() {
+ return this.rawValue;
+ }
+ public void setValue(String str) {
+ this.value = str;
+ storeString();
+ }
+
+ public String toString() {
+ return this.value;
+ }
+
/**
* Parses as non-unicode, supposedly 7 bit CP1252 data
* and returns the string that that yields.
Modified:
poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java?rev=1087734&r1=1087733&r2=1087734&view=diff
==============================================================================
---
poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
(original)
+++
poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
Fri Apr 1 15:02:14 2011
@@ -16,6 +16,7 @@
==================================================================== */
package org.apache.poi.hsmf.extractor;
+import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
@@ -56,6 +57,15 @@ public class OutlookTextExtactor extends
public OutlookTextExtactor(InputStream inp) throws IOException {
this(new MAPIMessage(inp));
}
+
+ public static void main(String[] args) throws Exception {
+ for(String filename : args) {
+ OutlookTextExtactor extractor = new OutlookTextExtactor(
+ new NPOIFSFileSystem(new File(filename))
+ );
+ System.out.println( extractor.getText() );
+ }
+ }
/**
* Returns the underlying MAPI message
@@ -71,6 +81,11 @@ public class OutlookTextExtactor extends
MAPIMessage msg = (MAPIMessage)document;
StringBuffer s = new StringBuffer();
+ // See if we can get a suitable encoding for any
+ // non unicode text in the file
+ msg.guess7BitEncoding();
+
+ // Off we go
StringsIterator emails;
try {
emails = new StringsIterator(
Modified:
poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java?rev=1087734&r1=1087733&r2=1087734&view=diff
==============================================================================
---
poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
(original)
+++
poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
Fri Apr 1 15:02:14 2011
@@ -199,4 +199,21 @@ public final class TestOutlookTextExtrac
// Embeded bits are checked in
// TestExtractorFactory
}
+
+ public void testEncodings() throws Exception {
+ POIFSFileSystem simple = new POIFSFileSystem(
+ new FileInputStream(samples.getFile("chinese-traditional.msg"))
+ );
+ MAPIMessage msg = new MAPIMessage(simple);
+ OutlookTextExtactor ext = new OutlookTextExtactor(msg);
+ String text = ext.getText();
+
+ // Check the english bits
+ assertContains(text, "From: Tests Chang@FT");
+ assertContains(text, "[email protected]");
+
+ // And check some chinese bits
+ assertContains(text, "(\u5f35\u6bd3\u502b)");
+ assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )");
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]