Author: tallison
Date: Tue Apr 11 01:30:02 2017
New Revision: 1790904
URL: http://svn.apache.org/viewvc?rev=1790904&view=rev
Log:
bug 50955 - try originally guessed codepoint, backoff to 1252 if that fails
Modified:
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java?rev=1790904&r1=1790903&r2=1790904&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java Tue
Apr 11 01:30:02 2017
@@ -96,25 +96,17 @@ public class HWPFOldDocument extends HWP
} else {
// TODO Discover if these older documents can ever hold Unicode
Strings?
// (We think not, because they seem to lack a Piece table)
- // TODO Build the Piece Descriptor properly
- // (We have to fake it, as they don't seem to have a proper Piece
table)
- PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0,
0,0,0,127, 0,0}, 0, guessedCharset);
- pd.setFilePosition(_fib.getFibBase().getFcMin());
-
- // Generate a single Text Piece Table, with a single Text Piece
- // which covers all the (8 bit only) text in the file
- tpt = new OldTextPieceTable();
- byte[] textData = new
byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()];
- System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(),
textData, 0, textData.length);
-
- int numChars = textData.length;
- if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) {
- numChars /= 2;
+ //
+ // What we have here is a wretched hack. We need to figure out
+ // how to get the correct charset for the doc.
+ TextPiece tp = null;
+ try {
+ tp = buildTextPiece(guessedCharset);
+ } catch (IllegalStateException e) {
+ //if there was a problem with the guessed charset and the
length of the
+ //textpiece, back off to win1252. This is effectively what we
used to do.
+ tp = buildTextPiece(StringUtil.WIN_1252);
}
-
- TextPiece tp = new TextPiece(
- 0, numChars, textData, pd
- );
tpt.add(tp);
}
@@ -156,6 +148,33 @@ public class HWPFOldDocument extends HWP
}
}
+ /**
+ *
+ * @param guessedCharset charset that we think this is
+ * @return a new text piece
+ * @throws IllegalStateException if the length isn't correct
+ */
+ private TextPiece buildTextPiece(Charset guessedCharset) throws
IllegalStateException {
+ PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127,
0,0}, 0, guessedCharset);
+ pd.setFilePosition(_fib.getFibBase().getFcMin());
+
+ // Generate a single Text Piece Table, with a single Text Piece
+ // which covers all the (8 bit only) text in the file
+ tpt = new OldTextPieceTable();
+ byte[] textData = new
byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()];
+ System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData,
0, textData.length);
+
+ int numChars = textData.length;
+ if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) {
+ numChars /= 2;
+ }
+
+ return new TextPiece(
+ 0, numChars, textData, pd
+ );
+
+ }
+
/**
* Take the first codepage that is not default, ansi or symbol.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]