Author: tallison
Date: Tue Apr 4 02:06:46 2017
New Revision: 1790061
URL: http://svn.apache.org/viewvc?rev=1790061&view=rev
Log:
bug 50955 -- word 6.0 charset fix
Added:
poi/trunk/src/java/org/apache/poi/util/LittleEndianBig5Stream.java (with
props)
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java
- copied, changed from r1788131,
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java
- copied, changed from r1788131,
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/Ffn.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java
- copied, changed from r1788131,
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/FontTable.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java
- copied, changed from r1788131,
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java
- copied, changed from r1788131,
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
poi/trunk/test-data/document/Bug60936.doc (with props)
poi/trunk/test-data/document/Bug60942.doc (with props)
poi/trunk/test-data/document/Bug60942b.doc (with props)
Modified:
poi/site/src/documentation/content/xdocs/status.xml
poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java
poi/trunk/src/java/org/apache/poi/util/CodePageUtil.java
poi/trunk/src/java/org/apache/poi/util/StringUtil.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java
Modified: poi/site/src/documentation/content/xdocs/status.xml
URL:
http://svn.apache.org/viewvc/poi/site/src/documentation/content/xdocs/status.xml?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/site/src/documentation/content/xdocs/status.xml (original)
+++ poi/site/src/documentation/content/xdocs/status.xml Tue Apr 4 02:06:46 2017
@@ -58,6 +58,7 @@
<release version="3.16-beta3" date="2017-04-??">
<actions>
+ <action dev="PD" type="fix" fixes-bug="50955" module="HWPF">Fix
charset handling in HWPFOldDocument</action>
<action dev="PD" type="add" fixes-bug="60826" module="XSSF">Add
initial streaming, read-only support for xlsb files</action>
<action dev="PD" type="fix" fixes-bug="51519" module="XSSF">Allow user
to select or ignore phonetic strings in shared strings table</action>
<action dev="PD" type="fix" fixes-bug="60662" module="XSLF">Slide
import delete unrecognized elements in group shape</action>
Modified: poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java Tue Apr 4
02:06:46 2017
@@ -218,6 +218,9 @@ public class TestAllFiles {
"document/Word6_sections2.doc",
"document/Word95.doc",
"document/word95err.doc",
+ "document/Bug60936.doc",
+ "document/Bug60942.doc",
+ "document/Bug60942b.doc",
"hpsf/TestMickey.doc",
"document/52117.doc"
);
Modified: poi/trunk/src/java/org/apache/poi/util/CodePageUtil.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/util/CodePageUtil.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/util/CodePageUtil.java (original)
+++ poi/trunk/src/java/org/apache/poi/util/CodePageUtil.java Tue Apr 4
02:06:46 2017
@@ -18,6 +18,9 @@
package org.apache.poi.util;
import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
+import java.util.HashSet;
+import java.util.Set;
/**
* Utilities for working with Microsoft CodePages.
@@ -27,6 +30,13 @@ import java.io.UnsupportedEncodingExcept
*/
public class CodePageUtil
{
+
+ public static final Set<Charset> VARIABLE_BYTE_CHARSETS = new
HashSet<Charset>();
+ static {
+ //others?
+ VARIABLE_BYTE_CHARSETS.add(StringUtil.BIG5);
+ }
+
/** <p>Codepage 037, a special case</p> */
public static final int CP_037 = 37;
Added: poi/trunk/src/java/org/apache/poi/util/LittleEndianBig5Stream.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/util/LittleEndianBig5Stream.java?rev=1790061&view=auto
==============================================================================
--- poi/trunk/src/java/org/apache/poi/util/LittleEndianBig5Stream.java (added)
+++ poi/trunk/src/java/org/apache/poi/util/LittleEndianBig5Stream.java Tue Apr
4 02:06:46 2017
@@ -0,0 +1,107 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.util;
+
+import java.io.ByteArrayInputStream;
+
+/**
+ * Stream that converts MSOffice's way of storing Big5, with
+ * zero-byte padding for ASCII and in LittleEndianOrder.
+ */
+@Internal
+public class LittleEndianBig5Stream extends ByteArrayInputStream {
+ private static final int EOF = -1;
+ private static final int INVALID_PAIR = -2;
+ private static final int EMPTY_TRAILING = -3;
+
+ //the char that is logically trailing in Big5 encoding
+ //however in LittleEndian order, this is the first encountered.
+ int trailing = EMPTY_TRAILING;
+ public LittleEndianBig5Stream(byte[] buf) {
+ super(buf);
+ }
+
+ public LittleEndianBig5Stream(byte[] buf, int offset, int length) {
+ super(buf, offset, length);
+ }
+
+ @Override
+ public int read() {
+
+ if (trailing != EMPTY_TRAILING) {
+ int tmp = trailing;
+ trailing = EMPTY_TRAILING;
+ return tmp;
+ }
+ int leading = readNext();
+ while (leading == INVALID_PAIR) {
+ leading = readNext();
+ }
+
+ if (leading == EOF) {
+ return EOF;
+ }
+ return leading;
+ }
+
+ //returns leading, sets trailing appropriately
+ //returns -1 if it hits the end of the stream
+ //returns -2 for an invalid big5 code pair
+ private final int readNext() {
+ trailing = super.read();
+ if (trailing == -1) {
+ return EOF;
+ }
+ int leading = super.read();
+ if (leading == EOF) {
+ return EOF;
+ }
+ int lead = leading&0xff;
+ if (lead > 0x80) {
+ return leading;
+ } else if (lead == 0) {
+ int ret = trailing;
+ trailing = EMPTY_TRAILING;
+ return ret;
+ } else {
+ int ret = trailing;
+ trailing = EMPTY_TRAILING;
+ return ret;
+ //return INVALID_PAIR;
+ }
+
+ }
+
+ @Override
+ public int read(byte[] buff, int off, int len) {
+ int bytesRead = 0;
+ for (int i = off; i < off+len; i++) {
+ int b = read();
+ if (b == -1) {
+ if (bytesRead == 0) {
+ return -1;
+ } else {
+ return bytesRead;
+ }
+ }
+ bytesRead++;
+ buff[i] = (byte)b;
+ }
+ return bytesRead;
+ }
+}
Propchange: poi/trunk/src/java/org/apache/poi/util/LittleEndianBig5Stream.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: poi/trunk/src/java/org/apache/poi/util/StringUtil.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/util/StringUtil.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/util/StringUtil.java (original)
+++ poi/trunk/src/java/org/apache/poi/util/StringUtil.java Tue Apr 4 02:06:46
2017
@@ -17,6 +17,8 @@
package org.apache.poi.util;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Iterator;
@@ -27,9 +29,14 @@ import java.util.Map;
*/
@Internal
public class StringUtil {
+
+ private static final POILogger logger = POILogFactory
+ .getLogger(StringUtil.class);
protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
- protected static final Charset UTF16LE = Charset.forName("UTF-16LE");
+ public static final Charset UTF16LE = Charset.forName("UTF-16LE");
public static final Charset UTF8 = Charset.forName("UTF-8");
+ public static final Charset WIN_1252 = Charset.forName("cp1252");
+ public static final Charset BIG5 = Charset.forName("Big5");
private static Map<Integer,Integer> msCodepointToUnicode;
@@ -573,7 +580,28 @@ public class StringUtil {
9133, // 0xf0fe bracerightbt
' ', // 0xf0ff not defined
};
-
+
+ /**
+ * This tries to convert a LE byte array in Big5 to a String.
+ * We know MS zero-padded ascii, and we drop those.
+ * However, there may be areas for improvement in this.
+ *
+ * @param data
+ * @param offset
+ * @param lengthInBytes
+ * @return
+ */
+ public static String littleEndianBig5Stream(byte[] data, int offset, int
lengthInBytes) {
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ try {
+ IOUtils.copy(new LittleEndianBig5Stream(data, offset,
lengthInBytes), os);
+ } catch (IOException e) {
+ logger.log(POILogger.WARN,
+ "IOException while copying a byte array stream to a byte
array stream?!");
+ }
+ return new String(os.toByteArray(), BIG5);
+ }
+
// Could be replaced with org.apache.commons.lang3.StringUtils#join
@Internal
public static String join(Object[] array, String separator) {
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java Tue
Apr 4 02:06:46 2017
@@ -108,7 +108,7 @@ public class HwmfFont {
return charset;
}
- static WmfCharset valueOf(int flag) {
+ public static WmfCharset valueOf(int flag) {
for (WmfCharset cs : values()) {
if (cs.flag == flag) return cs;
}
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java Tue
Apr 4 02:06:46 2017
@@ -19,27 +19,43 @@ package org.apache.poi.hwpf;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
+import java.nio.charset.Charset;
+import org.apache.poi.hwmf.record.HwmfFont;
import org.apache.poi.hwpf.model.ComplexFileTable;
+import org.apache.poi.hwpf.model.FontTable;
import org.apache.poi.hwpf.model.OldCHPBinTable;
+import org.apache.poi.hwpf.model.OldComplexFileTable;
+import org.apache.poi.hwpf.model.OldFfn;
+import org.apache.poi.hwpf.model.OldFontTable;
import org.apache.poi.hwpf.model.OldPAPBinTable;
import org.apache.poi.hwpf.model.OldSectionTable;
+import org.apache.poi.hwpf.model.OldTextPieceTable;
import org.apache.poi.hwpf.model.PieceDescriptor;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.model.TextPieceTable;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.CodePageUtil;
import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.NotImplemented;
+import org.apache.poi.util.StringUtil;
/**
* Provides very simple support for old (Word 6 / Word 95)
* files.
*/
public class HWPFOldDocument extends HWPFDocumentCore {
- private TextPieceTable tpt;
+
+ private final static Charset DEFAULT_CHARSET = StringUtil.WIN_1252;
+
+ private OldTextPieceTable tpt;
private StringBuilder _text;
+
+ private final OldFontTable fontTable;
+ private final Charset guessedCharset;
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
this(fs.getRoot());
@@ -56,45 +72,52 @@ public class HWPFOldDocument extends HWP
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0);
int papTableSize = LittleEndian.getInt(_mainStream, 0xc4);
- //int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60);
- //int shfTableSize = LittleEndian.getInt(_mainStream, 0x64);
+ int fontTableOffset = LittleEndian.getInt(_mainStream, 0xd0);
+ int fontTableSize = LittleEndian.getInt(_mainStream, 0xd4);
+
+ fontTable = new OldFontTable(_mainStream, fontTableOffset,
fontTableSize);
+ //TODO: figure out how to map runs/text pieces to fonts
+ //for now, if there's a non standard codepage in one of the fonts
+ //assume that the doc is in that codepage.
+ guessedCharset = guessCodePage(fontTable);
+
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
// We need to get hold of the text that makes up the
// document, which might be regular or fast-saved
ComplexFileTable cft = null;
- StringBuffer text = new StringBuffer();
if(_fib.getFibBase().isFComplex()) {
- cft = new ComplexFileTable(
+ cft = new OldComplexFileTable(
_mainStream, _mainStream,
- complexTableOffset, _fib.getFibBase().getFcMin()
+ complexTableOffset, _fib.getFibBase().getFcMin(),
guessedCharset
);
- tpt = cft.getTextPieceTable();
+ tpt = (OldTextPieceTable)cft.getTextPieceTable();
- for(TextPiece tp : tpt.getTextPieces()) {
- text.append( tp.getStringBuilder() );
- }
} else {
// TODO Discover if these older documents can ever hold Unicode
Strings?
// (We think not, because they seem to lack a Piece table)
// TODO Build the Piece Descriptor properly
// (We have to fake it, as they don't seem to have a proper Piece
table)
- PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0,
0,0,0,127, 0,0}, 0);
+ PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0,
0,0,0,127, 0,0}, 0, guessedCharset);
pd.setFilePosition(_fib.getFibBase().getFcMin());
// Generate a single Text Piece Table, with a single Text Piece
// which covers all the (8 bit only) text in the file
- tpt = new TextPieceTable();
+ tpt = new OldTextPieceTable();
byte[] textData = new
byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()];
System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(),
textData, 0, textData.length);
+
+ int numChars = textData.length;
+ if (CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(guessedCharset)) {
+ numChars /= 2;
+ }
+
TextPiece tp = new TextPiece(
- 0, textData.length, textData, pd
+ 0, numChars, textData, pd
);
tpt.add(tp);
- text.append(tp.getStringBuilder());
}
-
_text = tpt.getText();
// Now we can fetch the character and paragraph properties
@@ -133,12 +156,54 @@ public class HWPFOldDocument extends HWP
}
}
+
+ /**
+ * Take the first codepage that is not default, ansi or symbol.
+ * Ideally, we'd want to track fonts with runs, but we don't yet
+ * know how to do that.
+ *
+ * Consider throwing an exception if > 1 unique codepage that is not
default, symbol or ansi
+ * appears here.
+ *
+ * @param fontTable
+ * @return
+ */
+ private Charset guessCodePage(OldFontTable fontTable) {
+
+ for (OldFfn oldFfn : fontTable.getFontNames()) {
+ HwmfFont.WmfCharset wmfCharset =
HwmfFont.WmfCharset.valueOf(oldFfn.getChs()& 0xff);
+ if (wmfCharset != null &&
+ wmfCharset != HwmfFont.WmfCharset.ANSI_CHARSET &&
+ wmfCharset != HwmfFont.WmfCharset.DEFAULT_CHARSET &&
+ wmfCharset != HwmfFont.WmfCharset.SYMBOL_CHARSET ) {
+ return wmfCharset.getCharset();
+ }
+ }
+ return DEFAULT_CHARSET;
+ }
+
public Range getOverallRange()
{
// Life is easy when we have no footers, headers or unicode!
return new Range( 0, _fib.getFibBase().getFcMac() -
_fib.getFibBase().getFcMin(), this );
}
+ /**
+ * Use {@link #getOldFontTable()} instead!!!
+ * This always throws an IllegalArgumentException.
+ *
+ * @return nothing
+ * @throws UnsupportedOperationException
+ */
+ @Override
+ @NotImplemented
+ public FontTable getFontTable() {
+ throw new UnsupportedOperationException("Use getOldFontTable
instead.");
+ }
+
+ public OldFontTable getOldFontTable() {
+ return fontTable;
+ }
public Range getRange()
{
return getOverallRange();
@@ -167,4 +232,19 @@ public class HWPFOldDocument extends HWP
public void write(OutputStream out) throws IOException {
throw new IllegalStateException("Writing is not available for the
older file formats");
}
+
+ /**
+ * As a rough heuristic (total hack), read through the font table
+ * and take the first non-default, non-ansi, non-symbol
+ * font's charset and return that.
+ *
+ * Once we figure out how to link a font to a text piece, we should
+ * use the font information per text piece.
+ *
+ * @return charset
+ */
+ public Charset getGuessedCharset() {
+ return guessedCharset;
+ }
+
}
Modified:
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
---
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java
(original)
+++
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java
Tue Apr 4 02:06:46 2017
@@ -18,6 +18,7 @@
package org.apache.poi.hwpf.model;
import java.io.IOException;
+import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;
@@ -26,9 +27,10 @@ import org.apache.poi.hwpf.model.io.HWPF
import org.apache.poi.hwpf.sprm.SprmBuffer;
import org.apache.poi.util.Internal;
import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.StringUtil;
@Internal
-public final class ComplexFileTable {
+public class ComplexFileTable {
private static final byte GRPPRL_TYPE = 1;
private static final byte TEXT_PIECE_TABLE_TYPE = 2;
@@ -40,7 +42,8 @@ public final class ComplexFileTable {
_tpt = new TextPieceTable();
}
- public ComplexFileTable(byte[] documentStream, byte[] tableStream, int
offset, int fcMin) throws IOException {
+ protected ComplexFileTable(byte[] documentStream, byte[] tableStream, int
offset, int fcMin,
+ Charset charset) throws IOException {
//skips through the prms before we reach the piece table. These
contain data
//for actual fast saved files
List<SprmBuffer> sprmBuffers = new LinkedList<SprmBuffer>();
@@ -61,7 +64,12 @@ public final class ComplexFileTable {
}
int pieceTableSize = LittleEndian.getInt(tableStream, ++offset);
offset += LittleEndian.INT_SIZE;
- _tpt = new TextPieceTable(documentStream, tableStream, offset,
pieceTableSize, fcMin);
+ _tpt = newTextPieceTable(documentStream, tableStream, offset,
pieceTableSize, fcMin, charset);
+
+ }
+
+ public ComplexFileTable(byte[] documentStream, byte[] tableStream, int
offset, int fcMin) throws IOException {
+ this(documentStream, tableStream, offset, fcMin, StringUtil.WIN_1252);
}
public TextPieceTable getTextPieceTable() {
@@ -92,4 +100,11 @@ public final class ComplexFileTable {
tableStream.write(table);
}
+ protected TextPieceTable newTextPieceTable(byte[] documentStream,
+ byte[] tableStream, int offset,
int pieceTableSize, int fcMin,
+ Charset charset) {
+ return new TextPieceTable(documentStream, tableStream, offset,
pieceTableSize, fcMin);
+ }
+
+
}
Modified:
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java
Tue Apr 4 02:06:46 2017
@@ -44,7 +44,7 @@ public final class OldCHPBinTable extend
* @param fcMin
*/
public OldCHPBinTable(byte[] documentStream, int offset,
- int size, int fcMin, TextPieceTable tpt)
+ int size, int fcMin, OldTextPieceTable tpt)
{
PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
Copied:
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java
(from r1788131,
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java)
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java?p2=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java&p1=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java&r1=1788131&r2=1790061&rev=1790061&view=diff
==============================================================================
---
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java
(original)
+++
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java
Tue Apr 4 02:06:46 2017
@@ -18,78 +18,25 @@
package org.apache.poi.hwpf.model;
import java.io.IOException;
-import java.util.LinkedList;
-import java.util.List;
+import java.nio.charset.Charset;
-import org.apache.poi.hwpf.model.io.HWPFFileSystem;
-import org.apache.poi.hwpf.model.io.HWPFOutputStream;
-import org.apache.poi.hwpf.sprm.SprmBuffer;
import org.apache.poi.util.Internal;
-import org.apache.poi.util.LittleEndian;
@Internal
-public final class ComplexFileTable {
- private static final byte GRPPRL_TYPE = 1;
- private static final byte TEXT_PIECE_TABLE_TYPE = 2;
+public final class OldComplexFileTable extends ComplexFileTable {
- protected TextPieceTable _tpt;
-
- private SprmBuffer[] _grpprls;
-
- public ComplexFileTable() {
- _tpt = new TextPieceTable();
+ public OldComplexFileTable(byte[] documentStream, byte[] tableStream,
+ int offset, int fcMin, Charset charset) throws
IOException {
+ super(documentStream, tableStream, offset, fcMin, charset);
}
- public ComplexFileTable(byte[] documentStream, byte[] tableStream, int
offset, int fcMin) throws IOException {
- //skips through the prms before we reach the piece table. These
contain data
- //for actual fast saved files
- List<SprmBuffer> sprmBuffers = new LinkedList<SprmBuffer>();
- while (tableStream[offset] == GRPPRL_TYPE) {
- offset++;
- int size = LittleEndian.getShort(tableStream, offset);
- offset += LittleEndian.SHORT_SIZE;
- byte[] bs = LittleEndian.getByteArray(tableStream, offset, size);
- offset += size;
-
- SprmBuffer sprmBuffer = new SprmBuffer(bs, false, 0);
- sprmBuffers.add(sprmBuffer);
- }
- this._grpprls = sprmBuffers.toArray(new
SprmBuffer[sprmBuffers.size()]);
-
- if (tableStream[offset] != TEXT_PIECE_TABLE_TYPE) {
- throw new IOException("The text piece table is corrupted");
- }
- int pieceTableSize = LittleEndian.getInt(tableStream, ++offset);
- offset += LittleEndian.INT_SIZE;
- _tpt = new TextPieceTable(documentStream, tableStream, offset,
pieceTableSize, fcMin);
- }
-
- public TextPieceTable getTextPieceTable() {
- return _tpt;
- }
- public SprmBuffer[] getGrpprls() {
- return _grpprls;
+ @Override
+ protected TextPieceTable newTextPieceTable(byte[] documentStream,
+ byte[] tableStream, int offset,
+ int pieceTableSize, int fcMin,
Charset charset) {
+ return new OldTextPieceTable(documentStream, tableStream, offset,
pieceTableSize, fcMin, charset);
}
- @Deprecated
- public void writeTo(HWPFFileSystem sys) throws IOException {
- HWPFOutputStream docStream = sys.getStream("WordDocument");
- HWPFOutputStream tableStream = sys.getStream("1Table");
-
- writeTo(docStream, tableStream);
- }
-
- public void writeTo(HWPFOutputStream wordDocumentStream,
- HWPFOutputStream tableStream) throws IOException {
- tableStream.write(TEXT_PIECE_TABLE_TYPE);
-
- byte[] table = _tpt.writeTo(wordDocumentStream);
-
- byte[] numHolder = new byte[LittleEndian.INT_SIZE];
- LittleEndian.putInt(numHolder, 0, table.length);
- tableStream.write(numHolder);
- tableStream.write(table);
- }
}
Copied: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java
(from r1788131, poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/Ffn.java)
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java?p2=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java&p1=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/Ffn.java&r1=1788131&r2=1790061&rev=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/Ffn.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java Tue Apr
4 02:06:46 2017
@@ -17,192 +17,144 @@
package org.apache.poi.hwpf.model;
-import java.util.Arrays;
+import java.nio.charset.Charset;
-import org.apache.poi.util.BitField;
-import org.apache.poi.util.BitFieldFactory;
+import org.apache.poi.hwmf.record.HwmfFont;
import org.apache.poi.util.Internal;
import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+import org.apache.poi.util.StringUtil;
/**
- * FFN - Font Family Name. FFN is a data structure that stores the names of
the Main
- * Font and that of Alternate font as an array of characters. It has also a
header
- * that stores info about the whole structure and the fonts
- *
- * @author Praveen Mathew
+ * Word 6.0 Font information
*/
@Internal
-public final class Ffn
-{
- private int _cbFfnM1;//total length of FFN - 1.
- private byte _info;
- private static BitField _prq = BitFieldFactory.getInstance(0x0003);//
pitch request
- private static BitField _fTrueType =
BitFieldFactory.getInstance(0x0004);// when 1, font is a TrueType font
- private static BitField _ff = BitFieldFactory.getInstance(0x0070);
- private short _wWeight;// base weight of font
- private byte _chs;// character set identifier
- private byte _ixchSzAlt; // index into ffn.szFfn to the name of
- // the alternate font
- private byte [] _panose = new byte[10];//????
- private byte [] _fontSig = new byte[24];//????
-
- // zero terminated string that records name of font, cuurently not
- // supporting Extended chars
- private char [] _xszFfn;
-
- // extra facilitator members
- private int _xszFfnLength;
-
- public Ffn(byte[] buf, int offset)
- {
- int offsetTmp = offset;
-
- _cbFfnM1 = LittleEndian.getUByte(buf,offset);
- offset += LittleEndian.BYTE_SIZE;
- _info = buf[offset];
- offset += LittleEndian.BYTE_SIZE;
- _wWeight = LittleEndian.getShort(buf, offset);
- offset += LittleEndian.SHORT_SIZE;
- _chs = buf[offset];
- offset += LittleEndian.BYTE_SIZE;
- _ixchSzAlt = buf[offset];
- offset += LittleEndian.BYTE_SIZE;
-
- // read panose and fs so we can write them back out.
- System.arraycopy(buf, offset, _panose, 0, _panose.length);
- offset += _panose.length;
- System.arraycopy(buf, offset, _fontSig, 0, _fontSig.length);
- offset += _fontSig.length;
-
- offsetTmp = offset - offsetTmp;
- _xszFfnLength = (this.getSize() - offsetTmp)/2;
- _xszFfn = new char[_xszFfnLength];
-
- for(int i = 0; i < _xszFfnLength; i++)
- {
- _xszFfn[i] = (char)LittleEndian.getShort(buf, offset);
- offset += LittleEndian.SHORT_SIZE;
- }
-
-
- }
-
- public int get_cbFfnM1()
- {
- return _cbFfnM1;
- }
-
- public short getWeight()
- {
- return _wWeight;
- }
-
- public byte getChs()
- {
- return _chs;
- }
-
- public byte [] getPanose()
- {
- return _panose;
- }
-
- public byte [] getFontSig()
- {
- return _fontSig;
- }
-
- public int getSize()
- {
- return (_cbFfnM1 + 1);
- }
-
- public String getMainFontName()
- {
- int index = 0;
- for (;index < _xszFfnLength; index++)
- {
- if (_xszFfn[index] == '\0')
- {
- break;
- }
- }
- return new String(_xszFfn, 0, index);
- }
-
- public String getAltFontName()
- {
- int index = _ixchSzAlt;
- for (;index < _xszFfnLength; index++)
- {
- if (_xszFfn[index] == '\0')
- {
- break;
- }
- }
- return new String(_xszFfn, _ixchSzAlt, index);
-
- }
-
- public void set_cbFfnM1(int _cbFfnM1)
- {
- this._cbFfnM1 = _cbFfnM1;
- }
-
- // changed protected to public
- public byte[] toByteArray()
- {
- int offset = 0;
- byte[] buf = new byte[this.getSize()];
-
- buf[offset] = (byte)_cbFfnM1;
- offset += LittleEndian.BYTE_SIZE;
- buf[offset] = _info;
- offset += LittleEndian.BYTE_SIZE;
- LittleEndian.putShort(buf, offset, _wWeight);
- offset += LittleEndian.SHORT_SIZE;
- buf[offset] = _chs;
- offset += LittleEndian.BYTE_SIZE;
- buf[offset] = _ixchSzAlt;
- offset += LittleEndian.BYTE_SIZE;
-
- System.arraycopy(_panose,0,buf, offset,_panose.length);
- offset += _panose.length;
- System.arraycopy(_fontSig,0,buf, offset, _fontSig.length);
- offset += _fontSig.length;
-
- for(int i = 0; i < _xszFfn.length; i++)
- {
- LittleEndian.putShort(buf, offset, (short)_xszFfn[i]);
- offset += LittleEndian.SHORT_SIZE;
- }
-
- return buf;
-
- }
-
- @Override
- public boolean equals(Object other) {
- if (!(other instanceof Ffn)) return false;
- Ffn o = (Ffn)other;
-
- return (
- o._cbFfnM1 == this._cbFfnM1
- && o._info == this._info
- && o._wWeight == _wWeight
- && o._chs == _chs
- && o._ixchSzAlt == _ixchSzAlt
- && Arrays.equals(o._panose,_panose)
- && Arrays.equals(o._fontSig,_fontSig)
- && Arrays.equals(o._xszFfn,_xszFfn)
- );
- }
+public final class OldFfn {
+ private static final POILogger logger =
POILogFactory.getLogger(OldFfn.class);
+
+ private byte _chs;// character set identifier
+
+ private final String fontName;
+ private final String altFontName;
+
+ private final int length; //length in bytes for this record
+
+ /**
+ * try to read an OldFfn starting at offset; read no farther than end
+ *
+ * @param buf buffer from which to read
+ * @param offset offset at which to start
+ * @param fontTableEnd read no farther than this
+ * @return an OldFfn or null if asked to read beyond end
+ */
+ static OldFfn build(byte[] buf, int offset, int fontTableEnd) {
+ int start = offset;
+ //preliminary bytes
+ if (offset + 6 > fontTableEnd) {
+ return null;
+ }
+ //first byte
+ short fontDescriptionLength = (short) buf[offset];
+ offset += 1;
+ if (offset + fontDescriptionLength > fontTableEnd) {
+ logger.log(POILogger.WARN, "Asked to read beyond font table end.
Skipping font");
+ return null;
+ }
+
+ //no idea what these 3 bytes do
+ offset += 3;
+ byte chs = buf[offset];
+ Charset charset = null;
+ HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(chs &
0xff);
+ if (wmfCharset == null) {
+ logger.log(POILogger.WARN, "Couldn't find font for type: " + (chs
& 0xff));
+ } else {
+ charset = wmfCharset.getCharset();
+ }
+ charset = charset == null ? StringUtil.WIN_1252 : charset;
+ offset += LittleEndian.BYTE_SIZE;
+ //if this byte here == 7, it _may_ signify existence of
+ //an altername font name
+
+ //not sure what the byte after the _chs does
+ offset += LittleEndian.BYTE_SIZE;
+ int fontNameLength = -1;
+ for (int i = offset; i < fontTableEnd; i++) {
+ if (buf[i] == 0) {
+ fontNameLength = i - offset;
+ break;
+ }
+ }
+ if (fontNameLength == -1) {
+ logger.log(POILogger.WARN, "Couldn't find the zero-byte delimited
font name length");
+ return null;
+ }
+ String fontName = new String(buf, offset, fontNameLength, charset);
+ String altFontName = null;
+ int altFontNameLength = -1;
+ offset += fontNameLength + 1;
+ if (offset - start < fontDescriptionLength) {
+ for (int i = offset; i <= start + fontDescriptionLength; i++) {
+ if (buf[i] == 0) {
+ altFontNameLength = i - offset;
+ break;
+ }
+ }
+ if (altFontNameLength > -1) {
+ altFontName = new String(buf, offset, altFontNameLength,
charset);
+ }
+ }
+ //reset to 0 for length calculation
+ altFontNameLength = (altFontNameLength < 0) ? 0 : altFontNameLength +
1;//add one for zero byte
+
+ int len = LittleEndian.INT_SIZE + LittleEndian.BYTE_SIZE +
LittleEndian.BYTE_SIZE +//6 starting bytes
+ fontNameLength + altFontNameLength + 1;//+1 is for the zero
byte
+ //this len should == fontDescriptionLength
+
+ return new OldFfn(chs, fontName, altFontName, len);
+
+ }
+
+ public OldFfn(byte charsetIdentifier, String fontName, String altFontName,
int length) {
+ this._chs = charsetIdentifier;
+ this.fontName = fontName;
+ this.altFontName = altFontName;
+ this.length = length;
+ }
+
+ public byte getChs() {
+ return _chs;
+ }
+
+ public String getMainFontName() {
+ return fontName;
+ }
+
+ /**
+ * @return altFontName if it exists, null otherwise
+ */
+ public String getAltFontName() {
+ return altFontName;
+ }
+
+
+ /**
+ * @return length in bytes for this record
+ */
+ public int getLength() {
+ return length;
+ }
@Override
- public int hashCode() {
- assert false : "hashCode not designed";
- return 42; // any arbitrary constant will do
+ public String toString() {
+ return "OldFfn{" +
+ "_chs=" + (_chs & 0xff) +
+ ", fontName='" + fontName + '\'' +
+ ", altFontName='" + altFontName + '\'' +
+ ", length=" + length +
+ '}';
}
}
Copied:
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java (from
r1788131, poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/FontTable.java)
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java?p2=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java&p1=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/FontTable.java&r1=1788131&r2=1790061&rev=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/FontTable.java
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java
Tue Apr 4 02:06:46 2017
@@ -17,81 +17,57 @@
package org.apache.poi.hwpf.model;
-import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
-import org.apache.poi.hwpf.model.io.HWPFFileSystem;
-import org.apache.poi.hwpf.model.io.HWPFOutputStream;
import org.apache.poi.util.Internal;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/**
- * FontTable or in MS terminology sttbfffn is a common data structure written
in all
- * Word files. The sttbfffn is an sttbf where each string is an FFN structure
instead
- * of pascal-style strings. An sttbf is a string Table stored in file. Thus
sttbffn
- * is like an Sttbf with an array of FFN structures that stores the font name
strings
- *
- * @author Praveen Mathew
+ * Font table for Word 6.0
*/
@Internal
-public final class FontTable
-{
- private final static POILogger _logger =
POILogFactory.getLogger(FontTable.class);
- private short _stringCount;// how many strings are included in the string
table
- private short _extraDataSz;// size in bytes of the extra data
+public final class OldFontTable {
+ private final static POILogger _logger =
POILogFactory.getLogger(OldFontTable.class);
// added extra facilitator members
- private int lcbSttbfffn;// count of bytes in sttbfffn
- private int fcSttbfffn;// table stream offset for sttbfffn
-
// FFN structure containing strings of font names
- private Ffn[] _fontNames = null;
-
-
- public FontTable(byte[] buf, int offset, int lcbSttbfffn)
- {
- this.lcbSttbfffn = lcbSttbfffn;
- this.fcSttbfffn = offset;
-
- _stringCount = LittleEndian.getShort(buf, offset);
- offset += LittleEndian.SHORT_SIZE;
- _extraDataSz = LittleEndian.getShort(buf, offset);
- offset += LittleEndian.SHORT_SIZE;
+ private final OldFfn[] _fontNames;
- _fontNames = new Ffn[_stringCount]; //Ffn corresponds to a Pascal
style String in STTBF.
+ public OldFontTable(byte[] buf, int offset, int length) {
+ //length is stored at the index section in the table
+ //and it is recorded in the first short.
+
+
+ List<OldFfn> ffns = new ArrayList<OldFfn>();
+ int fontTableLength = LittleEndian.getShort(buf, offset);
+
+ int endOfTableOffset = offset + length;
+ int startOffset = offset + LittleEndian.SHORT_SIZE;//first short
should == length!
+
+ while (true) {
+ OldFfn oldFfn = OldFfn.build(buf, startOffset, endOfTableOffset);
+ if (oldFfn == null) {
+ break;
+ }
+ ffns.add(oldFfn);
+ startOffset += oldFfn.getLength();
- for(int i = 0;i<_stringCount; i++)
- {
- _fontNames[i] = new Ffn(buf,offset);
- offset += _fontNames[i].getSize();
}
+ _fontNames = ffns.toArray(new OldFfn[ffns.size()]);
}
- public short getStringCount()
- {
- return _stringCount;
- }
-
- public short getExtraDataSz()
- {
- return _extraDataSz;
- }
- public Ffn[] getFontNames()
- {
+ public OldFfn[] getFontNames() {
return _fontNames;
}
- public int getSize()
- {
- return lcbSttbfffn;
- }
- public String getMainFont(int chpFtc )
- {
- if(chpFtc >= _stringCount)
- {
+ public String getMainFont(int chpFtc) {
+ if (chpFtc >= _fontNames.length) {
_logger.log(POILogger.INFO, "Mismatch in chpFtc with stringCount");
return null;
}
@@ -99,65 +75,10 @@ public final class FontTable
return _fontNames[chpFtc].getMainFontName();
}
- public String getAltFont(int chpFtc )
- {
- if(chpFtc >= _stringCount)
- {
- _logger.log(POILogger.INFO, "Mismatch in chpFtc with stringCount");
- return null;
- }
-
- return _fontNames[chpFtc].getAltFontName();
- }
-
- public void setStringCount(short stringCount)
- {
- this._stringCount = stringCount;
- }
-
- @Deprecated
- public void writeTo( HWPFFileSystem sys ) throws IOException
- {
- HWPFOutputStream tableStream = sys.getStream( "1Table" );
- writeTo( tableStream );
- }
-
- public void writeTo( HWPFOutputStream tableStream ) throws IOException
- {
- byte[] buf = new byte[LittleEndian.SHORT_SIZE];
- LittleEndian.putShort(buf, 0, _stringCount);
- tableStream.write(buf);
- LittleEndian.putShort(buf, 0, _extraDataSz);
- tableStream.write(buf);
-
- for(int i = 0; i < _fontNames.length; i++)
- {
- tableStream.write(_fontNames[i].toByteArray());
- }
-
- }
-
- @Override
- public boolean equals(Object other) {
- if (!(other instanceof FontTable)) return false;
- FontTable o = (FontTable)other;
-
- if (o._stringCount != this._stringCount
- || o._extraDataSz != this._extraDataSz
- || o._fontNames.length != this._fontNames.length
- ) return false;
-
- for (int i=0; i<o._fontNames.length; i++) {
- if (!o._fontNames[i].equals(this._fontNames[i])) return false;
- }
-
- return true;
- }
-
@Override
- public int hashCode() {
- assert false : "hashCode not designed";
- return 42; // any arbitrary constant will do
+ public String toString() {
+ return "OldFontTable{" +
+ "_fontNames=" + Arrays.toString(_fontNames) +
+ '}';
}
-
}
Copied:
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java (from
r1788131, poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java)
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java?p2=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java&p1=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java&r1=1788131&r2=1790061&rev=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java
Tue Apr 4 02:06:46 2017
@@ -18,89 +18,53 @@
package org.apache.poi.hwpf.model;
-import java.nio.charset.Charset;
-
import org.apache.poi.util.Internal;
+import org.apache.poi.util.NotImplemented;
/**
* Lightweight representation of a text piece.
* Works in the character domain, not the byte domain, so you
* need to have turned byte references into character
* references before getting here.
- *
- * @author Ryan Ackley
*/
@Internal
-public class TextPiece extends PropertyNode<TextPiece> {
- private boolean _usesUnicode;
-
- private PieceDescriptor _pd;
+public class OldTextPiece extends TextPiece {
- /**
- * @param start Beginning offset in main document stream, in characters.
- * @param end Ending offset in main document stream, in characters.
- * @param text The raw bytes of our text
- * @deprecated Use {@link #TextPiece(int, int, byte[], PieceDescriptor)}
- * instead
- */
- public TextPiece(int start, int end, byte[] text, PieceDescriptor pd,
- int cpStart) {
- this(start, end, text, pd);
- }
+ private final byte[] rawBytes;
/**
* @param start Beginning offset in main document stream, in characters.
* @param end Ending offset in main document stream, in characters.
* @param text The raw bytes of our text
*/
- public TextPiece(int start, int end, byte[] text, PieceDescriptor pd) {
- super(start, end, buildInitSB(text, pd));
- _usesUnicode = pd.isUnicode();
- _pd = pd;
-
- // Validate
- int textLength = ((CharSequence) _buf).length();
- if (end - start != textLength) {
- throw new IllegalStateException("Told we're for characters " +
start + " -> " + end + ", but actually covers " + textLength + " characters!");
- }
+ public OldTextPiece(int start, int end, byte[] text, PieceDescriptor pd) {
+ super(start, end, text, pd);
+ this.rawBytes = text;
if (end < start) {
throw new IllegalStateException("Told we're of negative size!
start=" + start + " end=" + end);
}
}
/**
- * Create the StringBuilder from the text and unicode flag
- */
- private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) {
- String str = new String(text, Charset.forName(pd.isUnicode() ?
"UTF-16LE" : "Cp1252"));
-
- return new StringBuilder(str);
- }
-
- /**
- * @return If this text piece is unicode
+ * @return nothing, ever. Always throws an UnsupportedOperationException
+ * @throws UnsupportedOperationException
*/
+ @NotImplemented
+ @Override
public boolean isUnicode() {
- return _usesUnicode;
+ throw new UnsupportedOperationException();
}
- public PieceDescriptor getPieceDescriptor() {
- return _pd;
- }
-
- @Deprecated
- public StringBuffer getStringBuffer() {
- return new StringBuffer(getStringBuilder());
- }
public StringBuilder getStringBuilder() {
return (StringBuilder) _buf;
}
+ @Override
public byte[] getRawBytes() {
- return ((CharSequence) _buf).toString().getBytes(
- Charset.forName(_usesUnicode ? "UTF-16LE" : "Cp1252")
- );
+ byte[] buf = new byte[rawBytes.length];
+ System.arraycopy(rawBytes, 0, buf, 0, rawBytes.length);
+ return buf;
}
/**
@@ -109,84 +73,29 @@ public class TextPiece extends PropertyN
*
* @param start Local start position, in characters
* @param end Local end position, in characters
+ * @throws UnsupportedOperationException
*/
@Deprecated
+ @NotImplemented
public String substring(int start, int end) {
- StringBuilder buf = (StringBuilder) _buf;
-
- // Validate
- if (start < 0) {
- throw new StringIndexOutOfBoundsException("Can't request a
substring before 0 - asked for " + start);
- }
- if (end > buf.length()) {
- throw new StringIndexOutOfBoundsException("Index " + end + " out
of range 0 -> " + buf.length());
- }
- if (end < start) {
- throw new StringIndexOutOfBoundsException("Asked for text from " +
start + " to " + end + ", which has an end before the start!");
- }
- return buf.substring(start, end);
+ throw new UnsupportedOperationException();
}
/**
- * Adjusts the internal string for deletinging
- * some characters within this.
- *
- * @param start The start position for the delete, in characters
- * @param length The number of characters to delete
+ * Not implemented for OldTextPiece.
+ * Always throws UnsupportedOperationException
*/
@Deprecated
+ @NotImplemented
public void adjustForDelete(int start, int length) {
- int myStart = getStart();
- int myEnd = getEnd();
- int end = start + length;
-
- /* do we have to delete from this text piece? */
- if (start <= myEnd && end >= myStart) {
-
- /* find where the deleted area overlaps with this text piece
*/
- int overlapStart = Math.max(myStart, start);
- int overlapEnd = Math.min(myEnd, end);
-
- int bufStart = overlapStart - myStart;
- int bufEnd = overlapEnd - myStart;
- ((StringBuilder) _buf).delete(bufStart, bufEnd);
- }
-
- // We need to invoke this even if text from this piece is not being
- // deleted because the adjustment must propagate to all subsequent
- // text pieces i.e., if text from tp[n] is being deleted, then
- // tp[n + 1], tp[n + 2], etc. will need to be adjusted.
- // The superclass is expected to use a separate sentry for this.
- super.adjustForDelete(start, length);
- }
-
- /**
- * Returns the length, in characters
- */
- @Deprecated
- public int characterLength() {
- return (getEnd() - getStart());
+ throw new UnsupportedOperationException();
}
/**
* Returns the length, in bytes
*/
public int bytesLength() {
- return (getEnd() - getStart()) * (_usesUnicode ? 2 : 1);
- }
-
- @Override
- public boolean equals(Object o) {
- if (!(o instanceof TextPiece)) return false;
- TextPiece tp = (TextPiece) o;
- assert (_buf != null && tp._buf != null && _pd != null && tp._pd !=
null);
-
- return (
- limitsAreEqual(o)
- && tp._usesUnicode == this._usesUnicode
- && tp._buf.toString().equals(this._buf.toString())
- && tp._pd.equals(this._pd)
- );
+ return rawBytes.length;
}
@Override
@@ -204,7 +113,8 @@ public class TextPiece extends PropertyN
}
public String toString() {
- return "TextPiece from " + getStart() + " to " + getEnd() + " ("
+ return "OldTextPiece from " + getStart() + " to " + getEnd() + " ("
+ getPieceDescriptor() + ")";
}
+
}
Copied:
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java
(from r1788131,
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java)
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java?p2=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java&p1=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java&r1=1788131&r2=1790061&rev=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
(original)
+++
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java
Tue Apr 4 02:06:46 2017
@@ -16,42 +16,29 @@
==================================================================== */
package org.apache.poi.hwpf.model;
-import java.io.IOException;
-import java.io.Serializable;
+import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
-import java.util.Comparator;
-import java.util.LinkedList;
-import java.util.List;
-import org.apache.poi.hwpf.model.io.HWPFOutputStream;
-import org.apache.poi.poifs.common.POIFSConstants;
+import org.apache.poi.util.CodePageUtil;
import org.apache.poi.util.Internal;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
-/**
- * The piece table for matching up character positions to bits of text. This
- * mostly works in bytes, but the TextPieces themselves work in characters.
This
- * does the icky convertion.
- *
- * @author Ryan Ackley
- */
+
@Internal
-public class TextPieceTable implements CharIndexTranslator {
- private static final POILogger logger = POILogFactory
- .getLogger(TextPieceTable.class);
+public class OldTextPieceTable extends TextPieceTable {
- // int _multiple;
- int _cpMin;
- protected ArrayList<TextPiece> _textPieces = new ArrayList<TextPiece>();
- protected ArrayList<TextPiece> _textPiecesFCOrder = new
ArrayList<TextPiece>();
+ private static final POILogger logger = POILogFactory
+ .getLogger(OldTextPieceTable.class);
- public TextPieceTable() {
+ public OldTextPieceTable() {
+ super();
}
- public TextPieceTable(byte[] documentStream, byte[] tableStream,
- int offset, int size, int fcMin) {
+ public OldTextPieceTable(byte[] documentStream, byte[] tableStream,
+ int offset, int size, int fcMin, Charset charset)
{
+ //super(documentStream, tableStream, offset, size, fcMin, charset);
// get our plex of PieceDescriptors
PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size,
PieceDescriptor.getSizeInBytes());
@@ -63,7 +50,7 @@ public class TextPieceTable implements C
// PieceDescriptor objects
for (int x = 0; x < length; x++) {
GenericPropertyNode node = pieceTable.getProperty(x);
- pieces[x] = new PieceDescriptor(node.getBytes(), 0);
+ pieces[x] = new PieceDescriptor(node.getBytes(), 0, charset);
}
// Figure out the cp of the earliest text piece
@@ -88,7 +75,8 @@ public class TextPieceTable implements C
// What's the relationship between bytes and characters?
boolean unicode = pieces[x].isUnicode();
int multiple = 1;
- if (unicode) {
+ if (unicode ||
+ (charset != null &&
CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset))) {
multiple = 2;
}
@@ -101,7 +89,7 @@ public class TextPieceTable implements C
System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
// And now build the piece
- final TextPiece newTextPiece = new TextPiece(nodeStartChars,
nodeEndChars, buf,
+ final TextPiece newTextPiece = newTextPiece(nodeStartChars,
nodeEndChars, buf,
pieces[x]);
_textPieces.add(newTextPiece);
@@ -112,344 +100,20 @@ public class TextPieceTable implements C
Collections.sort(_textPieces);
_textPiecesFCOrder = new ArrayList<TextPiece>(_textPieces);
Collections.sort(_textPiecesFCOrder, new FCComparator());
- }
-
- public void add(TextPiece piece) {
- _textPieces.add(piece);
- _textPiecesFCOrder.add(piece);
- Collections.sort(_textPieces);
- Collections.sort(_textPiecesFCOrder, new FCComparator());
- }
-
- /**
- * Adjust all the text piece after inserting some text into one of them
- *
- * @param listIndex The TextPiece that had characters inserted into
- * @param length The number of characters inserted
- */
- public int adjustForInsert(int listIndex, int length) {
- int size = _textPieces.size();
-
- TextPiece tp = _textPieces.get(listIndex);
-
- // Update with the new end
- tp.setEnd(tp.getEnd() + length);
-
- // Now change all subsequent ones
- for (int x = listIndex + 1; x < size; x++) {
- tp = _textPieces.get(x);
- tp.setStart(tp.getStart() + length);
- tp.setEnd(tp.getEnd() + length);
- }
-
- // All done
- return length;
- }
-
- public boolean equals(Object o) {
- if (!(o instanceof TextPieceTable)) return false;
- TextPieceTable tpt = (TextPieceTable) o;
-
- int size = tpt._textPieces.size();
- if (size == _textPieces.size()) {
- for (int x = 0; x < size; x++) {
- if (!tpt._textPieces.get(x).equals(_textPieces.get(x))) {
- return false;
- }
- }
- return true;
- }
- return false;
- }
-
- public int getByteIndex(int charPos) {
- int byteCount = 0;
- for (TextPiece tp : _textPieces) {
- if (charPos >= tp.getEnd()) {
- byteCount = tp.getPieceDescriptor().getFilePosition()
- + (tp.getEnd() - tp.getStart())
- * (tp.isUnicode() ? 2 : 1);
-
- if (charPos == tp.getEnd())
- break;
-
- continue;
- }
- if (charPos < tp.getEnd()) {
- int left = charPos - tp.getStart();
- byteCount = tp.getPieceDescriptor().getFilePosition() + left
- * (tp.isUnicode() ? 2 : 1);
- break;
- }
- }
- return byteCount;
- }
- @Deprecated
- public int getCharIndex(int bytePos) {
- return getCharIndex(bytePos, 0);
- }
-
- @Deprecated
- public int getCharIndex(int startBytePos, int startCP) {
- int charCount = 0;
-
- int bytePos = lookIndexForward(startBytePos);
-
- for (TextPiece tp : _textPieces) {
- int pieceStart = tp.getPieceDescriptor().getFilePosition();
-
- int bytesLength = tp.bytesLength();
- int pieceEnd = pieceStart + bytesLength;
-
- int toAdd;
-
- if (bytePos < pieceStart || bytePos > pieceEnd) {
- toAdd = bytesLength;
- } else if (bytePos > pieceStart && bytePos < pieceEnd) {
- toAdd = (bytePos - pieceStart);
- } else {
- toAdd = bytesLength - (pieceEnd - bytePos);
- }
-
- if (tp.isUnicode()) {
- charCount += toAdd / 2;
- } else {
- charCount += toAdd;
- }
-
- if (bytePos >= pieceStart && bytePos <= pieceEnd
- && charCount >= startCP) {
- break;
- }
- }
-
- return charCount;
}
@Override
- public int[][] getCharIndexRanges(int startBytePosInclusive,
- int endBytePosExclusive) {
- List<int[]> result = new LinkedList<int[]>();
- for (TextPiece textPiece : _textPiecesFCOrder) {
- final int tpStart = textPiece.getPieceDescriptor()
- .getFilePosition();
- final int tpEnd = textPiece.getPieceDescriptor().getFilePosition()
- + textPiece.bytesLength();
- if (startBytePosInclusive > tpEnd)
- continue;
- if (endBytePosExclusive <= tpStart)
- break;
-
- final int rangeStartBytes = Math.max(tpStart,
- startBytePosInclusive);
- final int rangeEndBytes = Math.min(tpEnd, endBytePosExclusive);
- final int rangeLengthBytes = rangeEndBytes - rangeStartBytes;
-
- if (rangeStartBytes > rangeEndBytes)
- continue;
-
- final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
-
- final int rangeStartCp = textPiece.getStart()
- + (rangeStartBytes - tpStart) / encodingMultiplier;
- final int rangeEndCp = rangeStartCp + rangeLengthBytes
- / encodingMultiplier;
-
- result.add(new int[]{rangeStartCp, rangeEndCp});
- }
-
- return result.toArray(new int[result.size()][]);
- }
-
- public int getCpMin() {
- return _cpMin;
- }
-
- public StringBuilder getText() {
- final long start = System.currentTimeMillis();
-
- // rebuild document paragraphs structure
- StringBuilder docText = new StringBuilder();
- for (TextPiece textPiece : _textPieces) {
- String toAppend = textPiece.getStringBuilder().toString();
- int toAppendLength = toAppend.length();
-
- if (toAppendLength != textPiece.getEnd() - textPiece.getStart()) {
- logger.log(
- POILogger.WARN,
- "Text piece has boundaries [",
- Integer.valueOf(textPiece.getStart()),
- "; ",
- Integer.valueOf(textPiece.getEnd()),
- ") but length ",
- Integer.valueOf(textPiece.getEnd()
- - textPiece.getStart()));
- }
-
- docText.replace(textPiece.getStart(), textPiece.getStart()
- + toAppendLength, toAppend);
- }
-
- logger.log(POILogger.DEBUG, "Document text were rebuilded in ",
- Long.valueOf(System.currentTimeMillis() - start), " ms (",
- Integer.valueOf(docText.length()), " chars)");
-
- return docText;
- }
-
- public List<TextPiece> getTextPieces() {
- return _textPieces;
+ protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars,
byte[] buf, PieceDescriptor pd) {
+ return new OldTextPiece(nodeStartChars, nodeEndChars, buf, pd);
}
@Override
- public int hashCode() {
- return _textPieces.size();
- }
-
- public boolean isIndexInTable(int bytePos) {
- for (TextPiece tp : _textPiecesFCOrder) {
- int pieceStart = tp.getPieceDescriptor().getFilePosition();
-
- if (bytePos > pieceStart + tp.bytesLength()) {
- continue;
- }
-
- if (pieceStart > bytePos) {
- return false;
- }
-
- return true;
- }
-
- return false;
- }
-
- boolean isIndexInTable(int startBytePos, int endBytePos) {
- for (TextPiece tp : _textPiecesFCOrder) {
- int pieceStart = tp.getPieceDescriptor().getFilePosition();
-
- if (startBytePos >= pieceStart + tp.bytesLength()) {
- continue;
- }
-
- int left = Math.max(startBytePos, pieceStart);
- int right = Math.min(endBytePos, pieceStart + tp.bytesLength());
-
- if (left >= right)
- return false;
-
- return true;
- }
-
- return false;
- }
-
- public int lookIndexBackward(final int startBytePos) {
- int bytePos = startBytePos;
- int lastEnd = 0;
-
- for (TextPiece tp : _textPiecesFCOrder) {
- int pieceStart = tp.getPieceDescriptor().getFilePosition();
-
- if (bytePos > pieceStart + tp.bytesLength()) {
- lastEnd = pieceStart + tp.bytesLength();
- continue;
- }
-
- if (pieceStart > bytePos) {
- bytePos = lastEnd;
- }
-
- break;
- }
-
- return bytePos;
- }
-
- public int lookIndexForward(final int startBytePos) {
- if (_textPiecesFCOrder.isEmpty())
- throw new IllegalStateException("Text pieces table is empty");
-
- if (_textPiecesFCOrder.get(0).getPieceDescriptor().getFilePosition() >
startBytePos)
- return
_textPiecesFCOrder.get(0).getPieceDescriptor().getFilePosition();
-
- if (_textPiecesFCOrder.get(_textPiecesFCOrder.size() - 1)
- .getPieceDescriptor().getFilePosition() <= startBytePos)
- return startBytePos;
-
- int low = 0;
- int high = _textPiecesFCOrder.size() - 1;
-
- while (low <= high) {
- int mid = (low + high) >>> 1;
- final TextPiece textPiece = _textPiecesFCOrder.get(mid);
- int midVal = textPiece.getPieceDescriptor().getFilePosition();
-
- if (midVal < startBytePos)
- low = mid + 1;
- else if (midVal > startBytePos)
- high = mid - 1;
- else
- // found piece with exact start
- return textPiece.getPieceDescriptor().getFilePosition();
- }
- assert low == high;
- assert _textPiecesFCOrder.get(low).getPieceDescriptor()
- .getFilePosition() < startBytePos;
- // last line can't be current, can it?
- assert _textPiecesFCOrder.get(low + 1).getPieceDescriptor()
- .getFilePosition() > startBytePos;
-
- // shifting to next piece start
- return _textPiecesFCOrder.get(low +
1).getPieceDescriptor().getFilePosition();
- }
-
- public byte[] writeTo(HWPFOutputStream docStream) throws IOException {
- PlexOfCps textPlex = new PlexOfCps(PieceDescriptor.getSizeInBytes());
- // int fcMin = docStream.getOffset();
-
- for (TextPiece next : _textPieces) {
- PieceDescriptor pd = next.getPieceDescriptor();
-
- int offset = docStream.getOffset();
- int mod = (offset % POIFSConstants.SMALLER_BIG_BLOCK_SIZE);
- if (mod != 0) {
- mod = POIFSConstants.SMALLER_BIG_BLOCK_SIZE - mod;
- byte[] buf = new byte[mod];
- docStream.write(buf);
- }
-
- // set the text piece position to the current docStream offset.
- pd.setFilePosition(docStream.getOffset());
-
- // write the text to the docstream and save the piece descriptor to
- // the
- // plex which will be written later to the tableStream.
- docStream.write(next.getRawBytes());
-
- // The TextPiece is already in characters, which
- // makes our life much easier
- int nodeStart = next.getStart();
- int nodeEnd = next.getEnd();
- textPlex.addProperty(new GenericPropertyNode(nodeStart, nodeEnd,
- pd.toByteArray()));
- }
-
- return textPlex.toByteArray();
- }
-
- private static class FCComparator implements Comparator<TextPiece>,
Serializable {
- public int compare(TextPiece textPiece, TextPiece textPiece1) {
- if (textPiece.getPieceDescriptor().fc > textPiece1
- .getPieceDescriptor().fc) {
- return 1;
- } else if (textPiece.getPieceDescriptor().fc < textPiece1
- .getPieceDescriptor().fc) {
- return -1;
- } else {
- return 0;
- }
+ protected int getEncodingMultiplier(TextPiece textPiece) {
+ Charset charset = textPiece.getPieceDescriptor().getCharset();
+ if (charset != null &&
CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset)) {
+ return 2;
}
+ return 1;
}
}
Modified:
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java Tue
Apr 4 02:06:46 2017
@@ -260,7 +260,7 @@ public class PAPBinTable
SprmBuffer sprmBuffer = null;
for ( PAPX papx : papxs )
{
- if ( papx.getGrpprl() == null || papx.getGrpprl().length == 0 )
+ if ( papx.getGrpprl() == null || papx.getGrpprl().length <= 2 )
continue;
if ( sprmBuffer == null ) {
Modified:
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java
Tue Apr 4 02:06:46 2017
@@ -17,10 +17,13 @@
package org.apache.poi.hwpf.model;
+import java.nio.charset.Charset;
+
import org.apache.poi.util.BitField;
import org.apache.poi.util.BitFieldFactory;
import org.apache.poi.util.Internal;
import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.StringUtil;
@Internal
public final class PieceDescriptor
@@ -32,29 +35,51 @@ public final class PieceDescriptor
private static BitField fCopied = BitFieldFactory.getInstance(0x04);
int fc;
PropertyModifier prm;
- boolean unicode;
+ boolean unicode = false;
+ private final Charset charset;
- public PieceDescriptor(byte[] buf, int offset)
- {
- descriptor = LittleEndian.getShort(buf, offset);
- offset += LittleEndian.SHORT_SIZE;
- fc = LittleEndian.getInt(buf, offset);
- offset += LittleEndian.INT_SIZE;
- prm = new PropertyModifier( LittleEndian.getShort(buf, offset));
-
- // see if this piece uses unicode.
- if ((fc & 0x40000000) == 0)
- {
- unicode = true;
- }
- else
- {
- unicode = false;
- fc &= ~(0x40000000);//gives me FC in doc stream
- fc /= 2;
+ public PieceDescriptor(byte[] buf, int offset) {
+ this(buf, offset, null);
}
+ /**
+ *
+ * This initializer should only be used for HWPFOldDocuments.
+ *
+ * @param buf
+ * @param offset
+ * @param charset which charset to use if this is not unicode
+ */
+ public PieceDescriptor(byte[] buf, int offset, Charset charset) {
+ descriptor = LittleEndian.getShort(buf, offset);
+ offset += LittleEndian.SHORT_SIZE;
+ fc = LittleEndian.getInt(buf, offset);
+ offset += LittleEndian.INT_SIZE;
+ prm = new PropertyModifier(LittleEndian.getShort(buf, offset));
+ if (charset == null) {
+ // see if this piece uses unicode.
+ //From the documentation: If the second most significant bit
+ //is clear, then this indicates the actual file offset of the
Unicode character (two bytes). If the
+ //second most significant bit is set, then the actual address of the
codepage-1252
+ //compressed version of the Unicode character (one byte), is
actually at the offset indicated
+ //by clearing this bit and dividing by two.
+ if ((fc & 0x40000000) == 0) {
+ unicode = true;
+ this.charset = null;
+ } else {
+ unicode = false;
+ fc &= ~(0x40000000);//gives me FC in doc stream
+ fc /= 2;
+ this.charset = StringUtil.WIN_1252;
+ }
+ } else {
+ if (charset == StringUtil.UTF16LE) {
+ unicode = true;
+ }
+ this.charset = charset;
+ }
+
}
public int getFilePosition()
@@ -72,6 +97,15 @@ public final class PieceDescriptor
return unicode;
}
+ /**
+ *
+ * @return charset to use if this is not a Unicode PieceDescriptor
+ * this can be <code>null</code>
+ */
+ public Charset getCharset() {
+ return charset;
+ }
+
public PropertyModifier getPrm()
{
return prm;
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java Tue
Apr 4 02:06:46 2017
@@ -21,6 +21,7 @@ package org.apache.poi.hwpf.model;
import java.nio.charset.Charset;
import org.apache.poi.util.Internal;
+import org.apache.poi.util.StringUtil;
/**
* Lightweight representation of a text piece.
@@ -40,7 +41,6 @@ public class TextPiece extends PropertyN
* @param start Beginning offset in main document stream, in characters.
* @param end Ending offset in main document stream, in characters.
* @param text The raw bytes of our text
- * @deprecated Use {@link #TextPiece(int, int, byte[], PieceDescriptor)}
* instead
*/
public TextPiece(int start, int end, byte[] text, PieceDescriptor pd,
@@ -72,8 +72,13 @@ public class TextPiece extends PropertyN
* Create the StringBuilder from the text and unicode flag
*/
private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) {
- String str = new String(text, Charset.forName(pd.isUnicode() ?
"UTF-16LE" : "Cp1252"));
+ byte[] textBuffer = text;
+ if (StringUtil.BIG5.equals(pd.getCharset())) {
+ String txt = new
StringBuilder(StringUtil.littleEndianBig5Stream(text, 0,
text.length)).toString();
+ return new StringBuilder(txt);
+ }
+ String str = new String(textBuffer, 0, textBuffer.length,
(pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset());
return new StringBuilder(str);
}
@@ -207,4 +212,5 @@ public class TextPiece extends PropertyN
return "TextPiece from " + getStart() + " to " + getEnd() + " ("
+ getPieceDescriptor() + ")";
}
+
}
Modified:
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
Tue Apr 4 02:06:46 2017
@@ -101,7 +101,7 @@ public class TextPieceTable implements C
System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
// And now build the piece
- final TextPiece newTextPiece = new TextPiece(nodeStartChars,
nodeEndChars, buf,
+ final TextPiece newTextPiece = newTextPiece(nodeStartChars,
nodeEndChars, buf,
pieces[x]);
_textPieces.add(newTextPiece);
@@ -114,6 +114,10 @@ public class TextPieceTable implements C
Collections.sort(_textPiecesFCOrder, new FCComparator());
}
+ protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars,
byte[] buf, PieceDescriptor pd) {
+ return new TextPiece(nodeStartChars, nodeEndChars, buf, pd);
+ }
+
public void add(TextPiece piece) {
_textPieces.add(piece);
_textPiecesFCOrder.add(piece);
@@ -249,7 +253,7 @@ public class TextPieceTable implements C
if (rangeStartBytes > rangeEndBytes)
continue;
- final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
+ final int encodingMultiplier = getEncodingMultiplier(textPiece);
final int rangeStartCp = textPiece.getStart()
+ (rangeStartBytes - tpStart) / encodingMultiplier;
@@ -262,6 +266,10 @@ public class TextPieceTable implements C
return result.toArray(new int[result.size()][]);
}
+ protected int getEncodingMultiplier(TextPiece textPiece) {
+ return textPiece.isUnicode() ? 2 : 1;
+ }
+
public int getCpMin() {
return _cpMin;
}
@@ -439,7 +447,7 @@ public class TextPieceTable implements C
return textPlex.toByteArray();
}
- private static class FCComparator implements Comparator<TextPiece>,
Serializable {
+ protected static class FCComparator implements Comparator<TextPiece>,
Serializable {
public int compare(TextPiece textPiece, TextPiece textPiece1) {
if (textPiece.getPieceDescriptor().fc > textPiece1
.getPieceDescriptor().fc) {
Modified:
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
---
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java
(original)
+++
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java
Tue Apr 4 02:06:46 2017
@@ -18,6 +18,7 @@
package org.apache.poi.hwpf.usermodel;
import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFOldDocument;
import org.apache.poi.hwpf.model.CHPX;
import org.apache.poi.hwpf.model.FFData;
import org.apache.poi.hwpf.model.Ffn;
@@ -438,6 +439,10 @@ public final class CharacterRun extends
public String getFontName()
{
+ if (_doc instanceof HWPFOldDocument) {
+ return ((HWPFOldDocument)
_doc).getOldFontTable().getMainFont(_props.getFtcAscii());
+ }
+
if (_doc.getFontTable() == null)
// old word format
return null;
Modified:
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
---
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java
(original)
+++
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java
Tue Apr 4 02:06:46 2017
@@ -16,18 +16,19 @@
==================================================================== */
package org.apache.poi.hwpf.converter;
-import java.io.File;
-import java.io.FilenameFilter;
-import java.io.StringWriter;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
+import static org.junit.Assert.assertNotNull;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
import org.apache.poi.POIDataSamples;
import org.apache.poi.hwpf.HWPFDocumentCore;
@@ -36,8 +37,6 @@ import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
-import static org.junit.Assert.assertNotNull;
-
@RunWith(Parameterized.class)
public class TestWordToConverterSuite
{
@@ -45,7 +44,11 @@ public class TestWordToConverterSuite
* YK: a quick hack to exclude failing documents from the suite.
*/
private static List<String> failingFiles = Arrays
- .asList( "ProblemExtracting.doc" );
+ .asList( "ProblemExtracting.doc",
+ "Bug50955.doc" //basic extraction works,
+ // but these extractors modify the
document,
+ // which is a no-go for this Word 6.0 file
+ );
@Parameterized.Parameters(name="{index}: {0}")
public static Iterable<Object[]> files() {
Modified:
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
---
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
(original)
+++
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
Tue Apr 4 02:06:46 2017
@@ -57,6 +57,7 @@ import junit.framework.TestCase;
* against HWPF
*/
public class TestBugs{
+
private static final POILogger logger =
POILogFactory.getLogger(TestBugs.class);
public static void assertEqualsIgnoreNewline(String expected, String
actual )
@@ -536,13 +537,6 @@ public class TestBugs{
hwpfDocument.getPicturesTable().getAllPictures();
}
- /**
- * [FAILING] Bug 50955 - error while retrieving the text file
- */
- @Test(expected=IllegalStateException.class)
- public void test50955() throws IOException {
- getTextOldFile("Bug50955.doc");
- }
/**
* [RESOLVED FIXED] Bug 51604 - replace text fails for doc (poi 3.8 beta
Modified:
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java
URL:
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
---
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java
(original)
+++
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java
Tue Apr 4 02:06:46 2017
@@ -17,14 +17,19 @@
package org.apache.poi.hwpf.usermodel;
+import static org.apache.poi.POITestCase.assertContains;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
+import java.nio.charset.Charset;
import org.apache.poi.OldFileFormatException;
+import org.apache.poi.hwmf.record.HwmfFont;
import org.apache.poi.hwpf.HWPFOldDocument;
import org.apache.poi.hwpf.HWPFTestCase;
import org.apache.poi.hwpf.HWPFTestDataSamples;
+import org.apache.poi.hwpf.extractor.Word6Extractor;
+import org.apache.poi.hwpf.model.OldFontTable;
import org.junit.Test;
/**
@@ -98,7 +103,7 @@ public final class TestHWPFOldDocument e
assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns());
// Normal, superscript for 4th, normal
assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns());
-
+
doc.close();
}
@@ -143,4 +148,87 @@ public final class TestHWPFOldDocument e
doc.getRange().getParagraph(1).text());
doc.close();
}
+
+ @Test
+ public void testDefaultCodePageEncoding() throws IOException {
+ HWPFOldDocument doc =
HWPFTestDataSamples.openOldSampleFile("Bug60942.doc");
+ Word6Extractor ex = new Word6Extractor(doc);
+ String txt = ex.getText();
+ assertContains(txt, "BERTHOD");
+ assertContains(txt, "APPLICOLOR");
+ assertContains(txt, "les meilleurs");
+ assertContains(txt, "GUY LECOLE");
+ }
+
+
+ @Test
+ public void testCodePageBug50955() throws IOException {
+ //windows 1251
+ HWPFOldDocument doc =
HWPFTestDataSamples.openOldSampleFile("Bug50955.doc");
+ Word6Extractor ex = new Word6Extractor(doc);
+
+ StringBuilder sb = new StringBuilder();
+ for (String p : ex.getParagraphText()) {
+ sb.append(p);
+ }
+ assertContains(sb.toString(),
"\u043F\u0440\u0438\u0432\u0435\u0442");//Greetings!
+ }
+
+ @Test
+ public void testCodePageBug60936() throws IOException {
+ //windows 1250 -- this test file was generated with OpenOffice
+ //see https://bz.apache.org/ooo/show_bug.cgi?id=12445 for the
inspiration
+
+
+ HWPFOldDocument doc =
HWPFTestDataSamples.openOldSampleFile("Bug60936.doc");
+ Word6Extractor ex = new Word6Extractor(doc);
+ StringBuilder sb = new StringBuilder();
+ for (String p : ex.getParagraphText()) {
+ sb.append(p);
+ }
+ assertContains(sb.toString(), "4 sk\u00f3re a p\u0159ed 7
lety");//Greetings!
+ }
+
+ @Test
+ public void testOldFontTableEncoding() throws IOException {
+ HWPFOldDocument doc =
HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
+ OldFontTable oldFontTable = doc.getOldFontTable();
+ assertEquals(5, oldFontTable.getFontNames().length);
+ assertEquals("\u7D30\u660E\u9AD4",
oldFontTable.getFontNames()[0].getMainFontName());
+ assertEquals(HwmfFont.WmfCharset.CHINESEBIG5_CHARSET.getCharset(),
Charset.forName("Big5"));
+ assertEquals("Times New Roman",
oldFontTable.getFontNames()[1].getMainFontName());
+ doc.close();
+
+ }
+
+ @Test
+ public void testOldFontTableAltName() throws IOException {
+ HWPFOldDocument doc =
HWPFTestDataSamples.openOldSampleFile("Bug60942b.doc");
+ OldFontTable oldFontTable = doc.getOldFontTable();
+ assertEquals(5, oldFontTable.getFontNames().length);
+ assertEquals("Roboto",
oldFontTable.getFontNames()[3].getMainFontName());
+ assertEquals("arial", oldFontTable.getFontNames()[3].getAltFontName());
+ assertEquals("Roboto",
oldFontTable.getFontNames()[4].getMainFontName());
+ assertEquals("arial", oldFontTable.getFontNames()[4].getAltFontName());
+ }
+
+
+ @Test
+ public void test51944() throws IOException {
+ HWPFOldDocument doc =
HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
+ Word6Extractor ex = new Word6Extractor(doc);
+ StringBuilder sb = new StringBuilder();
+ for (String p : ex.getParagraphText()) {
+ sb.append(p.replaceAll("[\r\n]+", "\n"));
+ }
+ String txt = sb.toString();
+ assertContains(txt, "Post and Fax");
+ assertContains(txt, "also maintain");//this is at a critical juncture
+ assertContains(txt, "which are available for");//this too
+
+ //TODO: figure out why these two aren't passing
+// assertContains(txt, "\u2019\u0078 block2");//make sure smart quote
is extracted correctly
+// assertContains(txt, "We are able to");//not sure if we can get this
easily?
+ }
+
}
Added: poi/trunk/test-data/document/Bug60936.doc
URL:
http://svn.apache.org/viewvc/poi/trunk/test-data/document/Bug60936.doc?rev=1790061&view=auto
==============================================================================
Binary file - no diff available.
Propchange: poi/trunk/test-data/document/Bug60936.doc
------------------------------------------------------------------------------
svn:mime-type = application/msword
Added: poi/trunk/test-data/document/Bug60942.doc
URL:
http://svn.apache.org/viewvc/poi/trunk/test-data/document/Bug60942.doc?rev=1790061&view=auto
==============================================================================
Binary file - no diff available.
Propchange: poi/trunk/test-data/document/Bug60942.doc
------------------------------------------------------------------------------
svn:mime-type = application/msword
Added: poi/trunk/test-data/document/Bug60942b.doc
URL:
http://svn.apache.org/viewvc/poi/trunk/test-data/document/Bug60942b.doc?rev=1790061&view=auto
==============================================================================
Binary file - no diff available.
Propchange: poi/trunk/test-data/document/Bug60942b.doc
------------------------------------------------------------------------------
svn:mime-type = application/msword
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]