Author: tallison
Date: Tue Apr  4 02:06:46 2017
New Revision: 1790061

URL: http://svn.apache.org/viewvc?rev=1790061&view=rev
Log:
bug 50955 -- word 6.0 charset fix

Added:
    poi/trunk/src/java/org/apache/poi/util/LittleEndianBig5Stream.java   (with 
props)
    
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java
      - copied, changed from r1788131, 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java
      - copied, changed from r1788131, 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/Ffn.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java
      - copied, changed from r1788131, 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/FontTable.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java
      - copied, changed from r1788131, 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
    
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java
      - copied, changed from r1788131, 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
    poi/trunk/test-data/document/Bug60936.doc   (with props)
    poi/trunk/test-data/document/Bug60942.doc   (with props)
    poi/trunk/test-data/document/Bug60942b.doc   (with props)
Modified:
    poi/site/src/documentation/content/xdocs/status.xml
    poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java
    poi/trunk/src/java/org/apache/poi/util/CodePageUtil.java
    poi/trunk/src/java/org/apache/poi/util/StringUtil.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java
    
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java
    
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
    
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java

Modified: poi/site/src/documentation/content/xdocs/status.xml
URL: 
http://svn.apache.org/viewvc/poi/site/src/documentation/content/xdocs/status.xml?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/site/src/documentation/content/xdocs/status.xml (original)
+++ poi/site/src/documentation/content/xdocs/status.xml Tue Apr  4 02:06:46 2017
@@ -58,6 +58,7 @@
 
     <release version="3.16-beta3" date="2017-04-??">
     <actions>
+        <action dev="PD" type="fix" fixes-bug="50955" module="HWPF">Fix 
charset handling in HWPFOldDocument</action>
         <action dev="PD" type="add" fixes-bug="60826" module="XSSF">Add 
initial streaming, read-only support for xlsb files</action>
         <action dev="PD" type="fix" fixes-bug="51519" module="XSSF">Allow user 
to select or ignore phonetic strings in shared strings table</action>
         <action dev="PD" type="fix" fixes-bug="60662" module="XSLF">Slide 
import delete unrecognized elements in group shape</action>

Modified: poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java Tue Apr  4 
02:06:46 2017
@@ -218,6 +218,9 @@ public class TestAllFiles {
         "document/Word6_sections2.doc",
         "document/Word95.doc",
         "document/word95err.doc",
+        "document/Bug60936.doc",
+        "document/Bug60942.doc",
+        "document/Bug60942b.doc",
         "hpsf/TestMickey.doc",
         "document/52117.doc"
     );

Modified: poi/trunk/src/java/org/apache/poi/util/CodePageUtil.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/util/CodePageUtil.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/util/CodePageUtil.java (original)
+++ poi/trunk/src/java/org/apache/poi/util/CodePageUtil.java Tue Apr  4 
02:06:46 2017
@@ -18,6 +18,9 @@
 package org.apache.poi.util;
 
 import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
+import java.util.HashSet;
+import java.util.Set;
 
 /**
  * Utilities for working with Microsoft CodePages.
@@ -27,6 +30,13 @@ import java.io.UnsupportedEncodingExcept
  */
 public class CodePageUtil
 {
+
+    public static final Set<Charset> VARIABLE_BYTE_CHARSETS = new 
HashSet<Charset>();
+    static {
+        //others?
+        VARIABLE_BYTE_CHARSETS.add(StringUtil.BIG5);
+    }
+
     /** <p>Codepage 037, a special case</p> */
     public static final int CP_037 = 37;
 

Added: poi/trunk/src/java/org/apache/poi/util/LittleEndianBig5Stream.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/util/LittleEndianBig5Stream.java?rev=1790061&view=auto
==============================================================================
--- poi/trunk/src/java/org/apache/poi/util/LittleEndianBig5Stream.java (added)
+++ poi/trunk/src/java/org/apache/poi/util/LittleEndianBig5Stream.java Tue Apr  
4 02:06:46 2017
@@ -0,0 +1,107 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.util;
+
+import java.io.ByteArrayInputStream;
+
+/**
+ * Stream that converts MSOffice's way of storing Big5, with
+ * zero-byte padding for ASCII and in LittleEndianOrder.
+ */
+@Internal
+public class LittleEndianBig5Stream extends ByteArrayInputStream {
+    private static final int EOF = -1;
+    private static final int INVALID_PAIR = -2;
+    private static final int EMPTY_TRAILING = -3;
+
+    //the char that is logically trailing in Big5 encoding
+    //however in LittleEndian order, this is the first encountered.
+    int trailing = EMPTY_TRAILING;
+    public LittleEndianBig5Stream(byte[] buf) {
+        super(buf);
+    }
+
+    public LittleEndianBig5Stream(byte[] buf, int offset, int length) {
+        super(buf, offset, length);
+    }
+
+    @Override
+    public int read() {
+
+        if (trailing != EMPTY_TRAILING) {
+            int tmp = trailing;
+            trailing = EMPTY_TRAILING;
+            return tmp;
+        }
+        int leading = readNext();
+        while (leading == INVALID_PAIR) {
+            leading = readNext();
+        }
+
+        if (leading == EOF) {
+            return EOF;
+        }
+        return leading;
+    }
+
+    //returns leading, sets trailing appropriately
+    //returns -1 if it hits the end of the stream
+    //returns -2 for an invalid big5 code pair
+    private final int readNext() {
+        trailing = super.read();
+        if (trailing == -1) {
+            return EOF;
+        }
+        int leading = super.read();
+        if (leading == EOF) {
+            return EOF;
+        }
+        int lead = leading&0xff;
+        if (lead > 0x80) {
+            return leading;
+        } else if (lead == 0) {
+            int ret = trailing;
+            trailing = EMPTY_TRAILING;
+            return ret;
+        } else {
+            int ret = trailing;
+            trailing = EMPTY_TRAILING;
+            return ret;
+            //return INVALID_PAIR;
+        }
+
+    }
+
+    @Override
+    public int read(byte[] buff, int off, int len) {
+        int bytesRead = 0;
+        for (int i = off; i < off+len; i++) {
+            int b = read();
+            if (b == -1) {
+                if (bytesRead == 0) {
+                    return -1;
+                } else {
+                    return bytesRead;
+                }
+            }
+            bytesRead++;
+            buff[i] = (byte)b;
+        }
+        return bytesRead;
+    }
+}

Propchange: poi/trunk/src/java/org/apache/poi/util/LittleEndianBig5Stream.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: poi/trunk/src/java/org/apache/poi/util/StringUtil.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/util/StringUtil.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/util/StringUtil.java (original)
+++ poi/trunk/src/java/org/apache/poi/util/StringUtil.java Tue Apr  4 02:06:46 
2017
@@ -17,6 +17,8 @@
 
 package org.apache.poi.util;
 
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
 import java.nio.charset.Charset;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -27,9 +29,14 @@ import java.util.Map;
  */
 @Internal
 public class StringUtil {
+
+    private static final POILogger logger = POILogFactory
+            .getLogger(StringUtil.class);
     protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
-    protected static final Charset UTF16LE = Charset.forName("UTF-16LE");
+    public static final Charset UTF16LE = Charset.forName("UTF-16LE");
     public static final Charset UTF8 = Charset.forName("UTF-8");
+    public static final Charset WIN_1252 = Charset.forName("cp1252");
+    public static final Charset BIG5 = Charset.forName("Big5");
 
     private static Map<Integer,Integer> msCodepointToUnicode;
 
@@ -573,7 +580,28 @@ public class StringUtil {
        9133, // 0xf0fe bracerightbt
        ' ', // 0xf0ff not defined
    };
-   
+
+    /**
+     * This tries to convert a LE byte array in Big5 to a String.
+     * We know MS zero-padded ascii, and we drop those.
+     * However, there may be areas for improvement in this.
+     *
+     * @param data
+     * @param offset
+     * @param lengthInBytes
+     * @return
+     */
+   public static String littleEndianBig5Stream(byte[] data, int offset, int 
lengthInBytes) {
+       ByteArrayOutputStream os = new ByteArrayOutputStream();
+       try {
+           IOUtils.copy(new LittleEndianBig5Stream(data, offset, 
lengthInBytes), os);
+       } catch (IOException e) {
+           logger.log(POILogger.WARN,
+                   "IOException while copying a byte array stream to a byte 
array stream?!");
+       }
+       return new String(os.toByteArray(), BIG5);
+   }
+
    // Could be replaced with org.apache.commons.lang3.StringUtils#join
    @Internal
    public static String join(Object[] array, String separator) {

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java 
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java Tue 
Apr  4 02:06:46 2017
@@ -108,7 +108,7 @@ public class HwmfFont {
             return charset;
         }
 
-        static WmfCharset valueOf(int flag) {
+        public static WmfCharset valueOf(int flag) {
             for (WmfCharset cs : values()) {
                 if (cs.flag == flag) return cs;
             }

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java 
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java Tue 
Apr  4 02:06:46 2017
@@ -19,27 +19,43 @@ package org.apache.poi.hwpf;
 import java.io.File;
 import java.io.IOException;
 import java.io.OutputStream;
+import java.nio.charset.Charset;
 
+import org.apache.poi.hwmf.record.HwmfFont;
 import org.apache.poi.hwpf.model.ComplexFileTable;
+import org.apache.poi.hwpf.model.FontTable;
 import org.apache.poi.hwpf.model.OldCHPBinTable;
+import org.apache.poi.hwpf.model.OldComplexFileTable;
+import org.apache.poi.hwpf.model.OldFfn;
+import org.apache.poi.hwpf.model.OldFontTable;
 import org.apache.poi.hwpf.model.OldPAPBinTable;
 import org.apache.poi.hwpf.model.OldSectionTable;
+import org.apache.poi.hwpf.model.OldTextPieceTable;
 import org.apache.poi.hwpf.model.PieceDescriptor;
 import org.apache.poi.hwpf.model.TextPiece;
 import org.apache.poi.hwpf.model.TextPieceTable;
 import org.apache.poi.hwpf.usermodel.Range;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.CodePageUtil;
 import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.NotImplemented;
+import org.apache.poi.util.StringUtil;
 
 /**
  * Provides very simple support for old (Word 6 / Word 95)
  *  files.
  */
 public class HWPFOldDocument extends HWPFDocumentCore {
-    private TextPieceTable tpt;
+
+    private final static Charset DEFAULT_CHARSET = StringUtil.WIN_1252;
+
+    private OldTextPieceTable tpt;
     
     private StringBuilder _text;
+
+    private final OldFontTable fontTable;
+    private final Charset guessedCharset;
     
     public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
         this(fs.getRoot());
@@ -56,45 +72,52 @@ public class HWPFOldDocument extends HWP
         int chpTableSize   = LittleEndian.getInt(_mainStream, 0xbc);
         int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0);
         int papTableSize   = LittleEndian.getInt(_mainStream, 0xc4);
-        //int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60);
-        //int shfTableSize   = LittleEndian.getInt(_mainStream, 0x64);
+        int fontTableOffset = LittleEndian.getInt(_mainStream, 0xd0);
+        int fontTableSize = LittleEndian.getInt(_mainStream, 0xd4);
+
+        fontTable = new OldFontTable(_mainStream, fontTableOffset, 
fontTableSize);
+        //TODO: figure out how to map runs/text pieces to fonts
+        //for now, if there's a non standard codepage in one of the fonts
+        //assume that the doc is in that codepage.
+        guessedCharset = guessCodePage(fontTable);
+
         int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
         
         // We need to get hold of the text that makes up the
         //  document, which might be regular or fast-saved
         ComplexFileTable cft = null;
-        StringBuffer text = new StringBuffer();
         if(_fib.getFibBase().isFComplex()) {
-            cft = new ComplexFileTable(
+            cft = new OldComplexFileTable(
                     _mainStream, _mainStream,
-                    complexTableOffset, _fib.getFibBase().getFcMin()
+                    complexTableOffset, _fib.getFibBase().getFcMin(), 
guessedCharset
             );
-            tpt = cft.getTextPieceTable();
+            tpt = (OldTextPieceTable)cft.getTextPieceTable();
             
-            for(TextPiece tp : tpt.getTextPieces()) {
-                text.append( tp.getStringBuilder() );
-            }
         } else {
             // TODO Discover if these older documents can ever hold Unicode 
Strings?
             //  (We think not, because they seem to lack a Piece table)
             // TODO Build the Piece Descriptor properly
             //  (We have to fake it, as they don't seem to have a proper Piece 
table)
-            PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 
0,0,0,127, 0,0}, 0);
+            PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 
0,0,0,127, 0,0}, 0, guessedCharset);
             pd.setFilePosition(_fib.getFibBase().getFcMin());
 
             // Generate a single Text Piece Table, with a single Text Piece
             //  which covers all the (8 bit only) text in the file
-            tpt = new TextPieceTable();
+            tpt = new OldTextPieceTable();
             byte[] textData = new 
byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()];
             System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), 
textData, 0, textData.length);
+
+            int numChars = textData.length;
+            if (CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(guessedCharset)) {
+                numChars /= 2;
+            }
+
             TextPiece tp = new TextPiece(
-                    0, textData.length, textData, pd
+                    0, numChars, textData, pd
             );
             tpt.add(tp);
             
-            text.append(tp.getStringBuilder());
         }
-        
         _text = tpt.getText();
 
         // Now we can fetch the character and paragraph properties
@@ -133,12 +156,54 @@ public class HWPFOldDocument extends HWP
         }
     }
 
+
+    /**
+     * Take the first codepage that is not default, ansi or symbol.
+     * Ideally, we'd want to track fonts with runs, but we don't yet
+     * know how to do that.
+     *
+     * Consider throwing an exception if > 1 unique codepage that is not 
default, symbol or ansi
+     * appears here.
+     *
+     * @param fontTable
+     * @return
+     */
+    private Charset guessCodePage(OldFontTable fontTable) {
+
+        for (OldFfn oldFfn : fontTable.getFontNames()) {
+            HwmfFont.WmfCharset wmfCharset = 
HwmfFont.WmfCharset.valueOf(oldFfn.getChs()& 0xff);
+            if (wmfCharset != null &&
+                    wmfCharset != HwmfFont.WmfCharset.ANSI_CHARSET &&
+                    wmfCharset != HwmfFont.WmfCharset.DEFAULT_CHARSET &&
+                    wmfCharset != HwmfFont.WmfCharset.SYMBOL_CHARSET ) {
+                return wmfCharset.getCharset();
+            }
+        }
+        return DEFAULT_CHARSET;
+    }
+
     public Range getOverallRange()
     {
         // Life is easy when we have no footers, headers or unicode!
         return new Range( 0, _fib.getFibBase().getFcMac() - 
_fib.getFibBase().getFcMin(), this );
     }
 
+    /**
+     * Use {@link #getOldFontTable()} instead!!!
+     * This always throws an IllegalArgumentException.
+     *
+     * @return nothing
+     * @throws UnsupportedOperationException
+     */
+    @Override
+    @NotImplemented
+    public FontTable getFontTable() {
+        throw new UnsupportedOperationException("Use getOldFontTable 
instead.");
+    }
+
+    public OldFontTable getOldFontTable() {
+        return fontTable;
+    }
     public Range getRange()
     {
         return getOverallRange();
@@ -167,4 +232,19 @@ public class HWPFOldDocument extends HWP
     public void write(OutputStream out) throws IOException {
         throw new IllegalStateException("Writing is not available for the 
older file formats");
     }
+
+    /**
+     * As a rough heuristic (total hack), read through the font table
+     * and take the first non-default, non-ansi, non-symbol
+     * font's charset and return that.
+     *
+     * Once we figure out how to link a font to a text piece, we should
+     * use the font information per text piece.
+     *
+     * @return charset
+     */
+    public Charset getGuessedCharset() {
+        return guessedCharset;
+    }
+
 }

Modified: 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java 
(original)
+++ 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java 
Tue Apr  4 02:06:46 2017
@@ -18,6 +18,7 @@
 package org.apache.poi.hwpf.model;
 
 import java.io.IOException;
+import java.nio.charset.Charset;
 import java.util.LinkedList;
 import java.util.List;
 
@@ -26,9 +27,10 @@ import org.apache.poi.hwpf.model.io.HWPF
 import org.apache.poi.hwpf.sprm.SprmBuffer;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.StringUtil;
 
 @Internal
-public final class ComplexFileTable {
+public class ComplexFileTable {
     private static final byte GRPPRL_TYPE = 1;
     private static final byte TEXT_PIECE_TABLE_TYPE = 2;
 
@@ -40,7 +42,8 @@ public final class ComplexFileTable {
         _tpt = new TextPieceTable();
     }
 
-    public ComplexFileTable(byte[] documentStream, byte[] tableStream, int 
offset, int fcMin) throws IOException {
+    protected ComplexFileTable(byte[] documentStream, byte[] tableStream, int 
offset, int fcMin,
+                               Charset charset) throws IOException {
         //skips through the prms before we reach the piece table. These 
contain data
         //for actual fast saved files
         List<SprmBuffer> sprmBuffers = new LinkedList<SprmBuffer>();
@@ -61,7 +64,12 @@ public final class ComplexFileTable {
         }
         int pieceTableSize = LittleEndian.getInt(tableStream, ++offset);
         offset += LittleEndian.INT_SIZE;
-        _tpt = new TextPieceTable(documentStream, tableStream, offset, 
pieceTableSize, fcMin);
+        _tpt = newTextPieceTable(documentStream, tableStream, offset, 
pieceTableSize, fcMin, charset);
+
+    }
+
+    public ComplexFileTable(byte[] documentStream, byte[] tableStream, int 
offset, int fcMin) throws IOException {
+        this(documentStream, tableStream, offset, fcMin, StringUtil.WIN_1252);
     }
 
     public TextPieceTable getTextPieceTable() {
@@ -92,4 +100,11 @@ public final class ComplexFileTable {
         tableStream.write(table);
     }
 
+    protected TextPieceTable newTextPieceTable(byte[] documentStream,
+                                               byte[] tableStream, int offset, 
int pieceTableSize, int fcMin,
+                                               Charset charset) {
+        return new TextPieceTable(documentStream, tableStream, offset, 
pieceTableSize, fcMin);
+    }
+
+
 }

Modified: 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java 
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java 
Tue Apr  4 02:06:46 2017
@@ -44,7 +44,7 @@ public final class OldCHPBinTable extend
    * @param fcMin
    */
   public OldCHPBinTable(byte[] documentStream, int offset,
-                     int size, int fcMin, TextPieceTable tpt)
+                     int size, int fcMin, OldTextPieceTable tpt)
   {
     PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
 

Copied: 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java 
(from r1788131, 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java)
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java?p2=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java&p1=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java&r1=1788131&r2=1790061&rev=1790061&view=diff
==============================================================================
--- 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java 
(original)
+++ 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java 
Tue Apr  4 02:06:46 2017
@@ -18,78 +18,25 @@
 package org.apache.poi.hwpf.model;
 
 import java.io.IOException;
-import java.util.LinkedList;
-import java.util.List;
+import java.nio.charset.Charset;
 
-import org.apache.poi.hwpf.model.io.HWPFFileSystem;
-import org.apache.poi.hwpf.model.io.HWPFOutputStream;
-import org.apache.poi.hwpf.sprm.SprmBuffer;
 import org.apache.poi.util.Internal;
-import org.apache.poi.util.LittleEndian;
 
 @Internal
-public final class ComplexFileTable {
-    private static final byte GRPPRL_TYPE = 1;
-    private static final byte TEXT_PIECE_TABLE_TYPE = 2;
+public final class OldComplexFileTable extends ComplexFileTable {
 
-    protected TextPieceTable _tpt;
-
-    private SprmBuffer[] _grpprls;
-
-    public ComplexFileTable() {
-        _tpt = new TextPieceTable();
+    public OldComplexFileTable(byte[] documentStream, byte[] tableStream,
+                               int offset, int fcMin, Charset charset) throws 
IOException {
+        super(documentStream, tableStream, offset, fcMin, charset);
     }
 
-    public ComplexFileTable(byte[] documentStream, byte[] tableStream, int 
offset, int fcMin) throws IOException {
-        //skips through the prms before we reach the piece table. These 
contain data
-        //for actual fast saved files
-        List<SprmBuffer> sprmBuffers = new LinkedList<SprmBuffer>();
-        while (tableStream[offset] == GRPPRL_TYPE) {
-            offset++;
-            int size = LittleEndian.getShort(tableStream, offset);
-            offset += LittleEndian.SHORT_SIZE;
-            byte[] bs = LittleEndian.getByteArray(tableStream, offset, size);
-            offset += size;
-
-            SprmBuffer sprmBuffer = new SprmBuffer(bs, false, 0);
-            sprmBuffers.add(sprmBuffer);
-        }
-        this._grpprls = sprmBuffers.toArray(new 
SprmBuffer[sprmBuffers.size()]);
-
-        if (tableStream[offset] != TEXT_PIECE_TABLE_TYPE) {
-            throw new IOException("The text piece table is corrupted");
-        }
-        int pieceTableSize = LittleEndian.getInt(tableStream, ++offset);
-        offset += LittleEndian.INT_SIZE;
-        _tpt = new TextPieceTable(documentStream, tableStream, offset, 
pieceTableSize, fcMin);
-    }
-
-    public TextPieceTable getTextPieceTable() {
-        return _tpt;
-    }
 
-    public SprmBuffer[] getGrpprls() {
-        return _grpprls;
+    @Override
+    protected TextPieceTable newTextPieceTable(byte[] documentStream,
+                                               byte[] tableStream, int offset,
+                                               int pieceTableSize, int fcMin, 
Charset charset) {
+        return new OldTextPieceTable(documentStream, tableStream, offset, 
pieceTableSize, fcMin, charset);
     }
 
-    @Deprecated
-    public void writeTo(HWPFFileSystem sys) throws IOException {
-        HWPFOutputStream docStream = sys.getStream("WordDocument");
-        HWPFOutputStream tableStream = sys.getStream("1Table");
-
-        writeTo(docStream, tableStream);
-    }
-
-    public void writeTo(HWPFOutputStream wordDocumentStream,
-                        HWPFOutputStream tableStream) throws IOException {
-        tableStream.write(TEXT_PIECE_TABLE_TYPE);
-
-        byte[] table = _tpt.writeTo(wordDocumentStream);
-
-        byte[] numHolder = new byte[LittleEndian.INT_SIZE];
-        LittleEndian.putInt(numHolder, 0, table.length);
-        tableStream.write(numHolder);
-        tableStream.write(table);
-    }
 
 }

Copied: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java 
(from r1788131, poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/Ffn.java)
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java?p2=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java&p1=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/Ffn.java&r1=1788131&r2=1790061&rev=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/Ffn.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java Tue Apr  
4 02:06:46 2017
@@ -17,192 +17,144 @@
 
 package org.apache.poi.hwpf.model;
 
-import java.util.Arrays;
+import java.nio.charset.Charset;
 
-import org.apache.poi.util.BitField;
-import org.apache.poi.util.BitFieldFactory;
+import org.apache.poi.hwmf.record.HwmfFont;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+import org.apache.poi.util.StringUtil;
 
 /**
- * FFN - Font Family Name. FFN is a data structure that stores the names of 
the Main
- * Font and that of Alternate font as an array of characters. It has also a 
header
- * that stores info about the whole structure and the fonts
- *
- * @author Praveen Mathew
+ * Word 6.0 Font information
  */
 @Internal
-public final class Ffn
-{
-  private int _cbFfnM1;//total length of FFN - 1.
-  private byte _info;
-    private static BitField _prq = BitFieldFactory.getInstance(0x0003);// 
pitch request
-    private static BitField _fTrueType = 
BitFieldFactory.getInstance(0x0004);// when 1, font is a TrueType font
-    private static BitField _ff = BitFieldFactory.getInstance(0x0070);
-  private short _wWeight;// base weight of font
-  private byte _chs;// character set identifier
-  private byte _ixchSzAlt;  // index into ffn.szFfn to the name of
-                                  // the alternate font
-  private byte [] _panose = new byte[10];//????
-  private byte [] _fontSig = new byte[24];//????
-
-  // zero terminated string that records name of font, cuurently not
-  // supporting Extended chars
-  private char [] _xszFfn;
-
-  // extra facilitator members
-  private int _xszFfnLength;
-
-  public Ffn(byte[] buf, int offset)
-  {
-    int offsetTmp = offset;
-
-    _cbFfnM1 = LittleEndian.getUByte(buf,offset);
-    offset += LittleEndian.BYTE_SIZE;
-    _info = buf[offset];
-    offset += LittleEndian.BYTE_SIZE;
-    _wWeight = LittleEndian.getShort(buf, offset);
-    offset += LittleEndian.SHORT_SIZE;
-    _chs = buf[offset];
-    offset += LittleEndian.BYTE_SIZE;
-    _ixchSzAlt = buf[offset];
-    offset += LittleEndian.BYTE_SIZE;
-
-    // read panose and fs so we can write them back out.
-    System.arraycopy(buf, offset, _panose, 0, _panose.length);
-    offset += _panose.length;
-    System.arraycopy(buf, offset, _fontSig, 0, _fontSig.length);
-    offset += _fontSig.length;
-
-    offsetTmp = offset - offsetTmp;
-    _xszFfnLength = (this.getSize() - offsetTmp)/2;
-    _xszFfn = new char[_xszFfnLength];
-
-    for(int i = 0; i < _xszFfnLength; i++)
-    {
-      _xszFfn[i] = (char)LittleEndian.getShort(buf, offset);
-      offset += LittleEndian.SHORT_SIZE;
-    }
-
-
-  }
-
-  public int get_cbFfnM1()
-  {
-    return  _cbFfnM1;
-  }
-
-  public short getWeight()
-  {
-         return  _wWeight;
-  }
-
-  public byte getChs()
-  {
-         return  _chs;
-  }
-
-  public byte [] getPanose()
-  {
-         return  _panose;
-  }
-
-  public byte [] getFontSig()
-  {
-         return  _fontSig;
-  }
-
-  public int getSize()
-  {
-    return (_cbFfnM1 + 1);
-  }
-
-  public String getMainFontName()
-  {
-    int index = 0;
-    for (;index < _xszFfnLength; index++)
-    {
-      if (_xszFfn[index] == '\0')
-      {
-        break;
-      }
-    }
-    return new String(_xszFfn, 0, index);
-  }
-
-  public String getAltFontName()
-  {
-    int index = _ixchSzAlt;
-    for (;index < _xszFfnLength; index++)
-    {
-      if (_xszFfn[index] == '\0')
-      {
-        break;
-      }
-    }
-    return new String(_xszFfn, _ixchSzAlt, index);
-
-  }
-
-  public void set_cbFfnM1(int _cbFfnM1)
-  {
-    this._cbFfnM1 = _cbFfnM1;
-  }
-
-  // changed protected to public
-  public byte[] toByteArray()
-  {
-    int offset = 0;
-    byte[] buf = new byte[this.getSize()];
-
-    buf[offset] = (byte)_cbFfnM1;
-    offset += LittleEndian.BYTE_SIZE;
-    buf[offset] = _info;
-    offset += LittleEndian.BYTE_SIZE;
-    LittleEndian.putShort(buf, offset, _wWeight);
-    offset += LittleEndian.SHORT_SIZE;
-    buf[offset] = _chs;
-    offset += LittleEndian.BYTE_SIZE;
-    buf[offset] = _ixchSzAlt;
-    offset += LittleEndian.BYTE_SIZE;
-
-    System.arraycopy(_panose,0,buf, offset,_panose.length);
-    offset += _panose.length;
-    System.arraycopy(_fontSig,0,buf, offset, _fontSig.length);
-    offset += _fontSig.length;
-
-    for(int i = 0; i < _xszFfn.length; i++)
-    {
-      LittleEndian.putShort(buf, offset, (short)_xszFfn[i]);
-      offset += LittleEndian.SHORT_SIZE;
-    }
-
-    return buf;
-
-  }
-
-  @Override
-  public boolean equals(Object other) {
-      if (!(other instanceof Ffn)) return false;
-      Ffn o = (Ffn)other;
-      
-      return (
-             o._cbFfnM1 == this._cbFfnM1
-          && o._info == this._info
-          && o._wWeight == _wWeight
-          && o._chs == _chs
-          && o._ixchSzAlt == _ixchSzAlt
-          && Arrays.equals(o._panose,_panose)
-          && Arrays.equals(o._fontSig,_fontSig)
-          && Arrays.equals(o._xszFfn,_xszFfn)
-      );
-  }
+public final class OldFfn {
 
+    private static final POILogger logger = 
POILogFactory.getLogger(OldFfn.class);
+
+    private byte _chs;// character set identifier
+
+    private final String fontName;
+    private final String altFontName;
+
+    private final int length; //length in bytes for this record
+
+    /**
+     * try to read an OldFfn starting at offset; read no farther than end
+     *
+     * @param buf          buffer from which to read
+     * @param offset       offset at which to start
+     * @param fontTableEnd read no farther than this
+     * @return an OldFfn or null if asked to read beyond end
+     */
+    static OldFfn build(byte[] buf, int offset, int fontTableEnd) {
+        int start = offset;
+        //preliminary bytes
+        if (offset + 6 > fontTableEnd) {
+            return null;
+        }
+        //first byte
+        short fontDescriptionLength = (short) buf[offset];
+        offset += 1;
+        if (offset + fontDescriptionLength > fontTableEnd) {
+            logger.log(POILogger.WARN, "Asked to read beyond font table end. 
Skipping font");
+            return null;
+        }
+
+        //no idea what these 3 bytes do
+        offset += 3;
+        byte chs = buf[offset];
+        Charset charset = null;
+        HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(chs & 
0xff);
+        if (wmfCharset == null) {
+            logger.log(POILogger.WARN, "Couldn't find font for type: " + (chs 
& 0xff));
+        } else {
+            charset = wmfCharset.getCharset();
+        }
+        charset = charset == null ? StringUtil.WIN_1252 : charset;
+        offset += LittleEndian.BYTE_SIZE;
+        //if this byte here == 7, it _may_ signify existence of
+        //an altername font name
+
+        //not sure what the byte after the _chs does
+        offset += LittleEndian.BYTE_SIZE;
+        int fontNameLength = -1;
+        for (int i = offset; i < fontTableEnd; i++) {
+            if (buf[i] == 0) {
+                fontNameLength = i - offset;
+                break;
+            }
+        }
+        if (fontNameLength == -1) {
+            logger.log(POILogger.WARN, "Couldn't find the zero-byte delimited 
font name length");
+            return null;
+        }
+        String fontName = new String(buf, offset, fontNameLength, charset);
+        String altFontName = null;
+        int altFontNameLength = -1;
+        offset += fontNameLength + 1;
+        if (offset - start < fontDescriptionLength) {
+            for (int i = offset; i <= start + fontDescriptionLength; i++) {
+                if (buf[i] == 0) {
+                    altFontNameLength = i - offset;
+                    break;
+                }
+            }
+            if (altFontNameLength > -1) {
+                altFontName = new String(buf, offset, altFontNameLength, 
charset);
+            }
+        }
+        //reset to 0 for length calculation
+        altFontNameLength = (altFontNameLength < 0) ? 0 : altFontNameLength + 
1;//add one for zero byte
+
+        int len = LittleEndian.INT_SIZE + LittleEndian.BYTE_SIZE + 
LittleEndian.BYTE_SIZE +//6 starting bytes
+                fontNameLength + altFontNameLength + 1;//+1 is for the zero 
byte
+        //this len should == fontDescriptionLength
+
+        return new OldFfn(chs, fontName, altFontName, len);
+
+    }
+
+    public OldFfn(byte charsetIdentifier, String fontName, String altFontName, 
int length) {
+        this._chs = charsetIdentifier;
+        this.fontName = fontName;
+        this.altFontName = altFontName;
+        this.length = length;
+    }
+
+    public byte getChs() {
+        return _chs;
+    }
+
+    public String getMainFontName() {
+        return fontName;
+    }
+
+    /**
+     * @return altFontName if it exists, null otherwise
+     */
+    public String getAltFontName() {
+        return altFontName;
+    }
+
+
+    /**
+     * @return length in bytes for this record
+     */
+    public int getLength() {
+        return length;
+    }
 
     @Override
-    public int hashCode() {
-        assert false : "hashCode not designed";
-        return 42; // any arbitrary constant will do
+    public String toString() {
+        return "OldFfn{" +
+                "_chs=" + (_chs & 0xff) +
+                ", fontName='" + fontName + '\'' +
+                ", altFontName='" + altFontName + '\'' +
+                ", length=" + length +
+                '}';
     }
 }
 

Copied: 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java (from 
r1788131, poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/FontTable.java)
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java?p2=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java&p1=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/FontTable.java&r1=1788131&r2=1790061&rev=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/FontTable.java 
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java 
Tue Apr  4 02:06:46 2017
@@ -17,81 +17,57 @@
 
 package org.apache.poi.hwpf.model;
 
-import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 
-import org.apache.poi.hwpf.model.io.HWPFFileSystem;
-import org.apache.poi.hwpf.model.io.HWPFOutputStream;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.LittleEndian;
 import org.apache.poi.util.POILogFactory;
 import org.apache.poi.util.POILogger;
 
 /**
- * FontTable or in MS terminology sttbfffn is a common data structure written 
in all
- * Word files. The sttbfffn is an sttbf where each string is an FFN structure 
instead
- * of pascal-style strings. An sttbf is a string Table stored in file. Thus 
sttbffn
- * is like an Sttbf with an array of FFN structures that stores the font name 
strings
- *
- * @author Praveen Mathew
+ * Font table for Word 6.0
  */
 @Internal
-public final class FontTable
-{
-    private final static POILogger _logger = 
POILogFactory.getLogger(FontTable.class);
-    private short _stringCount;// how many strings are included in the string 
table
-    private short _extraDataSz;// size in bytes of the extra data
+public final class OldFontTable {
+    private final static POILogger _logger = 
POILogFactory.getLogger(OldFontTable.class);
 
     // added extra facilitator members
-    private int lcbSttbfffn;// count of bytes in sttbfffn
-    private int fcSttbfffn;// table stream offset for sttbfffn
-
     // FFN structure containing strings of font names
-    private Ffn[] _fontNames = null;
-
-
-    public FontTable(byte[] buf, int offset, int lcbSttbfffn)
-    {
-        this.lcbSttbfffn = lcbSttbfffn;
-        this.fcSttbfffn = offset;
-
-        _stringCount = LittleEndian.getShort(buf, offset);
-        offset += LittleEndian.SHORT_SIZE;
-        _extraDataSz = LittleEndian.getShort(buf, offset);
-        offset += LittleEndian.SHORT_SIZE;
+    private final OldFfn[] _fontNames;
 
-        _fontNames = new Ffn[_stringCount]; //Ffn corresponds to a Pascal 
style String in STTBF.
+    public OldFontTable(byte[] buf, int offset, int length) {
+        //length is stored at the index section in the table
+        //and it is recorded in the first short.
+
+
+        List<OldFfn> ffns = new ArrayList<OldFfn>();
+        int fontTableLength = LittleEndian.getShort(buf, offset);
+
+        int endOfTableOffset = offset + length;
+        int startOffset = offset + LittleEndian.SHORT_SIZE;//first short 
should == length!
+
+        while (true) {
+            OldFfn oldFfn = OldFfn.build(buf, startOffset, endOfTableOffset);
+            if (oldFfn == null) {
+                break;
+            }
+            ffns.add(oldFfn);
+            startOffset += oldFfn.getLength();
 
-        for(int i = 0;i<_stringCount; i++)
-        {
-            _fontNames[i] = new Ffn(buf,offset);
-            offset += _fontNames[i].getSize();
         }
+        _fontNames = ffns.toArray(new OldFfn[ffns.size()]);
     }
 
-    public short getStringCount()
-    {
-        return _stringCount;
-    }
-
-    public short getExtraDataSz()
-    {
-        return _extraDataSz;
-    }
 
-    public Ffn[] getFontNames()
-    {
+    public OldFfn[] getFontNames() {
         return _fontNames;
     }
 
-    public int getSize()
-    {
-        return lcbSttbfffn;
-    }
 
-    public String getMainFont(int chpFtc )
-    {
-        if(chpFtc >= _stringCount)
-        {
+    public String getMainFont(int chpFtc) {
+        if (chpFtc >= _fontNames.length) {
             _logger.log(POILogger.INFO, "Mismatch in chpFtc with stringCount");
             return null;
         }
@@ -99,65 +75,10 @@ public final class FontTable
         return _fontNames[chpFtc].getMainFontName();
     }
 
-    public String getAltFont(int chpFtc )
-    {
-        if(chpFtc >= _stringCount)
-        {
-            _logger.log(POILogger.INFO, "Mismatch in chpFtc with stringCount");
-            return null;
-        }
-
-        return _fontNames[chpFtc].getAltFontName();
-    }
-
-    public void setStringCount(short stringCount)
-    {
-        this._stringCount = stringCount;
-    }
-
-    @Deprecated
-    public void writeTo( HWPFFileSystem sys ) throws IOException
-    {
-        HWPFOutputStream tableStream = sys.getStream( "1Table" );
-        writeTo( tableStream );
-    }
-
-    public void writeTo( HWPFOutputStream tableStream ) throws IOException
-    {
-        byte[] buf = new byte[LittleEndian.SHORT_SIZE];
-        LittleEndian.putShort(buf, 0, _stringCount);
-        tableStream.write(buf);
-        LittleEndian.putShort(buf, 0, _extraDataSz);
-        tableStream.write(buf);
-
-        for(int i = 0; i < _fontNames.length; i++)
-        {
-            tableStream.write(_fontNames[i].toByteArray());
-        }
-
-    }
-
-    @Override
-    public boolean equals(Object other) {
-        if (!(other instanceof FontTable)) return false;
-        FontTable o = (FontTable)other;
-
-        if (o._stringCount != this._stringCount
-                || o._extraDataSz != this._extraDataSz
-                || o._fontNames.length != this._fontNames.length
-        ) return false;
-        
-        for (int i=0; i<o._fontNames.length; i++) {
-            if (!o._fontNames[i].equals(this._fontNames[i])) return false;
-        }
-        
-        return true;
-    }
-
     @Override
-    public int hashCode() {
-        assert false : "hashCode not designed";
-        return 42; // any arbitrary constant will do
+    public String toString() {
+        return "OldFontTable{" +
+                "_fontNames=" + Arrays.toString(_fontNames) +
+                '}';
     }
-
 }

Copied: 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java (from 
r1788131, poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java)
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java?p2=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java&p1=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java&r1=1788131&r2=1790061&rev=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java 
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java 
Tue Apr  4 02:06:46 2017
@@ -18,89 +18,53 @@
 package org.apache.poi.hwpf.model;
 
 
-import java.nio.charset.Charset;
-
 import org.apache.poi.util.Internal;
+import org.apache.poi.util.NotImplemented;
 
 /**
  * Lightweight representation of a text piece.
  * Works in the character domain, not the byte domain, so you
  * need to have turned byte references into character
  * references before getting here.
- *
- * @author Ryan Ackley
  */
 @Internal
-public class TextPiece extends PropertyNode<TextPiece> {
-    private boolean _usesUnicode;
-
-    private PieceDescriptor _pd;
+public class OldTextPiece extends TextPiece {
 
-    /**
-     * @param start Beginning offset in main document stream, in characters.
-     * @param end   Ending offset in main document stream, in characters.
-     * @param text  The raw bytes of our text
-     * @deprecated Use {@link #TextPiece(int, int, byte[], PieceDescriptor)}
-     * instead
-     */
-    public TextPiece(int start, int end, byte[] text, PieceDescriptor pd,
-                     int cpStart) {
-        this(start, end, text, pd);
-    }
+    private final byte[] rawBytes;
 
     /**
      * @param start Beginning offset in main document stream, in characters.
      * @param end   Ending offset in main document stream, in characters.
      * @param text  The raw bytes of our text
      */
-    public TextPiece(int start, int end, byte[] text, PieceDescriptor pd) {
-        super(start, end, buildInitSB(text, pd));
-        _usesUnicode = pd.isUnicode();
-        _pd = pd;
-
-        // Validate
-        int textLength = ((CharSequence) _buf).length();
-        if (end - start != textLength) {
-            throw new IllegalStateException("Told we're for characters " + 
start + " -> " + end + ", but actually covers " + textLength + " characters!");
-        }
+    public OldTextPiece(int start, int end, byte[] text, PieceDescriptor pd) {
+        super(start, end, text, pd);
+        this.rawBytes = text;
         if (end < start) {
             throw new IllegalStateException("Told we're of negative size! 
start=" + start + " end=" + end);
         }
     }
 
     /**
-     * Create the StringBuilder from the text and unicode flag
-     */
-    private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) {
-        String str = new String(text, Charset.forName(pd.isUnicode() ? 
"UTF-16LE" : "Cp1252"));
-
-        return new StringBuilder(str);
-    }
-
-    /**
-     * @return If this text piece is unicode
+     * @return nothing, ever. Always throws an UnsupportedOperationException
+     * @throws UnsupportedOperationException
      */
+    @NotImplemented
+    @Override
     public boolean isUnicode() {
-        return _usesUnicode;
+        throw new UnsupportedOperationException();
     }
 
-    public PieceDescriptor getPieceDescriptor() {
-        return _pd;
-    }
-
-    @Deprecated
-    public StringBuffer getStringBuffer() {
-        return new StringBuffer(getStringBuilder());
-    }
 
     public StringBuilder getStringBuilder() {
         return (StringBuilder) _buf;
     }
 
+    @Override
     public byte[] getRawBytes() {
-        return ((CharSequence) _buf).toString().getBytes(
-                Charset.forName(_usesUnicode ? "UTF-16LE" : "Cp1252")
-        );
+        byte[] buf = new byte[rawBytes.length];
+        System.arraycopy(rawBytes, 0, buf, 0, rawBytes.length);
+        return buf;
     }
 
     /**
@@ -109,84 +73,29 @@ public class TextPiece extends PropertyN
      *
      * @param start Local start position, in characters
      * @param end   Local end position, in characters
+     * @throws UnsupportedOperationException
      */
     @Deprecated
+    @NotImplemented
     public String substring(int start, int end) {
-        StringBuilder buf = (StringBuilder) _buf;
-
-        // Validate
-        if (start < 0) {
-            throw new StringIndexOutOfBoundsException("Can't request a 
substring before 0 - asked for " + start);
-        }
-        if (end > buf.length()) {
-            throw new StringIndexOutOfBoundsException("Index " + end + " out 
of range 0 -> " + buf.length());
-        }
-        if (end < start) {
-            throw new StringIndexOutOfBoundsException("Asked for text from " + 
start + " to " + end + ", which has an end before the start!");
-        }
-        return buf.substring(start, end);
+        throw new UnsupportedOperationException();
     }
 
     /**
-     * Adjusts the internal string for deletinging
-     * some characters within this.
-     *
-     * @param start  The start position for the delete, in characters
-     * @param length The number of characters to delete
+     * Not implemented for OldTextPiece.
+     * Always throws UnsupportedOperationException
      */
     @Deprecated
+    @NotImplemented
     public void adjustForDelete(int start, int length) {
-        int myStart = getStart();
-        int myEnd = getEnd();
-        int end = start + length;
-
-          /* do we have to delete from this text piece? */
-        if (start <= myEnd && end >= myStart) {
-
-                  /* find where the deleted area overlaps with this text piece 
*/
-            int overlapStart = Math.max(myStart, start);
-            int overlapEnd = Math.min(myEnd, end);
-
-            int bufStart = overlapStart - myStart;
-            int bufEnd = overlapEnd - myStart;
-            ((StringBuilder) _buf).delete(bufStart, bufEnd);
-        }
-
-        // We need to invoke this even if text from this piece is not being
-        // deleted because the adjustment must propagate to all subsequent
-        // text pieces i.e., if text from tp[n] is being deleted, then
-        // tp[n + 1], tp[n + 2], etc. will need to be adjusted.
-        // The superclass is expected to use a separate sentry for this.
-        super.adjustForDelete(start, length);
-    }
-
-    /**
-     * Returns the length, in characters
-     */
-    @Deprecated
-    public int characterLength() {
-        return (getEnd() - getStart());
+        throw new UnsupportedOperationException();
     }
 
     /**
      * Returns the length, in bytes
      */
     public int bytesLength() {
-        return (getEnd() - getStart()) * (_usesUnicode ? 2 : 1);
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (!(o instanceof TextPiece)) return false;
-        TextPiece tp = (TextPiece) o;
-        assert (_buf != null && tp._buf != null && _pd != null && tp._pd != 
null);
-
-        return (
-                limitsAreEqual(o)
-                        && tp._usesUnicode == this._usesUnicode
-                        && tp._buf.toString().equals(this._buf.toString())
-                        && tp._pd.equals(this._pd)
-        );
+        return rawBytes.length;
     }
 
     @Override
@@ -204,7 +113,8 @@ public class TextPiece extends PropertyN
     }
 
     public String toString() {
-        return "TextPiece from " + getStart() + " to " + getEnd() + " ("
+        return "OldTextPiece from " + getStart() + " to " + getEnd() + " ("
                 + getPieceDescriptor() + ")";
     }
+
 }

Copied: 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java 
(from r1788131, 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java)
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java?p2=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java&p1=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java&r1=1788131&r2=1790061&rev=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java 
(original)
+++ 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java 
Tue Apr  4 02:06:46 2017
@@ -16,42 +16,29 @@
 ==================================================================== */
 package org.apache.poi.hwpf.model;
 
-import java.io.IOException;
-import java.io.Serializable;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Collections;
-import java.util.Comparator;
-import java.util.LinkedList;
-import java.util.List;
 
-import org.apache.poi.hwpf.model.io.HWPFOutputStream;
-import org.apache.poi.poifs.common.POIFSConstants;
+import org.apache.poi.util.CodePageUtil;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.POILogFactory;
 import org.apache.poi.util.POILogger;
 
-/**
- * The piece table for matching up character positions to bits of text. This
- * mostly works in bytes, but the TextPieces themselves work in characters. 
This
- * does the icky convertion.
- *
- * @author Ryan Ackley
- */
+
 @Internal
-public class TextPieceTable implements CharIndexTranslator {
-    private static final POILogger logger = POILogFactory
-            .getLogger(TextPieceTable.class);
+public class OldTextPieceTable extends TextPieceTable {
 
-    // int _multiple;
-    int _cpMin;
-    protected ArrayList<TextPiece> _textPieces = new ArrayList<TextPiece>();
-    protected ArrayList<TextPiece> _textPiecesFCOrder = new 
ArrayList<TextPiece>();
+    private static final POILogger logger = POILogFactory
+            .getLogger(OldTextPieceTable.class);
 
-    public TextPieceTable() {
+    public OldTextPieceTable() {
+        super();
     }
 
-    public TextPieceTable(byte[] documentStream, byte[] tableStream,
-                          int offset, int size, int fcMin) {
+    public OldTextPieceTable(byte[] documentStream, byte[] tableStream,
+                             int offset, int size, int fcMin, Charset charset) 
{
+        //super(documentStream, tableStream, offset, size, fcMin, charset);
         // get our plex of PieceDescriptors
         PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size,
                 PieceDescriptor.getSizeInBytes());
@@ -63,7 +50,7 @@ public class TextPieceTable implements C
         // PieceDescriptor objects
         for (int x = 0; x < length; x++) {
             GenericPropertyNode node = pieceTable.getProperty(x);
-            pieces[x] = new PieceDescriptor(node.getBytes(), 0);
+            pieces[x] = new PieceDescriptor(node.getBytes(), 0, charset);
         }
 
         // Figure out the cp of the earliest text piece
@@ -88,7 +75,8 @@ public class TextPieceTable implements C
             // What's the relationship between bytes and characters?
             boolean unicode = pieces[x].isUnicode();
             int multiple = 1;
-            if (unicode) {
+            if (unicode ||
+                    (charset != null && 
CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset))) {
                 multiple = 2;
             }
 
@@ -101,7 +89,7 @@ public class TextPieceTable implements C
             System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
 
             // And now build the piece
-            final TextPiece newTextPiece = new TextPiece(nodeStartChars, 
nodeEndChars, buf,
+            final TextPiece newTextPiece = newTextPiece(nodeStartChars, 
nodeEndChars, buf,
                     pieces[x]);
 
             _textPieces.add(newTextPiece);
@@ -112,344 +100,20 @@ public class TextPieceTable implements C
         Collections.sort(_textPieces);
         _textPiecesFCOrder = new ArrayList<TextPiece>(_textPieces);
         Collections.sort(_textPiecesFCOrder, new FCComparator());
-    }
-
-    public void add(TextPiece piece) {
-        _textPieces.add(piece);
-        _textPiecesFCOrder.add(piece);
-        Collections.sort(_textPieces);
-        Collections.sort(_textPiecesFCOrder, new FCComparator());
-    }
-
-    /**
-     * Adjust all the text piece after inserting some text into one of them
-     *
-     * @param listIndex The TextPiece that had characters inserted into
-     * @param length    The number of characters inserted
-     */
-    public int adjustForInsert(int listIndex, int length) {
-        int size = _textPieces.size();
-
-        TextPiece tp = _textPieces.get(listIndex);
-
-        // Update with the new end
-        tp.setEnd(tp.getEnd() + length);
-
-        // Now change all subsequent ones
-        for (int x = listIndex + 1; x < size; x++) {
-            tp = _textPieces.get(x);
-            tp.setStart(tp.getStart() + length);
-            tp.setEnd(tp.getEnd() + length);
-        }
-
-        // All done
-        return length;
-    }
-
-    public boolean equals(Object o) {
-        if (!(o instanceof TextPieceTable)) return false;
-        TextPieceTable tpt = (TextPieceTable) o;
-
-        int size = tpt._textPieces.size();
-        if (size == _textPieces.size()) {
-            for (int x = 0; x < size; x++) {
-                if (!tpt._textPieces.get(x).equals(_textPieces.get(x))) {
-                    return false;
-                }
-            }
-            return true;
-        }
-        return false;
-    }
-
-    public int getByteIndex(int charPos) {
-        int byteCount = 0;
-        for (TextPiece tp : _textPieces) {
-            if (charPos >= tp.getEnd()) {
-                byteCount = tp.getPieceDescriptor().getFilePosition()
-                        + (tp.getEnd() - tp.getStart())
-                        * (tp.isUnicode() ? 2 : 1);
-
-                if (charPos == tp.getEnd())
-                    break;
-
-                continue;
-            }
-            if (charPos < tp.getEnd()) {
-                int left = charPos - tp.getStart();
-                byteCount = tp.getPieceDescriptor().getFilePosition() + left
-                        * (tp.isUnicode() ? 2 : 1);
-                break;
-            }
-        }
-        return byteCount;
-    }
 
-    @Deprecated
-    public int getCharIndex(int bytePos) {
-        return getCharIndex(bytePos, 0);
-    }
-
-    @Deprecated
-    public int getCharIndex(int startBytePos, int startCP) {
-        int charCount = 0;
-
-        int bytePos = lookIndexForward(startBytePos);
-
-        for (TextPiece tp : _textPieces) {
-            int pieceStart = tp.getPieceDescriptor().getFilePosition();
-
-            int bytesLength = tp.bytesLength();
-            int pieceEnd = pieceStart + bytesLength;
-
-            int toAdd;
-
-            if (bytePos < pieceStart || bytePos > pieceEnd) {
-                toAdd = bytesLength;
-            } else if (bytePos > pieceStart && bytePos < pieceEnd) {
-                toAdd = (bytePos - pieceStart);
-            } else {
-                toAdd = bytesLength - (pieceEnd - bytePos);
-            }
-
-            if (tp.isUnicode()) {
-                charCount += toAdd / 2;
-            } else {
-                charCount += toAdd;
-            }
-
-            if (bytePos >= pieceStart && bytePos <= pieceEnd
-                    && charCount >= startCP) {
-                break;
-            }
-        }
-
-        return charCount;
     }
 
     @Override
-    public int[][] getCharIndexRanges(int startBytePosInclusive,
-                                      int endBytePosExclusive) {
-        List<int[]> result = new LinkedList<int[]>();
-        for (TextPiece textPiece : _textPiecesFCOrder) {
-            final int tpStart = textPiece.getPieceDescriptor()
-                    .getFilePosition();
-            final int tpEnd = textPiece.getPieceDescriptor().getFilePosition()
-                    + textPiece.bytesLength();
-            if (startBytePosInclusive > tpEnd)
-                continue;
-            if (endBytePosExclusive <= tpStart)
-                break;
-
-            final int rangeStartBytes = Math.max(tpStart,
-                    startBytePosInclusive);
-            final int rangeEndBytes = Math.min(tpEnd, endBytePosExclusive);
-            final int rangeLengthBytes = rangeEndBytes - rangeStartBytes;
-
-            if (rangeStartBytes > rangeEndBytes)
-                continue;
-
-            final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
-
-            final int rangeStartCp = textPiece.getStart()
-                    + (rangeStartBytes - tpStart) / encodingMultiplier;
-            final int rangeEndCp = rangeStartCp + rangeLengthBytes
-                    / encodingMultiplier;
-
-            result.add(new int[]{rangeStartCp, rangeEndCp});
-        }
-
-        return result.toArray(new int[result.size()][]);
-    }
-
-    public int getCpMin() {
-        return _cpMin;
-    }
-
-    public StringBuilder getText() {
-        final long start = System.currentTimeMillis();
-
-        // rebuild document paragraphs structure
-        StringBuilder docText = new StringBuilder();
-        for (TextPiece textPiece : _textPieces) {
-            String toAppend = textPiece.getStringBuilder().toString();
-            int toAppendLength = toAppend.length();
-
-            if (toAppendLength != textPiece.getEnd() - textPiece.getStart()) {
-                logger.log(
-                        POILogger.WARN,
-                        "Text piece has boundaries [",
-                        Integer.valueOf(textPiece.getStart()),
-                        "; ",
-                        Integer.valueOf(textPiece.getEnd()),
-                        ") but length ",
-                        Integer.valueOf(textPiece.getEnd()
-                                - textPiece.getStart()));
-            }
-
-            docText.replace(textPiece.getStart(), textPiece.getStart()
-                    + toAppendLength, toAppend);
-        }
-
-        logger.log(POILogger.DEBUG, "Document text were rebuilded in ",
-                Long.valueOf(System.currentTimeMillis() - start), " ms (",
-                Integer.valueOf(docText.length()), " chars)");
-
-        return docText;
-    }
-
-    public List<TextPiece> getTextPieces() {
-        return _textPieces;
+    protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, 
byte[] buf, PieceDescriptor pd) {
+        return new OldTextPiece(nodeStartChars, nodeEndChars, buf, pd);
     }
 
     @Override
-    public int hashCode() {
-        return _textPieces.size();
-    }
-
-    public boolean isIndexInTable(int bytePos) {
-        for (TextPiece tp : _textPiecesFCOrder) {
-            int pieceStart = tp.getPieceDescriptor().getFilePosition();
-
-            if (bytePos > pieceStart + tp.bytesLength()) {
-                continue;
-            }
-
-            if (pieceStart > bytePos) {
-                return false;
-            }
-
-            return true;
-        }
-
-        return false;
-    }
-
-    boolean isIndexInTable(int startBytePos, int endBytePos) {
-        for (TextPiece tp : _textPiecesFCOrder) {
-            int pieceStart = tp.getPieceDescriptor().getFilePosition();
-
-            if (startBytePos >= pieceStart + tp.bytesLength()) {
-                continue;
-            }
-
-            int left = Math.max(startBytePos, pieceStart);
-            int right = Math.min(endBytePos, pieceStart + tp.bytesLength());
-
-            if (left >= right)
-                return false;
-
-            return true;
-        }
-
-        return false;
-    }
-
-    public int lookIndexBackward(final int startBytePos) {
-        int bytePos = startBytePos;
-        int lastEnd = 0;
-
-        for (TextPiece tp : _textPiecesFCOrder) {
-            int pieceStart = tp.getPieceDescriptor().getFilePosition();
-
-            if (bytePos > pieceStart + tp.bytesLength()) {
-                lastEnd = pieceStart + tp.bytesLength();
-                continue;
-            }
-
-            if (pieceStart > bytePos) {
-                bytePos = lastEnd;
-            }
-
-            break;
-        }
-
-        return bytePos;
-    }
-
-    public int lookIndexForward(final int startBytePos) {
-        if (_textPiecesFCOrder.isEmpty())
-            throw new IllegalStateException("Text pieces table is empty");
-
-        if (_textPiecesFCOrder.get(0).getPieceDescriptor().getFilePosition() > 
startBytePos)
-            return 
_textPiecesFCOrder.get(0).getPieceDescriptor().getFilePosition();
-
-        if (_textPiecesFCOrder.get(_textPiecesFCOrder.size() - 1)
-                .getPieceDescriptor().getFilePosition() <= startBytePos)
-            return startBytePos;
-
-        int low = 0;
-        int high = _textPiecesFCOrder.size() - 1;
-
-        while (low <= high) {
-            int mid = (low + high) >>> 1;
-            final TextPiece textPiece = _textPiecesFCOrder.get(mid);
-            int midVal = textPiece.getPieceDescriptor().getFilePosition();
-
-            if (midVal < startBytePos)
-                low = mid + 1;
-            else if (midVal > startBytePos)
-                high = mid - 1;
-            else
-                // found piece with exact start
-                return textPiece.getPieceDescriptor().getFilePosition();
-        }
-        assert low == high;
-        assert _textPiecesFCOrder.get(low).getPieceDescriptor()
-                .getFilePosition() < startBytePos;
-        // last line can't be current, can it?
-        assert _textPiecesFCOrder.get(low + 1).getPieceDescriptor()
-                .getFilePosition() > startBytePos;
-
-        // shifting to next piece start
-        return _textPiecesFCOrder.get(low + 
1).getPieceDescriptor().getFilePosition();
-    }
-
-    public byte[] writeTo(HWPFOutputStream docStream) throws IOException {
-        PlexOfCps textPlex = new PlexOfCps(PieceDescriptor.getSizeInBytes());
-        // int fcMin = docStream.getOffset();
-
-        for (TextPiece next : _textPieces) {
-            PieceDescriptor pd = next.getPieceDescriptor();
-
-            int offset = docStream.getOffset();
-            int mod = (offset % POIFSConstants.SMALLER_BIG_BLOCK_SIZE);
-            if (mod != 0) {
-                mod = POIFSConstants.SMALLER_BIG_BLOCK_SIZE - mod;
-                byte[] buf = new byte[mod];
-                docStream.write(buf);
-            }
-
-            // set the text piece position to the current docStream offset.
-            pd.setFilePosition(docStream.getOffset());
-
-            // write the text to the docstream and save the piece descriptor to
-            // the
-            // plex which will be written later to the tableStream.
-            docStream.write(next.getRawBytes());
-
-            // The TextPiece is already in characters, which
-            // makes our life much easier
-            int nodeStart = next.getStart();
-            int nodeEnd = next.getEnd();
-            textPlex.addProperty(new GenericPropertyNode(nodeStart, nodeEnd,
-                    pd.toByteArray()));
-        }
-
-        return textPlex.toByteArray();
-    }
-
-    private static class FCComparator implements Comparator<TextPiece>, 
Serializable {
-        public int compare(TextPiece textPiece, TextPiece textPiece1) {
-            if (textPiece.getPieceDescriptor().fc > textPiece1
-                    .getPieceDescriptor().fc) {
-                return 1;
-            } else if (textPiece.getPieceDescriptor().fc < textPiece1
-                    .getPieceDescriptor().fc) {
-                return -1;
-            } else {
-                return 0;
-            }
+    protected int getEncodingMultiplier(TextPiece textPiece) {
+        Charset charset = textPiece.getPieceDescriptor().getCharset();
+        if (charset != null && 
CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset)) {
+            return 2;
         }
+        return 1;
     }
 }

Modified: 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java 
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java Tue 
Apr  4 02:06:46 2017
@@ -260,7 +260,7 @@ public class PAPBinTable
             SprmBuffer sprmBuffer = null;
             for ( PAPX papx : papxs )
             {
-                if ( papx.getGrpprl() == null || papx.getGrpprl().length == 0 )
+                if ( papx.getGrpprl() == null || papx.getGrpprl().length <= 2 )
                     continue;
 
                 if ( sprmBuffer == null ) {

Modified: 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java 
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java 
Tue Apr  4 02:06:46 2017
@@ -17,10 +17,13 @@
 
 package org.apache.poi.hwpf.model;
 
+import java.nio.charset.Charset;
+
 import org.apache.poi.util.BitField;
 import org.apache.poi.util.BitFieldFactory;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.StringUtil;
 
 @Internal
 public final class PieceDescriptor
@@ -32,29 +35,51 @@ public final class PieceDescriptor
    private static BitField fCopied = BitFieldFactory.getInstance(0x04);
   int fc;
   PropertyModifier prm;
-  boolean unicode;
+  boolean unicode = false;
+  private final Charset charset;
 
 
-  public PieceDescriptor(byte[] buf, int offset)
-  {
-    descriptor = LittleEndian.getShort(buf, offset);
-    offset += LittleEndian.SHORT_SIZE;
-    fc = LittleEndian.getInt(buf, offset);
-    offset += LittleEndian.INT_SIZE;
-    prm = new PropertyModifier( LittleEndian.getShort(buf, offset));
-
-    // see if this piece uses unicode.
-    if ((fc & 0x40000000) == 0)
-    {
-        unicode = true;
-    }
-    else
-    {
-        unicode = false;
-        fc &= ~(0x40000000);//gives me FC in doc stream
-        fc /= 2;
+    public PieceDescriptor(byte[] buf, int offset) {
+        this(buf, offset, null);
     }
 
+    /**
+     *
+     * This initializer should only be used for HWPFOldDocuments.
+     *
+     * @param buf
+     * @param offset
+     * @param charset which charset to use if this is not unicode
+     */
+  public PieceDescriptor(byte[] buf, int offset, Charset charset) {
+      descriptor = LittleEndian.getShort(buf, offset);
+      offset += LittleEndian.SHORT_SIZE;
+      fc = LittleEndian.getInt(buf, offset);
+      offset += LittleEndian.INT_SIZE;
+      prm = new PropertyModifier(LittleEndian.getShort(buf, offset));
+      if (charset == null) {
+        // see if this piece uses unicode.
+        //From the documentation: If the second most significant bit
+          //is clear, then this indicates the actual file offset of the 
Unicode character (two bytes). If the
+          //second most significant bit is set, then the actual address of the 
codepage-1252
+          //compressed version of the Unicode character (one byte), is 
actually at the offset indicated
+          //by clearing this bit and dividing by two.
+        if ((fc & 0x40000000) == 0) {
+          unicode = true;
+          this.charset = null;
+        } else {
+          unicode = false;
+          fc &= ~(0x40000000);//gives me FC in doc stream
+          fc /= 2;
+          this.charset = StringUtil.WIN_1252;
+        }
+      } else {
+          if (charset == StringUtil.UTF16LE) {
+              unicode = true;
+          }
+          this.charset = charset;
+      }
+
   }
 
   public int getFilePosition()
@@ -72,6 +97,15 @@ public final class PieceDescriptor
     return unicode;
   }
 
+    /**
+     *
+     * @return charset to use if this is not a Unicode PieceDescriptor
+     * this can be <code>null</code>
+     */
+  public Charset getCharset() {
+    return charset;
+  }
+
     public PropertyModifier getPrm()
     {
         return prm;

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java 
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java Tue 
Apr  4 02:06:46 2017
@@ -21,6 +21,7 @@ package org.apache.poi.hwpf.model;
 import java.nio.charset.Charset;
 
 import org.apache.poi.util.Internal;
+import org.apache.poi.util.StringUtil;
 
 /**
  * Lightweight representation of a text piece.
@@ -40,7 +41,6 @@ public class TextPiece extends PropertyN
      * @param start Beginning offset in main document stream, in characters.
      * @param end   Ending offset in main document stream, in characters.
      * @param text  The raw bytes of our text
-     * @deprecated Use {@link #TextPiece(int, int, byte[], PieceDescriptor)}
      * instead
      */
     public TextPiece(int start, int end, byte[] text, PieceDescriptor pd,
@@ -72,8 +72,13 @@ public class TextPiece extends PropertyN
      * Create the StringBuilder from the text and unicode flag
      */
     private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) {
-        String str = new String(text, Charset.forName(pd.isUnicode() ? 
"UTF-16LE" : "Cp1252"));
+        byte[] textBuffer = text;
+        if (StringUtil.BIG5.equals(pd.getCharset())) {
+            String txt = new 
StringBuilder(StringUtil.littleEndianBig5Stream(text, 0, 
text.length)).toString();
+            return new StringBuilder(txt);
+        }
 
+        String str = new String(textBuffer, 0, textBuffer.length, 
(pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset());
         return new StringBuilder(str);
     }
 
@@ -207,4 +212,5 @@ public class TextPiece extends PropertyN
         return "TextPiece from " + getStart() + " to " + getEnd() + " ("
                 + getPieceDescriptor() + ")";
     }
+
 }

Modified: 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java 
(original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java 
Tue Apr  4 02:06:46 2017
@@ -101,7 +101,7 @@ public class TextPieceTable implements C
             System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
 
             // And now build the piece
-            final TextPiece newTextPiece = new TextPiece(nodeStartChars, 
nodeEndChars, buf,
+            final TextPiece newTextPiece = newTextPiece(nodeStartChars, 
nodeEndChars, buf,
                     pieces[x]);
 
             _textPieces.add(newTextPiece);
@@ -114,6 +114,10 @@ public class TextPieceTable implements C
         Collections.sort(_textPiecesFCOrder, new FCComparator());
     }
 
+    protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, 
byte[] buf, PieceDescriptor pd) {
+        return new TextPiece(nodeStartChars, nodeEndChars, buf, pd);
+    }
+
     public void add(TextPiece piece) {
         _textPieces.add(piece);
         _textPiecesFCOrder.add(piece);
@@ -249,7 +253,7 @@ public class TextPieceTable implements C
             if (rangeStartBytes > rangeEndBytes)
                 continue;
 
-            final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
+            final int encodingMultiplier = getEncodingMultiplier(textPiece);
 
             final int rangeStartCp = textPiece.getStart()
                     + (rangeStartBytes - tpStart) / encodingMultiplier;
@@ -262,6 +266,10 @@ public class TextPieceTable implements C
         return result.toArray(new int[result.size()][]);
     }
 
+    protected int getEncodingMultiplier(TextPiece textPiece) {
+        return textPiece.isUnicode() ? 2 : 1;
+    }
+
     public int getCpMin() {
         return _cpMin;
     }
@@ -439,7 +447,7 @@ public class TextPieceTable implements C
         return textPlex.toByteArray();
     }
 
-    private static class FCComparator implements Comparator<TextPiece>, 
Serializable {
+    protected static class FCComparator implements Comparator<TextPiece>, 
Serializable {
         public int compare(TextPiece textPiece, TextPiece textPiece1) {
             if (textPiece.getPieceDescriptor().fc > textPiece1
                     .getPieceDescriptor().fc) {

Modified: 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java 
(original)
+++ 
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java 
Tue Apr  4 02:06:46 2017
@@ -18,6 +18,7 @@
 package org.apache.poi.hwpf.usermodel;
 
 import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFOldDocument;
 import org.apache.poi.hwpf.model.CHPX;
 import org.apache.poi.hwpf.model.FFData;
 import org.apache.poi.hwpf.model.Ffn;
@@ -438,6 +439,10 @@ public final class CharacterRun extends
 
   public String getFontName()
   {
+    if (_doc instanceof HWPFOldDocument) {
+      return ((HWPFOldDocument) 
_doc).getOldFontTable().getMainFont(_props.getFtcAscii());
+    }
+
     if (_doc.getFontTable() == null)
       // old word format
       return null;

Modified: 
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- 
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java
 (original)
+++ 
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java
 Tue Apr  4 02:06:46 2017
@@ -16,18 +16,19 @@
 ==================================================================== */
 package org.apache.poi.hwpf.converter;
 
-import java.io.File;
-import java.io.FilenameFilter;
-import java.io.StringWriter;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
+import static org.junit.Assert.assertNotNull;
 
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 
 import org.apache.poi.POIDataSamples;
 import org.apache.poi.hwpf.HWPFDocumentCore;
@@ -36,8 +37,6 @@ import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
 
-import static org.junit.Assert.assertNotNull;
-
 @RunWith(Parameterized.class)
 public class TestWordToConverterSuite
 {
@@ -45,7 +44,11 @@ public class TestWordToConverterSuite
      * YK: a quick hack to exclude failing documents from the suite.
      */
     private static List<String> failingFiles = Arrays
-            .asList( "ProblemExtracting.doc" );
+            .asList( "ProblemExtracting.doc",
+                    "Bug50955.doc" //basic extraction works,
+                                    // but these extractors modify the 
document,
+                                    // which is a no-go for this Word 6.0 file
+            );
 
     @Parameterized.Parameters(name="{index}: {0}")
     public static Iterable<Object[]> files() {

Modified: 
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- 
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java 
(original)
+++ 
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java 
Tue Apr  4 02:06:46 2017
@@ -57,6 +57,7 @@ import junit.framework.TestCase;
  *  against HWPF
  */
 public class TestBugs{
+
     private static final POILogger logger = 
POILogFactory.getLogger(TestBugs.class);
 
     public static void assertEqualsIgnoreNewline(String expected, String 
actual )
@@ -536,13 +537,6 @@ public class TestBugs{
         hwpfDocument.getPicturesTable().getAllPictures();
     }
 
-    /**
-     * [FAILING] Bug 50955 - error while retrieving the text file
-     */
-    @Test(expected=IllegalStateException.class)
-    public void test50955() throws IOException {
-        getTextOldFile("Bug50955.doc");
-    }
 
     /**
      * [RESOLVED FIXED] Bug 51604 - replace text fails for doc (poi 3.8 beta

Modified: 
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java?rev=1790061&r1=1790060&r2=1790061&view=diff
==============================================================================
--- 
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java
 (original)
+++ 
poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java
 Tue Apr  4 02:06:46 2017
@@ -17,14 +17,19 @@
 
 package org.apache.poi.hwpf.usermodel;
 
+import static org.apache.poi.POITestCase.assertContains;
 import static org.junit.Assert.assertEquals;
 
 import java.io.IOException;
+import java.nio.charset.Charset;
 
 import org.apache.poi.OldFileFormatException;
+import org.apache.poi.hwmf.record.HwmfFont;
 import org.apache.poi.hwpf.HWPFOldDocument;
 import org.apache.poi.hwpf.HWPFTestCase;
 import org.apache.poi.hwpf.HWPFTestDataSamples;
+import org.apache.poi.hwpf.extractor.Word6Extractor;
+import org.apache.poi.hwpf.model.OldFontTable;
 import org.junit.Test;
 
 /**
@@ -98,7 +103,7 @@ public final class TestHWPFOldDocument e
         assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns());
         // Normal, superscript for 4th, normal
         assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns());
-        
+
         doc.close();
     }
 
@@ -143,4 +148,87 @@ public final class TestHWPFOldDocument e
                 doc.getRange().getParagraph(1).text());
         doc.close();
     }
+
+    @Test
+    public void testDefaultCodePageEncoding() throws IOException {
+        HWPFOldDocument doc = 
HWPFTestDataSamples.openOldSampleFile("Bug60942.doc");
+        Word6Extractor ex = new Word6Extractor(doc);
+        String txt = ex.getText();
+        assertContains(txt, "BERTHOD");
+        assertContains(txt, "APPLICOLOR");
+        assertContains(txt, "les meilleurs");
+        assertContains(txt, "GUY LECOLE");
+    }
+
+
+    @Test
+    public void testCodePageBug50955() throws IOException {
+        //windows 1251
+        HWPFOldDocument doc = 
HWPFTestDataSamples.openOldSampleFile("Bug50955.doc");
+        Word6Extractor ex = new Word6Extractor(doc);
+
+        StringBuilder sb = new StringBuilder();
+        for (String p : ex.getParagraphText()) {
+            sb.append(p);
+        }
+        assertContains(sb.toString(), 
"\u043F\u0440\u0438\u0432\u0435\u0442");//Greetings!
+    }
+
+    @Test
+    public void testCodePageBug60936() throws IOException {
+        //windows 1250 -- this test file was generated with OpenOffice
+        //see https://bz.apache.org/ooo/show_bug.cgi?id=12445 for the 
inspiration
+
+
+        HWPFOldDocument doc = 
HWPFTestDataSamples.openOldSampleFile("Bug60936.doc");
+        Word6Extractor ex = new Word6Extractor(doc);
+        StringBuilder sb = new StringBuilder();
+        for (String p : ex.getParagraphText()) {
+            sb.append(p);
+        }
+        assertContains(sb.toString(), "4 sk\u00f3re a p\u0159ed 7 
lety");//Greetings!
+    }
+
+    @Test
+    public void testOldFontTableEncoding() throws IOException {
+        HWPFOldDocument doc = 
HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
+        OldFontTable oldFontTable = doc.getOldFontTable();
+        assertEquals(5, oldFontTable.getFontNames().length);
+        assertEquals("\u7D30\u660E\u9AD4", 
oldFontTable.getFontNames()[0].getMainFontName());
+        assertEquals(HwmfFont.WmfCharset.CHINESEBIG5_CHARSET.getCharset(), 
Charset.forName("Big5"));
+        assertEquals("Times New Roman", 
oldFontTable.getFontNames()[1].getMainFontName());
+        doc.close();
+
+    }
+
+    @Test
+    public void testOldFontTableAltName() throws IOException {
+        HWPFOldDocument doc  = 
HWPFTestDataSamples.openOldSampleFile("Bug60942b.doc");
+        OldFontTable oldFontTable = doc.getOldFontTable();
+        assertEquals(5, oldFontTable.getFontNames().length);
+        assertEquals("Roboto", 
oldFontTable.getFontNames()[3].getMainFontName());
+        assertEquals("arial", oldFontTable.getFontNames()[3].getAltFontName());
+        assertEquals("Roboto", 
oldFontTable.getFontNames()[4].getMainFontName());
+        assertEquals("arial", oldFontTable.getFontNames()[4].getAltFontName());
+    }
+
+
+    @Test
+    public void test51944() throws IOException {
+        HWPFOldDocument doc = 
HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
+        Word6Extractor ex = new Word6Extractor(doc);
+        StringBuilder sb = new StringBuilder();
+        for (String p : ex.getParagraphText()) {
+            sb.append(p.replaceAll("[\r\n]+", "\n"));
+        }
+        String txt = sb.toString();
+        assertContains(txt, "Post and Fax");
+        assertContains(txt, "also maintain");//this is at a critical juncture
+        assertContains(txt, "which are available for");//this too
+
+        //TODO: figure out why these two aren't passing
+//        assertContains(txt, "\u2019\u0078 block2");//make sure smart quote 
is extracted correctly
+//        assertContains(txt, "We are able to");//not sure if we can get this 
easily?
+    }
+
 }

Added: poi/trunk/test-data/document/Bug60936.doc
URL: 
http://svn.apache.org/viewvc/poi/trunk/test-data/document/Bug60936.doc?rev=1790061&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/test-data/document/Bug60936.doc
------------------------------------------------------------------------------
    svn:mime-type = application/msword

Added: poi/trunk/test-data/document/Bug60942.doc
URL: 
http://svn.apache.org/viewvc/poi/trunk/test-data/document/Bug60942.doc?rev=1790061&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/test-data/document/Bug60942.doc
------------------------------------------------------------------------------
    svn:mime-type = application/msword

Added: poi/trunk/test-data/document/Bug60942b.doc
URL: 
http://svn.apache.org/viewvc/poi/trunk/test-data/document/Bug60942b.doc?rev=1790061&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/test-data/document/Bug60942b.doc
------------------------------------------------------------------------------
    svn:mime-type = application/msword



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to