tika-advanced-parser-mo...

bob Sat, 16 Jan 2016 10:23:27 -0800

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,913 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.math.BigInteger;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
+import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * Decompresses a chm block. Depending on chm block type chooses most relevant
+ * decompressing method. A chm block type can be as follows:</br> <li>UNDEFINED
+ * - no action taken, i.e. skipping the block <li>VERBATIM <li>ALIGNED_OFFSET
+ * <li>UNCOMPRESSED the most simplest In addition there are unknown types 
(4-7).
+ * Currently relying on previous chm block these types changing according to 
the
+ * previous chm block type. We need to invent more appropriate way to handle
+ * such types.
+ * 
+ */
+public class ChmLzxBlock {
+    private int block_number;
+    private long block_length;
+    private ChmLzxState state;
+    private byte[] content = null;
+    private ChmSection chmSection = null;
+    private int contentLength = 0;
+
+    // trying to find solution for bad blocks ...
+    private int previousBlockType = -1;
+
+    public ChmLzxBlock(int blockNumber, byte[] dataSegment, long blockLength,
+            ChmLzxBlock prevBlock) throws TikaException {
+        try {
+            if (validateConstructorParams(blockNumber, dataSegment, 
blockLength)) {
+                setBlockNumber(blockNumber);
+
+                if (prevBlock != null
+                        && prevBlock.getState().getBlockLength() > prevBlock
+                                .getState().getBlockRemaining())
+                    setChmSection(new ChmSection(dataSegment, 
prevBlock.getContent()));
+                else
+                    setChmSection(new ChmSection(dataSegment));
+
+                setBlockLength(blockLength);
+
+                // ============================================
+                // we need to take care of previous context
+                // ============================================
+                checkLzxBlock(prevBlock);
+                if (prevBlock == null
+                        || blockLength < (int) getBlockLength()) {
+                    setContent((int) getBlockLength());
+                }
+                else {
+                    setContent((int) blockLength);
+                }
+
+                if (prevBlock != null && prevBlock.getState() != null)
+                    previousBlockType = prevBlock.getState().getBlockType();
+
+                extractContent();
+            } else
+                throw new TikaException("Check your chm lzx block parameters");
+        } catch (TikaException e) {
+            throw e;
+        }
+    }
+
+    protected int getContentLength() {
+        return contentLength;
+    }
+
+    protected void setContentLength(int contentLength) {
+        this.contentLength = contentLength;
+    }
+
+    private ChmSection getChmSection() {
+        return chmSection;
+    }
+
+    private void setChmSection(ChmSection chmSection) {
+        this.chmSection = chmSection;
+    }
+
+    private void assertStateNotNull() throws TikaException {
+        if (getState() == null)
+            throw new ChmParsingException("state is null");
+    }
+
+    private void extractContent() throws TikaException {
+        assertStateNotNull();
+        if (getChmSection().getData() != null) {
+            boolean continueLoop = true;
+            while (continueLoop && getContentLength() < getBlockLength()) {
+                if (getState() != null && getState().getBlockRemaining() == 0) 
{
+                    if (getState().getHadStarted() == 
LzxState.NOT_STARTED_DECODING) {
+                        getState().setHadStarted(LzxState.STARTED_DECODING);
+                        if (getChmSection().getSyncBits(1) == 1) {
+                            int intelSizeTemp = (getChmSection()
+                                    .getSyncBits(16) << 16)
+                                    + getChmSection().getSyncBits(16);
+                            if (intelSizeTemp >= 0)
+                                getState().setIntelFileSize(intelSizeTemp);
+                            else
+                                getState().setIntelFileSize(0);
+                        }
+                    }
+                    getState().setBlockType(getChmSection().getSyncBits(3));
+                    getState().setBlockLength(
+                            (getChmSection().getSyncBits(16) << 8)
+                                    + getChmSection().getSyncBits(8));
+                    getState().setBlockRemaining(getState().getBlockLength());
+
+                    // ----------------------------------------
+                    // Trying to handle 3 - 7 block types
+                    // ----------------------------------------
+                    if (getState().getBlockType() > 3) {
+                        if (previousBlockType >= 0 && previousBlockType < 3)
+                            getState().setBlockType(previousBlockType);
+                    }
+
+                    switch (getState().getBlockType()) {
+                        case ChmCommons.ALIGNED_OFFSET:
+                            createAlignedTreeTable();
+                            //fall through
+                        case ChmCommons.VERBATIM:
+                            /* Creates mainTreeTable */
+                            createMainTreeTable();
+                            createLengthTreeTable();
+                            if (getState().getMainTreeLengtsTable()[0xe8] != 0)
+                                getState().setIntelState(IntelState.STARTED);
+                            break;
+                        case ChmCommons.UNCOMPRESSED:
+                            getState().setIntelState(IntelState.STARTED);
+                            if (getChmSection().getTotal() > 16)
+                                getChmSection().setSwath(
+                                        getChmSection().getSwath() - 1);
+                            getState().setR0(
+                                    (new BigInteger(getChmSection()
+                                            .reverseByteOrder(
+                                                    
getChmSection().unmarshalBytes(
+                                                            4))).longValue()));
+                            getState().setR1(
+                                    (new BigInteger(getChmSection()
+                                            .reverseByteOrder(
+                                                    
getChmSection().unmarshalBytes(
+                                                            4))).longValue()));
+                            getState().setR2(
+                                    (new BigInteger(getChmSection()
+                                            .reverseByteOrder(
+                                                    
getChmSection().unmarshalBytes(
+                                                            4))).longValue()));
+                            break;
+                        default:
+                            break;
+                    }
+                } //end of if BlockRemaining == 0
+
+                int tempLen;
+
+                if (getContentLength() + getState().getBlockRemaining() > 
getBlockLength()) {
+                    getState().setBlockRemaining(
+                            getContentLength() + getState().getBlockRemaining()
+                                    - (int) getBlockLength());
+                    tempLen = (int) getBlockLength();
+                } else {
+                    tempLen = getContentLength()
+                            + getState().getBlockRemaining();
+                    getState().setBlockRemaining(0);
+                }
+
+                int lastLength = getContentLength();
+                switch (getState().getBlockType()) {
+                case ChmCommons.ALIGNED_OFFSET:
+                    // 
if(prevblock.lzxState.length>prevblock.lzxState.remaining)
+                    decompressAlignedBlock(tempLen, 
getChmSection().getPrevContent() == null ? getChmSection().getData() : 
getChmSection().getPrevContent());// prevcontext
+                    break;
+                case ChmCommons.VERBATIM:
+                    decompressVerbatimBlock(tempLen, 
getChmSection().getPrevContent() == null ? getChmSection().getData() : 
getChmSection().getPrevContent());
+                    break;
+                case ChmCommons.UNCOMPRESSED:
+                    decompressUncompressedBlock(tempLen, 
getChmSection().getPrevContent() == null ? getChmSection().getData() : 
getChmSection().getPrevContent());
+                    break;
+                }
+                getState().increaseFramesRead();
+                if ((getState().getFramesRead() < 32768)
+                        && getState().getIntelFileSize() != 0)
+                    intelE8Decoding();
+
+                continueLoop = getContentLength() > lastLength;
+            }
+        }
+    }
+
+    protected void intelE8Decoding() {
+        if (getBlockLength() <= ChmConstants.LZX_PRETREE_TABLEBITS
+                || (getState().getIntelState() == IntelState.NOT_STARTED)) {
+            getState().setBlockRemaining(
+                    getState().getBlockRemaining() - (int) getBlockLength());
+        } else {
+            long curpos = getState().getBlockRemaining();
+            getState().setBlockRemaining(
+                    getState().getBlockRemaining() - (int) getBlockLength());
+            int i = 0;
+            while (i < getBlockLength() - 10) {
+                if (content[i] != 0xe8) {
+                    i++;
+                    continue;
+                }
+                byte[] b = new byte[4];
+                b[0] = getContent()[i + 3];
+                b[1] = getContent()[i + 2];
+                b[2] = getContent()[i + 1];
+                b[3] = getContent()[i + 0];
+                long absoff = (new BigInteger(b)).longValue();
+                if ((absoff >= -curpos)
+                        && (absoff < getState().getIntelFileSize())) {
+                    long reloff = (absoff >= 0) ? absoff - curpos : absoff
+                            + getState().getIntelFileSize();
+                    getContent()[i + 0] = (byte) reloff;
+                    getContent()[i + 1] = (byte) (reloff >>> 8);
+                    getContent()[i + 2] = (byte) (reloff >>> 16);
+                    getContent()[i + 3] = (byte) (reloff >>> 24);
+                }
+                i += 4;
+                curpos += 5;
+            }
+        }
+    }
+
+    private short[] createPreLenTable() {
+        short[] tmp = new short[ChmConstants.LZX_PRETREE_MAXSYMBOLS];
+        for (int i = 0; i < ChmConstants.LZX_PRETREE_MAXSYMBOLS; i++) {
+            tmp[i] = (short) getChmSection().getSyncBits(
+                    ChmConstants.LZX_PRETREE_NUM_ELEMENTS_BITS);
+        }
+        return tmp;
+    }
+
+    private void createLengthTreeTable() throws TikaException {
+        //Read Pre Tree Table
+        short[] prelentable = createPreLenTable();
+
+        if (prelentable == null) {
+            throw new ChmParsingException("pretreetable is null");
+        }
+
+        short[] pretreetable = createTreeTable2(prelentable,
+                (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+                        + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+                ChmConstants.LZX_PRETREE_TABLEBITS,
+                ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+        if (pretreetable == null) {
+            throw new ChmParsingException("pretreetable is null");
+        }
+
+        //Build Length Tree
+        createLengthTreeLenTable(0, ChmConstants.LZX_NUM_SECONDARY_LENGTHS,
+                pretreetable, prelentable);
+
+        getState().setLengthTreeTable(
+                createTreeTable2(getState().getLengthTreeLengtsTable(),
+                        (1 << ChmConstants.LZX_LENGTH_TABLEBITS)
+                                + (ChmConstants.LZX_LENGTH_MAXSYMBOLS << 1),
+                        ChmConstants.LZX_LENGTH_TABLEBITS,
+                        ChmConstants.LZX_NUM_SECONDARY_LENGTHS));
+    }
+
+    private void decompressUncompressedBlock(int len, byte[] prevcontent) {
+        if (getContentLength() + getState().getBlockRemaining() <= 
getBlockLength()) {
+            for (int i = getContentLength(); i < (getContentLength() + 
getState()
+                    .getBlockRemaining()); i++)
+                content[i] = getChmSection().getByte();
+
+            setContentLength(getContentLength()
+                    + getState().getBlockRemaining());
+            getState().setBlockRemaining(0);
+        } else {
+            for (int i = getContentLength(); i < getBlockLength(); i++)
+                content[i] = getChmSection().getByte();
+            getState().setBlockRemaining(
+                    (int) getBlockLength() - getContentLength());// = blockLen 
-
+                                                                 // contentlen;
+            setContentLength((int) getBlockLength());
+        }
+    }
+
+    private void decompressAlignedBlock(int len, byte[] prevcontent) throws 
TikaException {
+
+        if ((getChmSection() == null) || (getState() == null)
+                || (getState().getMainTreeTable() == null))
+            throw new ChmParsingException("chm section is null");
+
+        short s;
+        int x, i, border;
+        int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
+        int matchoffset = 0;
+        for (i = getContentLength(); i < len; i++) {
+            /* new code */
+            //read huffman tree from main tree
+            border = getChmSection().peekBits(
+                    ChmConstants.LZX_MAINTREE_TABLEBITS);
+            if (border >= getState().mainTreeTable.length)
+                throw new ChmParsingException("error decompressing aligned 
block.");
+                //break;
+            /* end new code */
+            s = getState().mainTreeTable[getChmSection().peekBits(
+                    ChmConstants.LZX_MAINTREE_TABLEBITS)];
+            if (s >= getState().getMainTreeElements()) {
+                x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+                do {
+                    x++;
+                    s <<= 1;
+                    s += getChmSection().checkBit(x);
+                } while ((s = getState().mainTreeTable[s]) >= getState()
+                        .getMainTreeElements());
+            }
+            //System.out.printf("%d,", s);
+            //?getChmSection().getSyncBits(getState().mainTreeTable[s]);
+            
getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
+            if (s < ChmConstants.LZX_NUM_CHARS) {
+                content[i] = (byte) s;
+            } else {
+                s -= ChmConstants.LZX_NUM_CHARS;
+                matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
+                if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
+                    matchfooter = getState().lengthTreeTable[getChmSection()
+                            
.peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];//.LZX_MAINTREE_TABLEBITS)];
+                    if (matchfooter >= 
ChmConstants.LZX_LENGTH_MAXSYMBOLS/*?LZX_LENGTH_TABLEBITS*/) {
+                        x = ChmConstants.LZX_LENGTH_TABLEBITS;
+                        do {
+                            x++;
+                            matchfooter <<= 1;
+                            matchfooter += getChmSection().checkBit(x);
+                        } while ((matchfooter = 
getState().lengthTreeTable[matchfooter]) >= 
ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
+                    }
+                    getChmSection().getSyncBits(
+                            getState().lengthTreeLengtsTable[matchfooter]);
+                    matchlen += matchfooter;
+                }
+                matchlen += ChmConstants.LZX_MIN_MATCH;
+                matchoffset = s >>> 3;
+                if (matchoffset > 2) {
+                    extra = ChmConstants.EXTRA_BITS[matchoffset];
+                    matchoffset = (ChmConstants.POSITION_BASE[matchoffset] - 
2);
+                    if (extra > 3) {
+                        extra -= 3;
+                        long verbatim_bits = 
getChmSection().getSyncBits(extra);
+                        matchoffset += (verbatim_bits << 3);
+                        //READ HUFF SYM in Aligned Tree
+                        int aligned_bits = getChmSection().peekBits(
+                                ChmConstants.LZX_NUM_PRIMARY_LENGTHS);
+                        int t = getState().getAlignedTreeTable()[aligned_bits];
+                        if (t >= getState().getMainTreeElements()) {
+                            x = ChmConstants.LZX_ALIGNED_TABLEBITS; 
//?LZX_MAINTREE_TABLEBITS; //?LZX_ALIGNED_TABLEBITS
+                            do {
+                                x++;
+                                t <<= 1;
+                                t += getChmSection().checkBit(x);
+                            } while ((t = getState().getAlignedTreeTable()[t]) 
>= getState()
+                                    .getMainTreeElements());
+                        }
+                        getChmSection().getSyncBits(
+                                getState().getAlignedLenTable()[t]);
+                        matchoffset += t;
+                    } else if (extra == 3) {
+                        int g = getChmSection().peekBits(
+                                ChmConstants.LZX_NUM_PRIMARY_LENGTHS);
+                        int t = getState().getAlignedTreeTable()[g];
+                        if (t >= getState().getMainTreeElements()) {
+                            x = ChmConstants.LZX_ALIGNED_TABLEBITS; 
//?LZX_MAINTREE_TABLEBITS;
+                            do {
+                                x++;
+                                t <<= 1;
+                                t += getChmSection().checkBit(x);
+                            } while ((t = getState().getAlignedTreeTable()[t]) 
>= getState()
+                                    .getMainTreeElements());
+                        }
+                        getChmSection().getSyncBits(
+                                getState().getAlignedLenTable()[t]);
+                        matchoffset += t;
+                    } else if (extra > 0) {
+                        long l = getChmSection().getSyncBits(extra);
+                        matchoffset += l;
+                    } else
+                        matchoffset = 1;
+                    getState().setR2(getState().getR1());
+                    getState().setR1(getState().getR0());
+                    getState().setR0(matchoffset);
+                } else if (matchoffset == 0) {
+                    matchoffset = (int) getState().getR0();
+                } else if (matchoffset == 1) {
+                    matchoffset = (int) getState().getR1();
+                    getState().setR1(getState().getR0());
+                    getState().setR0(matchoffset);
+                } else /** match_offset == 2 */
+                {
+                    matchoffset = (int) getState().getR2();
+                    getState().setR2(getState().getR0());
+                    getState().setR0(matchoffset);
+                }
+                rundest = i;
+                runsrc = rundest - matchoffset;
+                i += (matchlen - 1);
+                if (i > len)
+                    break;
+
+                if (runsrc < 0) {
+                    if (matchlen + runsrc <= 0) {
+                        runsrc = prevcontent.length + runsrc;
+                        while (matchlen-- > 0)
+                            content[rundest++] = prevcontent[runsrc++];
+                    } else {
+                        runsrc = prevcontent.length + runsrc;
+                        while (runsrc < prevcontent.length)
+                            content[rundest++] = prevcontent[runsrc++];
+                        matchlen = matchlen + runsrc - prevcontent.length;
+                        runsrc = 0;
+                        while (matchlen-- > 0)
+                            content[rundest++] = content[runsrc++];
+                    }
+
+                } else {
+                    /* copies any wrappes around source data */
+                    while ((runsrc < 0) && (matchlen-- > 0)) {
+                        content[rundest++] = content[(int) (runsrc + 
getBlockLength())];
+                        runsrc++;
+                    }
+                    /* copies match data - no worries about destination wraps 
*/
+                    while (matchlen-- > 0)
+                        content[rundest++] = content[runsrc++];
+                }
+            }
+        }
+        setContentLength(len);
+    }
+
+    private void assertShortArrayNotNull(short[] array) throws TikaException {
+        if (array == null)
+            throw new ChmParsingException("short[] is null");
+    }
+
+    private void decompressVerbatimBlock(int len, byte[] prevcontent) throws 
TikaException {
+        short s;
+        int x, i;
+        int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
+        int matchoffset = 0;
+        for (i = getContentLength(); i < len; i++) {
+            int f = getChmSection().peekBits(
+                    ChmConstants.LZX_MAINTREE_TABLEBITS);
+            assertShortArrayNotNull(getState().getMainTreeTable());
+            s = getState().getMainTreeTable()[f];
+            if (s >= ChmConstants.LZX_MAIN_MAXSYMBOLS) {
+                x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+                do {
+                    x++;
+                    s <<= 1;
+                    s += getChmSection().checkBit(x);
+                } while ((s = getState().getMainTreeTable()[s]) >= 
ChmConstants.LZX_MAIN_MAXSYMBOLS);
+            }
+            
getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
+            if (s < ChmConstants.LZX_NUM_CHARS) {
+                content[i] = (byte) s;
+            } else {
+                s -= ChmConstants.LZX_NUM_CHARS;
+                matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
+                if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
+                    matchfooter = 
getState().getLengthTreeTable()[getChmSection()
+                            .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];
+                    if (matchfooter >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS) 
{
+                        x = ChmConstants.LZX_LENGTH_TABLEBITS;
+                        do {
+                            x++;
+                            matchfooter <<= 1;
+                            matchfooter += getChmSection().checkBit(x);
+                        } while ((matchfooter = 
getState().getLengthTreeTable()[matchfooter]) >= 
ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
+                    }
+                    getChmSection().getSyncBits(
+                            
getState().getLengthTreeLengtsTable()[matchfooter]);
+                    matchlen += matchfooter;
+                }
+                matchlen += ChmConstants.LZX_MIN_MATCH;
+                // shorter than 2
+                matchoffset = s >>> 3;
+                if (matchoffset > 2) {
+                    if (matchoffset != 3) { // should get other bits to 
retrieve
+                                            // offset
+                        extra = ChmConstants.EXTRA_BITS[matchoffset];
+                        long l = getChmSection().getSyncBits(extra);
+                        matchoffset = (int) 
(ChmConstants.POSITION_BASE[matchoffset] - 2 + l);
+                    } else {
+                        matchoffset = 1;
+                    }
+                    getState().setR2(getState().getR1());
+                    getState().setR1(getState().getR0());
+                    getState().setR0(matchoffset);
+                } else if (matchoffset == 0) {
+                    matchoffset = (int) getState().getR0();
+                } else if (matchoffset == 1) {
+                    matchoffset = (int) getState().getR1();
+                    getState().setR1(getState().getR0());
+                    getState().setR0(matchoffset);
+                } else /* match_offset == 2 */
+                {
+                    matchoffset = (int) getState().getR2();
+                    getState().setR2(getState().getR0());
+                    getState().setR0(matchoffset);
+                }
+                rundest = i;
+                runsrc = rundest - matchoffset;
+                i += (matchlen - 1);
+                if (i > len)
+                    break;
+                if (runsrc < 0) {
+                    if (matchlen + runsrc <= 0) {
+                        runsrc = prevcontent.length + runsrc;
+                        while ((matchlen-- > 0) && (prevcontent != null)
+                                && ((runsrc + 1) > 0))
+                            if ((rundest < content.length)
+                                    && (runsrc < content.length))
+                                content[rundest++] = prevcontent[runsrc++];
+                    } else {
+                        runsrc = prevcontent.length + runsrc;
+                        while (runsrc < prevcontent.length)
+                            if ((rundest < content.length)
+                                    && (runsrc < content.length))
+                                content[rundest++] = prevcontent[runsrc++];
+                        matchlen = matchlen + runsrc - prevcontent.length;
+                        runsrc = 0;
+                        while (matchlen-- > 0)
+                            content[rundest++] = content[runsrc++];
+                    }
+
+                } else {
+                    /* copies any wrapped source data */
+                    while ((runsrc < 0) && (matchlen-- > 0)) {
+                        content[rundest++] = content[(int) (runsrc + 
getBlockLength())];
+                        runsrc++;
+                    }
+                    /* copies match data - no worries about destination wraps 
*/
+                    while (matchlen-- > 0) {
+                        if ((rundest < content.length)
+                                && (runsrc < content.length))
+                            content[rundest++] = content[runsrc++];
+                    }
+                }
+            }
+        }
+        setContentLength(len);
+    }
+
+    private void createLengthTreeLenTable(int offset, int tablelen,
+            short[] pretreetable, short[] prelentable) throws TikaException {
+        if (prelentable == null || getChmSection() == null
+                || pretreetable == null || prelentable == null)
+            throw new ChmParsingException("is null");
+
+        int i = offset; // represents offset
+        int z, y, x;// local counters
+        while (i < tablelen) {
+            //Read HUFF sym to z
+            z = pretreetable[getChmSection().peekBits(
+                    ChmConstants.LZX_PRETREE_TABLEBITS)];
+            if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 1 bug, should 
be
+                                                             // 20
+                x = ChmConstants.LZX_PRETREE_TABLEBITS;
+                do {
+                    x++;
+                    z <<= 1;
+                    z += getChmSection().checkBit(x);
+                } while ((z = pretreetable[z]) >= 
ChmConstants.LZX_PRETREE_NUM_ELEMENTS);
+            }
+            getChmSection().getSyncBits(prelentable[z]);
+            
+            if (z < 17) {
+                z = getState().getLengthTreeLengtsTable()[i] - z;
+                if (z < 0)
+                    z = z + 17;
+                getState().getLengthTreeLengtsTable()[i] = (short) z;
+                i++;
+            } else if (z == 17) {
+                y = getChmSection().getSyncBits(4);
+                y += 4;
+                for (int j = 0; j < y; j++)
+                    if (i < getState().getLengthTreeLengtsTable().length)
+                        getState().getLengthTreeLengtsTable()[i++] = 0;
+            } else if (z == 18) {
+                y = getChmSection().getSyncBits(5);
+                y += 20;
+                for (int j = 0; j < y; j++)
+                    //no tolerate //if (i < 
getState().getLengthTreeLengtsTable().length)
+                        getState().getLengthTreeLengtsTable()[i++] = 0;
+            } else if (z == 19) {
+                y = getChmSection().getSyncBits(1);
+                y += 4;
+                z = pretreetable[getChmSection().peekBits(
+                        ChmConstants.LZX_PRETREE_TABLEBITS)];
+                if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 20
+                    x = ChmConstants.LZX_PRETREE_TABLEBITS;// 6
+                    do {
+                        x++;
+                        z <<= 1;
+                        z += getChmSection().checkBit(x);
+                    } while ((z = pretreetable[z]) >= 
ChmConstants.LZX_PRETREE_NUM_ELEMENTS);//LZX_MAINTREE_TABLEBITS);
+                }
+                getChmSection().getSyncBits(prelentable[z]);
+                z = getState().getLengthTreeLengtsTable()[i] - z;
+                if (z < 0)
+                    z = z + 17;
+                for (int j = 0; j < y; j++)
+                    getState().getLengthTreeLengtsTable()[i++] = (short) z;
+            }
+        }
+    }
+
+    private void createMainTreeTable() throws TikaException {
+        //Read Pre Tree Table
+        short[] prelentable = createPreLenTable();
+        short[] pretreetable = createTreeTable2(prelentable,
+                (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+                        + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+                ChmConstants.LZX_PRETREE_TABLEBITS,
+                ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+        createMainTreeLenTable(0, ChmConstants.LZX_NUM_CHARS, pretreetable,
+                prelentable);
+        
+        //Read Pre Tree Table
+        prelentable = createPreLenTable();
+        pretreetable = createTreeTable2(prelentable,
+                (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+                        + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+                ChmConstants.LZX_PRETREE_TABLEBITS,
+                ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+        createMainTreeLenTable(ChmConstants.LZX_NUM_CHARS,
+                getState().mainTreeLengtsTable.length, pretreetable,
+                prelentable);
+
+        getState().setMainTreeTable(
+                createTreeTable2(getState().mainTreeLengtsTable,
+                        (1 << ChmConstants.LZX_MAINTREE_TABLEBITS)
+                                + (ChmConstants.LZX_MAINTREE_MAXSYMBOLS << 1),
+                        ChmConstants.LZX_MAINTREE_TABLEBITS, getState()
+                                .getMainTreeElements()));
+    }
+
+    private void createMainTreeLenTable(int offset, int tablelen,
+            short[] pretreetable, short[] prelentable) throws TikaException {
+        if (pretreetable == null)
+            throw new ChmParsingException("pretreetable is null");
+        int i = offset;
+        int z, y, x;
+        while (i < tablelen) {
+            int f = getChmSection().peekBits(
+                    ChmConstants.LZX_PRETREE_TABLEBITS);
+            z = pretreetable[f];
+            if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
+                x = ChmConstants.LZX_PRETREE_TABLEBITS;
+                do {
+                    x++;
+                    z <<= 1;
+                    z += getChmSection().checkBit(x);
+                } while ((z = pretreetable[z]) >= 
ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+            }
+            getChmSection().getSyncBits(prelentable[z]);
+            if (z < 17) {
+                z = getState().getMainTreeLengtsTable()[i] - z;
+                if (z < 0)
+                    z = z + 17;
+                getState().mainTreeLengtsTable[i] = (short) z;
+                i++;
+            } else if (z == 17) {
+                y = getChmSection().getSyncBits(4);
+                y += 4;
+                for (int j = 0; j < y; j++) {
+                    assertInRange(getState().getMainTreeLengtsTable(), i);
+                    getState().mainTreeLengtsTable[i++] = 0;
+                }
+            } else if (z == 18) {
+                y = getChmSection().getSyncBits(5);
+                y += 20;
+                for (int j = 0; j < y; j++) {
+                    assertInRange(getState().getMainTreeLengtsTable(), i);
+                    getState().mainTreeLengtsTable[i++] = 0;
+                }
+            } else if (z == 19) {
+                y = getChmSection().getSyncBits(1);
+                y += 4;
+                z = pretreetable[getChmSection().peekBits(
+                        ChmConstants.LZX_PRETREE_TABLEBITS)];
+                if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
+                    x = ChmConstants.LZX_PRETREE_TABLEBITS;
+                    do {
+                        x++;
+                        z <<= 1;
+                        z += getChmSection().checkBit(x);
+                    } while ((z = pretreetable[z]) >= 
ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+                }
+                getChmSection().getSyncBits(prelentable[z]);
+                z = getState().mainTreeLengtsTable[i] - z;
+                if (z < 0)
+                    z = z + 17;
+                for (int j = 0; j < y; j++)
+                    if (i < getState().getMainTreeLengtsTable().length)
+                        getState().mainTreeLengtsTable[i++] = (short) z;
+            }
+        }
+    }
+
+    private void assertInRange(short[] array, int index) throws 
ChmParsingException {
+        if (index >= array.length)
+            throw new ChmParsingException(index + " is bigger than "
+                    + array.length);
+    }
+
+    private short[] createAlignedLenTable() {
+        int tablelen = 
ChmConstants.LZX_ALIGNED_NUM_ELEMENTS;//LZX_BLOCKTYPE_UNCOMPRESSED;//
+        int bits = ChmConstants.LZX_BLOCKTYPE_UNCOMPRESSED;
+        short[] tmp = new short[tablelen];
+        for (int i = 0; i < tablelen; i++) {
+            tmp[i] = (short) getChmSection().getSyncBits(bits);
+        }
+        return tmp;
+    }
+
+    private void createAlignedTreeTable() throws ChmParsingException {
+        getState().setAlignedLenTable(createAlignedLenTable());
+        getState().setAlignedTreeTable(//setAlignedLenTable(
+                createTreeTable2(getState().getAlignedLenTable(),
+                        (1 << ChmConstants.LZX_NUM_PRIMARY_LENGTHS)
+                                + (ChmConstants.LZX_ALIGNED_MAXSYMBOLS << 1),
+                        ChmConstants.LZX_NUM_PRIMARY_LENGTHS,
+                        ChmConstants.LZX_ALIGNED_MAXSYMBOLS));
+    }
+
+    private short[] createTreeTable2(short[] lentable, int tablelen, int bits,
+            int maxsymbol) throws ChmParsingException {
+        short[] tmp = new short[tablelen];
+        short sym;
+        int leaf;
+        int bit_num = 1;
+        long fill;
+        int pos = 0;
+        /* the current position in the decode table */
+        long table_mask = (1 << bits);
+        long bit_mask = (table_mask >> 1);
+        long next_symbol = bit_mask;
+
+        /* fills entries for short codes for a direct mapping */
+        while (bit_num <= bits) {
+            for (sym = 0; sym < maxsymbol; sym++) {
+                if (lentable.length > sym && lentable[sym] == bit_num) {
+                    leaf = pos;
+
+                    if ((pos += bit_mask) > table_mask) {
+                        /* table overflow */
+                        throw new ChmParsingException("Table overflow");
+                    }
+
+                    fill = bit_mask;
+                    while (fill-- > 0)
+                        tmp[leaf++] = sym;
+                }
+            }
+            bit_mask >>= 1;
+            bit_num++;
+        }
+
+        /* if there are any codes longer than nbits */
+        if (pos != table_mask) {
+            /* clears the remainder of the table */
+            for (leaf = pos; leaf < table_mask; leaf++)
+                tmp[leaf] = 0;
+
+            /* gives ourselves room for codes to grow by up to 16 more bits */
+            pos <<= 16;
+            table_mask <<= 16;
+            bit_mask = 1 << 15;
+
+            while (bit_num <= 16) {
+                for (sym = 0; sym < maxsymbol; sym++) {
+                    if ((lentable.length > sym) && (lentable[sym] == bit_num)) 
{
+                        leaf = pos >> 16;
+                        for (fill = 0; fill < bit_num - bits; fill++) {
+                            /*
+                             * if this path hasn't been taken yet, 'allocate'
+                             * two entries
+                             */
+                            if (tmp[leaf] == 0) {
+                                if (((next_symbol << 1) + 1) < tmp.length) {
+                                    tmp[(int) (next_symbol << 1)] = 0;
+                                    tmp[(int) (next_symbol << 1) + 1] = 0;
+                                    tmp[leaf] = (short) next_symbol++;
+                                }
+
+                            }
+                            /*
+                             * follows the path and select either left or right
+                             * for next bit
+                             */
+                            leaf = tmp[leaf] << 1;
+                            if (((pos >> (15 - fill)) & 1) != 0)
+                                leaf++;
+                        }
+                        tmp[leaf] = sym;
+
+                        if ((pos += bit_mask) > table_mask) {
+                            /* table overflow */
+                            throw new ChmParsingException("Table overflow");
+                        }
+                    }
+                }
+                bit_mask >>= 1;
+                bit_num++;
+            }
+        }
+
+        /* is it full table? */
+        if (pos == table_mask)
+            return tmp;
+
+        return tmp;
+    }
+
+    public byte[] getContent() {
+        return content;
+    }
+
+    public byte[] getContent(int startOffset, int endOffset) {
+        return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
+                startOffset, endOffset) : new byte[1];
+    }
+
+    public byte[] getContent(int start) {
+        return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
+                start, getContent().length) : new byte[1];
+    }
+
+    private void setContent(int contentLength) {
+        this.content = new byte[contentLength];
+    }
+
+    private void checkLzxBlock(ChmLzxBlock chmPrevLzxBlock) throws 
TikaException {
+        if (chmPrevLzxBlock == null && getBlockLength() < Integer.MAX_VALUE)
+            setState(new ChmLzxState((int) getBlockLength()));
+        else
+            //use clone to avoid changing a cached or to be cached block
+            setState(chmPrevLzxBlock.getState().clone()); 
+    }
+
+    private boolean validateConstructorParams(int blockNumber,
+            byte[] dataSegment, long blockLength) throws TikaException {
+        int goodParameter = 0;
+        if (blockNumber >= 0)
+            ++goodParameter;
+        else
+            throw new ChmParsingException("block number should be possitive");
+        if (dataSegment != null && dataSegment.length > 0)
+            ++goodParameter;
+        else
+            throw new ChmParsingException("data segment should not be null");
+        if (blockLength > 0)
+            ++goodParameter;
+        else
+            throw new ChmParsingException(
+                    "block length should be more than zero");
+        return (goodParameter == 3);
+    }
+
+    public int getBlockNumber() {
+        return block_number;
+    }
+
+    private void setBlockNumber(int block_number) {
+        this.block_number = block_number;
+    }
+
+    private long getBlockLength() {
+        return block_length;
+    }
+
+    private void setBlockLength(long block_length) {
+        this.block_length = block_length;
+    }
+
+    public ChmLzxState getState() {
+        return state;
+    }
+
+    private void setState(ChmLzxState state) {
+        this.state = state;
+    }
+}


Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,327 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.util.concurrent.CancellationException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
+import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+public class ChmLzxState implements Cloneable {
+    /* Class' members */
+    private int window; /* the actual decoding window */
+    private long window_size; /* window size (32Kb through 2Mb) */
+    private int window_position; /* current offset within the window */
+    private int main_tree_elements; /* number of main tree elements */
+    private LzxState hadStarted; /* have we started decoding at all yet? */
+    private int block_type; /* type of this block */
+    private int block_length; /* uncompressed length of this block */
+    private int block_remaining; /* uncompressed bytes still left to decode */
+    private int frames_read; /* the number of CFDATA blocks processed */
+    private int intel_file_size; /* magic header value used for transform */
+    private long intel_current_possition; /* current offset in transform space 
*/
+    private IntelState intel_state; /* have we seen any translatable data yet? 
*/
+    private long R0; /* for the LRU offset system */
+    private long R1; /* for the LRU offset system */
+    private long R2; /* for the LRU offset system */
+
+    // Trees - PRETREE, MAINTREE, LENGTH, ALIGNED
+    protected short[] mainTreeLengtsTable;
+    protected short[] mainTreeTable;
+
+    protected short[] lengthTreeTable;
+    protected short[] lengthTreeLengtsTable;
+
+    protected short[] alignedLenTable;
+    protected short[] alignedTreeTable;
+
+    @Override
+    public ChmLzxState clone() {
+        try {
+          ChmLzxState clone = (ChmLzxState)super.clone();
+          clone.mainTreeLengtsTable = arrayClone(mainTreeLengtsTable);
+          clone.mainTreeTable = arrayClone(mainTreeTable);
+          clone.lengthTreeTable = arrayClone(lengthTreeTable);
+          clone.lengthTreeLengtsTable = arrayClone(lengthTreeLengtsTable);
+          clone.alignedLenTable = arrayClone(alignedLenTable);
+          clone.alignedTreeTable = arrayClone(alignedTreeTable);
+          return clone;
+        } catch (CloneNotSupportedException ex) {
+           return null;
+        }
+    }
+    
+    protected short[] getMainTreeTable() {
+        return mainTreeTable;
+    }
+
+    protected short[] getAlignedTreeTable() {
+        return alignedTreeTable;
+    }
+
+    protected void setAlignedTreeTable(short[] alignedTreeTable) {
+        this.alignedTreeTable = alignedTreeTable;
+    }
+
+    protected short[] getLengthTreeTable() throws TikaException {
+        if (lengthTreeTable != null)
+            return this.lengthTreeTable;
+        else
+            throw new ChmParsingException("lengthTreeTable is null");
+    }
+
+    protected void setLengthTreeTable(short[] lengthTreeTable) {
+        this.lengthTreeTable = lengthTreeTable;
+    }
+
+    protected void setMainTreeTable(short[] mainTreeTable) {
+        this.mainTreeTable = mainTreeTable;
+    }
+
+    protected short[] getAlignedLenTable() {
+        return this.alignedLenTable;
+    }
+
+    protected void setAlignedLenTable(short[] alignedLenTable) {
+        this.alignedLenTable = alignedLenTable;
+    }
+
+    /**
+     * It suits for informative outlook
+     */
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("actual decoding window:=" + getWindow()
+                + System.getProperty("line.separator"));
+        sb.append("window size (32Kb through 2Mb):=" + getWindowSize()
+                + System.getProperty("line.separator"));
+        sb.append("current offset within the window:=" + getWindowPosition()
+                + System.getProperty("line.separator"));
+        sb.append("number of main tree elements:=" + getMainTreeElements()
+                + System.getProperty("line.separator"));
+        sb.append("have we started decoding at all yet?:=" + getHadStarted()
+                + System.getProperty("line.separator"));
+        sb.append("type of this block:=" + getBlockType()
+                + System.getProperty("line.separator"));
+        sb.append("uncompressed length of this block:=" + getBlockLength()
+                + System.getProperty("line.separator"));
+        sb.append("uncompressed bytes still left to decode:="
+                + getBlockRemaining() + System.getProperty("line.separator"));
+        sb.append("the number of CFDATA blocks processed:=" + getFramesRead()
+                + System.getProperty("line.separator"));
+        sb.append("magic header value used for transform:="
+                + getIntelFileSize() + System.getProperty("line.separator"));
+        sb.append("current offset in transform space:="
+                + getIntelCurrentPossition()
+                + System.getProperty("line.separator"));
+        sb.append("have we seen any translatable data yet?:=" + getIntelState()
+                + System.getProperty("line.separator"));
+        sb.append("R0 for the LRU offset system:=" + getR0()
+                + System.getProperty("line.separator"));
+        sb.append("R1 for the LRU offset system:=" + getR1()
+                + System.getProperty("line.separator"));
+        sb.append("R2 for the LRU offset system:=" + getR2()
+                + System.getProperty("line.separator"));
+        sb.append("main tree length:=" + getMainTreeLengtsTable().length
+                + System.getProperty("line.separator"));
+        sb.append("secondary tree length:=" + getLengthTreeLengtsTable().length
+                + System.getProperty("line.separator"));
+        return sb.toString();
+    }
+
+    public ChmLzxState(int window) throws TikaException {
+        if (window >= 0) {
+            int position_slots;
+            int win = ChmCommons.getWindowSize(window);
+            setWindowSize(1 << win);
+            /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */
+            if (win < 15 || win > 21)
+                throw new ChmParsingException("window less than 15 or window 
greater than 21");
+
+            /* Calculates required position slots */
+            if (win == 20)
+                position_slots = 42;
+            else if (win == 21)
+                position_slots = 50;
+            else
+                position_slots = win << 1;
+            //TODO: position_slots is not used ?
+            setR0(1);
+            setR1(1);
+            setR2(1);
+            setMainTreeElements(512);
+            setHadStarted(LzxState.NOT_STARTED_DECODING);
+            setFramesRead(0);
+            setBlockRemaining(0);
+            setBlockType(ChmConstants.LZX_BLOCKTYPE_INVALID);
+            setIntelCurrentPossition(0);
+            setIntelState(IntelState.NOT_STARTED);
+            setWindowPosition(0);
+            setMainTreeLengtsTable(new short[getMainTreeElements()]);
+            setLengthTreeLengtsTable(new 
short[ChmConstants.LZX_NUM_SECONDARY_LENGTHS]);
+        } else
+            throw new CancellationException(
+                    "window size should be more than zero");
+    }
+
+    protected void setWindow(int window) {
+        this.window = window;
+    }
+
+    protected int getWindow() {
+        return window;
+    }
+
+    protected void setWindowSize(long window_size) {
+        this.window_size = window_size;
+    }
+
+    protected long getWindowSize() {
+        return window_size;
+    }
+
+    protected void setWindowPosition(int window_position) {
+        this.window_position = window_position;
+    }
+
+    protected int getWindowPosition() {
+        return window_position;
+    }
+
+    protected void setMainTreeElements(int main_tree_elements) {
+        this.main_tree_elements = main_tree_elements;
+    }
+
+    protected int getMainTreeElements() {
+        return main_tree_elements;
+    }
+
+    protected void setHadStarted(LzxState hadStarted) {
+        this.hadStarted = hadStarted;
+    }
+
+    protected LzxState getHadStarted() {
+        return hadStarted;
+    }
+
+    protected void setBlockType(int block_type) {
+        this.block_type = block_type;
+    }
+
+    public int getBlockType() {
+        return block_type;
+    }
+
+    protected void setBlockLength(int block_length) {
+        this.block_length = block_length;
+    }
+
+    protected int getBlockLength() {
+        return block_length;
+    }
+
+    protected void setBlockRemaining(int block_remaining) {
+        this.block_remaining = block_remaining;
+    }
+
+    protected int getBlockRemaining() {
+        return block_remaining;
+    }
+
+    protected void setFramesRead(int frames_read) {
+        this.frames_read = frames_read;
+    }
+
+    protected void increaseFramesRead() {
+        this.frames_read = getFramesRead() + 1;
+    }
+
+    protected int getFramesRead() {
+        return frames_read;
+    }
+
+    protected void setIntelFileSize(int intel_file_size) {
+        this.intel_file_size = intel_file_size;
+    }
+
+    protected int getIntelFileSize() {
+        return intel_file_size;
+    }
+
+    protected void setIntelCurrentPossition(long intel_current_possition) {
+        this.intel_current_possition = intel_current_possition;
+    }
+
+    protected long getIntelCurrentPossition() {
+        return intel_current_possition;
+    }
+
+    protected void setIntelState(IntelState intel_state) {
+        this.intel_state = intel_state;
+    }
+
+    protected IntelState getIntelState() {
+        return intel_state;
+    }
+
+    protected void setR0(long r0) {
+        R0 = r0;
+    }
+
+    protected long getR0() {
+        return R0;
+    }
+
+    protected void setR1(long r1) {
+        R1 = r1;
+    }
+
+    protected long getR1() {
+        return R1;
+    }
+
+    protected void setR2(long r2) {
+        R2 = r2;
+    }
+
+    protected long getR2() {
+        return R2;
+    }
+
+    public void setMainTreeLengtsTable(short[] mainTreeLengtsTable) {
+        this.mainTreeLengtsTable = mainTreeLengtsTable;
+    }
+
+    public short[] getMainTreeLengtsTable() {
+        return mainTreeLengtsTable;
+    }
+
+    public void setLengthTreeLengtsTable(short[] lengthTreeLengtsTable) {
+        this.lengthTreeLengtsTable = lengthTreeLengtsTable;
+    }
+
+    public short[] getLengthTreeLengtsTable() {
+        return lengthTreeLengtsTable;
+    }
+    
+    private static short[] arrayClone(short[] a) {
+        return a==null ? null : (short[]) a.clone();
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+
+public class ChmSection {
+    final private byte[] data;
+    final private byte[] prevcontent;
+    private int swath;// kiks
+    private int total;// remains
+    private int buffer;// val
+
+    public ChmSection(byte[] data) throws TikaException {
+        this(data, null);
+    }
+
+    public ChmSection(byte[] data, byte[] prevconent) throws TikaException {
+        ChmCommons.assertByteArrayNotNull(data);
+        this.data = data;
+        this.prevcontent = prevconent;
+        //setData(data);
+    }
+    
+    /* Utilities */
+    public byte[] reverseByteOrder(byte[] toBeReversed) throws TikaException {
+        ChmCommons.assertByteArrayNotNull(toBeReversed);
+        ChmCommons.reverse(toBeReversed);
+        return toBeReversed;
+    }
+
+    public int checkBit(int i) {
+        return ((getBuffer() & (1 << (getTotal() - i))) == 0) ? 0 : 1;
+    }
+
+    public int getSyncBits(int bit) {
+        return getDesyncBits(bit, bit);
+    }
+
+    public int peekBits(int bit) {
+        return getDesyncBits(bit, 0);
+    }
+    
+    private int getDesyncBits(int bit, int removeBit) {
+        while (getTotal() < 16) {
+            setBuffer((getBuffer() << 16) + unmarshalUByte()
+                    + (unmarshalUByte() << 8));
+            setTotal(getTotal() + 16);
+        }
+        int tmp = (getBuffer() >>> (getTotal() - bit));
+        setTotal(getTotal() - removeBit);
+        setBuffer(getBuffer() - ((getBuffer() >>> getTotal()) << getTotal()));
+        return tmp;
+    }
+
+    public int unmarshalUByte() {
+        return getByte() & 255;
+    }
+
+    public byte getByte() {
+        if (getSwath() < getData().length) {
+            setSwath(getSwath() + 1);
+            return getData()[getSwath() - 1];
+        } else
+            return 0;
+    }
+
+    public int getLeft() {
+        return (getData().length - getSwath());
+    }
+
+    public byte[] getData() {
+        return data;
+    }
+
+    public byte[] getPrevContent() {
+        return prevcontent;
+    }
+    
+    public BigInteger getBigInteger(int i) {
+        if (getData() == null)
+            return BigInteger.ZERO;
+        if (getData().length - getSwath() < i)
+            i = getData().length - getSwath();
+        byte[] tmp = new byte[i];
+        for (int j = i - 1; j >= 0; j--) {
+            tmp[i - j - 1] = getData()[getSwath() + j];
+        }
+        setSwath(getSwath() + i);
+        return new BigInteger(tmp);
+    }
+
+    public byte[] stringToAsciiBytes(String s) {
+        char[] c = s.toCharArray();
+        byte[] byteval = new byte[c.length];
+        for (int i = 0; i < c.length; i++)
+            byteval[i] = (byte) c[i];
+        return byteval;
+    }
+
+    public BigInteger unmarshalUlong() {
+        return getBigInteger(8);
+    }
+
+    public long unmarshalUInt() {
+        return getBigInteger(4).longValue();
+    }
+
+    public int unmarshalInt() {
+        return getBigInteger(4).intValue();
+    }
+
+    public byte[] unmarshalBytes(int i) {
+        if (i == 0)
+            return new byte[1];
+        byte[] t = new byte[i];
+        for (int j = 0; j < i; j++)
+            t[j] = getData()[j + getSwath()];
+        setSwath(getSwath() + i);
+        return t;
+    }
+
+    public BigInteger getEncint() {
+        byte ob;
+        BigInteger bi = BigInteger.ZERO;
+        byte[] nb = new byte[1];
+        while ((ob = this.getByte()) < 0) {
+            nb[0] = (byte) ((ob & 0x7f));
+            bi = bi.shiftLeft(7).add(new BigInteger(nb));
+        }
+        nb[0] = (byte) ((ob & 0x7f));
+        bi = bi.shiftLeft(7).add(new BigInteger(nb));
+        return bi;
+    }
+
+    public char unmarshalUtfChar() {
+        byte ob;
+        int i = 1;
+        byte[] ba;
+        ob = this.getByte();
+        if (ob < 0) {
+            i = 2;
+            while ((ob << (24 + i)) < 0)
+                i++;
+        }
+        ba = new byte[i];
+        ba[0] = ob;
+        int j = 1;
+        while (j < i) {
+            ba[j] = this.getByte();
+            j++;
+        }
+        i = ba.length;
+        if (i == 1)
+            return (char) ba[0];
+        else {
+            int n;
+            n = ba[0] & 15; // 00001111b, gets last 4 bits
+            j = 1;
+            while (j < i)
+                n = (n << 6) + (ba[j++] & 63);// 00111111b,gets last 6 bits
+            return (char) n;
+        }
+    }
+
+//    private void setData(byte[] data) {
+//        this.data = data;
+//    }
+
+    public int getSwath() {
+        return swath;
+    }
+
+    public void setSwath(int swath) {
+        this.swath = swath;
+    }
+
+    public int getTotal() {
+        return total;
+    }
+
+    public void setTotal(int total) {
+        this.total = total;
+    }
+
+    private int getBuffer() {
+        return buffer;
+    }
+
+    private void setBuffer(int buffer) {
+        this.buffer = buffer;
+    }
+
+    /**
+     * @param args
+     * @throws TikaException 
+     */
+    public static void main(String[] args) throws TikaException {
+        byte[] array = { 4, 78, -67, 90, 1, -33 };
+        ChmSection chmSection = new ChmSection(array);
+        System.out.println("before " + Arrays.toString(array));
+        System.out.println("after " + 
Arrays.toString(chmSection.reverseByteOrder(array)));
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Queue;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses 
the
+ * DelegatingParser to process each mail.
+ */
+public class MboxParser extends AbstractParser {
+
+    public static final String MBOX_MIME_TYPE = "application/mbox";
+    public static final String MBOX_RECORD_DIVIDER = "From ";
+    public static final int MAIL_MAX_SIZE = 50000000;
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -1762689436731160661L;
+    private static final Set<MediaType> SUPPORTED_TYPES = 
Collections.singleton(MediaType.application("mbox"));
+    private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ 
]+):[ \t]*(.*)");
+    private static final Pattern EMAIL_ADDRESS_PATTERN = 
Pattern.compile("<(.*@.*)>");
+
+    private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-";
+    private static final String EMAIL_FROMLINE_METADATA = 
EMAIL_HEADER_METADATA_PREFIX + "from";
+    private final Map<Integer, Metadata> trackingMetadata = new 
HashMap<Integer, Metadata>();
+    private boolean tracking = false;
+
+    public static Date parseDate(String headerContent) throws ParseException {
+        SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy 
HH:mm:ss Z", Locale.US);
+        return dateFormat.parse(headerContent);
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+            throws IOException, TikaException, SAXException {
+
+        EmbeddedDocumentExtractor extractor = 
context.get(EmbeddedDocumentExtractor.class,
+                new ParsingEmbeddedDocumentExtractor(context));
+
+        String charsetName = "windows-1252";
+
+        metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
+        metadata.set(Metadata.CONTENT_ENCODING, charsetName);
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        InputStreamReader isr = new InputStreamReader(stream, charsetName);
+        try (BufferedReader reader = new BufferedReader(isr)) {
+            String curLine = reader.readLine();
+            int mailItem = 0;
+            do {
+                if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
+                    Metadata mailMetadata = new Metadata();
+                    Queue<String> multiline = new LinkedList<String>();
+                    mailMetadata.add(EMAIL_FROMLINE_METADATA, 
curLine.substring(MBOX_RECORD_DIVIDER.length()));
+                    mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
+                    curLine = reader.readLine();
+
+                    ByteArrayOutputStream message = new 
ByteArrayOutputStream(100000);
+                    do {
+                        if (curLine.startsWith(" ") || 
curLine.startsWith("\t")) {
+                            String latestLine = multiline.poll();
+                            latestLine += " " + curLine.trim();
+                            multiline.add(latestLine);
+                        } else {
+                            multiline.add(curLine);
+                        }
+
+                        message.write(curLine.getBytes(charsetName));
+                        message.write(0x0A);
+                        curLine = reader.readLine();
+                    }
+                    while (curLine != null && 
!curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
+
+                    for (String item : multiline) {
+                        saveHeaderInMetadata(mailMetadata, item);
+                    }
+
+                    ByteArrayInputStream messageStream = new 
ByteArrayInputStream(message.toByteArray());
+                    message = null;
+
+                    if (extractor.shouldParseEmbedded(mailMetadata)) {
+                        extractor.parseEmbedded(messageStream, xhtml, 
mailMetadata, true);
+                    }
+
+                    if (tracking) {
+                        getTrackingMetadata().put(mailItem++, mailMetadata);
+                    }
+                } else {
+                    curLine = reader.readLine();
+                }
+
+            } while (curLine != null && 
!Thread.currentThread().isInterrupted());
+        }
+
+        xhtml.endDocument();
+    }
+
+    public boolean isTracking() {
+        return tracking;
+    }
+
+    public void setTracking(boolean tracking) {
+        this.tracking = tracking;
+    }
+
+    public Map<Integer, Metadata> getTrackingMetadata() {
+        return trackingMetadata;
+    }
+
+    private void saveHeaderInMetadata(Metadata metadata, String curLine) {
+        Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
+        if (!headerMatcher.matches()) {
+            return; // ignore malformed header lines
+        }
+
+        String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT);
+        String headerContent = headerMatcher.group(2);
+
+        if (headerTag.equalsIgnoreCase("From")) {
+            metadata.set(TikaCoreProperties.CREATOR, headerContent);
+        } else if (headerTag.equalsIgnoreCase("To") || 
headerTag.equalsIgnoreCase("Cc")
+                || headerTag.equalsIgnoreCase("Bcc")) {
+            Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent);
+            if (address.find()) {
+                metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, 
address.group(1));
+            } else if (headerContent.indexOf('@') > -1) {
+                metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, 
headerContent);
+            }
+
+            String property = Metadata.MESSAGE_TO;
+            if (headerTag.equalsIgnoreCase("Cc")) {
+                property = Metadata.MESSAGE_CC;
+            } else if (headerTag.equalsIgnoreCase("Bcc")) {
+                property = Metadata.MESSAGE_BCC;
+            }
+            metadata.add(property, headerContent);
+        } else if (headerTag.equalsIgnoreCase("Subject")) {
+            metadata.add(Metadata.SUBJECT, headerContent);
+        } else if (headerTag.equalsIgnoreCase("Date")) {
+            try {
+                Date date = parseDate(headerContent);
+                metadata.set(TikaCoreProperties.CREATED, date);
+            } catch (ParseException e) {
+                // ignoring date because format was not understood
+            }
+        } else if (headerTag.equalsIgnoreCase("Message-Id")) {
+            metadata.set(TikaCoreProperties.IDENTIFIER, headerContent);
+        } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
+            metadata.set(TikaCoreProperties.RELATION, headerContent);
+        } else if (headerTag.equalsIgnoreCase("Content-Type")) {
+            // TODO - key off content-type in headers to
+            // set mapping to use for content and convert if necessary.
+
+            metadata.add(Metadata.CONTENT_TYPE, headerContent);
+            metadata.set(TikaCoreProperties.FORMAT, headerContent);
+        } else {
+            metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, 
headerContent);
+        }
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static java.lang.String.valueOf;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.util.Collections.singleton;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+import com.pff.PSTAttachment;
+import com.pff.PSTFile;
+import com.pff.PSTFolder;
+import com.pff.PSTMessage;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Parser for MS Outlook PST email storage files
+ */
+public class OutlookPSTParser extends AbstractParser {
+
+    private static final long serialVersionUID = 620998217748364063L;
+
+    public static final MediaType MS_OUTLOOK_PST_MIMETYPE = 
MediaType.application("vnd.ms-outlook-pst");
+    private static final Set<MediaType> SUPPORTED_TYPES = 
singleton(MS_OUTLOOK_PST_MIMETYPE);
+
+    private static AttributesImpl createAttribute(String attName, String 
attValue) {
+        AttributesImpl attributes = new AttributesImpl();
+        attributes.addAttribute("", attName, attName, "CDATA", attValue);
+        return attributes;
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        // Use the delegate parser to parse the contained document
+        EmbeddedDocumentExtractor embeddedExtractor = 
context.get(EmbeddedDocumentExtractor.class,
+                new ParsingEmbeddedDocumentExtractor(context));
+
+        metadata.set(Metadata.CONTENT_TYPE, 
MS_OUTLOOK_PST_MIMETYPE.toString());
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        TikaInputStream in = TikaInputStream.get(stream);
+        PSTFile pstFile = null;
+        try {
+            pstFile = new PSTFile(in.getFile().getPath());
+            metadata.set(Metadata.CONTENT_LENGTH, 
valueOf(pstFile.getFileHandle().length()));
+            boolean isValid = pstFile.getFileHandle().getFD().valid();
+            metadata.set("isValid", valueOf(isValid));
+            if (isValid) {
+                parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
+            }
+        } catch (Exception e) {
+            throw new TikaException(e.getMessage(), e);
+        } finally {
+            if (pstFile != null && pstFile.getFileHandle() != null) {
+                try {
+                    pstFile.getFileHandle().close();
+                } catch (IOException e) {
+                    //swallow closing exception
+                }
+            }
+        }
+
+        xhtml.endDocument();
+    }
+
+    private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, 
EmbeddedDocumentExtractor embeddedExtractor)
+            throws Exception {
+        if (pstFolder.getContentCount() > 0) {
+            PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
+            while (pstMail != null) {
+                AttributesImpl attributes = new AttributesImpl();
+                attributes.addAttribute("", "class", "class", "CDATA", 
"embedded");
+                attributes.addAttribute("", "id", "id", "CDATA", 
pstMail.getInternetMessageId());
+                handler.startElement("div", attributes);
+                handler.element("h1", pstMail.getSubject());
+
+                parserMailItem(handler, pstMail, embeddedExtractor);
+                parseMailAttachments(handler, pstMail, embeddedExtractor);
+
+                handler.endElement("div");
+
+                pstMail = (PSTMessage) pstFolder.getNextChild();
+            }
+        }
+
+        if (pstFolder.hasSubfolders()) {
+            for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) {
+                handler.startElement("div", createAttribute("class", 
"email-folder"));
+                handler.element("h1", pstSubFolder.getDisplayName());
+                parseFolder(handler, pstSubFolder, embeddedExtractor);
+                handler.endElement("div");
+            }
+        }
+    }
+
+    private void parserMailItem(XHTMLContentHandler handler, PSTMessage 
pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, 
IOException {
+        Metadata mailMetadata = new Metadata();
+        mailMetadata.set(Metadata.RESOURCE_NAME_KEY, 
pstMail.getInternetMessageId());
+        mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, 
pstMail.getInternetMessageId());
+        mailMetadata.set(TikaCoreProperties.IDENTIFIER, 
pstMail.getInternetMessageId());
+        mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject());
+        mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName());
+        mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName());
+        mailMetadata.set(TikaCoreProperties.CREATED, 
pstMail.getCreationTime());
+        mailMetadata.set(TikaCoreProperties.MODIFIED, 
pstMail.getLastModificationTime());
+        mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment());
+        mailMetadata.set("descriptorNodeId", 
valueOf(pstMail.getDescriptorNodeId()));
+        mailMetadata.set("senderEmailAddress", 
pstMail.getSenderEmailAddress());
+        mailMetadata.set("recipients", pstMail.getRecipientsString());
+        mailMetadata.set("displayTo", pstMail.getDisplayTo());
+        mailMetadata.set("displayCC", pstMail.getDisplayCC());
+        mailMetadata.set("displayBCC", pstMail.getDisplayBCC());
+        mailMetadata.set("importance", valueOf(pstMail.getImportance()));
+        mailMetadata.set("priority", valueOf(pstMail.getPriority()));
+        mailMetadata.set("flagged", valueOf(pstMail.isFlagged()));
+
+        byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
+        embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), 
handler, mailMetadata, true);
+    }
+
+    private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage 
email, EmbeddedDocumentExtractor embeddedExtractor)
+            throws TikaException {
+        int numberOfAttachments = email.getNumberOfAttachments();
+        for (int i = 0; i < numberOfAttachments; i++) {
+            File tempFile = null;
+            try {
+                PSTAttachment attach = email.getAttachment(i);
+
+                // Get the filename; both long and short filenames can be used 
for attachments
+                String filename = attach.getLongFilename();
+                if (filename.isEmpty()) {
+                    filename = attach.getFilename();
+                }
+
+                xhtml.element("p", filename);
+
+                Metadata attachMeta = new Metadata();
+                attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename);
+                attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename);
+                AttributesImpl attributes = new AttributesImpl();
+                attributes.addAttribute("", "class", "class", "CDATA", 
"embedded");
+                attributes.addAttribute("", "id", "id", "CDATA", filename);
+                xhtml.startElement("div", attributes);
+                if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
+                    TemporaryResources tmp = new TemporaryResources();
+                    try {
+                        TikaInputStream tis = 
TikaInputStream.get(attach.getFileInputStream(), tmp);
+                        embeddedExtractor.parseEmbedded(tis, xhtml, 
attachMeta, true);
+                    } finally {
+                        tmp.dispose();
+                    }
+                }
+                xhtml.endElement("div");
+
+            } catch (Exception e) {
+                throw new TikaException("Unable to unpack document stream", e);
+            } finally {
+                if (tempFile != null)
+                    tempFile.delete();
+            }
+        }
+    }
+
+}

svn commit: r1725014 [7/28] - in /tika/branches/2.x: tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-parser-module/ tika-parser-modules/tika-advanced-parser-mo...

Reply via email to