Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,913 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.lzx; + +import java.math.BigInteger; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.core.ChmCommons; +import org.apache.tika.parser.chm.core.ChmCommons.IntelState; +import org.apache.tika.parser.chm.core.ChmCommons.LzxState; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.apache.tika.parser.chm.exception.ChmParsingException; + +/** + * Decompresses a chm block. Depending on chm block type chooses most relevant + * decompressing method. A chm block type can be as follows:</br> <li>UNDEFINED + * - no action taken, i.e. skipping the block <li>VERBATIM <li>ALIGNED_OFFSET + * <li>UNCOMPRESSED the most simplest In addition there are unknown types (4-7). + * Currently relying on previous chm block these types changing according to the + * previous chm block type. We need to invent more appropriate way to handle + * such types. + * + */ +public class ChmLzxBlock { + private int block_number; + private long block_length; + private ChmLzxState state; + private byte[] content = null; + private ChmSection chmSection = null; + private int contentLength = 0; + + // trying to find solution for bad blocks ... + private int previousBlockType = -1; + + public ChmLzxBlock(int blockNumber, byte[] dataSegment, long blockLength, + ChmLzxBlock prevBlock) throws TikaException { + try { + if (validateConstructorParams(blockNumber, dataSegment, blockLength)) { + setBlockNumber(blockNumber); + + if (prevBlock != null + && prevBlock.getState().getBlockLength() > prevBlock + .getState().getBlockRemaining()) + setChmSection(new ChmSection(dataSegment, prevBlock.getContent())); + else + setChmSection(new ChmSection(dataSegment)); + + setBlockLength(blockLength); + + // ============================================ + // we need to take care of previous context + // ============================================ + checkLzxBlock(prevBlock); + if (prevBlock == null + || blockLength < (int) getBlockLength()) { + setContent((int) getBlockLength()); + } + else { + setContent((int) blockLength); + } + + if (prevBlock != null && prevBlock.getState() != null) + previousBlockType = prevBlock.getState().getBlockType(); + + extractContent(); + } else + throw new TikaException("Check your chm lzx block parameters"); + } catch (TikaException e) { + throw e; + } + } + + protected int getContentLength() { + return contentLength; + } + + protected void setContentLength(int contentLength) { + this.contentLength = contentLength; + } + + private ChmSection getChmSection() { + return chmSection; + } + + private void setChmSection(ChmSection chmSection) { + this.chmSection = chmSection; + } + + private void assertStateNotNull() throws TikaException { + if (getState() == null) + throw new ChmParsingException("state is null"); + } + + private void extractContent() throws TikaException { + assertStateNotNull(); + if (getChmSection().getData() != null) { + boolean continueLoop = true; + while (continueLoop && getContentLength() < getBlockLength()) { + if (getState() != null && getState().getBlockRemaining() == 0) { + if (getState().getHadStarted() == LzxState.NOT_STARTED_DECODING) { + getState().setHadStarted(LzxState.STARTED_DECODING); + if (getChmSection().getSyncBits(1) == 1) { + int intelSizeTemp = (getChmSection() + .getSyncBits(16) << 16) + + getChmSection().getSyncBits(16); + if (intelSizeTemp >= 0) + getState().setIntelFileSize(intelSizeTemp); + else + getState().setIntelFileSize(0); + } + } + getState().setBlockType(getChmSection().getSyncBits(3)); + getState().setBlockLength( + (getChmSection().getSyncBits(16) << 8) + + getChmSection().getSyncBits(8)); + getState().setBlockRemaining(getState().getBlockLength()); + + // ---------------------------------------- + // Trying to handle 3 - 7 block types + // ---------------------------------------- + if (getState().getBlockType() > 3) { + if (previousBlockType >= 0 && previousBlockType < 3) + getState().setBlockType(previousBlockType); + } + + switch (getState().getBlockType()) { + case ChmCommons.ALIGNED_OFFSET: + createAlignedTreeTable(); + //fall through + case ChmCommons.VERBATIM: + /* Creates mainTreeTable */ + createMainTreeTable(); + createLengthTreeTable(); + if (getState().getMainTreeLengtsTable()[0xe8] != 0) + getState().setIntelState(IntelState.STARTED); + break; + case ChmCommons.UNCOMPRESSED: + getState().setIntelState(IntelState.STARTED); + if (getChmSection().getTotal() > 16) + getChmSection().setSwath( + getChmSection().getSwath() - 1); + getState().setR0( + (new BigInteger(getChmSection() + .reverseByteOrder( + getChmSection().unmarshalBytes( + 4))).longValue())); + getState().setR1( + (new BigInteger(getChmSection() + .reverseByteOrder( + getChmSection().unmarshalBytes( + 4))).longValue())); + getState().setR2( + (new BigInteger(getChmSection() + .reverseByteOrder( + getChmSection().unmarshalBytes( + 4))).longValue())); + break; + default: + break; + } + } //end of if BlockRemaining == 0 + + int tempLen; + + if (getContentLength() + getState().getBlockRemaining() > getBlockLength()) { + getState().setBlockRemaining( + getContentLength() + getState().getBlockRemaining() + - (int) getBlockLength()); + tempLen = (int) getBlockLength(); + } else { + tempLen = getContentLength() + + getState().getBlockRemaining(); + getState().setBlockRemaining(0); + } + + int lastLength = getContentLength(); + switch (getState().getBlockType()) { + case ChmCommons.ALIGNED_OFFSET: + // if(prevblock.lzxState.length>prevblock.lzxState.remaining) + decompressAlignedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());// prevcontext + break; + case ChmCommons.VERBATIM: + decompressVerbatimBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent()); + break; + case ChmCommons.UNCOMPRESSED: + decompressUncompressedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent()); + break; + } + getState().increaseFramesRead(); + if ((getState().getFramesRead() < 32768) + && getState().getIntelFileSize() != 0) + intelE8Decoding(); + + continueLoop = getContentLength() > lastLength; + } + } + } + + protected void intelE8Decoding() { + if (getBlockLength() <= ChmConstants.LZX_PRETREE_TABLEBITS + || (getState().getIntelState() == IntelState.NOT_STARTED)) { + getState().setBlockRemaining( + getState().getBlockRemaining() - (int) getBlockLength()); + } else { + long curpos = getState().getBlockRemaining(); + getState().setBlockRemaining( + getState().getBlockRemaining() - (int) getBlockLength()); + int i = 0; + while (i < getBlockLength() - 10) { + if (content[i] != 0xe8) { + i++; + continue; + } + byte[] b = new byte[4]; + b[0] = getContent()[i + 3]; + b[1] = getContent()[i + 2]; + b[2] = getContent()[i + 1]; + b[3] = getContent()[i + 0]; + long absoff = (new BigInteger(b)).longValue(); + if ((absoff >= -curpos) + && (absoff < getState().getIntelFileSize())) { + long reloff = (absoff >= 0) ? absoff - curpos : absoff + + getState().getIntelFileSize(); + getContent()[i + 0] = (byte) reloff; + getContent()[i + 1] = (byte) (reloff >>> 8); + getContent()[i + 2] = (byte) (reloff >>> 16); + getContent()[i + 3] = (byte) (reloff >>> 24); + } + i += 4; + curpos += 5; + } + } + } + + private short[] createPreLenTable() { + short[] tmp = new short[ChmConstants.LZX_PRETREE_MAXSYMBOLS]; + for (int i = 0; i < ChmConstants.LZX_PRETREE_MAXSYMBOLS; i++) { + tmp[i] = (short) getChmSection().getSyncBits( + ChmConstants.LZX_PRETREE_NUM_ELEMENTS_BITS); + } + return tmp; + } + + private void createLengthTreeTable() throws TikaException { + //Read Pre Tree Table + short[] prelentable = createPreLenTable(); + + if (prelentable == null) { + throw new ChmParsingException("pretreetable is null"); + } + + short[] pretreetable = createTreeTable2(prelentable, + (1 << ChmConstants.LZX_PRETREE_TABLEBITS) + + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1), + ChmConstants.LZX_PRETREE_TABLEBITS, + ChmConstants.LZX_PRETREE_MAXSYMBOLS); + + if (pretreetable == null) { + throw new ChmParsingException("pretreetable is null"); + } + + //Build Length Tree + createLengthTreeLenTable(0, ChmConstants.LZX_NUM_SECONDARY_LENGTHS, + pretreetable, prelentable); + + getState().setLengthTreeTable( + createTreeTable2(getState().getLengthTreeLengtsTable(), + (1 << ChmConstants.LZX_LENGTH_TABLEBITS) + + (ChmConstants.LZX_LENGTH_MAXSYMBOLS << 1), + ChmConstants.LZX_LENGTH_TABLEBITS, + ChmConstants.LZX_NUM_SECONDARY_LENGTHS)); + } + + private void decompressUncompressedBlock(int len, byte[] prevcontent) { + if (getContentLength() + getState().getBlockRemaining() <= getBlockLength()) { + for (int i = getContentLength(); i < (getContentLength() + getState() + .getBlockRemaining()); i++) + content[i] = getChmSection().getByte(); + + setContentLength(getContentLength() + + getState().getBlockRemaining()); + getState().setBlockRemaining(0); + } else { + for (int i = getContentLength(); i < getBlockLength(); i++) + content[i] = getChmSection().getByte(); + getState().setBlockRemaining( + (int) getBlockLength() - getContentLength());// = blockLen - + // contentlen; + setContentLength((int) getBlockLength()); + } + } + + private void decompressAlignedBlock(int len, byte[] prevcontent) throws TikaException { + + if ((getChmSection() == null) || (getState() == null) + || (getState().getMainTreeTable() == null)) + throw new ChmParsingException("chm section is null"); + + short s; + int x, i, border; + int matchlen = 0, matchfooter = 0, extra, rundest, runsrc; + int matchoffset = 0; + for (i = getContentLength(); i < len; i++) { + /* new code */ + //read huffman tree from main tree + border = getChmSection().peekBits( + ChmConstants.LZX_MAINTREE_TABLEBITS); + if (border >= getState().mainTreeTable.length) + throw new ChmParsingException("error decompressing aligned block."); + //break; + /* end new code */ + s = getState().mainTreeTable[getChmSection().peekBits( + ChmConstants.LZX_MAINTREE_TABLEBITS)]; + if (s >= getState().getMainTreeElements()) { + x = ChmConstants.LZX_MAINTREE_TABLEBITS; + do { + x++; + s <<= 1; + s += getChmSection().checkBit(x); + } while ((s = getState().mainTreeTable[s]) >= getState() + .getMainTreeElements()); + } + //System.out.printf("%d,", s); + //?getChmSection().getSyncBits(getState().mainTreeTable[s]); + getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]); + if (s < ChmConstants.LZX_NUM_CHARS) { + content[i] = (byte) s; + } else { + s -= ChmConstants.LZX_NUM_CHARS; + matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS; + if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) { + matchfooter = getState().lengthTreeTable[getChmSection() + .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];//.LZX_MAINTREE_TABLEBITS)]; + if (matchfooter >= ChmConstants.LZX_LENGTH_MAXSYMBOLS/*?LZX_LENGTH_TABLEBITS*/) { + x = ChmConstants.LZX_LENGTH_TABLEBITS; + do { + x++; + matchfooter <<= 1; + matchfooter += getChmSection().checkBit(x); + } while ((matchfooter = getState().lengthTreeTable[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS); + } + getChmSection().getSyncBits( + getState().lengthTreeLengtsTable[matchfooter]); + matchlen += matchfooter; + } + matchlen += ChmConstants.LZX_MIN_MATCH; + matchoffset = s >>> 3; + if (matchoffset > 2) { + extra = ChmConstants.EXTRA_BITS[matchoffset]; + matchoffset = (ChmConstants.POSITION_BASE[matchoffset] - 2); + if (extra > 3) { + extra -= 3; + long verbatim_bits = getChmSection().getSyncBits(extra); + matchoffset += (verbatim_bits << 3); + //READ HUFF SYM in Aligned Tree + int aligned_bits = getChmSection().peekBits( + ChmConstants.LZX_NUM_PRIMARY_LENGTHS); + int t = getState().getAlignedTreeTable()[aligned_bits]; + if (t >= getState().getMainTreeElements()) { + x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS; //?LZX_ALIGNED_TABLEBITS + do { + x++; + t <<= 1; + t += getChmSection().checkBit(x); + } while ((t = getState().getAlignedTreeTable()[t]) >= getState() + .getMainTreeElements()); + } + getChmSection().getSyncBits( + getState().getAlignedLenTable()[t]); + matchoffset += t; + } else if (extra == 3) { + int g = getChmSection().peekBits( + ChmConstants.LZX_NUM_PRIMARY_LENGTHS); + int t = getState().getAlignedTreeTable()[g]; + if (t >= getState().getMainTreeElements()) { + x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS; + do { + x++; + t <<= 1; + t += getChmSection().checkBit(x); + } while ((t = getState().getAlignedTreeTable()[t]) >= getState() + .getMainTreeElements()); + } + getChmSection().getSyncBits( + getState().getAlignedLenTable()[t]); + matchoffset += t; + } else if (extra > 0) { + long l = getChmSection().getSyncBits(extra); + matchoffset += l; + } else + matchoffset = 1; + getState().setR2(getState().getR1()); + getState().setR1(getState().getR0()); + getState().setR0(matchoffset); + } else if (matchoffset == 0) { + matchoffset = (int) getState().getR0(); + } else if (matchoffset == 1) { + matchoffset = (int) getState().getR1(); + getState().setR1(getState().getR0()); + getState().setR0(matchoffset); + } else /** match_offset == 2 */ + { + matchoffset = (int) getState().getR2(); + getState().setR2(getState().getR0()); + getState().setR0(matchoffset); + } + rundest = i; + runsrc = rundest - matchoffset; + i += (matchlen - 1); + if (i > len) + break; + + if (runsrc < 0) { + if (matchlen + runsrc <= 0) { + runsrc = prevcontent.length + runsrc; + while (matchlen-- > 0) + content[rundest++] = prevcontent[runsrc++]; + } else { + runsrc = prevcontent.length + runsrc; + while (runsrc < prevcontent.length) + content[rundest++] = prevcontent[runsrc++]; + matchlen = matchlen + runsrc - prevcontent.length; + runsrc = 0; + while (matchlen-- > 0) + content[rundest++] = content[runsrc++]; + } + + } else { + /* copies any wrappes around source data */ + while ((runsrc < 0) && (matchlen-- > 0)) { + content[rundest++] = content[(int) (runsrc + getBlockLength())]; + runsrc++; + } + /* copies match data - no worries about destination wraps */ + while (matchlen-- > 0) + content[rundest++] = content[runsrc++]; + } + } + } + setContentLength(len); + } + + private void assertShortArrayNotNull(short[] array) throws TikaException { + if (array == null) + throw new ChmParsingException("short[] is null"); + } + + private void decompressVerbatimBlock(int len, byte[] prevcontent) throws TikaException { + short s; + int x, i; + int matchlen = 0, matchfooter = 0, extra, rundest, runsrc; + int matchoffset = 0; + for (i = getContentLength(); i < len; i++) { + int f = getChmSection().peekBits( + ChmConstants.LZX_MAINTREE_TABLEBITS); + assertShortArrayNotNull(getState().getMainTreeTable()); + s = getState().getMainTreeTable()[f]; + if (s >= ChmConstants.LZX_MAIN_MAXSYMBOLS) { + x = ChmConstants.LZX_MAINTREE_TABLEBITS; + do { + x++; + s <<= 1; + s += getChmSection().checkBit(x); + } while ((s = getState().getMainTreeTable()[s]) >= ChmConstants.LZX_MAIN_MAXSYMBOLS); + } + getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]); + if (s < ChmConstants.LZX_NUM_CHARS) { + content[i] = (byte) s; + } else { + s -= ChmConstants.LZX_NUM_CHARS; + matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS; + if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) { + matchfooter = getState().getLengthTreeTable()[getChmSection() + .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)]; + if (matchfooter >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS) { + x = ChmConstants.LZX_LENGTH_TABLEBITS; + do { + x++; + matchfooter <<= 1; + matchfooter += getChmSection().checkBit(x); + } while ((matchfooter = getState().getLengthTreeTable()[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS); + } + getChmSection().getSyncBits( + getState().getLengthTreeLengtsTable()[matchfooter]); + matchlen += matchfooter; + } + matchlen += ChmConstants.LZX_MIN_MATCH; + // shorter than 2 + matchoffset = s >>> 3; + if (matchoffset > 2) { + if (matchoffset != 3) { // should get other bits to retrieve + // offset + extra = ChmConstants.EXTRA_BITS[matchoffset]; + long l = getChmSection().getSyncBits(extra); + matchoffset = (int) (ChmConstants.POSITION_BASE[matchoffset] - 2 + l); + } else { + matchoffset = 1; + } + getState().setR2(getState().getR1()); + getState().setR1(getState().getR0()); + getState().setR0(matchoffset); + } else if (matchoffset == 0) { + matchoffset = (int) getState().getR0(); + } else if (matchoffset == 1) { + matchoffset = (int) getState().getR1(); + getState().setR1(getState().getR0()); + getState().setR0(matchoffset); + } else /* match_offset == 2 */ + { + matchoffset = (int) getState().getR2(); + getState().setR2(getState().getR0()); + getState().setR0(matchoffset); + } + rundest = i; + runsrc = rundest - matchoffset; + i += (matchlen - 1); + if (i > len) + break; + if (runsrc < 0) { + if (matchlen + runsrc <= 0) { + runsrc = prevcontent.length + runsrc; + while ((matchlen-- > 0) && (prevcontent != null) + && ((runsrc + 1) > 0)) + if ((rundest < content.length) + && (runsrc < content.length)) + content[rundest++] = prevcontent[runsrc++]; + } else { + runsrc = prevcontent.length + runsrc; + while (runsrc < prevcontent.length) + if ((rundest < content.length) + && (runsrc < content.length)) + content[rundest++] = prevcontent[runsrc++]; + matchlen = matchlen + runsrc - prevcontent.length; + runsrc = 0; + while (matchlen-- > 0) + content[rundest++] = content[runsrc++]; + } + + } else { + /* copies any wrapped source data */ + while ((runsrc < 0) && (matchlen-- > 0)) { + content[rundest++] = content[(int) (runsrc + getBlockLength())]; + runsrc++; + } + /* copies match data - no worries about destination wraps */ + while (matchlen-- > 0) { + if ((rundest < content.length) + && (runsrc < content.length)) + content[rundest++] = content[runsrc++]; + } + } + } + } + setContentLength(len); + } + + private void createLengthTreeLenTable(int offset, int tablelen, + short[] pretreetable, short[] prelentable) throws TikaException { + if (prelentable == null || getChmSection() == null + || pretreetable == null || prelentable == null) + throw new ChmParsingException("is null"); + + int i = offset; // represents offset + int z, y, x;// local counters + while (i < tablelen) { + //Read HUFF sym to z + z = pretreetable[getChmSection().peekBits( + ChmConstants.LZX_PRETREE_TABLEBITS)]; + if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 1 bug, should be + // 20 + x = ChmConstants.LZX_PRETREE_TABLEBITS; + do { + x++; + z <<= 1; + z += getChmSection().checkBit(x); + } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS); + } + getChmSection().getSyncBits(prelentable[z]); + + if (z < 17) { + z = getState().getLengthTreeLengtsTable()[i] - z; + if (z < 0) + z = z + 17; + getState().getLengthTreeLengtsTable()[i] = (short) z; + i++; + } else if (z == 17) { + y = getChmSection().getSyncBits(4); + y += 4; + for (int j = 0; j < y; j++) + if (i < getState().getLengthTreeLengtsTable().length) + getState().getLengthTreeLengtsTable()[i++] = 0; + } else if (z == 18) { + y = getChmSection().getSyncBits(5); + y += 20; + for (int j = 0; j < y; j++) + //no tolerate //if (i < getState().getLengthTreeLengtsTable().length) + getState().getLengthTreeLengtsTable()[i++] = 0; + } else if (z == 19) { + y = getChmSection().getSyncBits(1); + y += 4; + z = pretreetable[getChmSection().peekBits( + ChmConstants.LZX_PRETREE_TABLEBITS)]; + if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 20 + x = ChmConstants.LZX_PRETREE_TABLEBITS;// 6 + do { + x++; + z <<= 1; + z += getChmSection().checkBit(x); + } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);//LZX_MAINTREE_TABLEBITS); + } + getChmSection().getSyncBits(prelentable[z]); + z = getState().getLengthTreeLengtsTable()[i] - z; + if (z < 0) + z = z + 17; + for (int j = 0; j < y; j++) + getState().getLengthTreeLengtsTable()[i++] = (short) z; + } + } + } + + private void createMainTreeTable() throws TikaException { + //Read Pre Tree Table + short[] prelentable = createPreLenTable(); + short[] pretreetable = createTreeTable2(prelentable, + (1 << ChmConstants.LZX_PRETREE_TABLEBITS) + + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1), + ChmConstants.LZX_PRETREE_TABLEBITS, + ChmConstants.LZX_PRETREE_MAXSYMBOLS); + + createMainTreeLenTable(0, ChmConstants.LZX_NUM_CHARS, pretreetable, + prelentable); + + //Read Pre Tree Table + prelentable = createPreLenTable(); + pretreetable = createTreeTable2(prelentable, + (1 << ChmConstants.LZX_PRETREE_TABLEBITS) + + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1), + ChmConstants.LZX_PRETREE_TABLEBITS, + ChmConstants.LZX_PRETREE_MAXSYMBOLS); + + createMainTreeLenTable(ChmConstants.LZX_NUM_CHARS, + getState().mainTreeLengtsTable.length, pretreetable, + prelentable); + + getState().setMainTreeTable( + createTreeTable2(getState().mainTreeLengtsTable, + (1 << ChmConstants.LZX_MAINTREE_TABLEBITS) + + (ChmConstants.LZX_MAINTREE_MAXSYMBOLS << 1), + ChmConstants.LZX_MAINTREE_TABLEBITS, getState() + .getMainTreeElements())); + } + + private void createMainTreeLenTable(int offset, int tablelen, + short[] pretreetable, short[] prelentable) throws TikaException { + if (pretreetable == null) + throw new ChmParsingException("pretreetable is null"); + int i = offset; + int z, y, x; + while (i < tablelen) { + int f = getChmSection().peekBits( + ChmConstants.LZX_PRETREE_TABLEBITS); + z = pretreetable[f]; + if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) { + x = ChmConstants.LZX_PRETREE_TABLEBITS; + do { + x++; + z <<= 1; + z += getChmSection().checkBit(x); + } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS); + } + getChmSection().getSyncBits(prelentable[z]); + if (z < 17) { + z = getState().getMainTreeLengtsTable()[i] - z; + if (z < 0) + z = z + 17; + getState().mainTreeLengtsTable[i] = (short) z; + i++; + } else if (z == 17) { + y = getChmSection().getSyncBits(4); + y += 4; + for (int j = 0; j < y; j++) { + assertInRange(getState().getMainTreeLengtsTable(), i); + getState().mainTreeLengtsTable[i++] = 0; + } + } else if (z == 18) { + y = getChmSection().getSyncBits(5); + y += 20; + for (int j = 0; j < y; j++) { + assertInRange(getState().getMainTreeLengtsTable(), i); + getState().mainTreeLengtsTable[i++] = 0; + } + } else if (z == 19) { + y = getChmSection().getSyncBits(1); + y += 4; + z = pretreetable[getChmSection().peekBits( + ChmConstants.LZX_PRETREE_TABLEBITS)]; + if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) { + x = ChmConstants.LZX_PRETREE_TABLEBITS; + do { + x++; + z <<= 1; + z += getChmSection().checkBit(x); + } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS); + } + getChmSection().getSyncBits(prelentable[z]); + z = getState().mainTreeLengtsTable[i] - z; + if (z < 0) + z = z + 17; + for (int j = 0; j < y; j++) + if (i < getState().getMainTreeLengtsTable().length) + getState().mainTreeLengtsTable[i++] = (short) z; + } + } + } + + private void assertInRange(short[] array, int index) throws ChmParsingException { + if (index >= array.length) + throw new ChmParsingException(index + " is bigger than " + + array.length); + } + + private short[] createAlignedLenTable() { + int tablelen = ChmConstants.LZX_ALIGNED_NUM_ELEMENTS;//LZX_BLOCKTYPE_UNCOMPRESSED;// + int bits = ChmConstants.LZX_BLOCKTYPE_UNCOMPRESSED; + short[] tmp = new short[tablelen]; + for (int i = 0; i < tablelen; i++) { + tmp[i] = (short) getChmSection().getSyncBits(bits); + } + return tmp; + } + + private void createAlignedTreeTable() throws ChmParsingException { + getState().setAlignedLenTable(createAlignedLenTable()); + getState().setAlignedTreeTable(//setAlignedLenTable( + createTreeTable2(getState().getAlignedLenTable(), + (1 << ChmConstants.LZX_NUM_PRIMARY_LENGTHS) + + (ChmConstants.LZX_ALIGNED_MAXSYMBOLS << 1), + ChmConstants.LZX_NUM_PRIMARY_LENGTHS, + ChmConstants.LZX_ALIGNED_MAXSYMBOLS)); + } + + private short[] createTreeTable2(short[] lentable, int tablelen, int bits, + int maxsymbol) throws ChmParsingException { + short[] tmp = new short[tablelen]; + short sym; + int leaf; + int bit_num = 1; + long fill; + int pos = 0; + /* the current position in the decode table */ + long table_mask = (1 << bits); + long bit_mask = (table_mask >> 1); + long next_symbol = bit_mask; + + /* fills entries for short codes for a direct mapping */ + while (bit_num <= bits) { + for (sym = 0; sym < maxsymbol; sym++) { + if (lentable.length > sym && lentable[sym] == bit_num) { + leaf = pos; + + if ((pos += bit_mask) > table_mask) { + /* table overflow */ + throw new ChmParsingException("Table overflow"); + } + + fill = bit_mask; + while (fill-- > 0) + tmp[leaf++] = sym; + } + } + bit_mask >>= 1; + bit_num++; + } + + /* if there are any codes longer than nbits */ + if (pos != table_mask) { + /* clears the remainder of the table */ + for (leaf = pos; leaf < table_mask; leaf++) + tmp[leaf] = 0; + + /* gives ourselves room for codes to grow by up to 16 more bits */ + pos <<= 16; + table_mask <<= 16; + bit_mask = 1 << 15; + + while (bit_num <= 16) { + for (sym = 0; sym < maxsymbol; sym++) { + if ((lentable.length > sym) && (lentable[sym] == bit_num)) { + leaf = pos >> 16; + for (fill = 0; fill < bit_num - bits; fill++) { + /* + * if this path hasn't been taken yet, 'allocate' + * two entries + */ + if (tmp[leaf] == 0) { + if (((next_symbol << 1) + 1) < tmp.length) { + tmp[(int) (next_symbol << 1)] = 0; + tmp[(int) (next_symbol << 1) + 1] = 0; + tmp[leaf] = (short) next_symbol++; + } + + } + /* + * follows the path and select either left or right + * for next bit + */ + leaf = tmp[leaf] << 1; + if (((pos >> (15 - fill)) & 1) != 0) + leaf++; + } + tmp[leaf] = sym; + + if ((pos += bit_mask) > table_mask) { + /* table overflow */ + throw new ChmParsingException("Table overflow"); + } + } + } + bit_mask >>= 1; + bit_num++; + } + } + + /* is it full table? */ + if (pos == table_mask) + return tmp; + + return tmp; + } + + public byte[] getContent() { + return content; + } + + public byte[] getContent(int startOffset, int endOffset) { + return (getContent() != null) ? ChmCommons.copyOfRange(getContent(), + startOffset, endOffset) : new byte[1]; + } + + public byte[] getContent(int start) { + return (getContent() != null) ? ChmCommons.copyOfRange(getContent(), + start, getContent().length) : new byte[1]; + } + + private void setContent(int contentLength) { + this.content = new byte[contentLength]; + } + + private void checkLzxBlock(ChmLzxBlock chmPrevLzxBlock) throws TikaException { + if (chmPrevLzxBlock == null && getBlockLength() < Integer.MAX_VALUE) + setState(new ChmLzxState((int) getBlockLength())); + else + //use clone to avoid changing a cached or to be cached block + setState(chmPrevLzxBlock.getState().clone()); + } + + private boolean validateConstructorParams(int blockNumber, + byte[] dataSegment, long blockLength) throws TikaException { + int goodParameter = 0; + if (blockNumber >= 0) + ++goodParameter; + else + throw new ChmParsingException("block number should be possitive"); + if (dataSegment != null && dataSegment.length > 0) + ++goodParameter; + else + throw new ChmParsingException("data segment should not be null"); + if (blockLength > 0) + ++goodParameter; + else + throw new ChmParsingException( + "block length should be more than zero"); + return (goodParameter == 3); + } + + public int getBlockNumber() { + return block_number; + } + + private void setBlockNumber(int block_number) { + this.block_number = block_number; + } + + private long getBlockLength() { + return block_length; + } + + private void setBlockLength(long block_length) { + this.block_length = block_length; + } + + public ChmLzxState getState() { + return state; + } + + private void setState(ChmLzxState state) { + this.state = state; + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,327 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.lzx; + +import java.util.concurrent.CancellationException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.core.ChmCommons; +import org.apache.tika.parser.chm.core.ChmCommons.IntelState; +import org.apache.tika.parser.chm.core.ChmCommons.LzxState; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.apache.tika.parser.chm.exception.ChmParsingException; + +public class ChmLzxState implements Cloneable { + /* Class' members */ + private int window; /* the actual decoding window */ + private long window_size; /* window size (32Kb through 2Mb) */ + private int window_position; /* current offset within the window */ + private int main_tree_elements; /* number of main tree elements */ + private LzxState hadStarted; /* have we started decoding at all yet? */ + private int block_type; /* type of this block */ + private int block_length; /* uncompressed length of this block */ + private int block_remaining; /* uncompressed bytes still left to decode */ + private int frames_read; /* the number of CFDATA blocks processed */ + private int intel_file_size; /* magic header value used for transform */ + private long intel_current_possition; /* current offset in transform space */ + private IntelState intel_state; /* have we seen any translatable data yet? */ + private long R0; /* for the LRU offset system */ + private long R1; /* for the LRU offset system */ + private long R2; /* for the LRU offset system */ + + // Trees - PRETREE, MAINTREE, LENGTH, ALIGNED + protected short[] mainTreeLengtsTable; + protected short[] mainTreeTable; + + protected short[] lengthTreeTable; + protected short[] lengthTreeLengtsTable; + + protected short[] alignedLenTable; + protected short[] alignedTreeTable; + + @Override + public ChmLzxState clone() { + try { + ChmLzxState clone = (ChmLzxState)super.clone(); + clone.mainTreeLengtsTable = arrayClone(mainTreeLengtsTable); + clone.mainTreeTable = arrayClone(mainTreeTable); + clone.lengthTreeTable = arrayClone(lengthTreeTable); + clone.lengthTreeLengtsTable = arrayClone(lengthTreeLengtsTable); + clone.alignedLenTable = arrayClone(alignedLenTable); + clone.alignedTreeTable = arrayClone(alignedTreeTable); + return clone; + } catch (CloneNotSupportedException ex) { + return null; + } + } + + protected short[] getMainTreeTable() { + return mainTreeTable; + } + + protected short[] getAlignedTreeTable() { + return alignedTreeTable; + } + + protected void setAlignedTreeTable(short[] alignedTreeTable) { + this.alignedTreeTable = alignedTreeTable; + } + + protected short[] getLengthTreeTable() throws TikaException { + if (lengthTreeTable != null) + return this.lengthTreeTable; + else + throw new ChmParsingException("lengthTreeTable is null"); + } + + protected void setLengthTreeTable(short[] lengthTreeTable) { + this.lengthTreeTable = lengthTreeTable; + } + + protected void setMainTreeTable(short[] mainTreeTable) { + this.mainTreeTable = mainTreeTable; + } + + protected short[] getAlignedLenTable() { + return this.alignedLenTable; + } + + protected void setAlignedLenTable(short[] alignedLenTable) { + this.alignedLenTable = alignedLenTable; + } + + /** + * It suits for informative outlook + */ + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("actual decoding window:=" + getWindow() + + System.getProperty("line.separator")); + sb.append("window size (32Kb through 2Mb):=" + getWindowSize() + + System.getProperty("line.separator")); + sb.append("current offset within the window:=" + getWindowPosition() + + System.getProperty("line.separator")); + sb.append("number of main tree elements:=" + getMainTreeElements() + + System.getProperty("line.separator")); + sb.append("have we started decoding at all yet?:=" + getHadStarted() + + System.getProperty("line.separator")); + sb.append("type of this block:=" + getBlockType() + + System.getProperty("line.separator")); + sb.append("uncompressed length of this block:=" + getBlockLength() + + System.getProperty("line.separator")); + sb.append("uncompressed bytes still left to decode:=" + + getBlockRemaining() + System.getProperty("line.separator")); + sb.append("the number of CFDATA blocks processed:=" + getFramesRead() + + System.getProperty("line.separator")); + sb.append("magic header value used for transform:=" + + getIntelFileSize() + System.getProperty("line.separator")); + sb.append("current offset in transform space:=" + + getIntelCurrentPossition() + + System.getProperty("line.separator")); + sb.append("have we seen any translatable data yet?:=" + getIntelState() + + System.getProperty("line.separator")); + sb.append("R0 for the LRU offset system:=" + getR0() + + System.getProperty("line.separator")); + sb.append("R1 for the LRU offset system:=" + getR1() + + System.getProperty("line.separator")); + sb.append("R2 for the LRU offset system:=" + getR2() + + System.getProperty("line.separator")); + sb.append("main tree length:=" + getMainTreeLengtsTable().length + + System.getProperty("line.separator")); + sb.append("secondary tree length:=" + getLengthTreeLengtsTable().length + + System.getProperty("line.separator")); + return sb.toString(); + } + + public ChmLzxState(int window) throws TikaException { + if (window >= 0) { + int position_slots; + int win = ChmCommons.getWindowSize(window); + setWindowSize(1 << win); + /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */ + if (win < 15 || win > 21) + throw new ChmParsingException("window less than 15 or window greater than 21"); + + /* Calculates required position slots */ + if (win == 20) + position_slots = 42; + else if (win == 21) + position_slots = 50; + else + position_slots = win << 1; + //TODO: position_slots is not used ? + setR0(1); + setR1(1); + setR2(1); + setMainTreeElements(512); + setHadStarted(LzxState.NOT_STARTED_DECODING); + setFramesRead(0); + setBlockRemaining(0); + setBlockType(ChmConstants.LZX_BLOCKTYPE_INVALID); + setIntelCurrentPossition(0); + setIntelState(IntelState.NOT_STARTED); + setWindowPosition(0); + setMainTreeLengtsTable(new short[getMainTreeElements()]); + setLengthTreeLengtsTable(new short[ChmConstants.LZX_NUM_SECONDARY_LENGTHS]); + } else + throw new CancellationException( + "window size should be more than zero"); + } + + protected void setWindow(int window) { + this.window = window; + } + + protected int getWindow() { + return window; + } + + protected void setWindowSize(long window_size) { + this.window_size = window_size; + } + + protected long getWindowSize() { + return window_size; + } + + protected void setWindowPosition(int window_position) { + this.window_position = window_position; + } + + protected int getWindowPosition() { + return window_position; + } + + protected void setMainTreeElements(int main_tree_elements) { + this.main_tree_elements = main_tree_elements; + } + + protected int getMainTreeElements() { + return main_tree_elements; + } + + protected void setHadStarted(LzxState hadStarted) { + this.hadStarted = hadStarted; + } + + protected LzxState getHadStarted() { + return hadStarted; + } + + protected void setBlockType(int block_type) { + this.block_type = block_type; + } + + public int getBlockType() { + return block_type; + } + + protected void setBlockLength(int block_length) { + this.block_length = block_length; + } + + protected int getBlockLength() { + return block_length; + } + + protected void setBlockRemaining(int block_remaining) { + this.block_remaining = block_remaining; + } + + protected int getBlockRemaining() { + return block_remaining; + } + + protected void setFramesRead(int frames_read) { + this.frames_read = frames_read; + } + + protected void increaseFramesRead() { + this.frames_read = getFramesRead() + 1; + } + + protected int getFramesRead() { + return frames_read; + } + + protected void setIntelFileSize(int intel_file_size) { + this.intel_file_size = intel_file_size; + } + + protected int getIntelFileSize() { + return intel_file_size; + } + + protected void setIntelCurrentPossition(long intel_current_possition) { + this.intel_current_possition = intel_current_possition; + } + + protected long getIntelCurrentPossition() { + return intel_current_possition; + } + + protected void setIntelState(IntelState intel_state) { + this.intel_state = intel_state; + } + + protected IntelState getIntelState() { + return intel_state; + } + + protected void setR0(long r0) { + R0 = r0; + } + + protected long getR0() { + return R0; + } + + protected void setR1(long r1) { + R1 = r1; + } + + protected long getR1() { + return R1; + } + + protected void setR2(long r2) { + R2 = r2; + } + + protected long getR2() { + return R2; + } + + public void setMainTreeLengtsTable(short[] mainTreeLengtsTable) { + this.mainTreeLengtsTable = mainTreeLengtsTable; + } + + public short[] getMainTreeLengtsTable() { + return mainTreeLengtsTable; + } + + public void setLengthTreeLengtsTable(short[] lengthTreeLengtsTable) { + this.lengthTreeLengtsTable = lengthTreeLengtsTable; + } + + public short[] getLengthTreeLengtsTable() { + return lengthTreeLengtsTable; + } + + private static short[] arrayClone(short[] a) { + return a==null ? null : (short[]) a.clone(); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.lzx; + +import java.math.BigInteger; +import java.util.Arrays; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.core.ChmCommons; + +public class ChmSection { + final private byte[] data; + final private byte[] prevcontent; + private int swath;// kiks + private int total;// remains + private int buffer;// val + + public ChmSection(byte[] data) throws TikaException { + this(data, null); + } + + public ChmSection(byte[] data, byte[] prevconent) throws TikaException { + ChmCommons.assertByteArrayNotNull(data); + this.data = data; + this.prevcontent = prevconent; + //setData(data); + } + + /* Utilities */ + public byte[] reverseByteOrder(byte[] toBeReversed) throws TikaException { + ChmCommons.assertByteArrayNotNull(toBeReversed); + ChmCommons.reverse(toBeReversed); + return toBeReversed; + } + + public int checkBit(int i) { + return ((getBuffer() & (1 << (getTotal() - i))) == 0) ? 0 : 1; + } + + public int getSyncBits(int bit) { + return getDesyncBits(bit, bit); + } + + public int peekBits(int bit) { + return getDesyncBits(bit, 0); + } + + private int getDesyncBits(int bit, int removeBit) { + while (getTotal() < 16) { + setBuffer((getBuffer() << 16) + unmarshalUByte() + + (unmarshalUByte() << 8)); + setTotal(getTotal() + 16); + } + int tmp = (getBuffer() >>> (getTotal() - bit)); + setTotal(getTotal() - removeBit); + setBuffer(getBuffer() - ((getBuffer() >>> getTotal()) << getTotal())); + return tmp; + } + + public int unmarshalUByte() { + return getByte() & 255; + } + + public byte getByte() { + if (getSwath() < getData().length) { + setSwath(getSwath() + 1); + return getData()[getSwath() - 1]; + } else + return 0; + } + + public int getLeft() { + return (getData().length - getSwath()); + } + + public byte[] getData() { + return data; + } + + public byte[] getPrevContent() { + return prevcontent; + } + + public BigInteger getBigInteger(int i) { + if (getData() == null) + return BigInteger.ZERO; + if (getData().length - getSwath() < i) + i = getData().length - getSwath(); + byte[] tmp = new byte[i]; + for (int j = i - 1; j >= 0; j--) { + tmp[i - j - 1] = getData()[getSwath() + j]; + } + setSwath(getSwath() + i); + return new BigInteger(tmp); + } + + public byte[] stringToAsciiBytes(String s) { + char[] c = s.toCharArray(); + byte[] byteval = new byte[c.length]; + for (int i = 0; i < c.length; i++) + byteval[i] = (byte) c[i]; + return byteval; + } + + public BigInteger unmarshalUlong() { + return getBigInteger(8); + } + + public long unmarshalUInt() { + return getBigInteger(4).longValue(); + } + + public int unmarshalInt() { + return getBigInteger(4).intValue(); + } + + public byte[] unmarshalBytes(int i) { + if (i == 0) + return new byte[1]; + byte[] t = new byte[i]; + for (int j = 0; j < i; j++) + t[j] = getData()[j + getSwath()]; + setSwath(getSwath() + i); + return t; + } + + public BigInteger getEncint() { + byte ob; + BigInteger bi = BigInteger.ZERO; + byte[] nb = new byte[1]; + while ((ob = this.getByte()) < 0) { + nb[0] = (byte) ((ob & 0x7f)); + bi = bi.shiftLeft(7).add(new BigInteger(nb)); + } + nb[0] = (byte) ((ob & 0x7f)); + bi = bi.shiftLeft(7).add(new BigInteger(nb)); + return bi; + } + + public char unmarshalUtfChar() { + byte ob; + int i = 1; + byte[] ba; + ob = this.getByte(); + if (ob < 0) { + i = 2; + while ((ob << (24 + i)) < 0) + i++; + } + ba = new byte[i]; + ba[0] = ob; + int j = 1; + while (j < i) { + ba[j] = this.getByte(); + j++; + } + i = ba.length; + if (i == 1) + return (char) ba[0]; + else { + int n; + n = ba[0] & 15; // 00001111b, gets last 4 bits + j = 1; + while (j < i) + n = (n << 6) + (ba[j++] & 63);// 00111111b,gets last 6 bits + return (char) n; + } + } + +// private void setData(byte[] data) { +// this.data = data; +// } + + public int getSwath() { + return swath; + } + + public void setSwath(int swath) { + this.swath = swath; + } + + public int getTotal() { + return total; + } + + public void setTotal(int total) { + this.total = total; + } + + private int getBuffer() { + return buffer; + } + + private void setBuffer(int buffer) { + this.buffer = buffer; + } + + /** + * @param args + * @throws TikaException + */ + public static void main(String[] args) throws TikaException { + byte[] array = { 4, 78, -67, 90, 1, -33 }; + ChmSection chmSection = new ChmSection(array); + System.out.println("before " + Arrays.toString(array)); + System.out.println("after " + Arrays.toString(chmSection.reverseByteOrder(array))); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mbox; + +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.Locale; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses the + * DelegatingParser to process each mail. + */ +public class MboxParser extends AbstractParser { + + public static final String MBOX_MIME_TYPE = "application/mbox"; + public static final String MBOX_RECORD_DIVIDER = "From "; + public static final int MAIL_MAX_SIZE = 50000000; + /** + * Serial version UID + */ + private static final long serialVersionUID = -1762689436731160661L; + private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("mbox")); + private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)"); + private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>"); + + private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-"; + private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from"; + private final Map<Integer, Metadata> trackingMetadata = new HashMap<Integer, Metadata>(); + private boolean tracking = false; + + public static Date parseDate(String headerContent) throws ParseException { + SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US); + return dateFormat.parse(headerContent); + } + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, TikaException, SAXException { + + EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class, + new ParsingEmbeddedDocumentExtractor(context)); + + String charsetName = "windows-1252"; + + metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE); + metadata.set(Metadata.CONTENT_ENCODING, charsetName); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + InputStreamReader isr = new InputStreamReader(stream, charsetName); + try (BufferedReader reader = new BufferedReader(isr)) { + String curLine = reader.readLine(); + int mailItem = 0; + do { + if (curLine.startsWith(MBOX_RECORD_DIVIDER)) { + Metadata mailMetadata = new Metadata(); + Queue<String> multiline = new LinkedList<String>(); + mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length())); + mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822"); + curLine = reader.readLine(); + + ByteArrayOutputStream message = new ByteArrayOutputStream(100000); + do { + if (curLine.startsWith(" ") || curLine.startsWith("\t")) { + String latestLine = multiline.poll(); + latestLine += " " + curLine.trim(); + multiline.add(latestLine); + } else { + multiline.add(curLine); + } + + message.write(curLine.getBytes(charsetName)); + message.write(0x0A); + curLine = reader.readLine(); + } + while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE); + + for (String item : multiline) { + saveHeaderInMetadata(mailMetadata, item); + } + + ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray()); + message = null; + + if (extractor.shouldParseEmbedded(mailMetadata)) { + extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true); + } + + if (tracking) { + getTrackingMetadata().put(mailItem++, mailMetadata); + } + } else { + curLine = reader.readLine(); + } + + } while (curLine != null && !Thread.currentThread().isInterrupted()); + } + + xhtml.endDocument(); + } + + public boolean isTracking() { + return tracking; + } + + public void setTracking(boolean tracking) { + this.tracking = tracking; + } + + public Map<Integer, Metadata> getTrackingMetadata() { + return trackingMetadata; + } + + private void saveHeaderInMetadata(Metadata metadata, String curLine) { + Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine); + if (!headerMatcher.matches()) { + return; // ignore malformed header lines + } + + String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT); + String headerContent = headerMatcher.group(2); + + if (headerTag.equalsIgnoreCase("From")) { + metadata.set(TikaCoreProperties.CREATOR, headerContent); + } else if (headerTag.equalsIgnoreCase("To") || headerTag.equalsIgnoreCase("Cc") + || headerTag.equalsIgnoreCase("Bcc")) { + Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent); + if (address.find()) { + metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1)); + } else if (headerContent.indexOf('@') > -1) { + metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent); + } + + String property = Metadata.MESSAGE_TO; + if (headerTag.equalsIgnoreCase("Cc")) { + property = Metadata.MESSAGE_CC; + } else if (headerTag.equalsIgnoreCase("Bcc")) { + property = Metadata.MESSAGE_BCC; + } + metadata.add(property, headerContent); + } else if (headerTag.equalsIgnoreCase("Subject")) { + metadata.add(Metadata.SUBJECT, headerContent); + } else if (headerTag.equalsIgnoreCase("Date")) { + try { + Date date = parseDate(headerContent); + metadata.set(TikaCoreProperties.CREATED, date); + } catch (ParseException e) { + // ignoring date because format was not understood + } + } else if (headerTag.equalsIgnoreCase("Message-Id")) { + metadata.set(TikaCoreProperties.IDENTIFIER, headerContent); + } else if (headerTag.equalsIgnoreCase("In-Reply-To")) { + metadata.set(TikaCoreProperties.RELATION, headerContent); + } else if (headerTag.equalsIgnoreCase("Content-Type")) { + // TODO - key off content-type in headers to + // set mapping to use for content and convert if necessary. + + metadata.add(Metadata.CONTENT_TYPE, headerContent); + metadata.set(TikaCoreProperties.FORMAT, headerContent); + } else { + metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mbox; + +import static java.lang.String.valueOf; +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Collections.singleton; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.Set; + +import com.pff.PSTAttachment; +import com.pff.PSTFile; +import com.pff.PSTFolder; +import com.pff.PSTMessage; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** + * Parser for MS Outlook PST email storage files + */ +public class OutlookPSTParser extends AbstractParser { + + private static final long serialVersionUID = 620998217748364063L; + + public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst"); + private static final Set<MediaType> SUPPORTED_TYPES = singleton(MS_OUTLOOK_PST_MIMETYPE); + + private static AttributesImpl createAttribute(String attName, String attValue) { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", attName, attName, "CDATA", attValue); + return attributes; + } + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + // Use the delegate parser to parse the contained document + EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class, + new ParsingEmbeddedDocumentExtractor(context)); + + metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString()); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + TikaInputStream in = TikaInputStream.get(stream); + PSTFile pstFile = null; + try { + pstFile = new PSTFile(in.getFile().getPath()); + metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length())); + boolean isValid = pstFile.getFileHandle().getFD().valid(); + metadata.set("isValid", valueOf(isValid)); + if (isValid) { + parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor); + } + } catch (Exception e) { + throw new TikaException(e.getMessage(), e); + } finally { + if (pstFile != null && pstFile.getFileHandle() != null) { + try { + pstFile.getFileHandle().close(); + } catch (IOException e) { + //swallow closing exception + } + } + } + + xhtml.endDocument(); + } + + private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, EmbeddedDocumentExtractor embeddedExtractor) + throws Exception { + if (pstFolder.getContentCount() > 0) { + PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild(); + while (pstMail != null) { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId()); + handler.startElement("div", attributes); + handler.element("h1", pstMail.getSubject()); + + parserMailItem(handler, pstMail, embeddedExtractor); + parseMailAttachments(handler, pstMail, embeddedExtractor); + + handler.endElement("div"); + + pstMail = (PSTMessage) pstFolder.getNextChild(); + } + } + + if (pstFolder.hasSubfolders()) { + for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) { + handler.startElement("div", createAttribute("class", "email-folder")); + handler.element("h1", pstSubFolder.getDisplayName()); + parseFolder(handler, pstSubFolder, embeddedExtractor); + handler.endElement("div"); + } + } + } + + private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException { + Metadata mailMetadata = new Metadata(); + mailMetadata.set(Metadata.RESOURCE_NAME_KEY, pstMail.getInternetMessageId()); + mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId()); + mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId()); + mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject()); + mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName()); + mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName()); + mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime()); + mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime()); + mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment()); + mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId())); + mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress()); + mailMetadata.set("recipients", pstMail.getRecipientsString()); + mailMetadata.set("displayTo", pstMail.getDisplayTo()); + mailMetadata.set("displayCC", pstMail.getDisplayCC()); + mailMetadata.set("displayBCC", pstMail.getDisplayBCC()); + mailMetadata.set("importance", valueOf(pstMail.getImportance())); + mailMetadata.set("priority", valueOf(pstMail.getPriority())); + mailMetadata.set("flagged", valueOf(pstMail.isFlagged())); + + byte[] mailContent = pstMail.getBody().getBytes(UTF_8); + embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true); + } + + private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor) + throws TikaException { + int numberOfAttachments = email.getNumberOfAttachments(); + for (int i = 0; i < numberOfAttachments; i++) { + File tempFile = null; + try { + PSTAttachment attach = email.getAttachment(i); + + // Get the filename; both long and short filenames can be used for attachments + String filename = attach.getLongFilename(); + if (filename.isEmpty()) { + filename = attach.getFilename(); + } + + xhtml.element("p", filename); + + Metadata attachMeta = new Metadata(); + attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename); + attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", filename); + xhtml.startElement("div", attributes); + if (embeddedExtractor.shouldParseEmbedded(attachMeta)) { + TemporaryResources tmp = new TemporaryResources(); + try { + TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp); + embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true); + } finally { + tmp.dispose(); + } + } + xhtml.endElement("div"); + + } catch (Exception e) { + throw new TikaException("Unable to unpack document stream", e); + } finally { + if (tempFile != null) + tempFile.delete(); + } + } + } + +}
