Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,913 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.lzx; + +import java.math.BigInteger; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.core.ChmCommons; +import org.apache.tika.parser.chm.core.ChmCommons.IntelState; +import org.apache.tika.parser.chm.core.ChmCommons.LzxState; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.apache.tika.parser.chm.exception.ChmParsingException; + +/** + * Decompresses a chm block. Depending on chm block type chooses most relevant + * decompressing method. A chm block type can be as follows:</br> <li>UNDEFINED + * - no action taken, i.e. skipping the block <li>VERBATIM <li>ALIGNED_OFFSET + * <li>UNCOMPRESSED the most simplest In addition there are unknown types (4-7). + * Currently relying on previous chm block these types changing according to the + * previous chm block type. We need to invent more appropriate way to handle + * such types. + * + */ +public class ChmLzxBlock { + private int block_number; + private long block_length; + private ChmLzxState state; + private byte[] content = null; + private ChmSection chmSection = null; + private int contentLength = 0; + + // trying to find solution for bad blocks ... + private int previousBlockType = -1; + + public ChmLzxBlock(int blockNumber, byte[] dataSegment, long blockLength, + ChmLzxBlock prevBlock) throws TikaException { + try { + if (validateConstructorParams(blockNumber, dataSegment, blockLength)) { + setBlockNumber(blockNumber); + + if (prevBlock != null + && prevBlock.getState().getBlockLength() > prevBlock + .getState().getBlockRemaining()) + setChmSection(new ChmSection(dataSegment, prevBlock.getContent())); + else + setChmSection(new ChmSection(dataSegment)); + + setBlockLength(blockLength); + + // ============================================ + // we need to take care of previous context + // ============================================ + checkLzxBlock(prevBlock); + if (prevBlock == null + || blockLength < (int) getBlockLength()) { + setContent((int) getBlockLength()); + } + else { + setContent((int) blockLength); + } + + if (prevBlock != null && prevBlock.getState() != null) + previousBlockType = prevBlock.getState().getBlockType(); + + extractContent(); + } else + throw new TikaException("Check your chm lzx block parameters"); + } catch (TikaException e) { + throw e; + } + } + + protected int getContentLength() { + return contentLength; + } + + protected void setContentLength(int contentLength) { + this.contentLength = contentLength; + } + + private ChmSection getChmSection() { + return chmSection; + } + + private void setChmSection(ChmSection chmSection) { + this.chmSection = chmSection; + } + + private void assertStateNotNull() throws TikaException { + if (getState() == null) + throw new ChmParsingException("state is null"); + } + + private void extractContent() throws TikaException { + assertStateNotNull(); + if (getChmSection().getData() != null) { + boolean continueLoop = true; + while (continueLoop && getContentLength() < getBlockLength()) { + if (getState() != null && getState().getBlockRemaining() == 0) { + if (getState().getHadStarted() == LzxState.NOT_STARTED_DECODING) { + getState().setHadStarted(LzxState.STARTED_DECODING); + if (getChmSection().getSyncBits(1) == 1) { + int intelSizeTemp = (getChmSection() + .getSyncBits(16) << 16) + + getChmSection().getSyncBits(16); + if (intelSizeTemp >= 0) + getState().setIntelFileSize(intelSizeTemp); + else + getState().setIntelFileSize(0); + } + } + getState().setBlockType(getChmSection().getSyncBits(3)); + getState().setBlockLength( + (getChmSection().getSyncBits(16) << 8) + + getChmSection().getSyncBits(8)); + getState().setBlockRemaining(getState().getBlockLength()); + + // ---------------------------------------- + // Trying to handle 3 - 7 block types + // ---------------------------------------- + if (getState().getBlockType() > 3) { + if (previousBlockType >= 0 && previousBlockType < 3) + getState().setBlockType(previousBlockType); + } + + switch (getState().getBlockType()) { + case ChmCommons.ALIGNED_OFFSET: + createAlignedTreeTable(); + //fall through + case ChmCommons.VERBATIM: + /* Creates mainTreeTable */ + createMainTreeTable(); + createLengthTreeTable(); + if (getState().getMainTreeLengtsTable()[0xe8] != 0) + getState().setIntelState(IntelState.STARTED); + break; + case ChmCommons.UNCOMPRESSED: + getState().setIntelState(IntelState.STARTED); + if (getChmSection().getTotal() > 16) + getChmSection().setSwath( + getChmSection().getSwath() - 1); + getState().setR0( + (new BigInteger(getChmSection() + .reverseByteOrder( + getChmSection().unmarshalBytes( + 4))).longValue())); + getState().setR1( + (new BigInteger(getChmSection() + .reverseByteOrder( + getChmSection().unmarshalBytes( + 4))).longValue())); + getState().setR2( + (new BigInteger(getChmSection() + .reverseByteOrder( + getChmSection().unmarshalBytes( + 4))).longValue())); + break; + default: + break; + } + } //end of if BlockRemaining == 0 + + int tempLen; + + if (getContentLength() + getState().getBlockRemaining() > getBlockLength()) { + getState().setBlockRemaining( + getContentLength() + getState().getBlockRemaining() + - (int) getBlockLength()); + tempLen = (int) getBlockLength(); + } else { + tempLen = getContentLength() + + getState().getBlockRemaining(); + getState().setBlockRemaining(0); + } + + int lastLength = getContentLength(); + switch (getState().getBlockType()) { + case ChmCommons.ALIGNED_OFFSET: + // if(prevblock.lzxState.length>prevblock.lzxState.remaining) + decompressAlignedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());// prevcontext + break; + case ChmCommons.VERBATIM: + decompressVerbatimBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent()); + break; + case ChmCommons.UNCOMPRESSED: + decompressUncompressedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent()); + break; + } + getState().increaseFramesRead(); + if ((getState().getFramesRead() < 32768) + && getState().getIntelFileSize() != 0) + intelE8Decoding(); + + continueLoop = getContentLength() > lastLength; + } + } + } + + protected void intelE8Decoding() { + if (getBlockLength() <= ChmConstants.LZX_PRETREE_TABLEBITS + || (getState().getIntelState() == IntelState.NOT_STARTED)) { + getState().setBlockRemaining( + getState().getBlockRemaining() - (int) getBlockLength()); + } else { + long curpos = getState().getBlockRemaining(); + getState().setBlockRemaining( + getState().getBlockRemaining() - (int) getBlockLength()); + int i = 0; + while (i < getBlockLength() - 10) { + if (content[i] != 0xe8) { + i++; + continue; + } + byte[] b = new byte[4]; + b[0] = getContent()[i + 3]; + b[1] = getContent()[i + 2]; + b[2] = getContent()[i + 1]; + b[3] = getContent()[i + 0]; + long absoff = (new BigInteger(b)).longValue(); + if ((absoff >= -curpos) + && (absoff < getState().getIntelFileSize())) { + long reloff = (absoff >= 0) ? absoff - curpos : absoff + + getState().getIntelFileSize(); + getContent()[i + 0] = (byte) reloff; + getContent()[i + 1] = (byte) (reloff >>> 8); + getContent()[i + 2] = (byte) (reloff >>> 16); + getContent()[i + 3] = (byte) (reloff >>> 24); + } + i += 4; + curpos += 5; + } + } + } + + private short[] createPreLenTable() { + short[] tmp = new short[ChmConstants.LZX_PRETREE_MAXSYMBOLS]; + for (int i = 0; i < ChmConstants.LZX_PRETREE_MAXSYMBOLS; i++) { + tmp[i] = (short) getChmSection().getSyncBits( + ChmConstants.LZX_PRETREE_NUM_ELEMENTS_BITS); + } + return tmp; + } + + private void createLengthTreeTable() throws TikaException { + //Read Pre Tree Table + short[] prelentable = createPreLenTable(); + + if (prelentable == null) { + throw new ChmParsingException("pretreetable is null"); + } + + short[] pretreetable = createTreeTable2(prelentable, + (1 << ChmConstants.LZX_PRETREE_TABLEBITS) + + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1), + ChmConstants.LZX_PRETREE_TABLEBITS, + ChmConstants.LZX_PRETREE_MAXSYMBOLS); + + if (pretreetable == null) { + throw new ChmParsingException("pretreetable is null"); + } + + //Build Length Tree + createLengthTreeLenTable(0, ChmConstants.LZX_NUM_SECONDARY_LENGTHS, + pretreetable, prelentable); + + getState().setLengthTreeTable( + createTreeTable2(getState().getLengthTreeLengtsTable(), + (1 << ChmConstants.LZX_LENGTH_TABLEBITS) + + (ChmConstants.LZX_LENGTH_MAXSYMBOLS << 1), + ChmConstants.LZX_LENGTH_TABLEBITS, + ChmConstants.LZX_NUM_SECONDARY_LENGTHS)); + } + + private void decompressUncompressedBlock(int len, byte[] prevcontent) { + if (getContentLength() + getState().getBlockRemaining() <= getBlockLength()) { + for (int i = getContentLength(); i < (getContentLength() + getState() + .getBlockRemaining()); i++) + content[i] = getChmSection().getByte(); + + setContentLength(getContentLength() + + getState().getBlockRemaining()); + getState().setBlockRemaining(0); + } else { + for (int i = getContentLength(); i < getBlockLength(); i++) + content[i] = getChmSection().getByte(); + getState().setBlockRemaining( + (int) getBlockLength() - getContentLength());// = blockLen - + // contentlen; + setContentLength((int) getBlockLength()); + } + } + + private void decompressAlignedBlock(int len, byte[] prevcontent) throws TikaException { + + if ((getChmSection() == null) || (getState() == null) + || (getState().getMainTreeTable() == null)) + throw new ChmParsingException("chm section is null"); + + short s; + int x, i, border; + int matchlen = 0, matchfooter = 0, extra, rundest, runsrc; + int matchoffset = 0; + for (i = getContentLength(); i < len; i++) { + /* new code */ + //read huffman tree from main tree + border = getChmSection().peekBits( + ChmConstants.LZX_MAINTREE_TABLEBITS); + if (border >= getState().mainTreeTable.length) + throw new ChmParsingException("error decompressing aligned block."); + //break; + /* end new code */ + s = getState().mainTreeTable[getChmSection().peekBits( + ChmConstants.LZX_MAINTREE_TABLEBITS)]; + if (s >= getState().getMainTreeElements()) { + x = ChmConstants.LZX_MAINTREE_TABLEBITS; + do { + x++; + s <<= 1; + s += getChmSection().checkBit(x); + } while ((s = getState().mainTreeTable[s]) >= getState() + .getMainTreeElements()); + } + //System.out.printf("%d,", s); + //?getChmSection().getSyncBits(getState().mainTreeTable[s]); + getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]); + if (s < ChmConstants.LZX_NUM_CHARS) { + content[i] = (byte) s; + } else { + s -= ChmConstants.LZX_NUM_CHARS; + matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS; + if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) { + matchfooter = getState().lengthTreeTable[getChmSection() + .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];//.LZX_MAINTREE_TABLEBITS)]; + if (matchfooter >= ChmConstants.LZX_LENGTH_MAXSYMBOLS/*?LZX_LENGTH_TABLEBITS*/) { + x = ChmConstants.LZX_LENGTH_TABLEBITS; + do { + x++; + matchfooter <<= 1; + matchfooter += getChmSection().checkBit(x); + } while ((matchfooter = getState().lengthTreeTable[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS); + } + getChmSection().getSyncBits( + getState().lengthTreeLengtsTable[matchfooter]); + matchlen += matchfooter; + } + matchlen += ChmConstants.LZX_MIN_MATCH; + matchoffset = s >>> 3; + if (matchoffset > 2) { + extra = ChmConstants.EXTRA_BITS[matchoffset]; + matchoffset = (ChmConstants.POSITION_BASE[matchoffset] - 2); + if (extra > 3) { + extra -= 3; + long verbatim_bits = getChmSection().getSyncBits(extra); + matchoffset += (verbatim_bits << 3); + //READ HUFF SYM in Aligned Tree + int aligned_bits = getChmSection().peekBits( + ChmConstants.LZX_NUM_PRIMARY_LENGTHS); + int t = getState().getAlignedTreeTable()[aligned_bits]; + if (t >= getState().getMainTreeElements()) { + x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS; //?LZX_ALIGNED_TABLEBITS + do { + x++; + t <<= 1; + t += getChmSection().checkBit(x); + } while ((t = getState().getAlignedTreeTable()[t]) >= getState() + .getMainTreeElements()); + } + getChmSection().getSyncBits( + getState().getAlignedLenTable()[t]); + matchoffset += t; + } else if (extra == 3) { + int g = getChmSection().peekBits( + ChmConstants.LZX_NUM_PRIMARY_LENGTHS); + int t = getState().getAlignedTreeTable()[g]; + if (t >= getState().getMainTreeElements()) { + x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS; + do { + x++; + t <<= 1; + t += getChmSection().checkBit(x); + } while ((t = getState().getAlignedTreeTable()[t]) >= getState() + .getMainTreeElements()); + } + getChmSection().getSyncBits( + getState().getAlignedLenTable()[t]); + matchoffset += t; + } else if (extra > 0) { + long l = getChmSection().getSyncBits(extra); + matchoffset += l; + } else + matchoffset = 1; + getState().setR2(getState().getR1()); + getState().setR1(getState().getR0()); + getState().setR0(matchoffset); + } else if (matchoffset == 0) { + matchoffset = (int) getState().getR0(); + } else if (matchoffset == 1) { + matchoffset = (int) getState().getR1(); + getState().setR1(getState().getR0()); + getState().setR0(matchoffset); + } else /** match_offset == 2 */ + { + matchoffset = (int) getState().getR2(); + getState().setR2(getState().getR0()); + getState().setR0(matchoffset); + } + rundest = i; + runsrc = rundest - matchoffset; + i += (matchlen - 1); + if (i > len) + break; + + if (runsrc < 0) { + if (matchlen + runsrc <= 0) { + runsrc = prevcontent.length + runsrc; + while (matchlen-- > 0) + content[rundest++] = prevcontent[runsrc++]; + } else { + runsrc = prevcontent.length + runsrc; + while (runsrc < prevcontent.length) + content[rundest++] = prevcontent[runsrc++]; + matchlen = matchlen + runsrc - prevcontent.length; + runsrc = 0; + while (matchlen-- > 0) + content[rundest++] = content[runsrc++]; + } + + } else { + /* copies any wrappes around source data */ + while ((runsrc < 0) && (matchlen-- > 0)) { + content[rundest++] = content[(int) (runsrc + getBlockLength())]; + runsrc++; + } + /* copies match data - no worries about destination wraps */ + while (matchlen-- > 0) + content[rundest++] = content[runsrc++]; + } + } + } + setContentLength(len); + } + + private void assertShortArrayNotNull(short[] array) throws TikaException { + if (array == null) + throw new ChmParsingException("short[] is null"); + } + + private void decompressVerbatimBlock(int len, byte[] prevcontent) throws TikaException { + short s; + int x, i; + int matchlen = 0, matchfooter = 0, extra, rundest, runsrc; + int matchoffset = 0; + for (i = getContentLength(); i < len; i++) { + int f = getChmSection().peekBits( + ChmConstants.LZX_MAINTREE_TABLEBITS); + assertShortArrayNotNull(getState().getMainTreeTable()); + s = getState().getMainTreeTable()[f]; + if (s >= ChmConstants.LZX_MAIN_MAXSYMBOLS) { + x = ChmConstants.LZX_MAINTREE_TABLEBITS; + do { + x++; + s <<= 1; + s += getChmSection().checkBit(x); + } while ((s = getState().getMainTreeTable()[s]) >= ChmConstants.LZX_MAIN_MAXSYMBOLS); + } + getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]); + if (s < ChmConstants.LZX_NUM_CHARS) { + content[i] = (byte) s; + } else { + s -= ChmConstants.LZX_NUM_CHARS; + matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS; + if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) { + matchfooter = getState().getLengthTreeTable()[getChmSection() + .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)]; + if (matchfooter >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS) { + x = ChmConstants.LZX_LENGTH_TABLEBITS; + do { + x++; + matchfooter <<= 1; + matchfooter += getChmSection().checkBit(x); + } while ((matchfooter = getState().getLengthTreeTable()[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS); + } + getChmSection().getSyncBits( + getState().getLengthTreeLengtsTable()[matchfooter]); + matchlen += matchfooter; + } + matchlen += ChmConstants.LZX_MIN_MATCH; + // shorter than 2 + matchoffset = s >>> 3; + if (matchoffset > 2) { + if (matchoffset != 3) { // should get other bits to retrieve + // offset + extra = ChmConstants.EXTRA_BITS[matchoffset]; + long l = getChmSection().getSyncBits(extra); + matchoffset = (int) (ChmConstants.POSITION_BASE[matchoffset] - 2 + l); + } else { + matchoffset = 1; + } + getState().setR2(getState().getR1()); + getState().setR1(getState().getR0()); + getState().setR0(matchoffset); + } else if (matchoffset == 0) { + matchoffset = (int) getState().getR0(); + } else if (matchoffset == 1) { + matchoffset = (int) getState().getR1(); + getState().setR1(getState().getR0()); + getState().setR0(matchoffset); + } else /* match_offset == 2 */ + { + matchoffset = (int) getState().getR2(); + getState().setR2(getState().getR0()); + getState().setR0(matchoffset); + } + rundest = i; + runsrc = rundest - matchoffset; + i += (matchlen - 1); + if (i > len) + break; + if (runsrc < 0) { + if (matchlen + runsrc <= 0) { + runsrc = prevcontent.length + runsrc; + while ((matchlen-- > 0) && (prevcontent != null) + && ((runsrc + 1) > 0)) + if ((rundest < content.length) + && (runsrc < content.length)) + content[rundest++] = prevcontent[runsrc++]; + } else { + runsrc = prevcontent.length + runsrc; + while (runsrc < prevcontent.length) + if ((rundest < content.length) + && (runsrc < content.length)) + content[rundest++] = prevcontent[runsrc++]; + matchlen = matchlen + runsrc - prevcontent.length; + runsrc = 0; + while (matchlen-- > 0) + content[rundest++] = content[runsrc++]; + } + + } else { + /* copies any wrapped source data */ + while ((runsrc < 0) && (matchlen-- > 0)) { + content[rundest++] = content[(int) (runsrc + getBlockLength())]; + runsrc++; + } + /* copies match data - no worries about destination wraps */ + while (matchlen-- > 0) { + if ((rundest < content.length) + && (runsrc < content.length)) + content[rundest++] = content[runsrc++]; + } + } + } + } + setContentLength(len); + } + + private void createLengthTreeLenTable(int offset, int tablelen, + short[] pretreetable, short[] prelentable) throws TikaException { + if (prelentable == null || getChmSection() == null + || pretreetable == null || prelentable == null) + throw new ChmParsingException("is null"); + + int i = offset; // represents offset + int z, y, x;// local counters + while (i < tablelen) { + //Read HUFF sym to z + z = pretreetable[getChmSection().peekBits( + ChmConstants.LZX_PRETREE_TABLEBITS)]; + if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 1 bug, should be + // 20 + x = ChmConstants.LZX_PRETREE_TABLEBITS; + do { + x++; + z <<= 1; + z += getChmSection().checkBit(x); + } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS); + } + getChmSection().getSyncBits(prelentable[z]); + + if (z < 17) { + z = getState().getLengthTreeLengtsTable()[i] - z; + if (z < 0) + z = z + 17; + getState().getLengthTreeLengtsTable()[i] = (short) z; + i++; + } else if (z == 17) { + y = getChmSection().getSyncBits(4); + y += 4; + for (int j = 0; j < y; j++) + if (i < getState().getLengthTreeLengtsTable().length) + getState().getLengthTreeLengtsTable()[i++] = 0; + } else if (z == 18) { + y = getChmSection().getSyncBits(5); + y += 20; + for (int j = 0; j < y; j++) + //no tolerate //if (i < getState().getLengthTreeLengtsTable().length) + getState().getLengthTreeLengtsTable()[i++] = 0; + } else if (z == 19) { + y = getChmSection().getSyncBits(1); + y += 4; + z = pretreetable[getChmSection().peekBits( + ChmConstants.LZX_PRETREE_TABLEBITS)]; + if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 20 + x = ChmConstants.LZX_PRETREE_TABLEBITS;// 6 + do { + x++; + z <<= 1; + z += getChmSection().checkBit(x); + } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);//LZX_MAINTREE_TABLEBITS); + } + getChmSection().getSyncBits(prelentable[z]); + z = getState().getLengthTreeLengtsTable()[i] - z; + if (z < 0) + z = z + 17; + for (int j = 0; j < y; j++) + getState().getLengthTreeLengtsTable()[i++] = (short) z; + } + } + } + + private void createMainTreeTable() throws TikaException { + //Read Pre Tree Table + short[] prelentable = createPreLenTable(); + short[] pretreetable = createTreeTable2(prelentable, + (1 << ChmConstants.LZX_PRETREE_TABLEBITS) + + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1), + ChmConstants.LZX_PRETREE_TABLEBITS, + ChmConstants.LZX_PRETREE_MAXSYMBOLS); + + createMainTreeLenTable(0, ChmConstants.LZX_NUM_CHARS, pretreetable, + prelentable); + + //Read Pre Tree Table + prelentable = createPreLenTable(); + pretreetable = createTreeTable2(prelentable, + (1 << ChmConstants.LZX_PRETREE_TABLEBITS) + + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1), + ChmConstants.LZX_PRETREE_TABLEBITS, + ChmConstants.LZX_PRETREE_MAXSYMBOLS); + + createMainTreeLenTable(ChmConstants.LZX_NUM_CHARS, + getState().mainTreeLengtsTable.length, pretreetable, + prelentable); + + getState().setMainTreeTable( + createTreeTable2(getState().mainTreeLengtsTable, + (1 << ChmConstants.LZX_MAINTREE_TABLEBITS) + + (ChmConstants.LZX_MAINTREE_MAXSYMBOLS << 1), + ChmConstants.LZX_MAINTREE_TABLEBITS, getState() + .getMainTreeElements())); + } + + private void createMainTreeLenTable(int offset, int tablelen, + short[] pretreetable, short[] prelentable) throws TikaException { + if (pretreetable == null) + throw new ChmParsingException("pretreetable is null"); + int i = offset; + int z, y, x; + while (i < tablelen) { + int f = getChmSection().peekBits( + ChmConstants.LZX_PRETREE_TABLEBITS); + z = pretreetable[f]; + if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) { + x = ChmConstants.LZX_PRETREE_TABLEBITS; + do { + x++; + z <<= 1; + z += getChmSection().checkBit(x); + } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS); + } + getChmSection().getSyncBits(prelentable[z]); + if (z < 17) { + z = getState().getMainTreeLengtsTable()[i] - z; + if (z < 0) + z = z + 17; + getState().mainTreeLengtsTable[i] = (short) z; + i++; + } else if (z == 17) { + y = getChmSection().getSyncBits(4); + y += 4; + for (int j = 0; j < y; j++) { + assertInRange(getState().getMainTreeLengtsTable(), i); + getState().mainTreeLengtsTable[i++] = 0; + } + } else if (z == 18) { + y = getChmSection().getSyncBits(5); + y += 20; + for (int j = 0; j < y; j++) { + assertInRange(getState().getMainTreeLengtsTable(), i); + getState().mainTreeLengtsTable[i++] = 0; + } + } else if (z == 19) { + y = getChmSection().getSyncBits(1); + y += 4; + z = pretreetable[getChmSection().peekBits( + ChmConstants.LZX_PRETREE_TABLEBITS)]; + if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) { + x = ChmConstants.LZX_PRETREE_TABLEBITS; + do { + x++; + z <<= 1; + z += getChmSection().checkBit(x); + } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS); + } + getChmSection().getSyncBits(prelentable[z]); + z = getState().mainTreeLengtsTable[i] - z; + if (z < 0) + z = z + 17; + for (int j = 0; j < y; j++) + if (i < getState().getMainTreeLengtsTable().length) + getState().mainTreeLengtsTable[i++] = (short) z; + } + } + } + + private void assertInRange(short[] array, int index) throws ChmParsingException { + if (index >= array.length) + throw new ChmParsingException(index + " is bigger than " + + array.length); + } + + private short[] createAlignedLenTable() { + int tablelen = ChmConstants.LZX_ALIGNED_NUM_ELEMENTS;//LZX_BLOCKTYPE_UNCOMPRESSED;// + int bits = ChmConstants.LZX_BLOCKTYPE_UNCOMPRESSED; + short[] tmp = new short[tablelen]; + for (int i = 0; i < tablelen; i++) { + tmp[i] = (short) getChmSection().getSyncBits(bits); + } + return tmp; + } + + private void createAlignedTreeTable() throws ChmParsingException { + getState().setAlignedLenTable(createAlignedLenTable()); + getState().setAlignedTreeTable(//setAlignedLenTable( + createTreeTable2(getState().getAlignedLenTable(), + (1 << ChmConstants.LZX_NUM_PRIMARY_LENGTHS) + + (ChmConstants.LZX_ALIGNED_MAXSYMBOLS << 1), + ChmConstants.LZX_NUM_PRIMARY_LENGTHS, + ChmConstants.LZX_ALIGNED_MAXSYMBOLS)); + } + + private short[] createTreeTable2(short[] lentable, int tablelen, int bits, + int maxsymbol) throws ChmParsingException { + short[] tmp = new short[tablelen]; + short sym; + int leaf; + int bit_num = 1; + long fill; + int pos = 0; + /* the current position in the decode table */ + long table_mask = (1 << bits); + long bit_mask = (table_mask >> 1); + long next_symbol = bit_mask; + + /* fills entries for short codes for a direct mapping */ + while (bit_num <= bits) { + for (sym = 0; sym < maxsymbol; sym++) { + if (lentable.length > sym && lentable[sym] == bit_num) { + leaf = pos; + + if ((pos += bit_mask) > table_mask) { + /* table overflow */ + throw new ChmParsingException("Table overflow"); + } + + fill = bit_mask; + while (fill-- > 0) + tmp[leaf++] = sym; + } + } + bit_mask >>= 1; + bit_num++; + } + + /* if there are any codes longer than nbits */ + if (pos != table_mask) { + /* clears the remainder of the table */ + for (leaf = pos; leaf < table_mask; leaf++) + tmp[leaf] = 0; + + /* gives ourselves room for codes to grow by up to 16 more bits */ + pos <<= 16; + table_mask <<= 16; + bit_mask = 1 << 15; + + while (bit_num <= 16) { + for (sym = 0; sym < maxsymbol; sym++) { + if ((lentable.length > sym) && (lentable[sym] == bit_num)) { + leaf = pos >> 16; + for (fill = 0; fill < bit_num - bits; fill++) { + /* + * if this path hasn't been taken yet, 'allocate' + * two entries + */ + if (tmp[leaf] == 0) { + if (((next_symbol << 1) + 1) < tmp.length) { + tmp[(int) (next_symbol << 1)] = 0; + tmp[(int) (next_symbol << 1) + 1] = 0; + tmp[leaf] = (short) next_symbol++; + } + + } + /* + * follows the path and select either left or right + * for next bit + */ + leaf = tmp[leaf] << 1; + if (((pos >> (15 - fill)) & 1) != 0) + leaf++; + } + tmp[leaf] = sym; + + if ((pos += bit_mask) > table_mask) { + /* table overflow */ + throw new ChmParsingException("Table overflow"); + } + } + } + bit_mask >>= 1; + bit_num++; + } + } + + /* is it full table? */ + if (pos == table_mask) + return tmp; + + return tmp; + } + + public byte[] getContent() { + return content; + } + + public byte[] getContent(int startOffset, int endOffset) { + return (getContent() != null) ? ChmCommons.copyOfRange(getContent(), + startOffset, endOffset) : new byte[1]; + } + + public byte[] getContent(int start) { + return (getContent() != null) ? ChmCommons.copyOfRange(getContent(), + start, getContent().length) : new byte[1]; + } + + private void setContent(int contentLength) { + this.content = new byte[contentLength]; + } + + private void checkLzxBlock(ChmLzxBlock chmPrevLzxBlock) throws TikaException { + if (chmPrevLzxBlock == null && getBlockLength() < Integer.MAX_VALUE) + setState(new ChmLzxState((int) getBlockLength())); + else + //use clone to avoid changing a cached or to be cached block + setState(chmPrevLzxBlock.getState().clone()); + } + + private boolean validateConstructorParams(int blockNumber, + byte[] dataSegment, long blockLength) throws TikaException { + int goodParameter = 0; + if (blockNumber >= 0) + ++goodParameter; + else + throw new ChmParsingException("block number should be possitive"); + if (dataSegment != null && dataSegment.length > 0) + ++goodParameter; + else + throw new ChmParsingException("data segment should not be null"); + if (blockLength > 0) + ++goodParameter; + else + throw new ChmParsingException( + "block length should be more than zero"); + return (goodParameter == 3); + } + + public int getBlockNumber() { + return block_number; + } + + private void setBlockNumber(int block_number) { + this.block_number = block_number; + } + + private long getBlockLength() { + return block_length; + } + + private void setBlockLength(long block_length) { + this.block_length = block_length; + } + + public ChmLzxState getState() { + return state; + } + + private void setState(ChmLzxState state) { + this.state = state; + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,327 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.lzx; + +import java.util.concurrent.CancellationException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.core.ChmCommons; +import org.apache.tika.parser.chm.core.ChmCommons.IntelState; +import org.apache.tika.parser.chm.core.ChmCommons.LzxState; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.apache.tika.parser.chm.exception.ChmParsingException; + +public class ChmLzxState implements Cloneable { + /* Class' members */ + private int window; /* the actual decoding window */ + private long window_size; /* window size (32Kb through 2Mb) */ + private int window_position; /* current offset within the window */ + private int main_tree_elements; /* number of main tree elements */ + private LzxState hadStarted; /* have we started decoding at all yet? */ + private int block_type; /* type of this block */ + private int block_length; /* uncompressed length of this block */ + private int block_remaining; /* uncompressed bytes still left to decode */ + private int frames_read; /* the number of CFDATA blocks processed */ + private int intel_file_size; /* magic header value used for transform */ + private long intel_current_possition; /* current offset in transform space */ + private IntelState intel_state; /* have we seen any translatable data yet? */ + private long R0; /* for the LRU offset system */ + private long R1; /* for the LRU offset system */ + private long R2; /* for the LRU offset system */ + + // Trees - PRETREE, MAINTREE, LENGTH, ALIGNED + protected short[] mainTreeLengtsTable; + protected short[] mainTreeTable; + + protected short[] lengthTreeTable; + protected short[] lengthTreeLengtsTable; + + protected short[] alignedLenTable; + protected short[] alignedTreeTable; + + @Override + public ChmLzxState clone() { + try { + ChmLzxState clone = (ChmLzxState)super.clone(); + clone.mainTreeLengtsTable = arrayClone(mainTreeLengtsTable); + clone.mainTreeTable = arrayClone(mainTreeTable); + clone.lengthTreeTable = arrayClone(lengthTreeTable); + clone.lengthTreeLengtsTable = arrayClone(lengthTreeLengtsTable); + clone.alignedLenTable = arrayClone(alignedLenTable); + clone.alignedTreeTable = arrayClone(alignedTreeTable); + return clone; + } catch (CloneNotSupportedException ex) { + return null; + } + } + + protected short[] getMainTreeTable() { + return mainTreeTable; + } + + protected short[] getAlignedTreeTable() { + return alignedTreeTable; + } + + protected void setAlignedTreeTable(short[] alignedTreeTable) { + this.alignedTreeTable = alignedTreeTable; + } + + protected short[] getLengthTreeTable() throws TikaException { + if (lengthTreeTable != null) + return this.lengthTreeTable; + else + throw new ChmParsingException("lengthTreeTable is null"); + } + + protected void setLengthTreeTable(short[] lengthTreeTable) { + this.lengthTreeTable = lengthTreeTable; + } + + protected void setMainTreeTable(short[] mainTreeTable) { + this.mainTreeTable = mainTreeTable; + } + + protected short[] getAlignedLenTable() { + return this.alignedLenTable; + } + + protected void setAlignedLenTable(short[] alignedLenTable) { + this.alignedLenTable = alignedLenTable; + } + + /** + * It suits for informative outlook + */ + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("actual decoding window:=" + getWindow() + + System.getProperty("line.separator")); + sb.append("window size (32Kb through 2Mb):=" + getWindowSize() + + System.getProperty("line.separator")); + sb.append("current offset within the window:=" + getWindowPosition() + + System.getProperty("line.separator")); + sb.append("number of main tree elements:=" + getMainTreeElements() + + System.getProperty("line.separator")); + sb.append("have we started decoding at all yet?:=" + getHadStarted() + + System.getProperty("line.separator")); + sb.append("type of this block:=" + getBlockType() + + System.getProperty("line.separator")); + sb.append("uncompressed length of this block:=" + getBlockLength() + + System.getProperty("line.separator")); + sb.append("uncompressed bytes still left to decode:=" + + getBlockRemaining() + System.getProperty("line.separator")); + sb.append("the number of CFDATA blocks processed:=" + getFramesRead() + + System.getProperty("line.separator")); + sb.append("magic header value used for transform:=" + + getIntelFileSize() + System.getProperty("line.separator")); + sb.append("current offset in transform space:=" + + getIntelCurrentPossition() + + System.getProperty("line.separator")); + sb.append("have we seen any translatable data yet?:=" + getIntelState() + + System.getProperty("line.separator")); + sb.append("R0 for the LRU offset system:=" + getR0() + + System.getProperty("line.separator")); + sb.append("R1 for the LRU offset system:=" + getR1() + + System.getProperty("line.separator")); + sb.append("R2 for the LRU offset system:=" + getR2() + + System.getProperty("line.separator")); + sb.append("main tree length:=" + getMainTreeLengtsTable().length + + System.getProperty("line.separator")); + sb.append("secondary tree length:=" + getLengthTreeLengtsTable().length + + System.getProperty("line.separator")); + return sb.toString(); + } + + public ChmLzxState(int window) throws TikaException { + if (window >= 0) { + int position_slots; + int win = ChmCommons.getWindowSize(window); + setWindowSize(1 << win); + /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */ + if (win < 15 || win > 21) + throw new ChmParsingException("window less than 15 or window greater than 21"); + + /* Calculates required position slots */ + if (win == 20) + position_slots = 42; + else if (win == 21) + position_slots = 50; + else + position_slots = win << 1; + //TODO: position_slots is not used ? + setR0(1); + setR1(1); + setR2(1); + setMainTreeElements(512); + setHadStarted(LzxState.NOT_STARTED_DECODING); + setFramesRead(0); + setBlockRemaining(0); + setBlockType(ChmConstants.LZX_BLOCKTYPE_INVALID); + setIntelCurrentPossition(0); + setIntelState(IntelState.NOT_STARTED); + setWindowPosition(0); + setMainTreeLengtsTable(new short[getMainTreeElements()]); + setLengthTreeLengtsTable(new short[ChmConstants.LZX_NUM_SECONDARY_LENGTHS]); + } else + throw new CancellationException( + "window size should be more than zero"); + } + + protected void setWindow(int window) { + this.window = window; + } + + protected int getWindow() { + return window; + } + + protected void setWindowSize(long window_size) { + this.window_size = window_size; + } + + protected long getWindowSize() { + return window_size; + } + + protected void setWindowPosition(int window_position) { + this.window_position = window_position; + } + + protected int getWindowPosition() { + return window_position; + } + + protected void setMainTreeElements(int main_tree_elements) { + this.main_tree_elements = main_tree_elements; + } + + protected int getMainTreeElements() { + return main_tree_elements; + } + + protected void setHadStarted(LzxState hadStarted) { + this.hadStarted = hadStarted; + } + + protected LzxState getHadStarted() { + return hadStarted; + } + + protected void setBlockType(int block_type) { + this.block_type = block_type; + } + + public int getBlockType() { + return block_type; + } + + protected void setBlockLength(int block_length) { + this.block_length = block_length; + } + + protected int getBlockLength() { + return block_length; + } + + protected void setBlockRemaining(int block_remaining) { + this.block_remaining = block_remaining; + } + + protected int getBlockRemaining() { + return block_remaining; + } + + protected void setFramesRead(int frames_read) { + this.frames_read = frames_read; + } + + protected void increaseFramesRead() { + this.frames_read = getFramesRead() + 1; + } + + protected int getFramesRead() { + return frames_read; + } + + protected void setIntelFileSize(int intel_file_size) { + this.intel_file_size = intel_file_size; + } + + protected int getIntelFileSize() { + return intel_file_size; + } + + protected void setIntelCurrentPossition(long intel_current_possition) { + this.intel_current_possition = intel_current_possition; + } + + protected long getIntelCurrentPossition() { + return intel_current_possition; + } + + protected void setIntelState(IntelState intel_state) { + this.intel_state = intel_state; + } + + protected IntelState getIntelState() { + return intel_state; + } + + protected void setR0(long r0) { + R0 = r0; + } + + protected long getR0() { + return R0; + } + + protected void setR1(long r1) { + R1 = r1; + } + + protected long getR1() { + return R1; + } + + protected void setR2(long r2) { + R2 = r2; + } + + protected long getR2() { + return R2; + } + + public void setMainTreeLengtsTable(short[] mainTreeLengtsTable) { + this.mainTreeLengtsTable = mainTreeLengtsTable; + } + + public short[] getMainTreeLengtsTable() { + return mainTreeLengtsTable; + } + + public void setLengthTreeLengtsTable(short[] lengthTreeLengtsTable) { + this.lengthTreeLengtsTable = lengthTreeLengtsTable; + } + + public short[] getLengthTreeLengtsTable() { + return lengthTreeLengtsTable; + } + + private static short[] arrayClone(short[] a) { + return a==null ? null : (short[]) a.clone(); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.lzx; + +import java.math.BigInteger; +import java.util.Arrays; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.core.ChmCommons; + +public class ChmSection { + final private byte[] data; + final private byte[] prevcontent; + private int swath;// kiks + private int total;// remains + private int buffer;// val + + public ChmSection(byte[] data) throws TikaException { + this(data, null); + } + + public ChmSection(byte[] data, byte[] prevconent) throws TikaException { + ChmCommons.assertByteArrayNotNull(data); + this.data = data; + this.prevcontent = prevconent; + //setData(data); + } + + /* Utilities */ + public byte[] reverseByteOrder(byte[] toBeReversed) throws TikaException { + ChmCommons.assertByteArrayNotNull(toBeReversed); + ChmCommons.reverse(toBeReversed); + return toBeReversed; + } + + public int checkBit(int i) { + return ((getBuffer() & (1 << (getTotal() - i))) == 0) ? 0 : 1; + } + + public int getSyncBits(int bit) { + return getDesyncBits(bit, bit); + } + + public int peekBits(int bit) { + return getDesyncBits(bit, 0); + } + + private int getDesyncBits(int bit, int removeBit) { + while (getTotal() < 16) { + setBuffer((getBuffer() << 16) + unmarshalUByte() + + (unmarshalUByte() << 8)); + setTotal(getTotal() + 16); + } + int tmp = (getBuffer() >>> (getTotal() - bit)); + setTotal(getTotal() - removeBit); + setBuffer(getBuffer() - ((getBuffer() >>> getTotal()) << getTotal())); + return tmp; + } + + public int unmarshalUByte() { + return getByte() & 255; + } + + public byte getByte() { + if (getSwath() < getData().length) { + setSwath(getSwath() + 1); + return getData()[getSwath() - 1]; + } else + return 0; + } + + public int getLeft() { + return (getData().length - getSwath()); + } + + public byte[] getData() { + return data; + } + + public byte[] getPrevContent() { + return prevcontent; + } + + public BigInteger getBigInteger(int i) { + if (getData() == null) + return BigInteger.ZERO; + if (getData().length - getSwath() < i) + i = getData().length - getSwath(); + byte[] tmp = new byte[i]; + for (int j = i - 1; j >= 0; j--) { + tmp[i - j - 1] = getData()[getSwath() + j]; + } + setSwath(getSwath() + i); + return new BigInteger(tmp); + } + + public byte[] stringToAsciiBytes(String s) { + char[] c = s.toCharArray(); + byte[] byteval = new byte[c.length]; + for (int i = 0; i < c.length; i++) + byteval[i] = (byte) c[i]; + return byteval; + } + + public BigInteger unmarshalUlong() { + return getBigInteger(8); + } + + public long unmarshalUInt() { + return getBigInteger(4).longValue(); + } + + public int unmarshalInt() { + return getBigInteger(4).intValue(); + } + + public byte[] unmarshalBytes(int i) { + if (i == 0) + return new byte[1]; + byte[] t = new byte[i]; + for (int j = 0; j < i; j++) + t[j] = getData()[j + getSwath()]; + setSwath(getSwath() + i); + return t; + } + + public BigInteger getEncint() { + byte ob; + BigInteger bi = BigInteger.ZERO; + byte[] nb = new byte[1]; + while ((ob = this.getByte()) < 0) { + nb[0] = (byte) ((ob & 0x7f)); + bi = bi.shiftLeft(7).add(new BigInteger(nb)); + } + nb[0] = (byte) ((ob & 0x7f)); + bi = bi.shiftLeft(7).add(new BigInteger(nb)); + return bi; + } + + public char unmarshalUtfChar() { + byte ob; + int i = 1; + byte[] ba; + ob = this.getByte(); + if (ob < 0) { + i = 2; + while ((ob << (24 + i)) < 0) + i++; + } + ba = new byte[i]; + ba[0] = ob; + int j = 1; + while (j < i) { + ba[j] = this.getByte(); + j++; + } + i = ba.length; + if (i == 1) + return (char) ba[0]; + else { + int n; + n = ba[0] & 15; // 00001111b, gets last 4 bits + j = 1; + while (j < i) + n = (n << 6) + (ba[j++] & 63);// 00111111b,gets last 6 bits + return (char) n; + } + } + +// private void setData(byte[] data) { +// this.data = data; +// } + + public int getSwath() { + return swath; + } + + public void setSwath(int swath) { + this.swath = swath; + } + + public int getTotal() { + return total; + } + + public void setTotal(int total) { + this.total = total; + } + + private int getBuffer() { + return buffer; + } + + private void setBuffer(int buffer) { + this.buffer = buffer; + } + + /** + * @param args + * @throws TikaException + */ + public static void main(String[] args) throws TikaException { + byte[] array = { 4, 78, -67, 90, 1, -33 }; + ChmSection chmSection = new ChmSection(array); + System.out.println("before " + Arrays.toString(array)); + System.out.println("after " + Arrays.toString(chmSection.reverseByteOrder(array))); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,269 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.poi.hwpf.converter.NumberFormatter; + +public abstract class AbstractListManager { + private final static String BULLET = "\u00b7"; + + protected Map<Integer, ParagraphLevelCounter> listLevelMap = new HashMap<Integer, ParagraphLevelCounter>(); + protected Map<Integer, LevelTuple[]> overrideTupleMap = new HashMap<Integer, LevelTuple[]>(); + + //helper class that is docx/doc format agnostic + protected class ParagraphLevelCounter { + + //counts can == 0 if the format is decimal, make sure + //that flag values are < 0 + private final Integer NOT_SEEN_YET = -1; + private final Integer FIRST_SKIPPED = -2; + private final LevelTuple[] levelTuples; + Pattern LEVEL_INTERPOLATOR = Pattern.compile("%(\\d+)"); + private List<Integer> counts = new ArrayList<Integer>(); + private int lastLevel = -1; + + public ParagraphLevelCounter(LevelTuple[] levelTuples) { + this.levelTuples = levelTuples; + } + + public int getNumberOfLevels() { + return levelTuples.length; + } + + /** + * Apply this to every numbered paragraph in order. + * + * @param levelNumber level number that is being incremented + * @return the new formatted number string for this level + */ + public String incrementLevel(int levelNumber, LevelTuple[] overrideLevelTuples) { + + for (int i = lastLevel + 1; i < levelNumber; i++) { + if (i >= counts.size()) { + int val = getStart(i, overrideLevelTuples); + counts.add(i, val); + } else { + int count = counts.get(i); + if (count == NOT_SEEN_YET) { + count = getStart(i, overrideLevelTuples); + counts.set(i, count); + } + } + } + + if (levelNumber < counts.size()) { + resetAfter(levelNumber, overrideLevelTuples); + int count = counts.get(levelNumber); + if (count == NOT_SEEN_YET) { + count = getStart(levelNumber, overrideLevelTuples); + } else { + count++; + } + counts.set(levelNumber, count); + lastLevel = levelNumber; + return format(levelNumber, overrideLevelTuples); + } + + counts.add(levelNumber, getStart(levelNumber, overrideLevelTuples)); + lastLevel = levelNumber; + return format(levelNumber, overrideLevelTuples); + } + + /** + * @param level which level to format + * @return the string that represents the number and the surrounding text for this paragraph + */ + private String format(int level, LevelTuple[] overrideLevelTuples) { + if (level < 0 || level >= levelTuples.length) { + //log? + return ""; + } + boolean isLegal = (overrideLevelTuples != null) ? overrideLevelTuples[level].isLegal : levelTuples[level].isLegal; + //short circuit bullet + String numFmt = getNumFormat(level, isLegal, overrideLevelTuples); + if ("bullet".equals(numFmt)) { + return BULLET + " "; + } + + String lvlText = (overrideLevelTuples == null || overrideLevelTuples[level].lvlText == null) ? + levelTuples[level].lvlText : overrideLevelTuples[level].lvlText; + StringBuilder sb = new StringBuilder(); + Matcher m = LEVEL_INTERPOLATOR.matcher(lvlText); + int last = 0; + while (m.find()) { + sb.append(lvlText.substring(last, m.start())); + String lvlString = m.group(1); + int lvlNum = -1; + try { + lvlNum = Integer.parseInt(lvlString); + } catch (NumberFormatException e) { + //swallow + } + String numString = ""; + //need to subtract 1 because, e.g. %1 is the format + //for the number at array offset 0 + numString = formatNum(lvlNum - 1, isLegal, overrideLevelTuples); + + sb.append(numString); + last = m.end(); + } + sb.append(lvlText.substring(last)); + if (sb.length() > 0) { + //TODO: add in character after number + sb.append(" "); + } + return sb.toString(); + } + + //actual level number; can return empty string if numberformatter fails + private String formatNum(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) { + + int numFmtStyle = 0; + String numFmt = getNumFormat(lvlNum, isLegal, overrideLevelTuples); + + int count = getCount(lvlNum); + if (count < 0) { + count = 1; + } + if ("lowerLetter".equals(numFmt)) { + numFmtStyle = 4; + } else if ("lowerRoman".equals(numFmt)) { + numFmtStyle = 2; + } else if ("decimal".equals(numFmt)) { + numFmtStyle = 0; + } else if ("upperLetter".equals(numFmt)) { + numFmtStyle = 3; + } else if ("upperRoman".equals(numFmt)) { + numFmtStyle = 1; + } else if ("bullet".equals(numFmt)) { + return ""; + //not yet handled by NumberFormatter...TODO: add to NumberFormatter? + } else if ("ordinal".equals(numFmt)) { + return ordinalize(count); + } else if ("decimalZero".equals(numFmt)) { + return "0" + NumberFormatter.getNumber(count, 0); + } else if ("none".equals(numFmt)) { + return ""; + } + try { + return NumberFormatter.getNumber(count, numFmtStyle); + } catch (IllegalArgumentException e) { + return ""; + } + } + + private String ordinalize(int count) { + //this is only good for locale == English + String countString = Integer.toString(count); + if (countString.endsWith("1")) { + return countString + "st"; + } else if (countString.endsWith("2")) { + return countString + "nd"; + } else if (countString.endsWith("3")) { + return countString + "rd"; + } + return countString + "th"; + } + + private String getNumFormat(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) { + if (lvlNum < 0 || lvlNum >= levelTuples.length) { + //log? + return "decimal"; + } + if (isLegal) { + //return decimal no matter the level if isLegal is true + return "decimal"; + } + return (overrideLevelTuples == null || overrideLevelTuples[lvlNum].numFmt == null) ? + levelTuples[lvlNum].numFmt : overrideLevelTuples[lvlNum].numFmt; + } + + private int getCount(int lvlNum) { + if (lvlNum < 0 || lvlNum >= counts.size()) { + //log? + return 1; + } + return counts.get(lvlNum); + } + + private void resetAfter(int startlevelNumber, LevelTuple[] overrideLevelTuples) { + for (int levelNumber = startlevelNumber + 1; levelNumber < counts.size(); levelNumber++) { + int cnt = counts.get(levelNumber); + if (cnt == NOT_SEEN_YET) { + //do nothing + } else if (cnt == FIRST_SKIPPED) { + //do nothing + } else if (levelTuples.length > levelNumber) { + //never reset if restarts == 0 + int restart = (overrideLevelTuples == null || overrideLevelTuples[levelNumber].restart < 0) ? + levelTuples[levelNumber].restart : overrideLevelTuples[levelNumber].restart; + if (restart == 0) { + return; + } else if (restart == -1 || + startlevelNumber <= restart - 1) { + counts.set(levelNumber, NOT_SEEN_YET); + } else { + //do nothing/don't reset + } + } else { + //reset! + counts.set(levelNumber, NOT_SEEN_YET); + } + } + } + + private int getStart(int levelNumber, LevelTuple[] overrideLevelTuples) { + if (levelNumber >= levelTuples.length) { + return 1; + } else { + return (overrideLevelTuples == null || overrideLevelTuples[levelNumber].start < 0) ? + levelTuples[levelNumber].start : overrideLevelTuples[levelNumber].start; + } + } + } + + protected class LevelTuple { + private final int start; + private final int restart; + private final String lvlText; + private final String numFmt; + private final boolean isLegal; + + public LevelTuple(String lvlText) { + this.lvlText = lvlText; + start = 1; + restart = -1; + numFmt = "decimal"; + isLegal = false; + } + + public LevelTuple(int start, int restart, String lvlText, String numFmt, boolean isLegal) { + this.start = start; + this.restart = restart; + this.lvlText = lvlText; + this.numFmt = numFmt; + this.isLegal = isLegal; + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.io.FileNotFoundException; +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentEntry; +import org.apache.poi.poifs.filesystem.DocumentInputStream; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.Ole10Native; +import org.apache.poi.poifs.filesystem.Ole10NativeException; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.Detector; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MimeType; +import org.apache.tika.mime.MimeTypeException; +import org.apache.tika.mime.MimeTypes; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType; +import org.apache.tika.parser.pkg.ZipContainerDetector; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +abstract class AbstractPOIFSExtractor { + private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class); + private final EmbeddedDocumentExtractor extractor; + private PasswordProvider passwordProvider; + private TikaConfig tikaConfig; + private MimeTypes mimeTypes; + private Detector detector; + private Metadata metadata; + + protected AbstractPOIFSExtractor(ParseContext context) { + this(context, null); + } + + protected AbstractPOIFSExtractor(ParseContext context, Metadata metadata) { + EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); + + if (ex == null) { + this.extractor = new ParsingEmbeddedDocumentExtractor(context); + } else { + this.extractor = ex; + } + + this.passwordProvider = context.get(PasswordProvider.class); + this.tikaConfig = context.get(TikaConfig.class); + this.mimeTypes = context.get(MimeTypes.class); + this.detector = context.get(Detector.class); + this.metadata = metadata; + } + + // Note - these cache, but avoid creating the default TikaConfig if not needed + protected TikaConfig getTikaConfig() { + if (tikaConfig == null) { + tikaConfig = TikaConfig.getDefaultConfig(); + } + return tikaConfig; + } + + protected Detector getDetector() { + if (detector != null) return detector; + + detector = getTikaConfig().getDetector(); + return detector; + } + + protected MimeTypes getMimeTypes() { + if (mimeTypes != null) return mimeTypes; + + mimeTypes = getTikaConfig().getMimeRepository(); + return mimeTypes; + } + + /** + * Returns the password to be used for this file, or null + * if no / default password should be used + */ + protected String getPassword() { + if (passwordProvider != null) { + return passwordProvider.getPassword(metadata); + } + return null; + } + + protected void handleEmbeddedResource(TikaInputStream resource, String filename, + String relationshipID, String mediaType, XHTMLContentHandler xhtml, + boolean outputHtml) + throws IOException, SAXException, TikaException { + try { + Metadata metadata = new Metadata(); + if (filename != null) { + metadata.set(Metadata.TIKA_MIME_FILE, filename); + metadata.set(Metadata.RESOURCE_NAME_KEY, filename); + } + if (relationshipID != null) { + metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, relationshipID); + } + if (mediaType != null) { + metadata.set(Metadata.CONTENT_TYPE, mediaType); + } + + if (extractor.shouldParseEmbedded(metadata)) { + extractor.parseEmbedded(resource, xhtml, metadata, outputHtml); + } + } finally { + resource.close(); + } + } + + /** + * Handle an office document that's embedded at the POIFS level + */ + protected void handleEmbeddedOfficeDoc( + DirectoryEntry dir, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + + // Is it an embedded OLE2 document, or an embedded OOXML document? + + if (dir.hasEntry("Package")) { + // It's OOXML (has a ZipFile): + Entry ooxml = dir.getEntry("Package"); + + try (TikaInputStream stream = TikaInputStream.get( + new DocumentInputStream((DocumentEntry) ooxml))) { + ZipContainerDetector detector = new ZipContainerDetector(); + MediaType type = detector.detect(stream, new Metadata()); + handleEmbeddedResource(stream, null, dir.getName(), type.toString(), xhtml, true); + return; + } + } + + // It's regular OLE2: + + // What kind of document is it? + Metadata metadata = new Metadata(); + metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName()); + POIFSDocumentType type = POIFSDocumentType.detectType(dir); + TikaInputStream embedded = null; + + try { + if (type == POIFSDocumentType.OLE10_NATIVE) { + try { + // Try to un-wrap the OLE10Native record: + Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir); + if (ole.getLabel() != null) { + metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel()); + } + byte[] data = ole.getDataBuffer(); + embedded = TikaInputStream.get(data); + } catch (Ole10NativeException ex) { + // Not a valid OLE10Native record, skip it + } catch (Exception e) { + logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e); + } + } else if (type == POIFSDocumentType.COMP_OBJ) { + try { + // Grab the contents and process + DocumentEntry contentsEntry; + try { + contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS"); + } catch (FileNotFoundException ioe) { + contentsEntry = (DocumentEntry) dir.getEntry("Contents"); + } + DocumentInputStream inp = new DocumentInputStream(contentsEntry); + byte[] contents = new byte[contentsEntry.getSize()]; + inp.readFully(contents); + embedded = TikaInputStream.get(contents); + + // Try to work out what it is + MediaType mediaType = getDetector().detect(embedded, new Metadata()); + String extension = type.getExtension(); + try { + MimeType mimeType = getMimeTypes().forName(mediaType.toString()); + extension = mimeType.getExtension(); + } catch (MimeTypeException mte) { + // No details on this type are known + } + + // Record what we can do about it + metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString()); + metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension); + } catch (Exception e) { + throw new TikaException("Invalid embedded resource", e); + } + } else { + metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); + metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension()); + } + + // Should we parse it? + if (extractor.shouldParseEmbedded(metadata)) { + if (embedded == null) { + // Make a TikaInputStream that just + // passes the root directory of the + // embedded document, and is otherwise + // empty (byte[0]): + embedded = TikaInputStream.get(new byte[0]); + embedded.setOpenContainer(dir); + } + extractor.parseEmbedded(embedded, xhtml, metadata, true); + } + } finally { + if (embedded != null) { + embedded.close(); + } + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Cell of content. Classes that implement this interface are used by + * Tika parsers (currently just the MS Excel parser) to keep track of + * individual pieces of content before they are rendered to the XHTML + * SAX event stream. + */ +public interface Cell { + + /** + * Renders the content to the given XHTML SAX event stream. + * + * @param handler + * @throws SAXException + */ + void render(XHTMLContentHandler handler) throws SAXException; + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Cell decorator. + */ +public class CellDecorator implements Cell { + + private final Cell cell; + + public CellDecorator(Cell cell) { + this.cell = cell; + } + + public void render(XHTMLContentHandler handler) throws SAXException { + cell.render(handler); + } + +}
