Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.accessor; + +import java.util.Arrays; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.assertion.ChmAssert; +import org.apache.tika.parser.chm.core.ChmCommons; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.apache.tika.parser.chm.exception.ChmParsingException; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Description Note: not always exists An index chunk has the following format: + * 0000: char[4] 'PMGI' 0004: DWORD Length of quickref/free area at end of + * directory chunk 0008: Directory index entries (to quickref/free area) The + * quickref area in an PMGI is the same as in an PMGL The format of a directory + * index entry is as follows: BYTE: length of name BYTEs: name (UTF-8 encoded) + * ENCINT: directory listing chunk which starts with name Encoded Integers aka + * ENCINT An ENCINT is a variable-length integer. The high bit of each byte + * indicates "continued to the next byte". Bytes are stored most significant to + * least significant. So, for example, $EA $15 is (((0xEA&0x7F)<<7)|0x15) = + * 0x3515. + * + * <p> + * Note: This class is not in use + * + * {@link http://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original/?show-translation-form=1 } + * + * + */ +public class ChmPmgiHeader implements ChmAccessor<ChmPmgiHeader> { + private static final long serialVersionUID = -2092282339894303701L; + private byte[] signature; + private long free_space; /* 4 */ + + /* local usage */ + private int dataRemained; + private int currentPlace = 0; + + public ChmPmgiHeader() { + signature = ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8); /* 0 (PMGI) */ + } + + private int getDataRemained() { + return dataRemained; + } + + private void setDataRemained(int dataRemained) { + this.dataRemained = dataRemained; + } + + private int getCurrentPlace() { + return currentPlace; + } + + private void setCurrentPlace(int currentPlace) { + this.currentPlace = currentPlace; + } + + private void unmarshalCharArray(byte[] data, ChmPmgiHeader chmPmgiHeader, + int count) throws ChmParsingException { + int index = -1; + ChmAssert.assertByteArrayNotNull(data); + ChmAssert.assertChmAccessorNotNull(chmPmgiHeader); + ChmAssert.assertPositiveInt(count); + this.setDataRemained(data.length); + index = ChmCommons.indexOf(data, + ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8)); + + if (index >= 0) + System.arraycopy(data, index, chmPmgiHeader.getSignature(), 0, count); + else{ + //Some chm documents (actually most of them) do not contain + //PMGI header, in this case, we just notice about it. + } + this.setCurrentPlace(this.getCurrentPlace() + count); + this.setDataRemained(this.getDataRemained() - count); + } + + private long unmarshalUInt32(byte[] data, long dest) throws ChmParsingException { + ChmAssert.assertByteArrayNotNull(data); + + if (4 > getDataRemained()) + throw new ChmParsingException("4 > dataLenght"); + dest = (data[this.getCurrentPlace()] & 0xff) + | (data[this.getCurrentPlace() + 1] & 0xff) << 8 + | (data[this.getCurrentPlace() + 2] & 0xff) << 16 + | (data[this.getCurrentPlace() + 3] & 0xff) << 24; + + setDataRemained(this.getDataRemained() - 4); + this.setCurrentPlace(this.getCurrentPlace() + 4); + return dest; + } + + /** + * Returns pmgi signature if exists + * + * @return signature + */ + public byte[] getSignature() { + return signature; + } + + /** + * Sets pmgi signature + * + * @param signature + */ + protected void setSignature(byte[] signature) { + this.signature = signature; + } + + /** + * Returns pmgi free space + * + * @return free_space + */ + public long getFreeSpace() { + return free_space; + } + + /** + * Sets pmgi free space + * + * @param free_space + */ + protected void setFreeSpace(long free_space) { + this.free_space = free_space; + } + + /** + * Returns textual representation of the pmgi header + */ + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("signature:=" + new String(getSignature(), UTF_8) + ", "); + sb.append("free space:=" + getFreeSpace() + + System.getProperty("line.separator")); + return sb.toString(); + } + + // @Override + public void parse(byte[] data, ChmPmgiHeader chmPmgiHeader) throws TikaException { + /* we only know how to deal with a 0x8 byte structures */ + if (data.length < ChmConstants.CHM_PMGI_LEN) + throw new TikaException("we only know how to deal with a 0x8 byte structures"); + + /* unmarshal fields */ + chmPmgiHeader.unmarshalCharArray(data, chmPmgiHeader, ChmConstants.CHM_SIGNATURE_LEN); + chmPmgiHeader.setFreeSpace(chmPmgiHeader.unmarshalUInt32(data, chmPmgiHeader.getFreeSpace())); + + /* check structure */ + if (!Arrays.equals(chmPmgiHeader.getSignature(), + ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8))) + throw new TikaException( + "it does not seem to be valid a PMGI signature, check ChmItsp index_root if it was -1, means no PMGI, use PMGL insted"); + + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.accessor; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.assertion.ChmAssert; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.apache.tika.parser.chm.exception.ChmParsingException; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Description There are two types of directory chunks -- index chunks, and + * listing chunks. The index chunk will be omitted if there is only one listing + * chunk. A listing chunk has the following format: 0000: char[4] 'PMGL' 0004: + * DWORD Length of free space and/or quickref area at end of directory chunk + * 0008: DWORD Always 0 000C: DWORD Chunk number of previous listing chunk when + * reading directory in sequence (-1 if this is the first listing chunk) 0010: + * DWORD Chunk number of next listing chunk when reading directory in sequence + * (-1 if this is the last listing chunk) 0014: Directory listing entries (to + * quickref area) Sorted by filename; the sort is case-insensitive The quickref + * area is written backwards from the end of the chunk. One quickref entry + * exists for every n entries in the file, where n is calculated as 1 + (1 << + * quickref density). So for density = 2, n = 5 Chunklen-0002: WORD Number of + * entries in the chunk Chunklen-0004: WORD Offset of entry n from entry 0 + * Chunklen-0008: WORD Offset of entry 2n from entry 0 Chunklen-000C: WORD + * Offset of entry 3n from entry 0 ... The format of a directory listing entry + * is as follows BYTE: length of name BYTEs: name (UTF-8 encoded) ENCINT: + * content section ENCINT: offset ENCINT: length The offset is from the + * beginning of the content section the file is in, after the section has been + * decompressed (if appropriate). The length also refers to length of the file + * in the section after decompression. There are two kinds of file represented + * in the directory: user data and format related files. The files which are + * format-related have names which begin with '::', the user data files have + * names which begin with "/". + * + * {@link http + * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original + * /?show-translation-form=1 } + * + * @author olegt + * + */ +public class ChmPmglHeader implements ChmAccessor<ChmPmglHeader> { + private static final long serialVersionUID = -6139486487475923593L; + private byte[] signature; + private long free_space; /* 4 */ + private long unknown_0008; /* 8 */ + private int block_prev; /* c */ + private int block_next; /* 10 */ + + /* local usage */ + private int dataRemained; + private int currentPlace = 0; + + public ChmPmglHeader() { + signature = ChmConstants.PMGL.getBytes(UTF_8); /* + * 0 + * (PMGL + * ) + */ + } + + private int getDataRemained() { + return dataRemained; + } + + private void setDataRemained(int dataRemained) { + this.dataRemained = dataRemained; + } + + private int getCurrentPlace() { + return currentPlace; + } + + private void setCurrentPlace(int currentPlace) { + this.currentPlace = currentPlace; + } + + public long getFreeSpace() { + return free_space; + } + + public void setFreeSpace(long free_space) throws TikaException { + if (free_space < 0) { + throw new TikaException("Bad PMGLheader.FreeSpace="+free_space); + } + this.free_space = free_space; + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("signatute:=" + new String(getSignature(), UTF_8) + ", "); + sb.append("free space:=" + getFreeSpace() + ", "); + sb.append("unknown0008:=" + getUnknown0008() + ", "); + sb.append("prev block:=" + getBlockPrev() + ", "); + sb.append("next block:=" + getBlockNext() + + System.getProperty("line.separator")); + return sb.toString(); + } + + protected void unmarshalCharArray(byte[] data, ChmPmglHeader chmPmglHeader, + int count) throws TikaException { + ChmAssert.assertByteArrayNotNull(data); + this.setDataRemained(data.length); + System.arraycopy(data, 0, chmPmglHeader.signature, 0, count); + this.setCurrentPlace(this.getCurrentPlace() + count); + this.setDataRemained(this.getDataRemained() - count); + } + + private int unmarshalInt32(byte[] data) throws TikaException { + ChmAssert.assertByteArrayNotNull(data); + int dest; + if (4 > this.getDataRemained()) + throw new TikaException("4 > dataLenght"); + dest = (data[this.getCurrentPlace()] & 0xff) + | (data[this.getCurrentPlace() + 1] & 0xff) << 8 + | (data[this.getCurrentPlace() + 2] & 0xff) << 16 + | (data[this.getCurrentPlace() + 3] & 0xff) << 24; + + this.setCurrentPlace(this.getCurrentPlace() + 4); + this.setDataRemained(this.getDataRemained() - 4); + return dest; + } + + private long unmarshalUInt32(byte[] data) throws ChmParsingException { + ChmAssert.assertByteArrayNotNull(data); + long dest; + if (4 > getDataRemained()) + throw new ChmParsingException("4 > dataLenght"); + dest = (data[this.getCurrentPlace()] & 0xff) + | (data[this.getCurrentPlace() + 1] & 0xff) << 8 + | (data[this.getCurrentPlace() + 2] & 0xff) << 16 + | (data[this.getCurrentPlace() + 3] & 0xff) << 24; + + setDataRemained(this.getDataRemained() - 4); + this.setCurrentPlace(this.getCurrentPlace() + 4); + return dest; + } + + // @Override + public void parse(byte[] data, ChmPmglHeader chmPmglHeader) throws TikaException { + if (data.length < ChmConstants.CHM_PMGL_LEN) + throw new TikaException(ChmPmglHeader.class.getName() + + " we only know how to deal with a 0x14 byte structures"); + + /* unmarshal fields */ + chmPmglHeader.unmarshalCharArray(data, chmPmglHeader, + ChmConstants.CHM_SIGNATURE_LEN); + chmPmglHeader.setFreeSpace(chmPmglHeader.unmarshalUInt32(data)); + chmPmglHeader.setUnknown0008(chmPmglHeader.unmarshalUInt32(data)); + chmPmglHeader.setBlockPrev(chmPmglHeader.unmarshalInt32(data)); + chmPmglHeader.setBlockNext(chmPmglHeader.unmarshalInt32(data)); + + /* check structure */ + if (!new String(chmPmglHeader.getSignature(), UTF_8).equals(ChmConstants.PMGL)) + throw new ChmParsingException(ChmPmglHeader.class.getName() + + " pmgl != pmgl.signature"); + } + + public byte[] getSignature() { + return signature; + } + + protected void setSignature(byte[] signature) { + this.signature = signature; + } + + public long getUnknown0008() { + return unknown_0008; + } + + protected void setUnknown0008(long unknown_0008) { + this.unknown_0008 = unknown_0008; + } + + public int getBlockPrev() { + return block_prev; + } + + protected void setBlockPrev(int block_prev) { + this.block_prev = block_prev; + } + + public int getBlockNext() { + return block_next; + } + + protected void setBlockNext(int block_next) { + this.block_next = block_next; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.accessor; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.assertion.ChmAssert; +import org.apache.tika.parser.chm.core.ChmCommons; + +/** + * The format of a directory listing entry is as follows: BYTE: length of name + * BYTEs: name (UTF-8 encoded) ENCINT: content section ENCINT: offset ENCINT: + * length The offset is from the beginning of the content section the file is + * in, after the section has been decompressed (if appropriate). The length also + * refers to length of the file in the section after decompression. There are + * two kinds of file represented in the directory: user data and format related + * files. The files which are format-related have names which begin with '::', + * the user data files have names which begin with "/". + * + */ +public class DirectoryListingEntry { + /* Length of the entry name */ + private int name_length; + /* Entry name or directory name */ + private String name; + /* Entry type */ + private ChmCommons.EntryType entryType; + /* Entry offset */ + private int offset; + /* Entry size */ + private int length; + + public DirectoryListingEntry() { + + } + + /** + * Constructs directoryListingEntry + * + * @param name_length + * int + * @param name + * String + * @param isCompressed + * ChmCommons.EntryType + * @param offset + * int + * @param length + * int + * @throws TikaException + */ + public DirectoryListingEntry(int name_length, String name, + ChmCommons.EntryType isCompressed, int offset, int length) throws TikaException { + ChmAssert.assertDirectoryListingEntry(name_length, name, isCompressed, offset, length); + setNameLength(name_length); + setName(name); + setEntryType(isCompressed); + setOffset(offset); + setLength(length); + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("name_length:=" + getNameLength() + System.getProperty("line.separator")); + sb.append("name:=" + getName() + System.getProperty("line.separator")); + sb.append("entryType:=" + getEntryType() + System.getProperty("line.separator")); + sb.append("offset:=" + getOffset() + System.getProperty("line.separator")); + sb.append("length:=" + getLength()); + return sb.toString(); + } + + /** + * Returns an entry name length + * + * @return int + */ + public int getNameLength() { + return name_length; + } + + /** + * Sets an entry name length + * + * @param name_length + * int + */ + protected void setNameLength(int name_length) { + this.name_length = name_length; + } + + /** + * Returns an entry name + * + * @return String + */ + public String getName() { + return name; + } + + /** + * Sets entry name + * + * @param name + * String + */ + protected void setName(String name) { + this.name = name; + } + + /** + * Returns ChmCommons.EntryType (COMPRESSED or UNCOMPRESSED) + * + * @return ChmCommons.EntryType + */ + public ChmCommons.EntryType getEntryType() { + return entryType; + } + + protected void setEntryType(ChmCommons.EntryType entryType) { + this.entryType = entryType; + } + + public int getOffset() { + return offset; + } + + protected void setOffset(int offset) { + this.offset = offset; + } + + public int getLength() { + return length; + } + + protected void setLength(int length) { + this.length = length; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.assertion; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.accessor.ChmAccessor; +import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable; +import org.apache.tika.parser.chm.core.ChmCommons; +import org.apache.tika.parser.chm.exception.ChmParsingException; + +/** + * Contains chm extractor assertions + */ +public class ChmAssert { + /** + * Checks a validity of the chmBlockSegment parameters + * + * @param data + * byte[] + * @param resetTable + * ChmLzxcResetTable + * @param blockNumber + * int + * @param lzxcBlockOffset + * int + * @param lzxcBlockLength + * int + * @throws TikaException + */ + public static final void assertChmBlockSegment(byte[] data, + ChmLzxcResetTable resetTable, int blockNumber, int lzxcBlockOffset, + int lzxcBlockLength) throws TikaException { + if ((data == null)) + throw new TikaException("data[] is null"); + + if ((data.length <= 0)) + throw new TikaException("data[] length should be greater than zero"); + + if (resetTable == null) + throw new TikaException("resetTable is null"); + + if (resetTable.getBlockAddress().length <= 1) + throw new TikaException("resetTable.getBlockAddress().length should be greater than zero"); + + if (blockNumber < 0) + throw new TikaException("blockNumber should be positive number"); + + if (lzxcBlockOffset < 0) + throw new TikaException("lzxcBlockOffset should be positive number"); + + if (lzxcBlockLength < 0) + throw new TikaException("lzxcBlockLength should be positive number"); + } + + /** + * Checks if InputStream is not null + * + * @param is + * InputStream + * @throws ChmParsingException + * @throws IOException + */ + public static final void assertInputStreamNotNull(InputStream is) throws IOException { + if (is == null) + throw new IOException("input sream is null"); + } + + /** + * Checks validity of ChmAccessor parameters + * + * @param data + * @param chmItsfHeader + * @param count + * @throws ChmParsingException + */ + public static final void assertChmAccessorParameters(byte[] data, + ChmAccessor<?> chmAccessor, int count) throws ChmParsingException { + assertByteArrayNotNull(data); + assertChmAccessorNotNull(chmAccessor); + } + + /** + * Checks if byte[] is not null + * + * @param data + * @throws ChmParsingException + */ + public static final void assertByteArrayNotNull(byte[] data) throws ChmParsingException { + if (data == null) + throw new ChmParsingException("byte[] data is null"); + } + + /** + * Checks if ChmAccessor is not null In case of null throws exception + * + * @param ChmAccessor + * @throws ChmParsingException + */ + public static final void assertChmAccessorNotNull(ChmAccessor<?> chmAccessor) throws ChmParsingException { + if (chmAccessor == null) + throw new ChmParsingException("chm header is null"); + } + + /** + * Checks validity of the DirectoryListingEntry's parameters In case of + * invalid parameter(s) throws an exception + * + * @param name_length + * length of the chm entry name + * @param name + * chm entry name + * @param entryType + * EntryType + * @param offset + * @param length + * @throws ChmParsingException + */ + public static final void assertDirectoryListingEntry(int name_length, + String name, ChmCommons.EntryType entryType, int offset, int length) throws ChmParsingException { + if (name_length < 0) + throw new ChmParsingException("invalid name length"); + if (name == null) + throw new ChmParsingException("invalid name"); + + if ((entryType != ChmCommons.EntryType.COMPRESSED) + && (entryType != ChmCommons.EntryType.UNCOMPRESSED)) + throw new ChmParsingException("invalid compressed type, should be EntryType.COMPRESSED | EntryType.UNCOMPRESSED"); + + if (offset < 0) + throw new ChmParsingException("invalid offset"); + + if (length < 0) + throw new ChmParsingException("invalid length"); + } + + public static void assertCopyingDataIndex(int index, int dataLength) throws ChmParsingException { + if (index >= dataLength) + throw new ChmParsingException("cannot parse chm file index > data.length"); + } + + /** + * Checks if int param is greater than zero In case param <=0 throws an + * exception + * + * @param param + * @throws ChmParsingException + */ + public static void assertPositiveInt(int param) throws ChmParsingException { + if (param <= 0) + throw new ChmParsingException("resetTable.getBlockAddress().length should be greater than zero"); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,361 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.core; + +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.List; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable; +import org.apache.tika.parser.chm.accessor.DirectoryListingEntry; +import org.apache.tika.parser.chm.assertion.ChmAssert; +import org.apache.tika.parser.chm.exception.ChmParsingException; + +public class ChmCommons { + /* Prevents initialization */ + private ChmCommons() { + } + + public static void assertByteArrayNotNull(byte[] data) throws TikaException { + if (data == null) + throw new TikaException("byte[] is null"); + } + + /** + * Represents entry types: uncompressed, compressed + */ + public enum EntryType { + UNCOMPRESSED, COMPRESSED + } + + /** + * Represents lzx states: started decoding, not started decoding + */ + public enum LzxState { + STARTED_DECODING, NOT_STARTED_DECODING + } + + /** + * Represents intel file states during decompression + */ + public enum IntelState { + STARTED, NOT_STARTED + } + + /** + * Represents lzx block types in order to decompress differently + */ + public final static int UNDEFINED = 0; + public final static int VERBATIM = 1; + public final static int ALIGNED_OFFSET = 2; + public final static int UNCOMPRESSED = 3; + + /** + * LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) Returns X, + * i.e 2^X + * + * @param window + * chmLzxControlData.getWindowSize() + * + * @return window size + */ + public static int getWindowSize(int window) { + int win = 0; + while (window > 1) { + window >>>= 1; + win++; + } + return win; + } + + public static byte[] getChmBlockSegment(byte[] data, + ChmLzxcResetTable resetTable, int blockNumber, int lzxcBlockOffset, + int lzxcBlockLength) throws TikaException { + ChmAssert.assertChmBlockSegment(data, resetTable, blockNumber, + lzxcBlockOffset, lzxcBlockLength); + int blockLength = -1; + // TODO add int_max_value checking + if (blockNumber < (resetTable.getBlockAddress().length - 1)) { + blockLength = (int) (resetTable.getBlockAddress()[blockNumber + 1] - resetTable + .getBlockAddress()[blockNumber]); + } else { + /* new code */ + if (blockNumber >= resetTable.getBlockAddress().length) + blockLength = 0; + else + /* end new code */ + blockLength = (int) (lzxcBlockLength - resetTable + .getBlockAddress()[blockNumber]); + } + byte[] t = ChmCommons + .copyOfRange( + data, + (int) (lzxcBlockOffset + resetTable.getBlockAddress()[blockNumber]), + (int) (lzxcBlockOffset + + resetTable.getBlockAddress()[blockNumber] + blockLength)); + return (t != null) ? t : new byte[1]; + } + + /** + * Returns textual representation of LangID + * + * @param langID + * + * @return language name + */ + public static String getLanguage(long langID) { + /* Potential problem with casting */ + switch ((int) langID) { + case 1025: + return "Arabic"; + case 1069: + return "Basque"; + case 1027: + return "Catalan"; + case 2052: + return "Chinese (Simplified)"; + case 1028: + return "Chinese (Traditional)"; + case 1029: + return "Czech"; + case 1030: + return "Danish"; + case 1043: + return "Dutch"; + case 1033: + return "English (United States)"; + case 1035: + return "Finnish"; + case 1036: + return "French"; + case 1031: + return "German"; + case 1032: + return "Greek"; + case 1037: + return "Hebrew"; + case 1038: + return "Hungarian"; + case 1040: + return "Italian"; + case 1041: + return "Japanese"; + case 1042: + return "Korean"; + case 1044: + return "Norwegian"; + case 1045: + return "Polish"; + case 2070: + return "Portuguese"; + case 1046: + return "Portuguese (Brazil)"; + case 1049: + return "Russian"; + case 1051: + return "Slovakian"; + case 1060: + return "Slovenian"; + case 3082: + return "Spanish"; + case 1053: + return "Swedish"; + case 1055: + return "Turkish"; + default: + return "unknown - http://msdn.microsoft.com/en-us/library/bb165625%28VS.80%29.aspx"; + } + } + + /** + * Checks skippable patterns + * + * @param directoryListingEntry + * + * @return boolean + */ + public static boolean hasSkip(DirectoryListingEntry directoryListingEntry) { + return (directoryListingEntry.getName().startsWith("/$") + || directoryListingEntry.getName().startsWith("/#") || directoryListingEntry + .getName().startsWith("::")) ? true : false; + } + + /** + * Writes byte[][] to the file + * + * @param buffer + * @param fileToBeSaved + * file name + * @throws TikaException + */ + public static void writeFile(byte[][] buffer, String fileToBeSaved) throws TikaException { + FileOutputStream output = null; + if (buffer != null && fileToBeSaved != null + && !ChmCommons.isEmpty(fileToBeSaved)) { + try { + output = new FileOutputStream(fileToBeSaved); + for (byte[] bufferEntry : buffer) { + output.write(bufferEntry); + } + } catch (FileNotFoundException e) { + throw new TikaException(e.getMessage()); + } catch (IOException e) { + e.printStackTrace(); + } finally { + if (output != null) + try { + output.flush(); + output.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + /** + * Reverses the order of given array + * + * @param array + */ + public static void reverse(byte[] array) { + if (array == null) { + return; + } + int i = 0; + int j = array.length - 1; + byte tmp; + while (j > i) { + tmp = array[j]; + array[j] = array[i]; + array[i] = tmp; + j--; + i++; + } + } + + /** + * Returns an index of the reset table + * + * @param text + * @param pattern + * @return index of the reset table + * @throws ChmParsingException + */ + public static final int indexOfResetTableBlock(byte[] text, byte[] pattern) throws ChmParsingException { + return (indexOf(text, pattern)) - 4; + } + + /** + * Searches some pattern in byte[] + * + * @param text + * byte[] + * @param pattern + * byte[] + * @return an index, if nothing found returns -1 + * @throws ChmParsingException + */ + public static int indexOf(byte[] text, byte[] pattern) throws ChmParsingException { + int[] next = null; + int i = 0, j = -1; + + /* Preprocessing */ + if (pattern != null && text != null) { + next = new int[pattern.length]; + next[0] = -1; + } else + throw new ChmParsingException("pattern and/or text should not be null"); + + /* Computes a failure function */ + while (i < pattern.length - 1) { + if (j == -1 || pattern[i] == pattern[j]) { + i++; + j++; + if (pattern[i] != pattern[j]) + next[i] = j; + else + next[i] = next[j]; + } else + j = next[j]; + } + + /* Reinitializes local variables */ + i = j = 0; + + /* Matching */ + while (i < text.length && j < pattern.length) { + if (j == -1 || pattern[j] == text[i]) { + i++; + j++; + } else + j = next[j]; + } + if (j == pattern.length) + return (i - j); // match found at offset i - M + else + return -1; // not found + } + + /** + * Searches for some pattern in the directory listing entry list + * + * @param list + * @param pattern + * @return an index, if nothing found returns -1 + */ + public static int indexOf(List<DirectoryListingEntry> list, String pattern) { + int place = 0; + for (DirectoryListingEntry directoryListingEntry : list) { + if (directoryListingEntry.toString().contains(pattern)) return place; + ++place; + } + return -1;// not found + } + + /* + * This method is added because of supporting of Java 5 + */ + public static byte[] copyOfRange(byte[] original, int from, int to) { + checkCopyOfRangeParams(original, from, to); + int newLength = to - from; + if (newLength < 0) + throw new IllegalArgumentException(from + " > " + to); + byte[] copy = new byte[newLength]; + System.arraycopy(original, from, copy, 0, Math.min(original.length - from, newLength)); + return copy; + } + + private static void checkCopyOfRangeParams(byte[] original, int from, int to) { + if (original == null) + throw new NullPointerException("array is null"); + if (from < 0) + throw new IllegalArgumentException(from + " should be > 0"); + if (to < 0) + throw new IllegalArgumentException(to + " should be > 0"); + } + + /* + * This method is added because of supporting of Java 5 + */ + public static boolean isEmpty(String str) { + return str == null || str.length() == 0; + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.core; + +import static java.nio.charset.StandardCharsets.UTF_8; + +public class ChmConstants { + /* Prevents instantiation */ + private ChmConstants() { + } + + public static final String DEFAULT_CHARSET = UTF_8.name(); + public static final String ITSF = "ITSF"; + public static final String ITSP = "ITSP"; + public static final String PMGL = "PMGL"; + public static final String LZXC = "LZXC"; + public static final String CHM_PMGI_MARKER = "PMGI"; + public static final int BYTE_ARRAY_LENGHT = 16; + public static final int CHM_ITSF_V2_LEN = 0x58; + public static final int CHM_ITSF_V3_LEN = 0x60; + public static final int CHM_ITSP_V1_LEN = 0x54; + public static final int CHM_PMGL_LEN = 0x14; + public static final int CHM_PMGI_LEN = 0x08; + public static final int CHM_LZXC_RESETTABLE_V1_LEN = 0x28; + public static final int CHM_LZXC_MIN_LEN = 0x18; + public static final int CHM_LZXC_V2_LEN = 0x1c; + public static final int CHM_SIGNATURE_LEN = 4; + public static final int CHM_VER_2 = 2; + public static final int CHM_VER_3 = 3; + public static final int CHM_VER_1 = 1; + public static final int CHM_WINDOW_SIZE_BLOCK = 0x8000; + + /* my hacking */ + public static final int START_PMGL = 0xCC; + public static final String CONTROL_DATA = "ControlData"; + public static final String RESET_TABLE = "ResetTable"; + public static final String CONTENT = "Content"; + + /* some constants defined by the LZX specification */ + public static final int LZX_MIN_MATCH = 2; + public static final int LZX_MAX_MATCH = 257; + public static final int LZX_NUM_CHARS = 256; + public static final int LZX_BLOCKTYPE_INVALID = 0; /* + * also blocktypes 4-7 + * invalid + */ + public static final int LZX_BLOCKTYPE_VERBATIM = 1; + public static final int LZX_BLOCKTYPE_ALIGNED = 2; + public static final int LZX_BLOCKTYPE_UNCOMPRESSED = 3; + public static final int LZX_PRETREE_NUM_ELEMENTS_BITS = 4; /* ??? */ + public static final int LZX_PRETREE_NUM_ELEMENTS = 20; + public static final int LZX_ALIGNED_NUM_ELEMENTS = 8; /* + * aligned offset tree + * #elements + */ + public static final int LZX_NUM_PRIMARY_LENGTHS = 7; /* + * this one missing + * from spec! + */ + public static final int LZX_NUM_SECONDARY_LENGTHS = 249; /* + * length tree + * #elements + */ + + /* LZX huffman defines: tweak tablebits as desired */ + public static final int LZX_PRETREE_MAXSYMBOLS = LZX_PRETREE_NUM_ELEMENTS; + public static final int LZX_PRETREE_TABLEBITS = 6; + public static final int LZX_MAINTREE_MAXSYMBOLS = LZX_NUM_CHARS + 50 * 8; + public static final int LZX_MAIN_MAXSYMBOLS = LZX_NUM_CHARS * 2; + public static final int LZX_MAINTREE_TABLEBITS = 12; + public static final int LZX_LENGTH_MAXSYMBOLS = LZX_NUM_SECONDARY_LENGTHS + 1; + public static final int LZX_LENGTH_TABLEBITS = 12; + public static final int LZX_ALIGNED_MAXSYMBOLS = LZX_ALIGNED_NUM_ELEMENTS; + public static final int LZX_ALIGNED_TABLEBITS = 7; + public static final int LZX_LENTABLE_SAFETY = 64; + + public static short[] EXTRA_BITS = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, + 15, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17 }; + + public static int[] POSITION_BASE = { 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32, + 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072, + 4096, 6144, 8192, 12288, 16384, 24576, 32768, 49152, 65536, 98304, + 131072, 196608, 262144, 393216, 524288, 655360, 786432, 917504, + 1048576, 1179648, 1310720, 1441792, 1572864, 1703936, 1835008, + 1966080, 2097152 }; +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,392 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.core; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet; +import org.apache.tika.parser.chm.accessor.ChmItsfHeader; +import org.apache.tika.parser.chm.accessor.ChmItspHeader; +import org.apache.tika.parser.chm.accessor.ChmLzxcControlData; +import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable; +import org.apache.tika.parser.chm.accessor.DirectoryListingEntry; +import org.apache.tika.parser.chm.assertion.ChmAssert; +import org.apache.tika.parser.chm.core.ChmCommons.EntryType; +import org.apache.tika.parser.chm.lzx.ChmBlockInfo; +import org.apache.tika.parser.chm.lzx.ChmLzxBlock; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Extracts text from chm file. Enumerates chm entries. + */ +public class ChmExtractor { + private List<ChmLzxBlock> lzxBlocksCache = null; + private ChmDirectoryListingSet chmDirList = null; + private ChmItsfHeader chmItsfHeader = null; + private ChmItspHeader chmItspHeader = null; + private ChmLzxcResetTable chmLzxcResetTable = null; + private ChmLzxcControlData chmLzxcControlData = null; + private byte[] data = null; + private int indexOfContent; + private long lzxBlockOffset; + private long lzxBlockLength; + + /** + * Returns lzxc control data. + * + * @return ChmLzxcControlData + */ + private ChmLzxcControlData getChmLzxcControlData() { + return chmLzxcControlData; + } + + /** + * Sets lzxc control data + * + * @param chmLzxcControlData + */ + private void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) { + this.chmLzxcControlData = chmLzxcControlData; + } + + private ChmItspHeader getChmItspHeader() { + return chmItspHeader; + } + + private void setChmItspHeader(ChmItspHeader chmItspHeader) { + this.chmItspHeader = chmItspHeader; + } + + /** + * Returns lzxc reset table + * + * @return ChmLzxcResetTable + */ + private ChmLzxcResetTable getChmLzxcResetTable() { + return chmLzxcResetTable; + } + + /** + * Sets lzxc reset table + * + * @param chmLzxcResetTable + */ + private void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) { + this.chmLzxcResetTable = chmLzxcResetTable; + } + + /** + * Returns lzxc hit_cache length + * + * @return lzxBlockLength + */ + private long getLzxBlockLength() { + return lzxBlockLength; + } + + /** + * Sets lzxc hit_cache length + * + * @param lzxBlockLength + */ + private void setLzxBlockLength(long lzxBlockLength) { + this.lzxBlockLength = lzxBlockLength; + } + + /** + * Returns lzxc hit_cache offset + * + * @return lzxBlockOffset + */ + private long getLzxBlockOffset() { + return lzxBlockOffset; + } + + /** + * Sets lzxc hit_cache offset + */ + private void setLzxBlockOffset(long lzxBlockOffset) { + this.lzxBlockOffset = lzxBlockOffset; + } + + private int getIndexOfContent() { + return indexOfContent; + } + + private void setIndexOfContent(int indexOfContent) { + this.indexOfContent = indexOfContent; + } + + private byte[] getData() { + return data; + } + + private void setData(byte[] data) { + this.data = data; + } + + public ChmExtractor(InputStream is) throws TikaException, IOException { + ChmAssert.assertInputStreamNotNull(is); + try { + setData(IOUtils.toByteArray(is)); + + /* Creates and parses chm itsf header */ + setChmItsfHeader(new ChmItsfHeader()); + // getChmItsfHeader().parse(Arrays.copyOfRange(getData(), 0, + // ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader()); + getChmItsfHeader().parse(ChmCommons.copyOfRange(getData(), 0, + ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader()); + + /* Creates and parses chm itsp header */ + setChmItspHeader(new ChmItspHeader()); + // getChmItspHeader().parse(Arrays.copyOfRange( getData(), (int) + // getChmItsfHeader().getDirOffset(), + // (int) getChmItsfHeader().getDirOffset() + + // ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader()); + getChmItspHeader().parse( + ChmCommons.copyOfRange(getData(), (int) getChmItsfHeader() + .getDirOffset(), (int) getChmItsfHeader().getDirOffset() + + ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader()); + + /* Creates instance of ChmDirListingContainer */ + setChmDirList(new ChmDirectoryListingSet(getData(), + getChmItsfHeader(), getChmItspHeader())); + + int indexOfControlData = getChmDirList().getControlDataIndex(); + int indexOfResetData = ChmCommons.indexOfResetTableBlock(getData(), + ChmConstants.LZXC.getBytes(UTF_8)); + byte[] dir_chunk = null; + if (indexOfResetData > 0) + dir_chunk = ChmCommons.copyOfRange( getData(), indexOfResetData, indexOfResetData + + getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength()); + // dir_chunk = Arrays.copyOfRange(getData(), indexOfResetData, + // indexOfResetData + // + + // getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength()); + + /* Creates and parses chm control data */ + setChmLzxcControlData(new ChmLzxcControlData()); + getChmLzxcControlData().parse(dir_chunk, getChmLzxcControlData()); + + int indexOfResetTable = getChmDirList().getResetTableIndex(); + setChmLzxcResetTable(new ChmLzxcResetTable()); + + int startIndex = (int) getChmDirList().getDataOffset() + + getChmDirList().getDirectoryListingEntryList() + .get(indexOfResetTable).getOffset(); + + // assert startIndex < data.length + ChmAssert.assertCopyingDataIndex(startIndex, getData().length); + + // dir_chunk = Arrays.copyOfRange(getData(), startIndex, startIndex + // + + // getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength()); + dir_chunk = ChmCommons.copyOfRange(getData(), startIndex, startIndex + + getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength()); + + getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable()); + + setIndexOfContent(ChmCommons.indexOf(getChmDirList().getDirectoryListingEntryList(), + ChmConstants.CONTENT)); + setLzxBlockOffset((getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getOffset() + + getChmItsfHeader().getDataOffset())); + setLzxBlockLength(getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getLength()); + + setLzxBlocksCache(new ArrayList<ChmLzxBlock>()); + + } catch (IOException e) { + e.printStackTrace(); + } + } + + /** + * Enumerates chm entities + * + * @return list of chm entities + */ + public List<String> enumerateChm() { + List<String> listOfEntries = new ArrayList<String>(); + for (DirectoryListingEntry directoryListingEntry : getChmDirList().getDirectoryListingEntryList()) { + listOfEntries.add(directoryListingEntry.getName()); + } + return listOfEntries; + } + + /** + * Decompresses a chm entry + * + * @param directoryListingEntry + * + * @return decompressed data + * @throws TikaException + */ + public byte[] extractChmEntry(DirectoryListingEntry directoryListingEntry) throws TikaException { + ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + ChmLzxBlock lzxBlock = null; + try { + /* UNCOMPRESSED type is easiest one */ + if (directoryListingEntry.getEntryType() == EntryType.UNCOMPRESSED + && directoryListingEntry.getLength() > 0 + && !ChmCommons.hasSkip(directoryListingEntry)) { + int dataOffset = (int) (getChmItsfHeader().getDataOffset() + directoryListingEntry + .getOffset()); + // dataSegment = Arrays.copyOfRange(getData(), dataOffset, + // dataOffset + directoryListingEntry.getLength()); + buffer.write(ChmCommons.copyOfRange( + getData(), dataOffset, + dataOffset + directoryListingEntry.getLength())); + } else if (directoryListingEntry.getEntryType() == EntryType.COMPRESSED + && !ChmCommons.hasSkip(directoryListingEntry)) { + /* Gets a chm hit_cache info */ + ChmBlockInfo bb = ChmBlockInfo.getChmBlockInfoInstance( + directoryListingEntry, (int) getChmLzxcResetTable() + .getBlockLen(), getChmLzxcControlData()); + + int i = 0, start = 0, hit_cache = 0; + + if ((getLzxBlockLength() < Integer.MAX_VALUE) + && (getLzxBlockOffset() < Integer.MAX_VALUE)) { + // TODO: Improve the caching + // caching ... = O(n^2) - depends on startBlock and endBlock + start = -1; + if (!getLzxBlocksCache().isEmpty()) { + for (i = 0; i < getLzxBlocksCache().size(); i++) { + //lzxBlock = getLzxBlocksCache().get(i); + int bn = getLzxBlocksCache().get(i).getBlockNumber(); + for (int j = bb.getIniBlock(); j <= bb.getStartBlock(); j++) { + if (bn == j) { + if (j > start) { + start = j; + hit_cache = i; + } + } + } + if (start == bb.getStartBlock()) + break; + } + } + +// if (i == getLzxBlocksCache().size() && i == 0) { + if (start<0) { + start = bb.getIniBlock(); + + byte[] dataSegment = ChmCommons.getChmBlockSegment( + getData(), + getChmLzxcResetTable(), start, + (int) getLzxBlockOffset(), + (int) getLzxBlockLength()); + + lzxBlock = new ChmLzxBlock(start, dataSegment, + getChmLzxcResetTable().getBlockLen(), null); + + getLzxBlocksCache().add(lzxBlock); + } else { + lzxBlock = getLzxBlocksCache().get(hit_cache); + } + + for (i = start; i <= bb.getEndBlock();) { + if (i == bb.getStartBlock() && i == bb.getEndBlock()) { + buffer.write(lzxBlock.getContent( + bb.getStartOffset(), bb.getEndOffset())); + break; + } + + if (i == bb.getStartBlock()) { + buffer.write(lzxBlock.getContent( + bb.getStartOffset())); + } + + if (i > bb.getStartBlock() && i < bb.getEndBlock()) { + buffer.write(lzxBlock.getContent()); + } + + if (i == bb.getEndBlock()) { + buffer.write(lzxBlock.getContent( + 0, bb.getEndOffset())); + break; + } + + i++; + + if (i % getChmLzxcControlData().getResetInterval() == 0) { + lzxBlock = new ChmLzxBlock(i, + ChmCommons.getChmBlockSegment(getData(), + getChmLzxcResetTable(), i, + (int) getLzxBlockOffset(), + (int) getLzxBlockLength()), + getChmLzxcResetTable().getBlockLen(), null); + } else { + lzxBlock = new ChmLzxBlock(i, + ChmCommons.getChmBlockSegment(getData(), + getChmLzxcResetTable(), i, + (int) getLzxBlockOffset(), + (int) getLzxBlockLength()), + getChmLzxcResetTable().getBlockLen(), + lzxBlock); + } + + getLzxBlocksCache().add(lzxBlock); + } + + if (getLzxBlocksCache().size() > getChmLzxcResetTable() + .getBlockCount()) { + getLzxBlocksCache().clear(); + } + } //end of if + + if (buffer.size() != directoryListingEntry.getLength()) { + throw new TikaException("CHM file extract error: extracted Length is wrong."); + } + } //end of if compressed + } catch (Exception e) { + throw new TikaException(e.getMessage()); + } + + return buffer.toByteArray(); + } + + private void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) { + this.lzxBlocksCache = lzxBlocksCache; + } + + private List<ChmLzxBlock> getLzxBlocksCache() { + return lzxBlocksCache; + } + + private void setChmDirList(ChmDirectoryListingSet chmDirList) { + this.chmDirList = chmDirList; + } + + public ChmDirectoryListingSet getChmDirList() { + return chmDirList; + } + + private void setChmItsfHeader(ChmItsfHeader chmItsfHeader) { + this.chmItsfHeader = chmItsfHeader; + } + + private ChmItsfHeader getChmItsfHeader() { + return chmItsfHeader; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.chm.core; + +import java.util.List; + +import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet; +import org.apache.tika.parser.chm.accessor.ChmItsfHeader; +import org.apache.tika.parser.chm.accessor.ChmItspHeader; +import org.apache.tika.parser.chm.accessor.ChmLzxcControlData; +import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable; +import org.apache.tika.parser.chm.lzx.ChmLzxBlock; + +public class ChmWrapper { + private List<ChmLzxBlock> lzxBlocksCache = null; + private ChmDirectoryListingSet chmDirList = null; + private ChmItsfHeader chmItsfHeader = null; + private ChmItspHeader chmItspHeader = null; + private ChmLzxcResetTable chmLzxcResetTable = null; + private ChmLzxcControlData chmLzxcControlData = null; + private byte[] data = null; + private int indexOfContent; + private long lzxBlockOffset; + private long lzxBlockLength; + private int indexOfResetData; + private int indexOfResetTable; + private int startIndex; + + protected int getStartIndex() { + return startIndex; + } + + protected void setStartIndex(int startIndex) { + this.startIndex = startIndex; + } + + protected int getIndexOfResetTable() { + return indexOfResetTable; + } + + protected void setIndexOfResetTable(int indexOfResetTable) { + this.indexOfResetTable = indexOfResetTable; + } + + protected List<ChmLzxBlock> getLzxBlocksCache() { + return lzxBlocksCache; + } + + protected void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) { + this.lzxBlocksCache = lzxBlocksCache; + } + + protected ChmDirectoryListingSet getChmDirList() { + return chmDirList; + } + + protected void setChmDirList(ChmDirectoryListingSet chmDirList) { + this.chmDirList = chmDirList; + } + + protected ChmItsfHeader getChmItsfHeader() { + return chmItsfHeader; + } + + protected void setChmItsfHeader(ChmItsfHeader chmItsfHeader) { + this.chmItsfHeader = chmItsfHeader; + } + + protected ChmLzxcResetTable getChmLzxcResetTable() { + return chmLzxcResetTable; + } + + protected void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) { + this.chmLzxcResetTable = chmLzxcResetTable; + } + + protected ChmLzxcControlData getChmLzxcControlData() { + return chmLzxcControlData; + } + + protected void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) { + this.chmLzxcControlData = chmLzxcControlData; + } + + protected byte[] getData() { + return data; + } + + protected void setData(byte[] data) { + this.data = data; + } + + protected int getIndexOfContent() { + return indexOfContent; + } + + protected void setIndexOfContent(int indexOfContent) { + this.indexOfContent = indexOfContent; + } + + protected long getLzxBlockOffset() { + return lzxBlockOffset; + } + + protected void setLzxBlockOffset(long lzxBlockOffset) { + this.lzxBlockOffset = lzxBlockOffset; + } + + protected long getLzxBlockLength() { + return lzxBlockLength; + } + + protected void setLzxBlockLength(long lzxBlockLength) { + this.lzxBlockLength = lzxBlockLength; + } + + protected void setChmItspHeader(ChmItspHeader chmItspHeader) { + this.chmItspHeader = chmItspHeader; + } + + protected ChmItspHeader getChmItspHeader() { + return chmItspHeader; + } + + protected void setIndexOfResetData(int indexOfResetData) { + this.indexOfResetData = indexOfResetData; + } + + protected int getIndexOfResetData() { + return indexOfResetData; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.exception; + +import org.apache.tika.exception.TikaException; + +public class ChmParsingException extends TikaException { + private static final long serialVersionUID = 6497936044733665210L; + + public ChmParsingException(String description) { + super(description); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.lzx; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.accessor.ChmLzxcControlData; +import org.apache.tika.parser.chm.accessor.DirectoryListingEntry; +import org.apache.tika.parser.chm.exception.ChmParsingException; + +/** + * A container that contains chm block information such as: i. initial block is + * using to reset main tree ii. start block is using for knowing where to start + * iii. end block is using for knowing where to stop iv. start offset is using + * for knowing where to start reading v. end offset is using for knowing where + * to stop reading + * + */ +public class ChmBlockInfo { + /* class members */ + private int iniBlock; + private int startBlock; + private int endBlock; + private int startOffset; + private int endOffset; + + private static ChmBlockInfo chmBlockInfo = null; + + private ChmBlockInfo() { + + } + + /** + * Returns an information related to the chmBlockInfo + * + * @param dle + * - DirectoryListingEntry + * @param bytesPerBlock + * - int, = chmLzxcResetTable.block_length + * @param clcd + * - ChmLzxcControlData + * @param chmBlockInfo + * - ChmBlockInfo + * + * @return ChmBlockInfo + * @throws TikaException + */ + protected ChmBlockInfo getChmBlockInfo(DirectoryListingEntry dle, + int bytesPerBlock, ChmLzxcControlData clcd, + ChmBlockInfo chmBlockInfo) throws TikaException { + if (!validateParameters(dle, bytesPerBlock, clcd, chmBlockInfo)) + throw new ChmParsingException("Please check you parameters"); + + chmBlockInfo.setStartBlock(dle.getOffset() / bytesPerBlock); + chmBlockInfo.setEndBlock((dle.getOffset() + dle.getLength()) + / bytesPerBlock); + chmBlockInfo.setStartOffset(dle.getOffset() % bytesPerBlock); + chmBlockInfo.setEndOffset((dle.getOffset() + dle.getLength()) + % bytesPerBlock); + // potential problem with casting long to int + chmBlockInfo + .setIniBlock(chmBlockInfo.startBlock - + chmBlockInfo.startBlock % (int) clcd.getResetInterval()); +// .setIniBlock((chmBlockInfo.startBlock - chmBlockInfo.startBlock) +// % (int) clcd.getResetInterval()); + return chmBlockInfo; + } + + public static ChmBlockInfo getChmBlockInfoInstance( + DirectoryListingEntry dle, int bytesPerBlock, + ChmLzxcControlData clcd) { + setChmBlockInfo(new ChmBlockInfo()); + getChmBlockInfo().setStartBlock(dle.getOffset() / bytesPerBlock); + getChmBlockInfo().setEndBlock( + (dle.getOffset() + dle.getLength()) / bytesPerBlock); + getChmBlockInfo().setStartOffset(dle.getOffset() % bytesPerBlock); + getChmBlockInfo().setEndOffset( + (dle.getOffset() + dle.getLength()) % bytesPerBlock); + // potential problem with casting long to int + getChmBlockInfo().setIniBlock( + getChmBlockInfo().startBlock - getChmBlockInfo().startBlock + % (int) clcd.getResetInterval()); +// (getChmBlockInfo().startBlock - getChmBlockInfo().startBlock) +// % (int) clcd.getResetInterval()); + return getChmBlockInfo(); + } + + /** + * Returns textual representation of ChmBlockInfo + */ + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("iniBlock:=" + getIniBlock() + ", "); + sb.append("startBlock:=" + getStartBlock() + ", "); + sb.append("endBlock:=" + getEndBlock() + ", "); + sb.append("startOffset:=" + getStartOffset() + ", "); + sb.append("endOffset:=" + getEndOffset() + + System.getProperty("line.separator")); + return sb.toString(); + } + + private boolean validateParameters(DirectoryListingEntry dle, + int bytesPerBlock, ChmLzxcControlData clcd, + ChmBlockInfo chmBlockInfo) { + int goodParameter = 0; + if (dle != null) + ++goodParameter; + if (bytesPerBlock > 0) + ++goodParameter; + if (clcd != null) + ++goodParameter; + if (chmBlockInfo != null) + ++goodParameter; + return (goodParameter == 4); + } + + public static void main(String[] args) { + } + + /** + * Returns an initial block index + * + * @return int + */ + public int getIniBlock() { + return iniBlock; + } + + /** + * Sets the initial block index + * + * @param iniBlock + * - int + */ + private void setIniBlock(int iniBlock) { + this.iniBlock = iniBlock; + } + + /** + * Returns the start block index + * + * @return int + */ + public int getStartBlock() { + return startBlock; + } + + /** + * Sets the start block index + * + * @param startBlock + * - int + */ + private void setStartBlock(int startBlock) { + this.startBlock = startBlock; + } + + /** + * Returns the end block index + * + * @return - int + */ + public int getEndBlock() { + return endBlock; + } + + /** + * Sets the end block index + * + * @param endBlock + * - int + */ + private void setEndBlock(int endBlock) { + this.endBlock = endBlock; + } + + /** + * Returns the start offset index + * + * @return - int + */ + public int getStartOffset() { + return startOffset; + } + + /** + * Sets the start offset index + * + * @param startOffset + * - int + */ + private void setStartOffset(int startOffset) { + this.startOffset = startOffset; + } + + /** + * Returns the end offset index + * + * @return - int + */ + public int getEndOffset() { + return endOffset; + } + + /** + * Sets the end offset index + * + * @param endOffset + * - int + */ + private void setEndOffset(int endOffset) { + this.endOffset = endOffset; + } + + public static void setChmBlockInfo(ChmBlockInfo chmBlockInfo) { + ChmBlockInfo.chmBlockInfo = chmBlockInfo; + } + + public static ChmBlockInfo getChmBlockInfo() { + return chmBlockInfo; + } +}
