Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/pom.xml?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/pom.xml (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/pom.xml Sat Jan 16 18:23:01 2016 @@ -0,0 +1,111 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + you under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-modules</artifactId> + <version>2.0-SNAPSHOT</version> + </parent> + + <artifactId>tika-office-parser-module</artifactId> + <name>Apache Tika Office Parser Module</name> + <url>http://tika.apache.org/</url> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + </dependency> + + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.poi</groupId> + <artifactId>poi</artifactId> + <version>${poi.version}</version> + </dependency> + <dependency> + <groupId>org.apache.poi</groupId> + <artifactId>poi-scratchpad</artifactId> + <version>${poi.version}</version> + </dependency> + <dependency> + <groupId>org.apache.poi</groupId> + <artifactId>poi-ooxml</artifactId> + <version>${poi.version}</version> + <exclusions> + <exclusion> + <groupId>stax</groupId> + <artifactId>stax-api</artifactId> + </exclusion> + <exclusion> + <groupId>xml-apis</groupId> + <artifactId>xml-apis</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>com.healthmarketscience.jackcess</groupId> + <artifactId>jackcess</artifactId> + <version>2.1.2</version> + </dependency> + <dependency> + <groupId>com.healthmarketscience.jackcess</groupId> + <artifactId>jackcess-encrypt</artifactId> + <version>2.1.1</version> + </dependency> + <dependency> + <groupId>com.pff</groupId> + <artifactId>java-libpst</artifactId> + <version>0.8.1</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-package-parser-module</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-web-parser-module</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-text-parser-module</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.chm.accessor.DirectoryListingEntry; +import org.apache.tika.parser.chm.core.ChmExtractor; +import org.apache.tika.parser.html.HtmlParser; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +public class ChmParser extends AbstractParser { + + /** Serial version UID */ + private static final long serialVersionUID = 5938777307516469802L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("vnd.ms-htmlhelp"), + MediaType.application("chm"), + MediaType.application("x-chm")))); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + ChmExtractor chmExtractor = new ChmExtractor(stream); + + // metadata + metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp"); + + // content + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + for (DirectoryListingEntry entry : chmExtractor.getChmDirList().getDirectoryListingEntryList()) { + final String entryName = entry.getName(); + if (entryName.endsWith(".html") + || entryName.endsWith(".htm") + ) { +// AttributesImpl attrs = new AttributesImpl(); +// attrs.addAttribute("", "name", "name", "String", entryName); +// xhtml.startElement("", "document", "document", attrs); + + byte[] data = chmExtractor.extractChmEntry(entry); + + parsePage(data, xhtml); + +// xhtml.endElement("", "", "document"); + } + } + + xhtml.endDocument(); + } + + + private void parsePage(byte[] byteObject, ContentHandler xhtml) throws TikaException {// throws IOException + InputStream stream = null; + Metadata metadata = new Metadata(); + HtmlParser htmlParser = new HtmlParser(); + ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1 + ParseContext parser = new ParseContext(); + try { + stream = new ByteArrayInputStream(byteObject); + htmlParser.parse(stream, handler, metadata, parser); + } catch (SAXException e) { + throw new RuntimeException(e); + } catch (IOException e) { + // Pushback overflow from tagsoup + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.accessor; + +import java.io.Serializable; + +import org.apache.tika.exception.TikaException; + +/** + * + * Defines an accessor interface + * + * @param <T> + */ +public interface ChmAccessor<T> extends Serializable { + /** + * Parses chm accessor + * + * @param data + * chm file + * @param chmAccessor + * @throws TikaException + */ + void parse(byte[] data, T chmAccessor) throws TikaException; +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,398 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.accessor; + +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.List; +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.core.ChmCommons; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.apache.tika.parser.chm.exception.ChmParsingException; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Holds chm listing entries + */ +public class ChmDirectoryListingSet { + private List<DirectoryListingEntry> dlel; + private byte[] data; + private int placeHolder = -1; + private long dataOffset = -1; + private int controlDataIndex = -1; + private int resetTableIndex = -1; + + private boolean isNotControlDataFound = true; + private boolean isNotResetTableFound = true; + + /** + * Constructs chm directory listing set + * + * @param data + * byte[] + * @param chmItsHeader + * @param chmItspHeader + * @throws TikaException + */ + public ChmDirectoryListingSet(byte[] data, ChmItsfHeader chmItsHeader, + ChmItspHeader chmItspHeader) throws TikaException { + setDirectoryListingEntryList(new ArrayList<DirectoryListingEntry>()); + ChmCommons.assertByteArrayNotNull(data); + setData(data); + enumerateChmDirectoryListingList(chmItsHeader, chmItspHeader); + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("list:=" + getDirectoryListingEntryList().toString() + + System.getProperty("line.separator")); + sb.append("number of list items:=" + + getDirectoryListingEntryList().size()); + return sb.toString(); + } + + /** + * Returns control data index that located in List + * + * @return control data index + */ + public int getControlDataIndex() { + return controlDataIndex; + } + + /** + * Sets control data index + * + * @param controlDataIndex + */ + protected void setControlDataIndex(int controlDataIndex) { + this.controlDataIndex = controlDataIndex; + } + + /** + * Return index of reset table + * + * @return reset table index + */ + public int getResetTableIndex() { + return resetTableIndex; + } + + /** + * Sets reset table index + * + * @param resetTableIndex + */ + protected void setResetTableIndex(int resetTableIndex) { + this.resetTableIndex = resetTableIndex; + } + + /** + * Sets place holder + * + * @param placeHolder + */ + private void setPlaceHolder(int placeHolder) { + this.placeHolder = placeHolder; + } + + private ChmPmglHeader PMGLheader; + /** + * Enumerates chm directory listing entries + * + * @param chmItsHeader + * chm itsf PMGLheader + * @param chmItspHeader + * chm itsp PMGLheader + */ + private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader, + ChmItspHeader chmItspHeader) { + try { + int startPmgl = chmItspHeader.getIndex_head(); + int stopPmgl = chmItspHeader.getUnknown_0024(); + int dir_offset = (int) (chmItsHeader.getDirOffset() + chmItspHeader + .getHeader_len()); + setDataOffset(chmItsHeader.getDataOffset()); + + /* loops over all pmgls */ + byte[] dir_chunk = null; + for (int i = startPmgl; i>=0; ) { + dir_chunk = new byte[(int) chmItspHeader.getBlock_len()]; + int start = i * (int) chmItspHeader.getBlock_len() + dir_offset; + dir_chunk = ChmCommons + .copyOfRange(getData(), start, + start +(int) chmItspHeader.getBlock_len()); + + PMGLheader = new ChmPmglHeader(); + PMGLheader.parse(dir_chunk, PMGLheader); + enumerateOneSegment(dir_chunk); + + i=PMGLheader.getBlockNext(); + dir_chunk = null; + } + } catch (Exception e) { + e.printStackTrace(); + } finally { + setData(null); + } + } + + /** + * Checks control data + * + * @param dle + * chm directory listing entry + */ + private void checkControlData(DirectoryListingEntry dle) { + if (isNotControlDataFound) { + if (dle.getName().contains(ChmConstants.CONTROL_DATA)) { + setControlDataIndex(getDirectoryListingEntryList().size()); + isNotControlDataFound = false; + } + } + } + + /** + * Checks reset table + * + * @param dle + * chm directory listing entry + */ + private void checkResetTable(DirectoryListingEntry dle) { + if (isNotResetTableFound) { + if (dle.getName().contains(ChmConstants.RESET_TABLE)) { + setResetTableIndex(getDirectoryListingEntryList().size()); + isNotResetTableFound = false; + } + } + } + + public static final boolean startsWith(byte[] data, String prefix) { + for (int i=0; i<prefix.length(); i++) { + if (data[i]!=prefix.charAt(i)) { + return false; + } + } + + return true; + } + /** + * Enumerates chm directory listing entries in single chm segment + * + * @param dir_chunk + */ + private void enumerateOneSegment(byte[] dir_chunk) throws ChmParsingException { +// try { + if (dir_chunk != null) { + int header_len; + if (startsWith(dir_chunk, ChmConstants.CHM_PMGI_MARKER)) { + header_len = ChmConstants.CHM_PMGI_LEN; + return; //skip PMGI + } + else if (startsWith(dir_chunk, ChmConstants.PMGL)) { + header_len = ChmConstants.CHM_PMGL_LEN; + } + else { + throw new ChmParsingException("Bad dir entry block."); + } + + placeHolder = header_len; + //setPlaceHolder(header_len); + while (placeHolder > 0 && placeHolder < dir_chunk.length - PMGLheader.getFreeSpace() + /*&& dir_chunk[placeHolder - 1] != 115*/) + { + //get entry name length + int strlen = 0;// = getEncint(data); + byte temp; + while ((temp=dir_chunk[placeHolder++]) >= 0x80) + { + strlen <<= 7; + strlen += temp & 0x7f; + } + + strlen = (strlen << 7) + temp & 0x7f; + + if (strlen>dir_chunk.length) { + throw new ChmParsingException("Bad data of a string length."); + } + + DirectoryListingEntry dle = new DirectoryListingEntry(); + dle.setNameLength(strlen); + dle.setName(new String(ChmCommons.copyOfRange( + dir_chunk, placeHolder, + (placeHolder + dle.getNameLength())), UTF_8)); + + checkControlData(dle); + checkResetTable(dle); + setPlaceHolder(placeHolder + + dle.getNameLength()); + + /* Sets entry type */ + if (placeHolder < dir_chunk.length + && dir_chunk[placeHolder] == 0) + dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED); + else + dle.setEntryType(ChmCommons.EntryType.COMPRESSED); + + setPlaceHolder(placeHolder + 1); + dle.setOffset(getEncint(dir_chunk)); + dle.setLength(getEncint(dir_chunk)); + getDirectoryListingEntryList().add(dle); + } + +// int indexWorkData = ChmCommons.indexOf(dir_chunk, +// "::".getBytes(UTF_8)); +// int indexUserData = ChmCommons.indexOf(dir_chunk, +// "/".getBytes(UTF_8)); +// +// if (indexUserData>=0 && indexUserData < indexWorkData) +// setPlaceHolder(indexUserData); +// else if (indexWorkData>=0) { +// setPlaceHolder(indexWorkData); +// } +// else { +// setPlaceHolder(indexUserData); +// } +// +// if (placeHolder > 0 && placeHolder < dir_chunk.length - PMGLheader.getFreeSpace() +// && dir_chunk[placeHolder - 1] != 115) {// #{ +// do { +// if (dir_chunk[placeHolder - 1] > 0) { +// DirectoryListingEntry dle = new DirectoryListingEntry(); +// +// // two cases: 1. when dir_chunk[placeHolder - +// // 1] == 0x73 +// // 2. when dir_chunk[placeHolder + 1] == 0x2f +// doNameCheck(dir_chunk, dle); +// +// // dle.setName(new +// // String(Arrays.copyOfRange(dir_chunk, +// // placeHolder, (placeHolder + +// // dle.getNameLength())))); +// dle.setName(new String(ChmCommons.copyOfRange( +// dir_chunk, placeHolder, +// (placeHolder + dle.getNameLength())), UTF_8)); +// checkControlData(dle); +// checkResetTable(dle); +// setPlaceHolder(placeHolder +// + dle.getNameLength()); +// +// /* Sets entry type */ +// if (placeHolder < dir_chunk.length +// && dir_chunk[placeHolder] == 0) +// dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED); +// else +// dle.setEntryType(ChmCommons.EntryType.COMPRESSED); +// +// setPlaceHolder(placeHolder + 1); +// dle.setOffset(getEncint(dir_chunk)); +// dle.setLength(getEncint(dir_chunk)); +// getDirectoryListingEntryList().add(dle); +// } else +// setPlaceHolder(placeHolder + 1); +// +// } while (nextEntry(dir_chunk)); +// } + } + +// } catch (Exception e) { +// e.printStackTrace(); +// } + } + + + /** + * Returns encrypted integer + * + * @param data_chunk + * + * @return + */ + private int getEncint(byte[] data_chunk) { + byte ob; + BigInteger bi = BigInteger.ZERO; + byte[] nb = new byte[1]; + + if (placeHolder < data_chunk.length) { + while ((ob = data_chunk[placeHolder]) < 0) { + nb[0] = (byte) ((ob & 0x7f)); + bi = bi.shiftLeft(7).add(new BigInteger(nb)); + setPlaceHolder(placeHolder + 1); + } + nb[0] = (byte) ((ob & 0x7f)); + bi = bi.shiftLeft(7).add(new BigInteger(nb)); + setPlaceHolder(placeHolder + 1); + } + return bi.intValue(); + } + + /** + * Sets chm directory listing entry list + * + * @param dlel + * chm directory listing entry list + */ + public void setDirectoryListingEntryList(List<DirectoryListingEntry> dlel) { + this.dlel = dlel; + } + + /** + * Returns chm directory listing entry list + * + * @return List<DirectoryListingEntry> + */ + public List<DirectoryListingEntry> getDirectoryListingEntryList() { + return dlel; + } + + /** + * Sets data + * + * @param data + */ + private void setData(byte[] data) { + this.data = data; + } + + /** + * Returns data + * + * @return + */ + private byte[] getData() { + return data; + } + + /** + * Sets data offset + * + * @param dataOffset + */ + private void setDataOffset(long dataOffset) { + this.dataOffset = dataOffset; + } + + /** + * Returns data offset + * + * @return dataOffset + */ + public long getDataOffset() { + return dataOffset; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,492 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.accessor; + +import java.math.BigInteger; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.assertion.ChmAssert; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.apache.tika.parser.chm.exception.ChmParsingException; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * The Header 0000: char[4] 'ITSF' 0004: DWORD 3 (Version number) 0008: DWORD + * Total header length, including header section table and following data. 000C: + * DWORD 1 (unknown) 0010: DWORD a timestamp 0014: DWORD Windows Language ID + * 0018: GUID {7C01FD10-7BAA-11D0-9E0C-00A0-C922-E6EC} 0028: GUID + * {7C01FD11-7BAA-11D0-9E0C-00A0-C922-E6EC} Note: a GUID is $10 bytes, arranged + * as 1 DWORD, 2 WORDs, and 8 BYTEs. 0000: QWORD Offset of section from + * beginning of file 0008: QWORD Length of section Following the header section + * table is 8 bytes of additional header data. In Version 2 files, this data is + * not there and the content section starts immediately after the directory. + * + * {@link http + * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original + * /?show-translation-form=1} + * + */ +/* structure of ITSF headers */ +public class ChmItsfHeader implements ChmAccessor<ChmItsfHeader> { + private static final long serialVersionUID = 2215291838533213826L; + private byte[] signature; + private int version; /* 4 */ + private int header_len; /* 8 */ + private int unknown_000c; /* c */ + private long last_modified; /* 10 */ + private long lang_id; /* 14 */ + private byte[] dir_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 18 */ + private byte[] stream_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 28 */ + private long unknown_offset; /* 38 */ + private long unknown_len; /* 40 */ + private long dir_offset; /* 48 */ + private long dir_len; /* 50 */ + private long data_offset; /* 58 (Not present before V3) */ + + /* local usage */ + private int dataRemained; + private int currentPlace = 0; + + public ChmItsfHeader() { + signature = ChmConstants.ITSF.getBytes(UTF_8); /* 0 (ITSF) */ + } + + /** + * Prints the values of ChmfHeader + */ + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(new String(getSignature(), UTF_8) + " "); + sb.append(getVersion() + " "); + sb.append(getHeaderLen() + " "); + sb.append(getUnknown_000c() + " "); + sb.append(getLastModified() + " "); + sb.append(getLangId() + " "); + sb.append(getDir_uuid() + " "); + sb.append(getStream_uuid() + " "); + sb.append(getUnknownOffset() + " "); + sb.append(getUnknownLen() + " "); + sb.append(getDirOffset() + " "); + sb.append(getDirLen() + " "); + sb.append(getDataOffset() + " "); + return sb.toString(); + } + + /** + * Returns a signature of itsf header + * + * @return itsf header + */ + public byte[] getSignature() { + return signature; + } + + /** + * Sets itsf header signature + * + * @param signature + */ + protected void setSignature(byte[] signature) { + this.signature = signature; + } + + /** + * Returns itsf header version + * + * @return itsf version + */ + public int getVersion() { + return version; + } + + /** + * Sets itsf version + * + * @param version + */ + protected void setVersion(int version) { + this.version = version; + } + + /** + * Returns itsf header length + * + * @return length + */ + public int getHeaderLen() { + return header_len; + } + + /** + * Sets itsf header length + * + * @param header_len + */ + protected void setHeaderLen(int header_len) { + this.header_len = header_len; + } + + /** + * Returns unknown_00c value + * + * @return unknown_00c + */ + public int getUnknown_000c() { + return unknown_000c; + } + + /** + * Sets unknown_00c + * + * @param unknown_000c + */ + protected void setUnknown_000c(int unknown_000c) { + this.unknown_000c = unknown_000c; + } + + /** + * Returns last modified date of the chm file + * + * @return last modified date as long + */ + public long getLastModified() { + return last_modified; + } + + /** + * Sets last modified date of the chm file + * + * @param last_modified + */ + protected void setLastModified(long last_modified) { + this.last_modified = last_modified; + } + + /** + * Returns language ID + * + * @return language_id + */ + public long getLangId() { + return lang_id; + } + + /** + * Sets language_id + * + * @param lang_id + */ + protected void setLangId(long lang_id) { + this.lang_id = lang_id; + } + + /** + * Returns directory uuid + * + * @return dir_uuid + */ + public byte[] getDir_uuid() { + return dir_uuid; + } + + /** + * Sets directory uuid + * + * @param dir_uuid + */ + protected void setDir_uuid(byte[] dir_uuid) { + this.dir_uuid = dir_uuid; + } + + /** + * Returns stream uuid + * + * @return stream_uuid + */ + public byte[] getStream_uuid() { + return stream_uuid; + } + + /** + * Sets stream uuid + * + * @param stream_uuid + */ + protected void setStream_uuid(byte[] stream_uuid) { + this.stream_uuid = stream_uuid; + } + + /** + * Returns unknown offset + * + * @return unknown_offset + */ + public long getUnknownOffset() { + return unknown_offset; + } + + /** + * Sets unknown offset + * + * @param unknown_offset + */ + protected void setUnknownOffset(long unknown_offset) { + this.unknown_offset = unknown_offset; + } + + /** + * Returns unknown length + * + * @return unknown_length + */ + public long getUnknownLen() { + return unknown_len; + } + + /** + * Sets unknown length + * + * @param unknown_len + */ + protected void setUnknownLen(long unknown_len) { + this.unknown_len = unknown_len; + } + + /** + * Returns directory offset + * + * @return directory_offset + */ + public long getDirOffset() { + return dir_offset; + } + + /** + * Sets directory offset + * + * @param dir_offset + */ + protected void setDirOffset(long dir_offset) { + this.dir_offset = dir_offset; + } + + /** + * Returns directory length + * + * @return directory_offset + */ + public long getDirLen() { + return dir_len; + } + + /** + * Sets directory length + * + * @param dir_len + */ + protected void setDirLen(long dir_len) { + this.dir_len = dir_len; + } + + /** + * Returns data offset + * + * @return data_offset + */ + public long getDataOffset() { + return data_offset; + } + + /** + * Sets data offset + * + * @param data_offset + */ + protected void setDataOffset(long data_offset) { + this.data_offset = data_offset; + } + + /** + * Copies 4 first bytes of the byte[] + * + * @param data + * @param chmItsfHeader + * @param count + * @throws TikaException + */ + private void unmarshalCharArray(byte[] data, ChmItsfHeader chmItsfHeader, + int count) throws TikaException { + ChmAssert.assertChmAccessorParameters(data, chmItsfHeader, count); + System.arraycopy(data, 0, chmItsfHeader.signature, 0, count); + this.setCurrentPlace(this.getCurrentPlace() + count); + this.setDataRemained(this.getDataRemained() - count); + } + + /** + * Copies X bytes of source byte[] to the dest byte[] + * + * @param data + * @param dest + * @param count + * @return + */ + private byte[] unmarshalUuid(byte[] data, byte[] dest, int count) { + System.arraycopy(data, this.getCurrentPlace(), dest, 0, count); + this.setCurrentPlace(this.getCurrentPlace() + count); + this.setDataRemained(this.getDataRemained() - count); + return dest; + } + + /** + * Takes 8 bytes and reverses them + * + * @param data + * @param dest + * @return + * @throws TikaException + */ + private long unmarshalUint64(byte[] data, long dest) throws TikaException{ + byte[] temp = new byte[8]; + int i, j; + + if (8 > this.getDataRemained()) + throw new TikaException("8 > this.getDataRemained()"); + + for (i = 8, j = 7; i > 0; i--) { + temp[j--] = data[this.getCurrentPlace()]; + this.setCurrentPlace(this.getCurrentPlace() + 1); + } + + dest = new BigInteger(temp).longValue(); + this.setDataRemained(this.getDataRemained() - 8); + return dest; + } + + private int unmarshalInt32(byte[] data, int dest) throws TikaException{ + ChmAssert.assertByteArrayNotNull(data); + + if (4 > this.getDataRemained()) + throw new TikaException("4 > dataLenght"); + dest = (data[this.getCurrentPlace()] & 0xff) + | (data[this.getCurrentPlace() + 1] & 0xff) << 8 + | (data[this.getCurrentPlace() + 2] & 0xff) << 16 + | (data[this.getCurrentPlace() + 3] & 0xff) << 24; + + this.setCurrentPlace(this.getCurrentPlace() + 4); + this.setDataRemained(this.getDataRemained() - 4); + return dest; + } + + private long unmarshalUInt32(byte[] data, long dest) throws TikaException{ + ChmAssert.assertByteArrayNotNull(data); + if (4 > getDataRemained()) + throw new TikaException("4 > dataLenght"); + dest = data[this.getCurrentPlace()] + | data[this.getCurrentPlace() + 1] << 8 + | data[this.getCurrentPlace() + 2] << 16 + | data[this.getCurrentPlace() + 3] << 24; + + setDataRemained(this.getDataRemained() - 4); + this.setCurrentPlace(this.getCurrentPlace() + 4); + return dest; + } + + public static void main(String[] args) { + } + + /** + * Sets data remained to be processed + * + * @param dataRemained + */ + private void setDataRemained(int dataRemained) { + this.dataRemained = dataRemained; + } + + /** + * Returns data remained + * + * @return data_remainned + */ + private int getDataRemained() { + return dataRemained; + } + + /** + * Sets current place in the byte[] + * + * @param currentPlace + */ + private void setCurrentPlace(int currentPlace) { + this.currentPlace = currentPlace; + } + + /** + * Returns current place in the byte[] + * + * @return current place + */ + private int getCurrentPlace() { + return currentPlace; + } + + // @Override + public void parse(byte[] data, ChmItsfHeader chmItsfHeader) throws TikaException { + if (data.length < ChmConstants.CHM_ITSF_V2_LEN + || data.length > ChmConstants.CHM_ITSF_V3_LEN) + throw new TikaException("we only know how to deal with the 0x58 and 0x60 byte structures"); + + chmItsfHeader.setDataRemained(data.length); + chmItsfHeader.unmarshalCharArray(data, chmItsfHeader, ChmConstants.CHM_SIGNATURE_LEN); + chmItsfHeader.setVersion(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getVersion())); + chmItsfHeader.setHeaderLen(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getHeaderLen())); + chmItsfHeader.setUnknown_000c(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getUnknown_000c())); + chmItsfHeader.setLastModified(chmItsfHeader.unmarshalUInt32(data, chmItsfHeader.getLastModified())); + chmItsfHeader.setLangId(chmItsfHeader.unmarshalUInt32(data, chmItsfHeader.getLangId())); + chmItsfHeader.setDir_uuid(chmItsfHeader.unmarshalUuid(data, chmItsfHeader.getDir_uuid(), 16)); + chmItsfHeader.setStream_uuid(chmItsfHeader.unmarshalUuid(data, chmItsfHeader.getStream_uuid(), 16)); + chmItsfHeader.setUnknownOffset(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getUnknownOffset())); + chmItsfHeader.setUnknownLen(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getUnknownLen())); + chmItsfHeader.setDirOffset(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getDirOffset())); + chmItsfHeader.setDirLen(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getDirLen())); + if (!new String(chmItsfHeader.getSignature(), UTF_8).equals(ChmConstants.ITSF)) + throw new TikaException("seems not valid file"); + if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_2) { + if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V2_LEN) + throw new TikaException("something wrong with header"); + } else if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) { + if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V3_LEN) + throw new TikaException("unknown v3 header lenght"); + } else + throw new ChmParsingException("unsupported chm format"); + + /* + * now, if we have a V3 structure, unmarshal the rest, otherwise, + * compute it + */ + if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) { + if (chmItsfHeader.getDataRemained() >= 0) + chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset() + + chmItsfHeader.getDirLen()); + else + throw new TikaException("cannot set data offset, no data remained"); + } else + chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset() + + chmItsfHeader.getDirLen()); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,548 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.accessor; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.assertion.ChmAssert; +import org.apache.tika.parser.chm.core.ChmCommons; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.apache.tika.parser.chm.exception.ChmParsingException; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Directory header The directory starts with a header; its format is as + * follows: 0000: char[4] 'ITSP' 0004: DWORD Version number 1 0008: DWORD Length + * of the directory header 000C: DWORD $0a (unknown) 0010: DWORD $1000 Directory + * chunk size 0014: DWORD "Density" of quickref section, usually 2 0018: DWORD + * Depth of the index tree - 1 there is no index, 2 if there is one level of + * PMGI chunks 001C: DWORD Chunk number of root index chunk, -1 if there is none + * (though at least one file has 0 despite there being no index chunk, probably + * a bug) 0020: DWORD Chunk number of first PMGL (listing) chunk 0024: DWORD + * Chunk number of last PMGL (listing) chunk 0028: DWORD -1 (unknown) 002C: + * DWORD Number of directory chunks (total) 0030: DWORD Windows language ID + * 0034: GUID {5D02926A-212E-11D0-9DF9-00A0C922E6EC} 0044: DWORD $54 (This is + * the length again) 0048: DWORD -1 (unknown) 004C: DWORD -1 (unknown) 0050: + * DWORD -1 (unknown) + * + * {@link http + * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original + * /?show-translation-form=1} + * + */ +public class ChmItspHeader implements ChmAccessor<ChmItspHeader> { + // TODO: refactor all unmarshals + private static final long serialVersionUID = 1962394421998181341L; + private byte[] signature; + private int version; /* 4 */ + private int header_len; /* 8 */ + private int unknown_000c; /* c */ + private long block_len; /* 10 */ + private int blockidx_intvl; /* 14 */ + private int index_depth; /* 18 */ + private int index_root; /* 1c */ + private int index_head; /* 20 */ + private int unknown_0024; /* 24 */ + private long num_blocks; /* 28 */ + private int unknown_002c; /* 2c */ + private long lang_id; /* 30 */ + private byte[] system_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 34 */ + private byte[] unknown_0044 = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 44 */ + + /* local usage */ + private int dataRemained; + private int currentPlace = 0; + + public ChmItspHeader() { + signature = ChmConstants.ITSP.getBytes(UTF_8); /* + * 0 + * (ITSP + * ) + */ + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("[ signature:=" + new String(getSignature(), UTF_8) + + System.getProperty("line.separator")); + sb.append("version:=\t" + getVersion() + + System.getProperty("line.separator")); + sb.append("header_len:=\t" + getHeader_len() + + System.getProperty("line.separator")); + sb.append("unknown_00c:=\t" + getUnknown_000c() + + System.getProperty("line.separator")); + sb.append("block_len:=\t" + getBlock_len() + " [directory chunk size]" + + System.getProperty("line.separator")); + sb.append("blockidx_intvl:=" + getBlockidx_intvl() + + ", density of quickref section, usually 2" + + System.getProperty("line.separator")); + sb.append("index_depth:=\t" + + getIndex_depth() + + ", depth of the index tree - 1 there is no index, 2 if there is one level of PMGI chunk" + + System.getProperty("line.separator")); + sb.append("index_root:=\t" + getIndex_root() + + ", chunk number of root index chunk, -1 if there is none" + + System.getProperty("line.separator")); + sb.append("index_head:=\t" + getIndex_head() + + ", chunk number of first PMGL (listing) chunk" + + System.getProperty("line.separator")); + sb.append("unknown_0024:=\t" + getUnknown_0024() + + ", chunk number of last PMGL (listing) chunk" + + System.getProperty("line.separator")); + sb.append("num_blocks:=\t" + getNum_blocks() + ", -1 (unknown)" + + System.getProperty("line.separator")); + sb.append("unknown_002c:=\t" + getUnknown_002c() + + ", number of directory chunks (total)" + + System.getProperty("line.separator")); + sb.append("lang_id:=\t" + getLang_id() + " - " + + ChmCommons.getLanguage(getLang_id()) + + System.getProperty("line.separator")); + sb.append("system_uuid:=" + getSystem_uuid() + + System.getProperty("line.separator")); + sb.append("unknown_0044:=" + getUnknown_0044() + " ]"); + return sb.toString(); + } + + /** + * Copies 4 bits from data[] + * + * @param data + * @param chmItspHeader + * @param count + * @throws TikaException + */ + private void unmarshalCharArray(byte[] data, ChmItspHeader chmItspHeader, + int count) throws TikaException { + ChmAssert.assertByteArrayNotNull(data); + ChmAssert.assertChmAccessorNotNull(chmItspHeader); + this.setDataRemained(data.length); + System.arraycopy(data, 0, chmItspHeader.signature, 0, count); + this.setCurrentPlace(this.getCurrentPlace() + count); + this.setDataRemained(this.getDataRemained() - count); + } + + private int unmarshalInt32(byte[] data, int dataLenght, int dest) throws TikaException { + ChmAssert.assertByteArrayNotNull(data); + if (4 > this.getDataRemained()) + throw new TikaException("4 > dataLenght"); + dest = (data[this.getCurrentPlace()] & 0xff) + | (data[this.getCurrentPlace() + 1] & 0xff) << 8 + | (data[this.getCurrentPlace() + 2] & 0xff) << 16 + | (data[this.getCurrentPlace() + 3] & 0xff) << 24; + + this.setCurrentPlace(this.getCurrentPlace() + 4); + this.setDataRemained(this.getDataRemained() - 4); + return dest; + } + + private long unmarshalUInt32(byte[] data, int dataLenght, long dest) throws TikaException { + ChmAssert.assertByteArrayNotNull(data); + if (4 > dataLenght) + throw new TikaException("4 > dataLenght"); + dest = (data[this.getCurrentPlace()] & 0xff) + | (data[this.getCurrentPlace() + 1] & 0xff) << 8 + | (data[this.getCurrentPlace() + 2] & 0xff) << 16 + | (data[this.getCurrentPlace() + 3] & 0xff) << 24; + + setDataRemained(this.getDataRemained() - 4); + this.setCurrentPlace(this.getCurrentPlace() + 4); + return dest; + } + + private byte[] unmarshalUuid(byte[] data, int dataLenght, byte[] dest, + int count) { + System.arraycopy(data, this.getCurrentPlace(), dest, 0, count); + this.setCurrentPlace(this.getCurrentPlace() + count); + this.setDataRemained(this.getDataRemained() - count); + return dest; + } + + /** + * Returns how many bytes remained + * + * @return int + */ + private int getDataRemained() { + return dataRemained; + } + + /** + * Sets how many bytes remained + * + * @param dataRemained + */ + private void setDataRemained(int dataRemained) { + this.dataRemained = dataRemained; + } + + /** + * Returns a place holder + * + * @return current place + */ + private int getCurrentPlace() { + return currentPlace; + } + + /** + * Sets current place + * + * @param currentPlace + */ + private void setCurrentPlace(int currentPlace) { + this.currentPlace = currentPlace; + } + + /** + * Returns a signature of the header + * + * @return itsp signature + */ + public byte[] getSignature() { + return signature; + } + + /** + * Sets itsp signature + * + * @param signature + */ + protected void setSignature(byte[] signature) { + this.signature = signature; + } + + /** + * Returns version of itsp header + * + * @return version + */ + public int getVersion() { + return version; + } + + /** + * Sets a version of itsp header + * + * @param version + */ + protected void setVersion(int version) { + this.version = version; + } + + /** + * Returns header length + * + * @return header length + */ + public int getHeader_len() { + return header_len; + } + + /** + * Sets itsp header length + * + * @param header_len + */ + protected void setHeader_len(int header_len) { + this.header_len = header_len; + } + + /** + * Returns 000c unknown bytes + */ + public int getUnknown_000c() { + return unknown_000c; + } + + /** + * Sets 000c unknown bytes Unknown means here that those guys who cracked + * the chm format do not know what's it purposes for + * + * @param unknown_000c + */ + protected void setUnknown_000c(int unknown_000c) { + this.unknown_000c = unknown_000c; + } + + /** + * Returns block's length + * + * @return block_length + */ + public long getBlock_len() { + return block_len; + } + + /** + * Sets block length + * + * @param block_len + */ + protected void setBlock_len(long block_len) { + this.block_len = block_len; + } + + /** + * Returns block index interval + * + * @return blockidx_intvl + */ + public int getBlockidx_intvl() { + return blockidx_intvl; + } + + /** + * Sets block index interval + * + * @param blockidx_intvl + */ + protected void setBlockidx_intvl(int blockidx_intvl) { + this.blockidx_intvl = blockidx_intvl; + } + + /** + * Returns an index depth + * + * @return index_depth + */ + public int getIndex_depth() { + return index_depth; + } + + /** + * Sets an index depth + * + * @param index_depth + */ + protected void setIndex_depth(int index_depth) { + this.index_depth = index_depth; + } + + /** + * Returns index root + * + * @return index_root + */ + public int getIndex_root() { + return index_root; + } + + /** + * Sets an index root + * + * @param index_root + */ + protected void setIndex_root(int index_root) { + this.index_root = index_root; + } + + /** + * Returns an index head + * + * @return index_head + */ + public int getIndex_head() { + return index_head; + } + + /** + * Sets an index head + * + * @param index_head + */ + protected void setIndex_head(int index_head) { + this.index_head = index_head; + } + + /** + * Returns 0024 unknown bytes + * + * @return unknown_0024 + */ + public int getUnknown_0024() { + return unknown_0024; + } + + /** + * Sets 0024 unknown bytes + * + * @param unknown_0024 + */ + protected void setUnknown_0024(int unknown_0024) { + this.unknown_0024 = unknown_0024; + } + + /** + * Returns number of blocks + * + * @return num_blocks + */ + public long getNum_blocks() { + return num_blocks; + } + + /** + * Sets number of blocks containing in the chm file + * + * @param num_blocks + */ + protected void setNum_blocks(long num_blocks) { + this.num_blocks = num_blocks; + } + + /** + * Returns 002c unknown bytes + * + * @return unknown_002c + */ + public int getUnknown_002c() { + return unknown_002c; + } + + /** + * Sets 002c unknown bytes + * + * @param unknown_002c + */ + protected void setUnknown_002c(int unknown_002c) { + this.unknown_002c = unknown_002c; + } + + /** + * Returns language id + * + * @return lang_id + */ + public long getLang_id() { + return lang_id; + } + + /** + * Sets language id + * + * @param lang_id + */ + protected void setLang_id(long lang_id) { + this.lang_id = lang_id; + } + + /** + * Returns system uuid + * + * @return system_uuid + */ + public byte[] getSystem_uuid() { + return system_uuid; + } + + /** + * Sets system uuid + * + * @param system_uuid + */ + protected void setSystem_uuid(byte[] system_uuid) { + this.system_uuid = system_uuid; + } + + /** + * Returns 0044 unknown bytes + * + * @return unknown_0044 + */ + public byte[] getUnknown_0044() { + return unknown_0044; + } + + /** + * Sets 0044 unknown bytes + * + * @param unknown_0044 + */ + protected void setUnknown_0044(byte[] unknown_0044) { + this.unknown_0044 = unknown_0044; + } + + // @Override + public void parse(byte[] data, ChmItspHeader chmItspHeader) throws TikaException { + /* we only know how to deal with the 0x58 and 0x60 byte structures */ + if (data.length != ChmConstants.CHM_ITSP_V1_LEN) + throw new ChmParsingException("we only know how to deal with the 0x58 and 0x60 byte structures"); + + /* unmarshal common fields */ + chmItspHeader.unmarshalCharArray(data, chmItspHeader, ChmConstants.CHM_SIGNATURE_LEN); + // ChmCommons.unmarshalCharArray(data, chmItspHeader, + // ChmConstants.CHM_SIGNATURE_LEN); + chmItspHeader.setVersion(chmItspHeader.unmarshalInt32(data, + chmItspHeader.getDataRemained(), chmItspHeader.getVersion())); + chmItspHeader + .setHeader_len(chmItspHeader.unmarshalInt32(data, + chmItspHeader.getDataRemained(), + chmItspHeader.getHeader_len())); + chmItspHeader.setUnknown_000c(chmItspHeader.unmarshalInt32(data, + chmItspHeader.getDataRemained(), + chmItspHeader.getUnknown_000c())); + chmItspHeader.setBlock_len(chmItspHeader.unmarshalUInt32(data, + chmItspHeader.getDataRemained(), chmItspHeader.getBlock_len())); + chmItspHeader.setBlockidx_intvl(chmItspHeader.unmarshalInt32(data, + chmItspHeader.getDataRemained(), + chmItspHeader.getBlockidx_intvl())); + chmItspHeader + .setIndex_depth(chmItspHeader.unmarshalInt32(data, + chmItspHeader.getDataRemained(), + chmItspHeader.getIndex_depth())); + chmItspHeader + .setIndex_root(chmItspHeader.unmarshalInt32(data, + chmItspHeader.getDataRemained(), + chmItspHeader.getIndex_root())); + chmItspHeader + .setIndex_head(chmItspHeader.unmarshalInt32(data, + chmItspHeader.getDataRemained(), + chmItspHeader.getIndex_head())); + chmItspHeader.setUnknown_0024(chmItspHeader.unmarshalInt32(data, + chmItspHeader.getDataRemained(), + chmItspHeader.getUnknown_0024())); + chmItspHeader + .setNum_blocks(chmItspHeader.unmarshalUInt32(data, + chmItspHeader.getDataRemained(), + chmItspHeader.getNum_blocks())); + chmItspHeader.setUnknown_002c((chmItspHeader.unmarshalInt32(data, + chmItspHeader.getDataRemained(), + chmItspHeader.getUnknown_002c()))); + chmItspHeader.setLang_id(chmItspHeader.unmarshalUInt32(data, + chmItspHeader.getDataRemained(), chmItspHeader.getLang_id())); + chmItspHeader + .setSystem_uuid(chmItspHeader.unmarshalUuid(data, + chmItspHeader.getDataRemained(), + chmItspHeader.getSystem_uuid(), + ChmConstants.BYTE_ARRAY_LENGHT)); + chmItspHeader + .setUnknown_0044(chmItspHeader.unmarshalUuid(data, + chmItspHeader.getDataRemained(), + chmItspHeader.getUnknown_0044(), + ChmConstants.BYTE_ARRAY_LENGHT)); + + /* Checks validity of the itsp header */ + if (!new String(chmItspHeader.getSignature(), UTF_8).equals(ChmConstants.ITSP)) + throw new ChmParsingException("seems not valid signature"); + + if (chmItspHeader.getVersion() != ChmConstants.CHM_VER_1) + throw new ChmParsingException("!=ChmConstants.CHM_VER_1"); + + if (chmItspHeader.getHeader_len() != ChmConstants.CHM_ITSP_V1_LEN) + throw new ChmParsingException("!= ChmConstants.CHM_ITSP_V1_LEN"); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,319 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.accessor; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.assertion.ChmAssert; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.apache.tika.parser.chm.exception.ChmParsingException; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * + * ::DataSpace/Storage/<SectionName>/ControlData This file contains $20 bytes of + * information on the compression. The information is partially known: 0000: + * DWORD 6 (unknown) 0004: ASCII 'LZXC' Compression type identifier 0008: DWORD + * 2 (Possibly numeric code for LZX) 000C: DWORD The Huffman reset interval in + * $8000-byte blocks 0010: DWORD The window size in $8000-byte blocks 0014: + * DWORD unknown (sometimes 2, sometimes 1, sometimes 0) 0018: DWORD 0 (unknown) + * 001C: DWORD 0 (unknown) + * + * {@link http + * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original + * /?page=2 } + * + */ +public class ChmLzxcControlData implements ChmAccessor<ChmLzxcControlData> { + private static final long serialVersionUID = -7897854774939631565L; + /* class' members */ + private long size; /* 0 */ + private byte[] signature; + private long version; /* 8 */ + private long resetInterval; /* c */ + private long windowSize; /* 10 */ + private long windowsPerReset; /* 14 */ + private long unknown_18; /* 18 */ + + /* local usage */ + private int dataRemained; + private int currentPlace = 0; + + public ChmLzxcControlData() { + signature = ChmConstants.LZXC.getBytes(UTF_8); /* + * 4 + * (LZXC + * ) + */ + } + + /** + * Returns a remained data + * + * @return dataRemained + */ + private int getDataRemained() { + return dataRemained; + } + + /** + * Sets a remained data + * + * @param dataRemained + */ + private void setDataRemained(int dataRemained) { + this.dataRemained = dataRemained; + } + + /** + * Returns a place holder + * + * @return current_place + */ + private int getCurrentPlace() { + return currentPlace; + } + + /** + * Sets a place holder + * + * @param current_place + */ + private void setCurrentPlace(int currentPlace) { + this.currentPlace = currentPlace; + } + + /** + * Returns a size of control data + * + * @return size + */ + public long getSize() { + return size; + } + + /** + * Sets a size of control data + * + * @param size + */ + protected void setSize(long size) { + this.size = size; + } + + /** + * Returns a signature of control data block + * + * @return signature + */ + public byte[] getSignature() { + return signature; + } + + /** + * Sets a signature of control data block + * + * @param signature + */ + protected void setSignature(byte[] signature) { + this.signature = signature; + } + + /** + * Returns a version of control data block + * + * @return version + */ + public long getVersion() { + return version; + } + + /** + * Sets version of control data block + * + * @param version + */ + protected void setVersion(long version) { + this.version = version; + } + + /** + * Returns reset interval + * + * @return reset_interval + */ + public long getResetInterval() { + return resetInterval; + } + + /** + * Sets a reset interval + * + * @param resetInterval + */ + protected void setResetInterval(long resetInterval) { + this.resetInterval = resetInterval; + } + + /** + * Returns a window size + * + * @return window_size + */ + public long getWindowSize() { + return windowSize; + } + + /** + * Sets a window size + * + * @param window_size + */ + protected void setWindowSize(long windowSize) { + this.windowSize = windowSize; + } + + /** + * Returns windows per reset + * + * @return + */ + public long getWindowsPerReset() { + return windowsPerReset; + } + + /** + * Sets windows per reset + * + * @param windows_per_reset + */ + protected void setWindowsPerReset(long windowsPerReset) { + this.windowsPerReset = windowsPerReset; + } + + /** + * Returns unknown 18 bytes + * + * @return unknown_18 + */ + public long getUnknown_18() { + return unknown_18; + } + + /** + * Sets unknown 18 bytes + * + * @param unknown_18 + */ + protected void setUnknown_18(long unknown_18) { + this.unknown_18 = unknown_18; + } + + private long unmarshalUInt32(byte[] data, long dest) throws ChmParsingException { + assert (data != null && data.length > 0); + if (4 > getDataRemained()) + throw new ChmParsingException("4 > dataLenght"); + dest = data[this.getCurrentPlace()] + | data[this.getCurrentPlace() + 1] << 8 + | data[this.getCurrentPlace() + 2] << 16 + | data[this.getCurrentPlace() + 3] << 24; + + setDataRemained(this.getDataRemained() - 4); + this.setCurrentPlace(this.getCurrentPlace() + 4); + return dest; + } + + private void unmarshalCharArray(byte[] data, + ChmLzxcControlData chmLzxcControlData, int count) throws TikaException { + ChmAssert.assertByteArrayNotNull(data); + ChmAssert.assertChmAccessorNotNull(chmLzxcControlData); + ChmAssert.assertPositiveInt(count); + System.arraycopy(data, 4, chmLzxcControlData.getSignature(), 0, count); + this.setCurrentPlace(this.getCurrentPlace() + count); + this.setDataRemained(this.getDataRemained() - count); + } + + /** + * Returns textual representation of ChmLzxcControlData + */ + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("size(unknown):=" + this.getSize() + ", "); + sb.append("signature(Compression type identifier):=" + + new String(this.getSignature(), UTF_8) + ", "); + sb.append("version(Possibly numeric code for LZX):=" + + this.getVersion() + System.getProperty("line.separator")); + sb.append("resetInterval(The Huffman reset interval):=" + + this.getResetInterval() + ", "); + sb.append("windowSize:=" + this.getWindowSize() + ", "); + sb.append("windowsPerReset(unknown (sometimes 2, sometimes 1, sometimes 0):=" + + this.getWindowsPerReset() + ", "); + sb.append("unknown_18:=" + this.getUnknown_18() + + System.getProperty("line.separator")); + return sb.toString(); + } + + // @Override + public void parse(byte[] data, ChmLzxcControlData chmLzxcControlData) throws TikaException { + if (data == null || (data.length < ChmConstants.CHM_LZXC_MIN_LEN)) + throw new ChmParsingException("we want at least 0x18 bytes"); + chmLzxcControlData.setDataRemained(data.length); + chmLzxcControlData.setSize(unmarshalUInt32(data, chmLzxcControlData.getSize())); + chmLzxcControlData.unmarshalCharArray(data, chmLzxcControlData, + ChmConstants.CHM_SIGNATURE_LEN); + chmLzxcControlData.setVersion(unmarshalUInt32(data, + chmLzxcControlData.getVersion())); + chmLzxcControlData.setResetInterval(unmarshalUInt32(data, + chmLzxcControlData.getResetInterval())); + chmLzxcControlData.setWindowSize(unmarshalUInt32(data, + chmLzxcControlData.getWindowSize())); + chmLzxcControlData.setWindowsPerReset(unmarshalUInt32(data, + chmLzxcControlData.getWindowsPerReset())); + + if (data.length >= ChmConstants.CHM_LZXC_V2_LEN) + chmLzxcControlData.setUnknown_18(unmarshalUInt32(data, + chmLzxcControlData.getUnknown_18())); + else + chmLzxcControlData.setUnknown_18(0); + + if (chmLzxcControlData.getVersion() == 2) { + chmLzxcControlData.setWindowSize(getWindowSize() + * ChmConstants.CHM_WINDOW_SIZE_BLOCK); + } + + if (chmLzxcControlData.getWindowSize() == 0 + || chmLzxcControlData.getResetInterval() == 0) + throw new ChmParsingException( + "window size / resetInterval should be more than zero"); + + if (chmLzxcControlData.getWindowSize() == 1) + throw new ChmParsingException( + "window size / resetInterval should be more than 1"); + + /* checks a signature */ + if (!new String(chmLzxcControlData.getSignature(), UTF_8) + .equals(ChmConstants.LZXC)) + throw new ChmParsingException( + "the signature does not seem to be correct"); + } + + /** + * @param args + */ + public static void main(String[] args) { + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,341 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm.accessor; + +import java.math.BigInteger; +import java.util.Arrays; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.assertion.ChmAssert; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.apache.tika.parser.chm.exception.ChmParsingException; + +/** + * LZXC reset table For ensuring a decompression. Reads the block named + * "::DataSpace/Storage/<SectionName>/Transform/{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable" + * . + * + * {@link http + * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original + * /?page=2 } + * + */ +public class ChmLzxcResetTable implements ChmAccessor<ChmLzxcResetTable> { + private static final long serialVersionUID = -8209574429411707460L; + /* class members */ + private long version; // 0000: DWORD 2 unknown (possibly a version number) + private long block_count; // 0004: DWORD Number of entries in reset table + private long unknown; // 0008: DWORD 8 unknown + private long table_offset; // 000C: DWORD $28 Length of table header (area + // before table entries) + private long uncompressed_len; // 0010: QWORD Uncompressed Length + private long compressed_len; // 0018: QWORD Compressed Length + private long block_len; // 0020: QWORD 0x8000 block size for locations below + private long[] block_address; + + /* local usage */ + private int dataRemained; + private int currentPlace = 0; + + private int getDataRemained() { + return dataRemained; + } + + private void setDataRemained(int dataRemained) { + this.dataRemained = dataRemained; + } + + /** + * Returns block addresses + * + * @return block addresses + */ + public long[] getBlockAddress() { + return block_address; + } + + /** + * Sets block addresses + * + * @param block_address + */ + public void setBlockAddress(long[] block_address) { + this.block_address = block_address; + } + + private int getCurrentPlace() { + return currentPlace; + } + + private void setCurrentPlace(int currentPlace) { + this.currentPlace = currentPlace; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("version:=" + getVersion() + + System.getProperty("line.separator")); + sb.append("block_count:=" + getBlockCount() + + System.getProperty("line.separator")); + sb.append("unknown:=" + getUnknown() + + System.getProperty("line.separator")); + sb.append("table_offset:=" + getTableOffset() + + System.getProperty("line.separator")); + sb.append("uncompressed_len:=" + getUncompressedLen() + + System.getProperty("line.separator")); + sb.append("compressed_len:=" + getCompressedLen() + + System.getProperty("line.separator")); + sb.append("block_len:=" + getBlockLen() + + System.getProperty("line.separator")); + sb.append("block_addresses:=" + Arrays.toString(getBlockAddress())); + return sb.toString(); + } + + /** + * Enumerates chm block addresses + * + * @param data + * + * @return byte[] of addresses + * @throws TikaException + */ + private long[] enumerateBlockAddresses(byte[] data) throws TikaException { + ChmAssert.assertByteArrayNotNull(data); + /* we have limit of number of blocks to be extracted */ + if (getBlockCount() > 5000) + setBlockCount(5000); + + if (getBlockCount() < 0 && (getDataRemained() / 8) > 0) + setBlockCount(getDataRemained() / 8); + + long[] addresses = new long[(int) getBlockCount()]; + int rem = getDataRemained() / 8; + for (int i = 0; i < rem; i++) { + long num = -1; + + try { + addresses[i] = unmarshalUint64(data, num); + } catch (Exception e) { + throw new TikaException(e.getMessage()); + } + } + return addresses; + } + + /** + * Validates parameters such as byte[] and chm lzxc reset table + * + * @param data + * @param chmLzxcResetTable + * + * @return boolean + * @throws TikaException + */ + private boolean validateParamaters(byte[] data, + ChmLzxcResetTable chmLzxcResetTable) throws TikaException { + int goodParameter = 0; + ChmAssert.assertByteArrayNotNull(data); + ++goodParameter; + ChmAssert.assertChmAccessorNotNull(chmLzxcResetTable); + ++goodParameter; + return (goodParameter == 2); + } + + private long unmarshalUInt32(byte[] data, long dest) throws TikaException { + ChmAssert.assertByteArrayNotNull(data); + dest = (data[this.getCurrentPlace()] & 0xff) + | (data[this.getCurrentPlace() + 1] & 0xff) << 8 + | (data[this.getCurrentPlace() + 2] & 0xff) << 16 + | (data[this.getCurrentPlace() + 3] & 0xff) << 24; + + setDataRemained(this.getDataRemained() - 4); + this.setCurrentPlace(this.getCurrentPlace() + 4); + return dest; + } + + private long unmarshalUint64(byte[] data, long dest) throws TikaException { + ChmAssert.assertByteArrayNotNull(data); + byte[] temp = new byte[8]; + int i, j;// counters + + for (i = 8, j = 7; i > 0; i--) { + if (data.length > this.getCurrentPlace()) { + temp[j--] = data[this.getCurrentPlace()]; + this.setCurrentPlace(this.getCurrentPlace() + 1); + } else + throw new TikaException("data is too small to calculate address block"); + } + dest = new BigInteger(temp).longValue(); + this.setDataRemained(this.getDataRemained() - 8); + return dest; + } + + /** + * Returns the version + * + * @return - long + */ + public long getVersion() { + return version; + } + + /** + * Sets the version + * + * @param version + * - long + */ + public void setVersion(long version) { + this.version = version; + } + + /** + * Gets a block count + * + * @return - int + */ + public long getBlockCount() { + return block_count; + } + + /** + * Sets a block count + * + * @param block_count + * - long + */ + public void setBlockCount(long block_count) { + this.block_count = block_count; + } + + /** + * Gets unknown + * + * @return - long + */ + public long getUnknown() { + return unknown; + } + + /** + * Sets an unknown + * + * @param unknown + * - long + */ + public void setUnknown(long unknown) { + this.unknown = unknown; + } + + /** + * Gets a table offset + * + * @return - long + */ + public long getTableOffset() { + return table_offset; + } + + /** + * Sets a table offset + * + * @param table_offset + * - long + */ + public void setTableOffset(long table_offset) { + this.table_offset = table_offset; + } + + /** + * Gets uncompressed length + * + * @return - {@link BigInteger } + */ + public long getUncompressedLen() { + return uncompressed_len; + } + + /** + * Sets uncompressed length + * + * @param uncompressed_len + * - {@link BigInteger} + */ + public void setUncompressedLen(long uncompressed_len) { + this.uncompressed_len = uncompressed_len; + } + + /** + * Gets compressed length + * + * @return - {@link BigInteger} + */ + public long getCompressedLen() { + return compressed_len; + } + + /** + * Sets compressed length + * + * @param compressed_len + * - {@link BigInteger} + */ + public void setCompressedLen(long compressed_len) { + this.compressed_len = compressed_len; + } + + /** + * Gets a block length + * + * @return - {@link BigInteger} + */ + public long getBlockLen() { + return block_len; + } + + /** + * Sets a block length + * + * @param block_len + * - {@link BigInteger} + */ + public void setBlockLlen(long block_len) { + this.block_len = block_len; + } + + // @Override + public void parse(byte[] data, ChmLzxcResetTable chmLzxcResetTable) throws TikaException { + setDataRemained(data.length); + if (validateParamaters(data, chmLzxcResetTable)) { + /* unmarshal fields */ + chmLzxcResetTable.setVersion(unmarshalUInt32(data, chmLzxcResetTable.getVersion())); + chmLzxcResetTable.setBlockCount(unmarshalUInt32(data, chmLzxcResetTable.getBlockCount())); + chmLzxcResetTable.setUnknown(unmarshalUInt32(data, chmLzxcResetTable.getUnknown())); + chmLzxcResetTable.setTableOffset(unmarshalUInt32(data, chmLzxcResetTable.getTableOffset())); + chmLzxcResetTable.setUncompressedLen(unmarshalUint64(data, chmLzxcResetTable.getUncompressedLen())); + chmLzxcResetTable.setCompressedLen(unmarshalUint64(data, chmLzxcResetTable.getCompressedLen())); + chmLzxcResetTable.setBlockLlen(unmarshalUint64(data, chmLzxcResetTable.getBlockLen())); + chmLzxcResetTable.setBlockAddress(enumerateBlockAddresses(data)); + } + + /* checks chmLzxcResetTable */ + if (chmLzxcResetTable.getVersion() != ChmConstants.CHM_VER_2) + throw new ChmParsingException( + "does not seem currect version of chmLzxcResetTable"); + } +}
