Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,1430 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.rtf; + +import java.io.IOException; +import java.io.InputStream; +import java.io.PushbackInputStream; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; +import java.util.Calendar; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.Locale; +import java.util.Map; +import java.util.TimeZone; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.OfficeOpenXMLExtended; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.CharsetUtils; +import org.xml.sax.SAXException; + +/* Tokenizes and performs a "shallow" parse of the RTF + * document, just enough to properly decode the text. + * + * TODO: we should cutover to a "real" tokenizer (eg JFlex); + * it should give better perf, by replacing the excessive + * "else if" string compares with FSA traversal. */ + +final class TextExtractor { + + private static final Charset ASCII = Charset.forName("US-ASCII"); + private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252"); + private static final Charset MAC_ROMAN = getCharset("MacRoman"); + private static final Charset SHIFT_JIS = getCharset("Shift_JIS"); + private static final Charset WINDOWS_57011 = getCharset("windows-57011"); + private static final Charset WINDOWS_57010 = getCharset("windows-57010"); + private static final Charset WINDOWS_57009 = getCharset("windows-57009"); + private static final Charset WINDOWS_57008 = getCharset("windows-57008"); + private static final Charset WINDOWS_57007 = getCharset("windows-57007"); + private static final Charset WINDOWS_57006 = getCharset("windows-57006"); + private static final Charset WINDOWS_57005 = getCharset("windows-57005"); + private static final Charset WINDOWS_57004 = getCharset("windows-57004"); + private static final Charset WINDOWS_57003 = getCharset("windows-57003"); + private static final Charset X_ISCII91 = getCharset("x-ISCII91"); + private static final Charset X_MAC_CENTRAL_EUROPE = getCharset("x-MacCentralEurope"); + private static final Charset MAC_CYRILLIC = getCharset("MacCyrillic"); + private static final Charset X_JOHAB = getCharset("x-Johab"); + private static final Charset CP12582 = getCharset("CP1258"); + private static final Charset CP12572 = getCharset("CP1257"); + private static final Charset CP12562 = getCharset("CP1256"); + private static final Charset CP12552 = getCharset("CP1255"); + private static final Charset CP12542 = getCharset("CP1254"); + private static final Charset CP12532 = getCharset("CP1253"); + private static final Charset CP1252 = getCharset("CP1252"); + private static final Charset CP12512 = getCharset("CP1251"); + private static final Charset CP12502 = getCharset("CP1250"); + private static final Charset CP950 = getCharset("CP950"); + private static final Charset CP949 = getCharset("CP949"); + private static final Charset MS9362 = getCharset("MS936"); + private static final Charset MS8742 = getCharset("MS874"); + private static final Charset CP866 = getCharset("CP866"); + private static final Charset CP865 = getCharset("CP865"); + private static final Charset CP864 = getCharset("CP864"); + private static final Charset CP863 = getCharset("CP863"); + private static final Charset CP862 = getCharset("CP862"); + private static final Charset CP860 = getCharset("CP860"); + private static final Charset CP852 = getCharset("CP852"); + private static final Charset CP8502 = getCharset("CP850"); + private static final Charset CP819 = getCharset("CP819"); + private static final Charset WINDOWS_720 = getCharset("windows-720"); + private static final Charset WINDOWS_711 = getCharset("windows-711"); + private static final Charset WINDOWS_710 = getCharset("windows-710"); + private static final Charset WINDOWS_709 = getCharset("windows-709"); + private static final Charset ISO_8859_6 = getCharset("ISO-8859-6"); + private static final Charset CP4372 = getCharset("CP437"); + private static final Charset CP850 = getCharset("cp850"); + private static final Charset CP437 = getCharset("cp437"); + private static final Charset MS874 = getCharset("ms874"); + private static final Charset CP1257 = getCharset("cp1257"); + private static final Charset CP1256 = getCharset("cp1256"); + private static final Charset CP1255 = getCharset("cp1255"); + private static final Charset CP1258 = getCharset("cp1258"); + private static final Charset CP1254 = getCharset("cp1254"); + private static final Charset CP1253 = getCharset("cp1253"); + private static final Charset MS950 = getCharset("ms950"); + private static final Charset MS936 = getCharset("ms936"); + private static final Charset MS1361 = getCharset("ms1361"); + private static final Charset MS932 = getCharset("MS932"); + private static final Charset CP1251 = getCharset("cp1251"); + private static final Charset CP1250 = getCharset("cp1250"); + private static final Charset MAC_THAI = getCharset("MacThai"); + private static final Charset MAC_TURKISH = getCharset("MacTurkish"); + private static final Charset MAC_GREEK = getCharset("MacGreek"); + private static final Charset MAC_ARABIC = getCharset("MacArabic"); + private static final Charset MAC_HEBREW = getCharset("MacHebrew"); + private static final Charset JOHAB = getCharset("johab"); + private static final Charset BIG5 = getCharset("Big5"); + private static final Charset GB2312 = getCharset("GB2312"); + private static final Charset MS949 = getCharset("ms949"); + // The RTF doc has a "font table" that assigns ords + // (f0, f1, f2, etc.) to fonts and charsets, using the + // \fcharsetN control word. This mapping maps from the + // N to corresponding Java charset: + private static final Map<Integer, Charset> FCHARSET_MAP = + new HashMap<Integer, Charset>(); + // The RTF may specify the \ansicpgN charset in the + // header; this maps the N to the corresponding Java + // character set: + private static final Map<Integer, Charset> ANSICPG_MAP = + new HashMap<Integer, Charset>(); + + static { + FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI + // charset 1 is Default + // charset 2 is Symbol + + FCHARSET_MAP.put(77, MAC_ROMAN); // Mac Roman + FCHARSET_MAP.put(78, SHIFT_JIS); // Mac Shift Jis + FCHARSET_MAP.put(79, MS949); // Mac Hangul + FCHARSET_MAP.put(80, GB2312); // Mac GB2312 + FCHARSET_MAP.put(81, BIG5); // Mac Big5 + FCHARSET_MAP.put(82, JOHAB); // Mac Johab (old) + FCHARSET_MAP.put(83, MAC_HEBREW); // Mac Hebrew + FCHARSET_MAP.put(84, MAC_ARABIC); // Mac Arabic + FCHARSET_MAP.put(85, MAC_GREEK); // Mac Greek + FCHARSET_MAP.put(86, MAC_TURKISH); // Mac Turkish + FCHARSET_MAP.put(87, MAC_THAI); // Mac Thai + FCHARSET_MAP.put(88, CP1250); // Mac East Europe + FCHARSET_MAP.put(89, CP1251); // Mac Russian + + FCHARSET_MAP.put(128, MS932); // Shift JIS + FCHARSET_MAP.put(129, MS949); // Hangul + FCHARSET_MAP.put(130, MS1361); // Johab + FCHARSET_MAP.put(134, MS936); // GB2312 + FCHARSET_MAP.put(136, MS950); // Big5 + FCHARSET_MAP.put(161, CP1253); // Greek + FCHARSET_MAP.put(162, CP1254); // Turkish + FCHARSET_MAP.put(163, CP1258); // Vietnamese + FCHARSET_MAP.put(177, CP1255); // Hebrew + FCHARSET_MAP.put(178, CP1256); // Arabic + // FCHARSET_MAP.put( 179, "" ); // Arabic Traditional + // FCHARSET_MAP.put( 180, "" ); // Arabic user + // FCHARSET_MAP.put( 181, "" ); // Hebrew user + FCHARSET_MAP.put(186, CP1257); // Baltic + + FCHARSET_MAP.put(204, CP1251); // Russian + FCHARSET_MAP.put(222, MS874); // Thai + FCHARSET_MAP.put(238, CP1250); // Eastern European + FCHARSET_MAP.put(254, CP437); // PC 437 + FCHARSET_MAP.put(255, CP850); // OEM + } + + static { + ANSICPG_MAP.put(437, CP4372); // US IBM + ANSICPG_MAP.put(708, ISO_8859_6); // Arabic (ASMO 708) + + ANSICPG_MAP.put(709, WINDOWS_709); // Arabic (ASMO 449+, BCON V4) + ANSICPG_MAP.put(710, WINDOWS_710); // Arabic (transparent Arabic) + ANSICPG_MAP.put(710, WINDOWS_711); // Arabic (Nafitha Enhanced) + ANSICPG_MAP.put(710, WINDOWS_720); // Arabic (transparent ASMO) + ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe) + ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe) + + ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe) + ANSICPG_MAP.put(850, CP8502); // IBM Multilingual + ANSICPG_MAP.put(852, CP852); // Eastern European + ANSICPG_MAP.put(860, CP860); // Portuguese + ANSICPG_MAP.put(862, CP862); // Hebrew + ANSICPG_MAP.put(863, CP863); // French Canadian + ANSICPG_MAP.put(864, CP864); // Arabic + ANSICPG_MAP.put(865, CP865); // Norwegian + ANSICPG_MAP.put(866, CP866); // Soviet Union + ANSICPG_MAP.put(874, MS8742); // Thai + ANSICPG_MAP.put(932, MS932); // Japanese + ANSICPG_MAP.put(936, MS9362); // Simplified Chinese + ANSICPG_MAP.put(949, CP949); // Korean + ANSICPG_MAP.put(950, CP950); // Traditional Chinese + ANSICPG_MAP.put(1250, CP12502); // Eastern European + ANSICPG_MAP.put(1251, CP12512); // Cyrillic + ANSICPG_MAP.put(1252, CP1252); // Western European + ANSICPG_MAP.put(1253, CP12532); // Greek + ANSICPG_MAP.put(1254, CP12542); // Turkish + ANSICPG_MAP.put(1255, CP12552); // Hebrew + ANSICPG_MAP.put(1256, CP12562); // Arabic + ANSICPG_MAP.put(1257, CP12572); // Baltic + ANSICPG_MAP.put(1258, CP12582); // Vietnamese + ANSICPG_MAP.put(1361, X_JOHAB); // Johab + ANSICPG_MAP.put(10000, MAC_ROMAN); // Mac Roman + ANSICPG_MAP.put(10001, SHIFT_JIS); // Mac Japan + ANSICPG_MAP.put(10004, MAC_ARABIC); // Mac Arabic + ANSICPG_MAP.put(10005, MAC_HEBREW); // Mac Hebrew + ANSICPG_MAP.put(10006, MAC_GREEK); // Mac Hebrew + ANSICPG_MAP.put(10007, MAC_CYRILLIC); // Mac Cyrillic + ANSICPG_MAP.put(10029, X_MAC_CENTRAL_EUROPE); // MAC Latin2 + ANSICPG_MAP.put(10081, MAC_TURKISH); // Mac Turkish + ANSICPG_MAP.put(57002, X_ISCII91); // Devanagari + + // TODO: in theory these other charsets are simple + // shifts off of Devanagari, so we could impl that + // here: + ANSICPG_MAP.put(57003, WINDOWS_57003); // Bengali + ANSICPG_MAP.put(57004, WINDOWS_57004); // Tamil + ANSICPG_MAP.put(57005, WINDOWS_57005); // Telugu + ANSICPG_MAP.put(57006, WINDOWS_57006); // Assamese + ANSICPG_MAP.put(57007, WINDOWS_57007); // Oriya + ANSICPG_MAP.put(57008, WINDOWS_57008); // Kannada + ANSICPG_MAP.put(57009, WINDOWS_57009); // Malayalam + ANSICPG_MAP.put(57010, WINDOWS_57010); // Gujariti + ANSICPG_MAP.put(57011, WINDOWS_57011); // Punjabi + } + + // Used when we decode bytes -> chars using CharsetDecoder: + private final char[] outputArray = new char[128]; + private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray); + // Holds the font table from this RTF doc, mapping + // the font number (from \fN control word) to the + // corresponding charset: + private final Map<Integer, Charset> fontToCharset = + new HashMap<Integer, Charset>(); + // Group stack: when we open a new group, we push + // the previous group state onto the stack; when we + // close the group, we restore it + private final LinkedList<GroupState> groupStates = new LinkedList<GroupState>(); + private final StringBuilder pendingBuffer = new StringBuilder(); + private final XHTMLContentHandler out; + private final Metadata metadata; + private final RTFEmbObjHandler embObjHandler; + // How many next ansi chars we should skip; this + // is 0 except when we are still in the "ansi + // shadow" after seeing a unicode escape, at which + // point it's set to the last ucN skip we had seen: + int ansiSkip = 0; + private int written = 0; + // Hold pending bytes (encoded in the current charset) + // for text output: + private byte[] pendingBytes = new byte[16]; + private int pendingByteCount; + private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes); + // Holds pending chars for text output + private char[] pendingChars = new char[10]; + private int pendingCharCount; + // Holds chars for a still-being-tokenized control word + private byte[] pendingControl = new byte[10]; + private int pendingControlCount; + // Reused when possible: + private CharsetDecoder decoder; + private Charset lastCharset; + private Charset globalCharset = WINDOWS_1252; + private int globalDefaultFont = -1; + private int curFontID = -1; + // Current group state; in theory this initial + // GroupState is unused because the RTF doc should + // immediately open the top group (start with {): + private GroupState groupState = new GroupState(); + private boolean inHeader = true; + private int fontTableState; + private int fontTableDepth; + // Non null if we are processing metadata (title, + // keywords, etc.) inside the info group: + private Property nextMetaData; + private boolean inParagraph; + // Non-zero if we are processing inside a field destination: + private int fieldState; + // Non-zero list index + private int pendingListEnd; + private Map<Integer, ListDescriptor> listTable = new HashMap<Integer, ListDescriptor>(); + private Map<Integer, ListDescriptor> listOverrideTable = new HashMap<Integer, ListDescriptor>(); + private Map<Integer, ListDescriptor> currentListTable; + private ListDescriptor currentList; + private int listTableLevel = -1; + private boolean ignoreLists; + // Non-null if we've seen the url for a HYPERLINK but not yet + // its text: + private String pendingURL; + // Used to process the sub-groups inside the upr + // group: + private int uprState = -1; + // Used when extracting CREATION date: + private int year, month, day, hour, minute; + + public TextExtractor(XHTMLContentHandler out, Metadata metadata, + RTFEmbObjHandler embObjHandler) { + this.metadata = metadata; + this.out = out; + this.embObjHandler = embObjHandler; + } + + private static Charset getCharset(String name) { + try { + return CharsetUtils.forName(name); + } catch (Exception e) { + return ASCII; + } + } + + protected static boolean isHexChar(int ch) { + return (ch >= '0' && ch <= '9') || + (ch >= 'a' && ch <= 'f') || + (ch >= 'A' && ch <= 'F'); + } + + private static boolean isAlpha(int ch) { + return (ch >= 'a' && ch <= 'z') || + (ch >= 'A' && ch <= 'Z'); + } + + private static boolean isDigit(int ch) { + return ch >= '0' && ch <= '9'; + } + + protected static int hexValue(int ch) { + if (ch >= '0' && ch <= '9') { + return ch - '0'; + } else if (ch >= 'a' && ch <= 'z') { + return 10 + (ch - 'a'); + } else { + assert ch >= 'A' && ch <= 'Z'; + return 10 + (ch - 'A'); + } + } + + public boolean isIgnoringLists() { + return ignoreLists; + } + + public void setIgnoreLists(boolean ignore) { + this.ignoreLists = ignore; + } + + // Push pending bytes or pending chars: + private void pushText() throws IOException, SAXException, TikaException { + if (pendingByteCount != 0) { + assert pendingCharCount == 0; + pushBytes(); + } else { + pushChars(); + } + } + + // Buffers the byte (unit in the current charset) for + // output: + private void addOutputByte(int b) throws IOException, SAXException, TikaException { + assert b >= 0 && b < 256 : "byte value out of range: " + b; + + if (pendingCharCount != 0) { + pushChars(); + } + if (groupState.pictDepth > 0) { + embObjHandler.writeMetadataChar((char) b); + } else { + // Save the byte in pending buffer: + if (pendingByteCount == pendingBytes.length) { + // Gradual but exponential growth: + final byte[] newArray = new byte[(int) (pendingBytes.length * 1.25)]; + System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length); + pendingBytes = newArray; + pendingByteBuffer = ByteBuffer.wrap(pendingBytes); + } + pendingBytes[pendingByteCount++] = (byte) b; + } + } + + // Buffers a byte as part of a control word: + private void addControl(int b) { + assert isAlpha(b); + // Save the byte in pending buffer: + if (pendingControlCount == pendingControl.length) { + // Gradual but exponential growth: + final byte[] newArray = new byte[(int) (pendingControl.length * 1.25)]; + System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length); + pendingControl = newArray; + } + pendingControl[pendingControlCount++] = (byte) b; + } + + // Buffers a UTF16 code unit for output + private void addOutputChar(char ch) throws IOException, SAXException, TikaException { + if (pendingByteCount != 0) { + pushBytes(); + } + + if (inHeader || fieldState == 1) { + pendingBuffer.append(ch); + } else if (groupState.sn == true || groupState.sv == true) { + embObjHandler.writeMetadataChar(ch); + } else { + if (pendingCharCount == pendingChars.length) { + // Gradual but exponential growth: + final char[] newArray = new char[(int) (pendingChars.length * 1.25)]; + System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length); + pendingChars = newArray; + } + pendingChars[pendingCharCount++] = ch; + } + } + + // Shallow parses the entire doc, writing output to + // this.out and this.metadata + public void extract(InputStream in) throws IOException, SAXException, TikaException { +// in = new FilterInputStream(in) { +// public int read() throws IOException { +// int r = super.read(); +// System.out.write(r); +// System.out.flush(); +// return r; +// } +// public int read(byte b[], int off, int len) throws IOException { +// int r = super.read(b, off, len); +// System.out.write(b, off, r); +// System.out.flush(); +// return r; +// } +// }; + extract(new PushbackInputStream(in, 2)); + } + + private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException { + out.startDocument(); + + while (true) { + final int b = in.read(); + if (b == -1) { + break; + } else if (b == '\\') { + parseControlToken(in); + } else if (b == '{') { + pushText(); + processGroupStart(in); + } else if (b == '}') { + pushText(); + processGroupEnd(); + if (groupStates.isEmpty()) { + // parsed document closing brace + break; + } + } else if (groupState.objdata == true || + groupState.pictDepth == 1) { + embObjHandler.writeHexChar(b); + } else if (b != '\r' && b != '\n' + && (!groupState.ignore || nextMetaData != null || + groupState.sn == true || groupState.sv == true)) { + // Linefeed and carriage return are not + // significant + if (ansiSkip != 0) { + ansiSkip--; + } else { + addOutputByte(b); + } + } + } + + endParagraph(false); + out.endDocument(); + } + + private void parseControlToken(PushbackInputStream in) throws IOException, SAXException, TikaException { + int b = in.read(); + if (b == '\'') { + // escaped hex char + parseHexChar(in); + } else if (isAlpha(b)) { + // control word + parseControlWord((char) b, in); + } else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == '\n') { + // escaped char + addOutputByte(b); + } else if (b != -1) { + // control symbol, eg \* or \~ + processControlSymbol((char) b); + } + } + + private void parseHexChar(PushbackInputStream in) throws IOException, SAXException, TikaException { + int hex1 = in.read(); + if (!isHexChar(hex1)) { + // DOC ERROR (malformed hex escape): ignore + in.unread(hex1); + return; + } + + int hex2 = in.read(); + if (!isHexChar(hex2)) { + // TODO: log a warning here, somehow? + // DOC ERROR (malformed hex escape): + // ignore + in.unread(hex2); + return; + } + + if (ansiSkip != 0) { + // Skip this ansi char since we are + // still in the shadow of a unicode + // escape: + ansiSkip--; + } else { + // Unescape: + addOutputByte(16 * hexValue(hex1) + hexValue(hex2)); + } + } + + private void parseControlWord(int firstChar, PushbackInputStream in) throws IOException, SAXException, TikaException { + addControl(firstChar); + + int b = in.read(); + while (isAlpha(b)) { + addControl(b); + b = in.read(); + } + + boolean hasParam = false; + boolean negParam = false; + if (b == '-') { + negParam = true; + hasParam = true; + b = in.read(); + } + + int param = 0; + while (isDigit(b)) { + param *= 10; + param += (b - '0'); + hasParam = true; + b = in.read(); + } + + // space is consumed as part of the + // control word, but is not added to the + // control word + if (b != ' ') { + in.unread(b); + } + + if (hasParam) { + if (negParam) { + param = -param; + } + processControlWord(param, in); + } else { + processControlWord(); + } + + pendingControlCount = 0; + } + + private void lazyStartParagraph() throws IOException, SAXException, TikaException { + if (!inParagraph) { + // Ensure </i></b> order + if (groupState.italic) { + end("i"); + } + if (groupState.bold) { + end("b"); + } + if (pendingListEnd != 0 && groupState.list != pendingListEnd) { + endList(pendingListEnd); + pendingListEnd = 0; + } + if (inList() && pendingListEnd != groupState.list) { + startList(groupState.list); + } + if (inList()) { + out.startElement("li"); + } else { + out.startElement("p"); + } + + // Ensure <b><i> order + if (groupState.bold) { + start("b"); + } + if (groupState.italic) { + start("i"); + } + inParagraph = true; + } + } + + private void endParagraph(boolean preserveStyles) throws IOException, SAXException, TikaException { + pushText(); + //maintain consecutive new lines + if (!inParagraph) { + lazyStartParagraph(); + } + if (inParagraph) { + if (groupState.italic) { + end("i"); + groupState.italic = preserveStyles; + } + if (groupState.bold) { + end("b"); + groupState.bold = preserveStyles; + } + if (inList()) { + out.endElement("li"); + } else { + out.endElement("p"); + } + + if (preserveStyles && (groupState.bold || groupState.italic)) { + start("p"); + if (groupState.bold) { + start("b"); + } + if (groupState.italic) { + start("i"); + } + inParagraph = true; + } else { + inParagraph = false; + } + } + + // Ensure closing the list at document end + if (!preserveStyles && pendingListEnd != 0) { + endList(pendingListEnd); + pendingListEnd = 0; + } + } + + // Push pending UTF16 units to out ContentHandler + private void pushChars() throws IOException, SAXException, TikaException { + if (pendingCharCount != 0) { + lazyStartParagraph(); + out.characters(pendingChars, 0, pendingCharCount); + pendingCharCount = 0; + } + } + + // Decodes the buffered bytes in pendingBytes + // into UTF16 code units, and sends the characters + // to the out ContentHandler, if we are in the body, + // else appends the characters to the pendingBuffer + private void pushBytes() throws IOException, SAXException, TikaException { + if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) { + + final CharsetDecoder decoder = getDecoder(); + pendingByteBuffer.limit(pendingByteCount); + assert pendingByteBuffer.position() == 0; + assert outputBuffer.position() == 0; + + while (true) { + // We pass true for endOfInput because, when + // we are called, we should have seen a + // complete sequence of characters for this + // charset: + final CoderResult result = decoder.decode(pendingByteBuffer, outputBuffer, true); + + final int pos = outputBuffer.position(); + if (pos > 0) { + if (inHeader || fieldState == 1) { + pendingBuffer.append(outputArray, 0, pos); + } else { + lazyStartParagraph(); + out.characters(outputArray, 0, pos); + } + outputBuffer.position(0); + } + + if (result == CoderResult.UNDERFLOW) { + break; + } + } + + while (true) { + final CoderResult result = decoder.flush(outputBuffer); + + final int pos = outputBuffer.position(); + if (pos > 0) { + if (inHeader || fieldState == 1) { + pendingBuffer.append(outputArray, 0, pos); + } else { + lazyStartParagraph(); + out.characters(outputArray, 0, pos); + } + outputBuffer.position(0); + } + + if (result == CoderResult.UNDERFLOW) { + break; + } + } + + // Reset for next decode + decoder.reset(); + pendingByteBuffer.position(0); + } + + pendingByteCount = 0; + } + + // NOTE: s must be ascii alpha only + private boolean equals(String s) { + if (pendingControlCount != s.length()) { + return false; + } + for (int idx = 0; idx < pendingControlCount; idx++) { + assert isAlpha(s.charAt(idx)); + if (((byte) s.charAt(idx)) != pendingControl[idx]) { + return false; + } + } + return true; + } + + private void processControlSymbol(char ch) throws IOException, SAXException, TikaException { + switch (ch) { + case '~': + // Non-breaking space -> unicode NON-BREAKING SPACE + addOutputChar('\u00a0'); + break; + case '*': + // Ignorable destination (control words defined after + // the 1987 RTF spec). These are already handled by + // processGroupStart() + break; + case '-': + // Optional hyphen -> unicode SOFT HYPHEN + addOutputChar('\u00ad'); + break; + case '_': + // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN + addOutputChar('\u2011'); + break; + default: + break; + } + } + + private CharsetDecoder getDecoder() throws TikaException { + Charset charset = getCharset(); + + // Common case: charset is same as last time, so + // just reuse it: + if (lastCharset == null || !charset.equals(lastCharset)) { + decoder = charset.newDecoder(); + decoder.onMalformedInput(CodingErrorAction.REPLACE); + decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); + lastCharset = charset; + } + + return decoder; + } + + // Return current charset in-use + private Charset getCharset() throws TikaException { + // If a specific font (fN) was set, use its charset + if (groupState.fontCharset != null) { + return groupState.fontCharset; + } + + // Else, if global default font (defN) was set, use that one + if (globalDefaultFont != -1 && !inHeader) { + Charset cs = fontToCharset.get(globalDefaultFont); + if (cs != null) { + return cs; + } + } + + // Else, use the global charset + if (globalCharset == null) { + throw new TikaException("unable to determine charset"); + } + + return globalCharset; + } + + // Handle control word that takes a parameter: + private void processControlWord(int param, PushbackInputStream in) throws IOException, SAXException, TikaException { + + // TODO: afN? (associated font number) + + // TODO: do these alter text output...? + /* + } else if (equals("stshfdbch")) { + // font to be used by default in + // style sheet for East Asian chars + // arg N is font table entry + } else if (equals("stshfloch")) { + // font to be used by default in + // style sheet for ASCII chars + // arg N is font table entry + } else if (equals("stshfhich")) { + // font to be used by default in + // style sheet for High Ansi chars + // arg N is font table entry + } else if (equals("stshfbi")) { + // style sheet for Complex Scripts (BIDI) chars + // arg N is font table entry + */ + + // TODO: inefficient that we check equals N times; + // we'd get better perf w/ real lexer (eg + // JFlex), which uses single-pass FSM to do cmp: + if (inHeader) { + if (equals("ansicpg")) { + // ANSI codepage + Charset cs = ANSICPG_MAP.get(param); + if (cs != null) { + globalCharset = cs; + } + } else if (equals("deff")) { + // Default font + globalDefaultFont = param; + } else if (equals("nofpages")) { + metadata.add(Office.PAGE_COUNT, Integer.toString(param)); + } else if (equals("nofwords")) { + metadata.add(Office.WORD_COUNT, Integer.toString(param)); + } else if (equals("nofchars")) { + metadata.add(Office.CHARACTER_COUNT, Integer.toString(param)); + } else if (equals("yr")) { + year = param; + } else if (equals("mo")) { + month = param; + } else if (equals("dy")) { + day = param; + } else if (equals("hr")) { + hour = param; + } else if (equals("min")) { + minute = param; + } + + if (fontTableState == 1) { + // Still inside font table -- record the + // mappings of fN to the fcharset: + if (groupState.depth < fontTableDepth) { + fontTableState = 2; + } else { + if (equals("f")) { + // Start new font definition + curFontID = param; + } else if (equals("fcharset")) { + Charset cs = FCHARSET_MAP.get(param); + if (cs != null) { + fontToCharset.put(curFontID, cs); + } + } + } + } + + if (currentList != null) { + if (equals("listid")) { + currentList.id = param; + currentListTable.put(currentList.id, currentList); + } else if (equals("listtemplateid")) { + currentList.templateID = param; + } else if (equals("levelnfc") || equals("levelnfcn")) { + //sanity check to make sure list information isn't corrupt + if (listTableLevel > -1 && + listTableLevel < currentList.numberType.length) { + currentList.numberType[listTableLevel] = param; + } + } + } + } else { + // In document + if (equals("b")) { + // b0 + assert param == 0; + if (groupState.bold) { + pushText(); + if (groupState.italic) { + end("i"); + } + end("b"); + if (groupState.italic) { + start("i"); + } + groupState.bold = false; + } + } else if (equals("i")) { + // i0 + assert param == 0; + if (groupState.italic) { + pushText(); + end("i"); + groupState.italic = false; + } + } else if (equals("f")) { + // Change current font + Charset fontCharset = fontToCharset.get(param); + + // Push any buffered text before changing + // font: + pushText(); + + if (fontCharset != null) { + groupState.fontCharset = fontCharset; + } else { + // DOC ERROR: font change referenced a + // non-table'd font number + // TODO: log a warning? Throw an exc? + groupState.fontCharset = null; + } + } else if (equals("ls")) { + groupState.list = param; + } else if (equals("lslvl")) { + groupState.listLevel = param; + } + } + + // Process unicode escape. This can appear in doc + // or in header, since the metadata (info) fields + // in the header can be unicode escaped as well: + if (equals("u")) { + // Unicode escape + if (!groupState.ignore || groupState.sv || groupState.sn) { + final char utf16CodeUnit = (char) (param & 0xffff); + addOutputChar(utf16CodeUnit); + } + + // After seeing a unicode escape we must + // skip the next ucSkip ansi chars (the + // "unicode shadow") + ansiSkip = groupState.ucSkip; + } else if (equals("uc")) { + // Change unicode shadow length + groupState.ucSkip = param; + } else if (equals("bin")) { + if (param >= 0) { + if (groupState.pictDepth == 1) { + try { + embObjHandler.writeBytes(in, param); + } catch (IOException e) { + //param was out of bounds or something went wrong during writing. + //skip this obj and move on + //TODO: log.warn + embObjHandler.reset(); + } + } else { + int bytesToRead = param; + byte[] tmpArray = new byte[Math.min(1024, bytesToRead)]; + while (bytesToRead > 0) { + int r = in.read(tmpArray, 0, Math.min(bytesToRead, tmpArray.length)); + if (r < 0) { + throw new TikaException("unexpected end of file: need " + param + " bytes of binary data, found " + (param - bytesToRead)); + } + bytesToRead -= r; + } + } + } else { + // log some warning? + } + } + } + + private boolean inList() { + return !ignoreLists && groupState.list != 0; + } + + /** + * Marks the current list as pending to end. This is done to be able to merge list items of + * the same list within the same enclosing list tag (ie. either <code>"ul"</code>, or + * <code>"ol"</code>). + */ + private void pendingListEnd() { + pendingListEnd = groupState.list; + groupState.list = 0; + } + + /** + * Emits the end tag of a list. Uses {@link #isUnorderedList(int)} to determine the list + * type for the given <code>listID</code>. + * + * @param listID The ID of the list. + * @throws IOException + * @throws SAXException + * @throws TikaException + */ + private void endList(int listID) throws IOException, SAXException, TikaException { + if (!ignoreLists) { + out.endElement(isUnorderedList(listID) ? "ul" : "ol"); + } + } + + /** + * Emits the start tag of a list. Uses {@link #isUnorderedList(int)} to determine the list + * type for the given <code>listID</code>. + * + * @param listID The ID of the list. + * @throws IOException + * @throws SAXException + * @throws TikaException + */ + private void startList(int listID) throws IOException, SAXException, TikaException { + if (!ignoreLists) { + out.startElement(isUnorderedList(listID) ? "ul" : "ol"); + } + } + + private boolean isUnorderedList(int listID) { + ListDescriptor list = listTable.get(listID); + if (list != null) { + return list.isUnordered(groupState.listLevel); + } + return true; + } + + private void end(String tag) throws IOException, SAXException, TikaException { + out.endElement(tag); + } + + private void start(String tag) throws IOException, SAXException, TikaException { + out.startElement(tag); + } + + // Handle non-parameter control word: + private void processControlWord() throws IOException, SAXException, TikaException { + if (inHeader) { + if (equals("ansi")) { + globalCharset = WINDOWS_1252; + } else if (equals("pca")) { + globalCharset = CP850; + } else if (equals("pc")) { + globalCharset = CP437; + } else if (equals("mac")) { + globalCharset = MAC_ROMAN; + } + + if (equals("colortbl") || equals("stylesheet") || equals("fonttbl")) { + groupState.ignore = true; + } else if (equals("listtable")) { + currentListTable = listTable; + } else if (equals("listoverridetable")) { + currentListTable = listOverrideTable; + } + + if (uprState == -1) { + // TODO: we can also parse \creatim, \revtim, + // \printim, \version, etc. + if (equals("author")) { + nextMetaData = TikaCoreProperties.CREATOR; + } else if (equals("title")) { + nextMetaData = TikaCoreProperties.TITLE; + } else if (equals("subject")) { + // TODO: Move to OO subject in Tika 2.0 + nextMetaData = TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT; + } else if (equals("keywords")) { + nextMetaData = TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT; + } else if (equals("category")) { + nextMetaData = OfficeOpenXMLCore.CATEGORY; + } else if (equals("comment")) { + nextMetaData = TikaCoreProperties.COMMENTS; + } else if (equals("company")) { + nextMetaData = OfficeOpenXMLExtended.COMPANY; + } else if (equals("manager")) { + nextMetaData = OfficeOpenXMLExtended.MANAGER; + } else if (equals("template")) { + nextMetaData = OfficeOpenXMLExtended.TEMPLATE; + } else if (equals("creatim")) { + nextMetaData = TikaCoreProperties.CREATED; + } + } + + if (fontTableState == 0) { + // Didn't see font table yet + if (equals("fonttbl")) { + fontTableState = 1; + fontTableDepth = groupState.depth; + } + } else if (fontTableState == 1) { + // Inside font table + if (groupState.depth < fontTableDepth) { + fontTableState = 2; + } + } + + // List table handling + if (currentListTable != null) { + if (equals("list") || equals("listoverride")) { + currentList = new ListDescriptor(); + listTableLevel = -1; + } else if (currentList != null) { + if (equals("liststylename")) { + currentList.isStyle = true; + } else if (equals("listlevel")) { + listTableLevel++; + } + } + } + + if (!groupState.ignore && (equals("par") || equals("pard") || equals("sect") || equals("sectd") || equals("plain") || equals("ltrch") || equals("rtlch"))) { + inHeader = false; + } + } else { + if (equals("b")) { + if (!groupState.bold) { + pushText(); + lazyStartParagraph(); + if (groupState.italic) { + // Make sure nesting is always <b><i> + end("i"); + } + groupState.bold = true; + start("b"); + if (groupState.italic) { + start("i"); + } + } + } else if (equals("i")) { + if (!groupState.italic) { + pushText(); + lazyStartParagraph(); + groupState.italic = true; + start("i"); + } + } + } + + final boolean ignored = groupState.ignore; + + if (equals("pard")) { + // Reset styles + pushText(); + if (groupState.italic) { + end("i"); + groupState.italic = false; + } + if (groupState.bold) { + end("b"); + groupState.bold = false; + } + if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0)) + pendingListEnd(); + } + } else if (equals("par")) { + if (!ignored) { + endParagraph(true); + } + } else if (equals("shptxt")) { + pushText(); + // Text inside a shape + groupState.ignore = false; + } else if (equals("atnid")) { + pushText(); + // Annotation ID + groupState.ignore = false; + } else if (equals("atnauthor")) { + pushText(); + // Annotation author + groupState.ignore = false; + } else if (equals("annotation")) { + pushText(); + // Annotation + groupState.ignore = false; + } else if (equals("listtext")) { + groupState.ignore = true; + } else if (equals("cell")) { + // TODO: we should produce a table output here? + //addOutputChar(' '); + endParagraph(true); + } else if (equals("sp")) { + groupState.sp = true; + } else if (equals("sn")) { + embObjHandler.startSN(); + groupState.sn = true; + } else if (equals("sv")) { + embObjHandler.startSV(); + groupState.sv = true; + } else if (equals("object")) { + pushText(); + embObjHandler.setInObject(true); + groupState.object = true; + } else if (equals("objdata")) { + groupState.objdata = true; + embObjHandler.startObjData(); + } else if (equals("pict")) { + pushText(); + // TODO: create img tag? but can that support + // embedded image data? + groupState.pictDepth = 1; + embObjHandler.startPict(); + } else if (equals("line")) { + if (!ignored) { + addOutputChar('\n'); + } + } else if (equals("column")) { + if (!ignored) { + addOutputChar(' '); + } + } else if (equals("page")) { + if (!ignored) { + addOutputChar('\n'); + } + } else if (equals("softline")) { + if (!ignored) { + addOutputChar('\n'); + } + } else if (equals("softcolumn")) { + if (!ignored) { + addOutputChar(' '); + } + } else if (equals("softpage")) { + if (!ignored) { + addOutputChar('\n'); + } + } else if (equals("tab")) { + if (!ignored) { + addOutputChar('\t'); + } + } else if (equals("upr")) { + uprState = 0; + } else if (equals("ud") && uprState == 1) { + uprState = -1; + // 2nd group inside the upr destination, which + // contains the unicode encoding of the text, so + // we want to keep that: + groupState.ignore = false; + } else if (equals("bullet")) { + if (!ignored) { + // unicode BULLET + addOutputChar('\u2022'); + } + } else if (equals("endash")) { + if (!ignored) { + // unicode EN DASH + addOutputChar('\u2013'); + } + } else if (equals("emdash")) { + if (!ignored) { + // unicode EM DASH + addOutputChar('\u2014'); + } + } else if (equals("enspace")) { + if (!ignored) { + // unicode EN SPACE + addOutputChar('\u2002'); + } + } else if (equals("qmspace")) { + if (!ignored) { + // quarter em space -> unicode FOUR-PER-EM SPACE + addOutputChar('\u2005'); + } + } else if (equals("emspace")) { + if (!ignored) { + // unicode EM SPACE + addOutputChar('\u2003'); + } + } else if (equals("lquote")) { + if (!ignored) { + // unicode LEFT SINGLE QUOTATION MARK + addOutputChar('\u2018'); + } + } else if (equals("rquote")) { + if (!ignored) { + // unicode RIGHT SINGLE QUOTATION MARK + addOutputChar('\u2019'); + } + } else if (equals("ldblquote")) { + if (!ignored) { + // unicode LEFT DOUBLE QUOTATION MARK + addOutputChar('\u201C'); + } + } else if (equals("rdblquote")) { + if (!ignored) { + // unicode RIGHT DOUBLE QUOTATION MARK + addOutputChar('\u201D'); + } + } else if (equals("fldinst")) { + fieldState = 1; + groupState.ignore = false; + } else if (equals("fldrslt") && fieldState == 2) { + assert pendingURL != null; + lazyStartParagraph(); + out.startElement("a", "href", pendingURL); + pendingURL = null; + fieldState = 3; + groupState.ignore = false; + } + } + + // Push new GroupState + private void processGroupStart(PushbackInputStream in) throws IOException { + ansiSkip = 0; + // Push current groupState onto the stack + groupStates.add(groupState); + + // Make new GroupState + groupState = new GroupState(groupState); + assert groupStates.size() == groupState.depth : "size=" + groupStates.size() + " depth=" + groupState.depth; + + if (uprState == 0) { + uprState = 1; + groupState.ignore = true; + } + + // Check for ignorable groups. Note that + // sometimes we un-ignore within this group, eg + // when handling upr escape. + int b2 = in.read(); + if (b2 == '\\') { + int b3 = in.read(); + if (b3 == '*') { + groupState.ignore = true; + } + in.unread(b3); + } + in.unread(b2); + } + + // Pop current GroupState + private void processGroupEnd() throws IOException, SAXException, TikaException { + if (inHeader) { + if (nextMetaData != null) { + if (nextMetaData == TikaCoreProperties.CREATED) { + Calendar cal = Calendar.getInstance(TimeZone.getDefault(), Locale.ROOT); + cal.set(year, month - 1, day, hour, minute, 0); + metadata.set(nextMetaData, cal.getTime()); + } else if (nextMetaData.isMultiValuePermitted()) { + metadata.add(nextMetaData, pendingBuffer.toString()); + } else { + metadata.set(nextMetaData, pendingBuffer.toString()); + } + nextMetaData = null; + } + pendingBuffer.setLength(0); + } + + assert groupState.depth > 0; + ansiSkip = 0; + + if (groupState.objdata == true) { + embObjHandler.handleCompletedObject(); + groupState.objdata = false; + } else if (groupState.pictDepth > 0) { + if (groupState.sn == true) { + embObjHandler.endSN(); + } else if (groupState.sv == true) { + embObjHandler.endSV(); + } else if (groupState.sp == true) { + embObjHandler.endSP(); + } else if (groupState.pictDepth == 1) { + embObjHandler.handleCompletedObject(); + } + } + + if (groupState.object == true) { + embObjHandler.setInObject(false); + } + + // Be robust if RTF doc is corrupt (has too many + // closing }s): + // TODO: log a warning? + if (groupStates.size() > 0) { + // Restore group state: + final GroupState outerGroupState = groupStates.removeLast(); + + // Close italic, if outer does not have italic or + // bold changed: + if (groupState.italic) { + if (!outerGroupState.italic || + groupState.bold != outerGroupState.bold) { + end("i"); + groupState.italic = false; + } + } + + // Close bold + if (groupState.bold && !outerGroupState.bold) { + end("b"); + } + + // Open bold + if (!groupState.bold && outerGroupState.bold) { + start("b"); + } + + // Open italic + if (!groupState.italic && outerGroupState.italic) { + start("i"); + } + groupState = outerGroupState; + } + assert groupStates.size() == groupState.depth; + + if (fieldState == 1) { + String s = pendingBuffer.toString().trim(); + pendingBuffer.setLength(0); + if (s.startsWith("HYPERLINK")) { + s = s.substring(9).trim(); + // TODO: what other instructions can be in a + // HYPERLINK destination? + final boolean isLocalLink = s.contains("\\l "); + int idx = s.indexOf('"'); + if (idx != -1) { + int idx2 = s.indexOf('"', 1 + idx); + if (idx2 != -1) { + s = s.substring(1 + idx, idx2); + } + } + pendingURL = (isLocalLink ? "#" : "") + s; + fieldState = 2; + } else { + fieldState = 0; + } + + // TODO: we could process the other known field + // types. Right now, we will extract their text + // inlined, but fail to record them in metadata + // as a field value. + } else if (fieldState == 3) { + out.endElement("a"); + fieldState = 0; + } + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector Sat Jan 16 18:23:01 2016 @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.tika.parser.microsoft.POIFSContainerDetector Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Jan 16 18:23:01 2016 @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +org.apache.tika.parser.chm.ChmParser +org.apache.tika.parser.microsoft.JackcessParser +org.apache.tika.parser.microsoft.OfficeParser +org.apache.tika.parser.microsoft.OldExcelParser +org.apache.tika.parser.microsoft.TNEFParser +org.apache.tika.parser.microsoft.ooxml.OOXMLParser +#org.apache.tika.parser.odf.OpenDocumentContentParser +#org.apache.tika.parser.odf.OpenDocumentMetaParser +org.apache.tika.parser.odf.OpenDocumentParser +#org.apache.tika.parser.opendocument.OpenOfficeParser +org.apache.tika.parser.rtf.RTFParser Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertTrue; + +import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet; +import org.apache.tika.parser.chm.accessor.ChmItsfHeader; +import org.apache.tika.parser.chm.accessor.ChmItspHeader; +import org.apache.tika.parser.chm.accessor.ChmLzxcControlData; +import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable; +import org.apache.tika.parser.chm.accessor.DirectoryListingEntry; +import org.apache.tika.parser.chm.core.ChmCommons; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.apache.tika.parser.chm.lzx.ChmBlockInfo; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Tests major functionality of ChmBlockInfo + * + */ +public class TestChmBlockInfo { + private byte[] data; + private ChmBlockInfo chmBlockInfo; + private ChmDirectoryListingSet chmDirListCont = null; + private ChmLzxcResetTable clrt = null; + private ChmLzxcControlData chmLzxcControlData = null; + + @Before + public void setUp() throws Exception { + data = TestParameters.chmData; + /* Creates and parses itsf header */ + ChmItsfHeader chmItsHeader = new ChmItsfHeader(); + // chmItsHeader.parse(Arrays.copyOfRange(data, 0, + // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader); + chmItsHeader.parse(ChmCommons.copyOfRange(data, 0, + ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader); + /* Creates and parses itsp block */ + ChmItspHeader chmItspHeader = new ChmItspHeader(); + // chmItspHeader.parse(Arrays.copyOfRange( data, (int) + // chmItsHeader.getDirOffset(), + // (int) chmItsHeader.getDirOffset() + // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader); + chmItspHeader.parse(ChmCommons.copyOfRange(data, + (int) chmItsHeader.getDirOffset(), + (int) chmItsHeader.getDirOffset() + + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader); + /* Creating instance of ChmDirListingContainer */ + chmDirListCont = new ChmDirectoryListingSet(data, chmItsHeader, + chmItspHeader); + int indexOfControlData = chmDirListCont.getControlDataIndex(); + + int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, + ChmConstants.LZXC.getBytes(UTF_8)); + byte[] dir_chunk = null; + if (indexOfResetTable > 0) { + // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable, + // indexOfResetTable + // + + // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength()); + dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable, + indexOfResetTable + + chmDirListCont.getDirectoryListingEntryList() + .get(indexOfControlData).getLength()); + } + + /* Creates and parses control block */ + chmLzxcControlData = new ChmLzxcControlData(); + chmLzxcControlData.parse(dir_chunk, chmLzxcControlData); + + int indexOfFeList = chmDirListCont.getResetTableIndex(); + int startIndex = (int) chmDirListCont.getDataOffset() + + chmDirListCont.getDirectoryListingEntryList() + .get(indexOfFeList).getOffset(); + // dir_chunk = Arrays.copyOfRange(data, startIndex , startIndex + + // chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength()); + dir_chunk = ChmCommons.copyOfRange(data, startIndex, startIndex + + chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength()); + clrt = new ChmLzxcResetTable(); + clrt.parse(dir_chunk, clrt); + } + + @Test + public void testToString() { + if (chmBlockInfo == null) + testGetChmBlockInfo(); + assertTrue(chmBlockInfo.toString().length() > 0); + } + + @Test + public void testGetChmBlockInfo() { + for (DirectoryListingEntry directoryListingEntry : chmDirListCont.getDirectoryListingEntryList()) { + chmBlockInfo = ChmBlockInfo.getChmBlockInfoInstance( + directoryListingEntry, (int) clrt.getBlockLen(), + chmLzxcControlData); + // Assert.assertTrue(!directoryListingEntry.getName().isEmpty() && + // chmBlockInfo.toString() != null); + assertTrue(!ChmCommons.isEmpty(directoryListingEntry + .getName()) && chmBlockInfo.toString() != null); + } + } + + @After + public void tearDown() throws Exception { + data = null; + chmBlockInfo = null; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm; + +import static java.nio.charset.StandardCharsets.ISO_8859_1; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Set; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.regex.Pattern; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet; +import org.apache.tika.parser.chm.accessor.DirectoryListingEntry; +import org.apache.tika.parser.chm.core.ChmExtractor; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.SAXException; + +public class TestChmExtraction { + + private final Parser parser = new ChmParser(); + + private final List<String> files = Arrays.asList( + "/test-documents/testChm.chm", + "/test-documents/testChm2.chm", + "/test-documents/testChm3.chm"); + + @Test + public void testGetText() throws Exception { + BodyContentHandler handler = new BodyContentHandler(); + new ChmParser().parse( + new ByteArrayInputStream(TestParameters.chmData), + handler, new Metadata(), new ParseContext()); + assertTrue(handler.toString().contains( + "The TCard method accepts only numeric arguments")); + } + + @Test + public void testChmParser() throws Exception{ + for (String fileName : files) { + InputStream stream = TestChmExtraction.class.getResourceAsStream(fileName); + testingChm(stream); + } + } + + private void testingChm(InputStream stream) throws IOException, SAXException, TikaException { + try { + BodyContentHandler handler = new BodyContentHandler(-1); + parser.parse(stream, handler, new Metadata(), new ParseContext()); + assertTrue(!handler.toString().isEmpty()); + } finally { + stream.close(); + } + } + + @Test + public void testExtractChmEntries() throws TikaException, IOException{ + for (String fileName : files) { + try (InputStream stream = TestChmExtraction.class.getResourceAsStream(fileName)) { + testExtractChmEntry(stream); + } + } + } + + protected boolean findZero(byte[] textData) { + for (byte b : textData) { + if (b==0) { + return true; + } + } + + return false; + } + + protected boolean niceAscFileName(String name) { + for (char c : name.toCharArray()) { + if (c>=127 || c<32) { + //non-ascii char or control char + return false; + } + } + + return true; + } + + protected void testExtractChmEntry(InputStream stream) throws TikaException, IOException{ + ChmExtractor chmExtractor = new ChmExtractor(stream); + ChmDirectoryListingSet entries = chmExtractor.getChmDirList(); + final Pattern htmlPairP = Pattern.compile("\\Q<html\\E.+\\Q</html>\\E" + , Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL); + + Set<String> names = new HashSet<String>(); + + for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) { + byte[] data = chmExtractor.extractChmEntry(directoryListingEntry); + + //Entry names should be nice. Disable this if the test chm do have bad looking but valid entry names. + if (! niceAscFileName(directoryListingEntry.getName())) { + throw new TikaException("Warning: File name contains a non ascii char : " + directoryListingEntry.getName()); + } + + final String lowName = directoryListingEntry.getName().toLowerCase(Locale.ROOT); + + //check duplicate entry name which is seen before. + if (names.contains(lowName)) { + throw new TikaException("Duplicate File name detected : " + directoryListingEntry.getName()); + } + names.add(lowName); + + if (lowName.endsWith(".html") + || lowName.endsWith(".htm") + || lowName.endsWith(".hhk") + || lowName.endsWith(".hhc") + //|| name.endsWith(".bmp") + ) { + if (findZero(data)) { + throw new TikaException("Xhtml/text file contains '\\0' : " + directoryListingEntry.getName()); + } + + //validate html + String html = new String(data, ISO_8859_1); + if (! htmlPairP.matcher(html).find()) { + System.err.println(lowName + " is invalid."); + System.err.println(html); + throw new TikaException("Invalid xhtml file : " + directoryListingEntry.getName()); + } +// else { +// System.err.println(directoryListingEntry.getName() + " is valid."); +// } + } + } + } + + + @Test + public void testMultiThreadedChmExtraction() throws InterruptedException { + ExecutorService executor = Executors.newFixedThreadPool(TestParameters.NTHREADS); + for (int i = 0; i < TestParameters.NTHREADS; i++) { + executor.execute(new Runnable() { + public void run() { + for (String fileName : files) { + InputStream stream = null; + try { + stream = TestChmExtraction.class.getResourceAsStream(fileName); + BodyContentHandler handler = new BodyContentHandler(-1); + parser.parse(stream, handler, new Metadata(), new ParseContext()); + assertTrue(!handler.toString().isEmpty()); + } catch (Exception e) { + e.printStackTrace(); + } finally { + try { + stream.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + }); + } + executor.shutdown(); + // Waits until all threads will have finished + while (!executor.isTerminated()) { + Thread.sleep(500); + } + } + + @Test + public void test_TIKA_1446() throws Exception { + URL chmDir = TestChmExtraction.class.getResource("/test-documents/chm/"); + File chmFolder = new File(chmDir.toURI()); + for (String fileName : chmFolder.list()) { + File file = new File(chmFolder, fileName); + InputStream stream = new FileInputStream(file); + testingChm(stream); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm; + +import java.io.ByteArrayInputStream; +import java.util.List; +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet; +import org.apache.tika.parser.chm.accessor.DirectoryListingEntry; +import org.apache.tika.parser.chm.core.ChmExtractor; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import org.junit.Before; +import org.junit.Test; + +public class TestChmExtractor { + private ChmExtractor chmExtractor = null; + + @Before + public void setUp() throws Exception { + chmExtractor = new ChmExtractor( + new ByteArrayInputStream(TestParameters.chmData)); + } + + @Test + public void testEnumerateChm() { + List<String> chmEntries = chmExtractor.enumerateChm(); + assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER, + chmEntries.size()); + } + + @Test + public void testGetChmDirList() { + assertNotNull(chmExtractor.getChmDirList()); + } + + @Test + public void testExtractChmEntry() throws TikaException{ + ChmDirectoryListingSet entries = chmExtractor.getChmDirList(); + + int count = 0; + for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) { + chmExtractor.extractChmEntry(directoryListingEntry); + ++count; + } + assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER, count); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.chm; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import org.apache.tika.parser.chm.accessor.ChmItsfHeader; +import org.apache.tika.parser.chm.core.ChmCommons; +import org.apache.tika.parser.chm.core.ChmConstants; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Tests all public functions of ChmItsfHeader + * + */ +public class TestChmItsfHeader { + private ChmItsfHeader chmItsfHeader = null; + + @Before + public void setUp() throws Exception { + chmItsfHeader = new ChmItsfHeader(); + byte[] data = TestParameters.chmData; + // chmItsfHeader.parse(Arrays.copyOfRange(data, 0, + // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader); + chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0, + ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader); + } + + @Test + public void getDataOffset() { + assertEquals(TestParameters.VP_DATA_OFFSET_LENGTH, + chmItsfHeader.getDataOffset()); + } + + @Test + public void getDir_uuid() { + assertNotNull(chmItsfHeader.getDir_uuid()); + } + + @Test + public void getDirLen() { + assertEquals(TestParameters.VP_DIRECTORY_LENGTH, + chmItsfHeader.getDirLen()); + } + + @Test + public void getDirOffset() { + assertEquals(TestParameters.VP_DIRECTORY_OFFSET, + chmItsfHeader.getDirOffset()); + } + + @Test + public void getHeaderLen() { + assertEquals(TestParameters.VP_ITSF_HEADER_LENGTH, + chmItsfHeader.getHeaderLen()); + } + + @Test + public void getLangId() { + assertEquals(TestParameters.VP_LANGUAGE_ID, + chmItsfHeader.getLangId()); + } + + @Test + public void getLastModified() { + assertEquals(TestParameters.VP_LAST_MODIFIED, + chmItsfHeader.getLastModified()); + } + + @Test + public void getUnknown_000c() { + assertEquals(TestParameters.VP_UNKNOWN_000C, + chmItsfHeader.getUnknown_000c()); + } + + @Test + public void getUnknownLen() { + assertEquals(TestParameters.VP_UNKNOWN_LEN, + chmItsfHeader.getUnknownLen()); + } + + @Test + public void getUnknownOffset() { + assertEquals(TestParameters.VP_UNKNOWN_OFFSET, + chmItsfHeader.getUnknownOffset()); + } + + @Test + public void getVersion() { + assertEquals(TestParameters.VP_VERSION, + chmItsfHeader.getVersion()); + } + + @Test + public void testToString() { + assertTrue(chmItsfHeader.toString().contains( + TestParameters.VP_ISTF_SIGNATURE)); + } + + @After + public void tearDown() throws Exception { + chmItsfHeader = null; + } +}
