tika-advanced-parser-m...

bob Sat, 16 Jan 2016 10:24:05 -0800

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,1430 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.rtf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
+import java.util.Calendar;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Locale;
+import java.util.Map;
+import java.util.TimeZone;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.CharsetUtils;
+import org.xml.sax.SAXException;
+
+/* Tokenizes and performs a "shallow" parse of the RTF
+ * document, just enough to properly decode the text.
+ *
+ * TODO: we should cutover to a "real" tokenizer (eg JFlex);
+ * it should give better perf, by replacing the excessive
+ * "else if" string compares with FSA traversal. */
+
+final class TextExtractor {
+
+    private static final Charset ASCII = Charset.forName("US-ASCII");
+    private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252");
+    private static final Charset MAC_ROMAN = getCharset("MacRoman");
+    private static final Charset SHIFT_JIS = getCharset("Shift_JIS");
+    private static final Charset WINDOWS_57011 = getCharset("windows-57011");
+    private static final Charset WINDOWS_57010 = getCharset("windows-57010");
+    private static final Charset WINDOWS_57009 = getCharset("windows-57009");
+    private static final Charset WINDOWS_57008 = getCharset("windows-57008");
+    private static final Charset WINDOWS_57007 = getCharset("windows-57007");
+    private static final Charset WINDOWS_57006 = getCharset("windows-57006");
+    private static final Charset WINDOWS_57005 = getCharset("windows-57005");
+    private static final Charset WINDOWS_57004 = getCharset("windows-57004");
+    private static final Charset WINDOWS_57003 = getCharset("windows-57003");
+    private static final Charset X_ISCII91 = getCharset("x-ISCII91");
+    private static final Charset X_MAC_CENTRAL_EUROPE = 
getCharset("x-MacCentralEurope");
+    private static final Charset MAC_CYRILLIC = getCharset("MacCyrillic");
+    private static final Charset X_JOHAB = getCharset("x-Johab");
+    private static final Charset CP12582 = getCharset("CP1258");
+    private static final Charset CP12572 = getCharset("CP1257");
+    private static final Charset CP12562 = getCharset("CP1256");
+    private static final Charset CP12552 = getCharset("CP1255");
+    private static final Charset CP12542 = getCharset("CP1254");
+    private static final Charset CP12532 = getCharset("CP1253");
+    private static final Charset CP1252 = getCharset("CP1252");
+    private static final Charset CP12512 = getCharset("CP1251");
+    private static final Charset CP12502 = getCharset("CP1250");
+    private static final Charset CP950 = getCharset("CP950");
+    private static final Charset CP949 = getCharset("CP949");
+    private static final Charset MS9362 = getCharset("MS936");
+    private static final Charset MS8742 = getCharset("MS874");
+    private static final Charset CP866 = getCharset("CP866");
+    private static final Charset CP865 = getCharset("CP865");
+    private static final Charset CP864 = getCharset("CP864");
+    private static final Charset CP863 = getCharset("CP863");
+    private static final Charset CP862 = getCharset("CP862");
+    private static final Charset CP860 = getCharset("CP860");
+    private static final Charset CP852 = getCharset("CP852");
+    private static final Charset CP8502 = getCharset("CP850");
+    private static final Charset CP819 = getCharset("CP819");
+    private static final Charset WINDOWS_720 = getCharset("windows-720");
+    private static final Charset WINDOWS_711 = getCharset("windows-711");
+    private static final Charset WINDOWS_710 = getCharset("windows-710");
+    private static final Charset WINDOWS_709 = getCharset("windows-709");
+    private static final Charset ISO_8859_6 = getCharset("ISO-8859-6");
+    private static final Charset CP4372 = getCharset("CP437");
+    private static final Charset CP850 = getCharset("cp850");
+    private static final Charset CP437 = getCharset("cp437");
+    private static final Charset MS874 = getCharset("ms874");
+    private static final Charset CP1257 = getCharset("cp1257");
+    private static final Charset CP1256 = getCharset("cp1256");
+    private static final Charset CP1255 = getCharset("cp1255");
+    private static final Charset CP1258 = getCharset("cp1258");
+    private static final Charset CP1254 = getCharset("cp1254");
+    private static final Charset CP1253 = getCharset("cp1253");
+    private static final Charset MS950 = getCharset("ms950");
+    private static final Charset MS936 = getCharset("ms936");
+    private static final Charset MS1361 = getCharset("ms1361");
+    private static final Charset MS932 = getCharset("MS932");
+    private static final Charset CP1251 = getCharset("cp1251");
+    private static final Charset CP1250 = getCharset("cp1250");
+    private static final Charset MAC_THAI = getCharset("MacThai");
+    private static final Charset MAC_TURKISH = getCharset("MacTurkish");
+    private static final Charset MAC_GREEK = getCharset("MacGreek");
+    private static final Charset MAC_ARABIC = getCharset("MacArabic");
+    private static final Charset MAC_HEBREW = getCharset("MacHebrew");
+    private static final Charset JOHAB = getCharset("johab");
+    private static final Charset BIG5 = getCharset("Big5");
+    private static final Charset GB2312 = getCharset("GB2312");
+    private static final Charset MS949 = getCharset("ms949");
+    // The RTF doc has a "font table" that assigns ords
+    // (f0, f1, f2, etc.) to fonts and charsets, using the
+    // \fcharsetN control word.  This mapping maps from the
+    // N to corresponding Java charset:
+    private static final Map<Integer, Charset> FCHARSET_MAP =
+            new HashMap<Integer, Charset>();
+    // The RTF may specify the \ansicpgN charset in the
+    // header; this maps the N to the corresponding Java
+    // character set:
+    private static final Map<Integer, Charset> ANSICPG_MAP =
+            new HashMap<Integer, Charset>();
+
+    static {
+        FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI
+        // charset 1 is Default
+        // charset 2 is Symbol
+
+        FCHARSET_MAP.put(77, MAC_ROMAN); // Mac Roman
+        FCHARSET_MAP.put(78, SHIFT_JIS); // Mac Shift Jis
+        FCHARSET_MAP.put(79, MS949); // Mac Hangul
+        FCHARSET_MAP.put(80, GB2312); // Mac GB2312
+        FCHARSET_MAP.put(81, BIG5); // Mac Big5
+        FCHARSET_MAP.put(82, JOHAB); // Mac Johab (old)
+        FCHARSET_MAP.put(83, MAC_HEBREW); // Mac Hebrew
+        FCHARSET_MAP.put(84, MAC_ARABIC); // Mac Arabic
+        FCHARSET_MAP.put(85, MAC_GREEK); // Mac Greek
+        FCHARSET_MAP.put(86, MAC_TURKISH); // Mac Turkish
+        FCHARSET_MAP.put(87, MAC_THAI); // Mac Thai
+        FCHARSET_MAP.put(88, CP1250); // Mac East Europe
+        FCHARSET_MAP.put(89, CP1251); // Mac Russian
+
+        FCHARSET_MAP.put(128, MS932); // Shift JIS
+        FCHARSET_MAP.put(129, MS949); // Hangul
+        FCHARSET_MAP.put(130, MS1361); // Johab
+        FCHARSET_MAP.put(134, MS936); // GB2312
+        FCHARSET_MAP.put(136, MS950); // Big5
+        FCHARSET_MAP.put(161, CP1253); // Greek
+        FCHARSET_MAP.put(162, CP1254); // Turkish
+        FCHARSET_MAP.put(163, CP1258); // Vietnamese
+        FCHARSET_MAP.put(177, CP1255); // Hebrew
+        FCHARSET_MAP.put(178, CP1256); // Arabic
+        // FCHARSET_MAP.put( 179, "" ); // Arabic Traditional
+        // FCHARSET_MAP.put( 180, "" ); // Arabic user
+        // FCHARSET_MAP.put( 181, "" ); // Hebrew user
+        FCHARSET_MAP.put(186, CP1257); // Baltic
+
+        FCHARSET_MAP.put(204, CP1251); // Russian
+        FCHARSET_MAP.put(222, MS874); // Thai
+        FCHARSET_MAP.put(238, CP1250); // Eastern European
+        FCHARSET_MAP.put(254, CP437); // PC 437
+        FCHARSET_MAP.put(255, CP850); // OEM
+    }
+
+    static {
+        ANSICPG_MAP.put(437, CP4372);   // US IBM
+        ANSICPG_MAP.put(708, ISO_8859_6);   // Arabic (ASMO 708)
+
+        ANSICPG_MAP.put(709, WINDOWS_709);  // Arabic (ASMO 449+, BCON V4)
+        ANSICPG_MAP.put(710, WINDOWS_710);  // Arabic (transparent Arabic)
+        ANSICPG_MAP.put(710, WINDOWS_711);  // Arabic (Nafitha Enhanced)
+        ANSICPG_MAP.put(710, WINDOWS_720);  // Arabic (transparent ASMO)
+        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
+        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
+
+        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
+        ANSICPG_MAP.put(850, CP8502);  // IBM Multilingual
+        ANSICPG_MAP.put(852, CP852);  // Eastern European
+        ANSICPG_MAP.put(860, CP860);  // Portuguese
+        ANSICPG_MAP.put(862, CP862);  // Hebrew
+        ANSICPG_MAP.put(863, CP863);  // French Canadian
+        ANSICPG_MAP.put(864, CP864);  // Arabic
+        ANSICPG_MAP.put(865, CP865);  // Norwegian
+        ANSICPG_MAP.put(866, CP866);  // Soviet Union
+        ANSICPG_MAP.put(874, MS8742);  // Thai
+        ANSICPG_MAP.put(932, MS932);  // Japanese
+        ANSICPG_MAP.put(936, MS9362);  // Simplified Chinese
+        ANSICPG_MAP.put(949, CP949);  // Korean
+        ANSICPG_MAP.put(950, CP950);  // Traditional Chinese
+        ANSICPG_MAP.put(1250, CP12502);  // Eastern European
+        ANSICPG_MAP.put(1251, CP12512);  // Cyrillic
+        ANSICPG_MAP.put(1252, CP1252);  // Western European
+        ANSICPG_MAP.put(1253, CP12532);  // Greek
+        ANSICPG_MAP.put(1254, CP12542);  // Turkish
+        ANSICPG_MAP.put(1255, CP12552);  // Hebrew
+        ANSICPG_MAP.put(1256, CP12562);  // Arabic
+        ANSICPG_MAP.put(1257, CP12572);  // Baltic
+        ANSICPG_MAP.put(1258, CP12582);  // Vietnamese
+        ANSICPG_MAP.put(1361, X_JOHAB);  // Johab
+        ANSICPG_MAP.put(10000, MAC_ROMAN);  // Mac Roman
+        ANSICPG_MAP.put(10001, SHIFT_JIS);  // Mac Japan
+        ANSICPG_MAP.put(10004, MAC_ARABIC);  // Mac Arabic
+        ANSICPG_MAP.put(10005, MAC_HEBREW);  // Mac Hebrew
+        ANSICPG_MAP.put(10006, MAC_GREEK);  // Mac Hebrew
+        ANSICPG_MAP.put(10007, MAC_CYRILLIC);  // Mac Cyrillic
+        ANSICPG_MAP.put(10029, X_MAC_CENTRAL_EUROPE);  // MAC Latin2
+        ANSICPG_MAP.put(10081, MAC_TURKISH);  // Mac Turkish
+        ANSICPG_MAP.put(57002, X_ISCII91);   // Devanagari
+
+        // TODO: in theory these other charsets are simple
+        // shifts off of Devanagari, so we could impl that
+        // here:
+        ANSICPG_MAP.put(57003, WINDOWS_57003);   // Bengali
+        ANSICPG_MAP.put(57004, WINDOWS_57004);   // Tamil
+        ANSICPG_MAP.put(57005, WINDOWS_57005);   // Telugu
+        ANSICPG_MAP.put(57006, WINDOWS_57006);   // Assamese
+        ANSICPG_MAP.put(57007, WINDOWS_57007);   // Oriya
+        ANSICPG_MAP.put(57008, WINDOWS_57008);   // Kannada
+        ANSICPG_MAP.put(57009, WINDOWS_57009);   // Malayalam
+        ANSICPG_MAP.put(57010, WINDOWS_57010);   // Gujariti
+        ANSICPG_MAP.put(57011, WINDOWS_57011);   // Punjabi
+    }
+
+    // Used when we decode bytes -> chars using CharsetDecoder:
+    private final char[] outputArray = new char[128];
+    private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray);
+    // Holds the font table from this RTF doc, mapping
+    // the font number (from \fN control word) to the
+    // corresponding charset:
+    private final Map<Integer, Charset> fontToCharset =
+            new HashMap<Integer, Charset>();
+    // Group stack: when we open a new group, we push
+    // the previous group state onto the stack; when we
+    // close the group, we restore it
+    private final LinkedList<GroupState> groupStates = new 
LinkedList<GroupState>();
+    private final StringBuilder pendingBuffer = new StringBuilder();
+    private final XHTMLContentHandler out;
+    private final Metadata metadata;
+    private final RTFEmbObjHandler embObjHandler;
+    // How many next ansi chars we should skip; this
+    // is 0 except when we are still in the "ansi
+    // shadow" after seeing a unicode escape, at which
+    // point it's set to the last ucN skip we had seen:
+    int ansiSkip = 0;
+    private int written = 0;
+    // Hold pending bytes (encoded in the current charset)
+    // for text output:
+    private byte[] pendingBytes = new byte[16];
+    private int pendingByteCount;
+    private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
+    // Holds pending chars for text output
+    private char[] pendingChars = new char[10];
+    private int pendingCharCount;
+    // Holds chars for a still-being-tokenized control word
+    private byte[] pendingControl = new byte[10];
+    private int pendingControlCount;
+    // Reused when possible:
+    private CharsetDecoder decoder;
+    private Charset lastCharset;
+    private Charset globalCharset = WINDOWS_1252;
+    private int globalDefaultFont = -1;
+    private int curFontID = -1;
+    // Current group state; in theory this initial
+    // GroupState is unused because the RTF doc should
+    // immediately open the top group (start with {):
+    private GroupState groupState = new GroupState();
+    private boolean inHeader = true;
+    private int fontTableState;
+    private int fontTableDepth;
+    // Non null if we are processing metadata (title,
+    // keywords, etc.) inside the info group:
+    private Property nextMetaData;
+    private boolean inParagraph;
+    // Non-zero if we are processing inside a field destination:
+    private int fieldState;
+    // Non-zero list index
+    private int pendingListEnd;
+    private Map<Integer, ListDescriptor> listTable = new HashMap<Integer, 
ListDescriptor>();
+    private Map<Integer, ListDescriptor> listOverrideTable = new 
HashMap<Integer, ListDescriptor>();
+    private Map<Integer, ListDescriptor> currentListTable;
+    private ListDescriptor currentList;
+    private int listTableLevel = -1;
+    private boolean ignoreLists;
+    // Non-null if we've seen the url for a HYPERLINK but not yet
+    // its text:
+    private String pendingURL;
+    // Used to process the sub-groups inside the upr
+    // group:
+    private int uprState = -1;
+    // Used when extracting CREATION date:
+    private int year, month, day, hour, minute;
+
+    public TextExtractor(XHTMLContentHandler out, Metadata metadata,
+                         RTFEmbObjHandler embObjHandler) {
+        this.metadata = metadata;
+        this.out = out;
+        this.embObjHandler = embObjHandler;
+    }
+
+    private static Charset getCharset(String name) {
+        try {
+            return CharsetUtils.forName(name);
+        } catch (Exception e) {
+            return ASCII;
+        }
+    }
+
+    protected static boolean isHexChar(int ch) {
+        return (ch >= '0' && ch <= '9') ||
+                (ch >= 'a' && ch <= 'f') ||
+                (ch >= 'A' && ch <= 'F');
+    }
+
+    private static boolean isAlpha(int ch) {
+        return (ch >= 'a' && ch <= 'z') ||
+                (ch >= 'A' && ch <= 'Z');
+    }
+
+    private static boolean isDigit(int ch) {
+        return ch >= '0' && ch <= '9';
+    }
+
+    protected static int hexValue(int ch) {
+        if (ch >= '0' && ch <= '9') {
+            return ch - '0';
+        } else if (ch >= 'a' && ch <= 'z') {
+            return 10 + (ch - 'a');
+        } else {
+            assert ch >= 'A' && ch <= 'Z';
+            return 10 + (ch - 'A');
+        }
+    }
+
+    public boolean isIgnoringLists() {
+        return ignoreLists;
+    }
+
+    public void setIgnoreLists(boolean ignore) {
+        this.ignoreLists = ignore;
+    }
+
+    // Push pending bytes or pending chars:
+    private void pushText() throws IOException, SAXException, TikaException {
+        if (pendingByteCount != 0) {
+            assert pendingCharCount == 0;
+            pushBytes();
+        } else {
+            pushChars();
+        }
+    }
+
+    // Buffers the byte (unit in the current charset) for
+    // output:
+    private void addOutputByte(int b) throws IOException, SAXException, 
TikaException {
+        assert b >= 0 && b < 256 : "byte value out of range: " + b;
+
+        if (pendingCharCount != 0) {
+            pushChars();
+        }
+        if (groupState.pictDepth > 0) {
+            embObjHandler.writeMetadataChar((char) b);
+        } else {
+            // Save the byte in pending buffer:
+            if (pendingByteCount == pendingBytes.length) {
+                // Gradual but exponential growth:
+                final byte[] newArray = new byte[(int) (pendingBytes.length * 
1.25)];
+                System.arraycopy(pendingBytes, 0, newArray, 0, 
pendingBytes.length);
+                pendingBytes = newArray;
+                pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
+            }
+            pendingBytes[pendingByteCount++] = (byte) b;
+        }
+    }
+
+    // Buffers a byte as part of a control word:
+    private void addControl(int b) {
+        assert isAlpha(b);
+        // Save the byte in pending buffer:
+        if (pendingControlCount == pendingControl.length) {
+            // Gradual but exponential growth:
+            final byte[] newArray = new byte[(int) (pendingControl.length * 
1.25)];
+            System.arraycopy(pendingControl, 0, newArray, 0, 
pendingControl.length);
+            pendingControl = newArray;
+        }
+        pendingControl[pendingControlCount++] = (byte) b;
+    }
+
+    // Buffers a UTF16 code unit for output
+    private void addOutputChar(char ch) throws IOException, SAXException, 
TikaException {
+        if (pendingByteCount != 0) {
+            pushBytes();
+        }
+
+        if (inHeader || fieldState == 1) {
+            pendingBuffer.append(ch);
+        } else if (groupState.sn == true || groupState.sv == true) {
+            embObjHandler.writeMetadataChar(ch);
+        } else {
+            if (pendingCharCount == pendingChars.length) {
+                // Gradual but exponential growth:
+                final char[] newArray = new char[(int) (pendingChars.length * 
1.25)];
+                System.arraycopy(pendingChars, 0, newArray, 0, 
pendingChars.length);
+                pendingChars = newArray;
+            }
+            pendingChars[pendingCharCount++] = ch;
+        }
+    }
+
+    // Shallow parses the entire doc, writing output to
+    // this.out and this.metadata
+    public void extract(InputStream in) throws IOException, SAXException, 
TikaException {
+//        in = new FilterInputStream(in) {
+//            public int read() throws IOException {
+//                int r = super.read();
+//                System.out.write(r);
+//                System.out.flush();
+//                return r;
+//            }
+//            public int read(byte b[], int off, int len) throws IOException {
+//                int r = super.read(b, off, len);
+//                System.out.write(b, off, r);
+//                System.out.flush();
+//                return r;
+//            }
+//        };
+        extract(new PushbackInputStream(in, 2));
+    }
+
+    private void extract(PushbackInputStream in) throws IOException, 
SAXException, TikaException {
+        out.startDocument();
+
+        while (true) {
+            final int b = in.read();
+            if (b == -1) {
+                break;
+            } else if (b == '\\') {
+                parseControlToken(in);
+            } else if (b == '{') {
+                pushText();
+                processGroupStart(in);
+            } else if (b == '}') {
+                pushText();
+                processGroupEnd();
+                if (groupStates.isEmpty()) {
+                    // parsed document closing brace
+                    break;
+                }
+            } else if (groupState.objdata == true ||
+                    groupState.pictDepth == 1) {
+                embObjHandler.writeHexChar(b);
+            } else if (b != '\r' && b != '\n'
+                    && (!groupState.ignore || nextMetaData != null ||
+                    groupState.sn == true || groupState.sv == true)) {
+                // Linefeed and carriage return are not
+                // significant
+                if (ansiSkip != 0) {
+                    ansiSkip--;
+                } else {
+                    addOutputByte(b);
+                }
+            }
+        }
+
+        endParagraph(false);
+        out.endDocument();
+    }
+
+    private void parseControlToken(PushbackInputStream in) throws IOException, 
SAXException, TikaException {
+        int b = in.read();
+        if (b == '\'') {
+            // escaped hex char
+            parseHexChar(in);
+        } else if (isAlpha(b)) {
+            // control word
+            parseControlWord((char) b, in);
+        } else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == 
'\n') {
+            // escaped char
+            addOutputByte(b);
+        } else if (b != -1) {
+            // control symbol, eg \* or \~
+            processControlSymbol((char) b);
+        }
+    }
+
+    private void parseHexChar(PushbackInputStream in) throws IOException, 
SAXException, TikaException {
+        int hex1 = in.read();
+        if (!isHexChar(hex1)) {
+            // DOC ERROR (malformed hex escape): ignore 
+            in.unread(hex1);
+            return;
+        }
+
+        int hex2 = in.read();
+        if (!isHexChar(hex2)) {
+            // TODO: log a warning here, somehow?
+            // DOC ERROR (malformed hex escape):
+            // ignore
+            in.unread(hex2);
+            return;
+        }
+
+        if (ansiSkip != 0) {
+            // Skip this ansi char since we are
+            // still in the shadow of a unicode
+            // escape:
+            ansiSkip--;
+        } else {
+            // Unescape:
+            addOutputByte(16 * hexValue(hex1) + hexValue(hex2));
+        }
+    }
+
+    private void parseControlWord(int firstChar, PushbackInputStream in) 
throws IOException, SAXException, TikaException {
+        addControl(firstChar);
+
+        int b = in.read();
+        while (isAlpha(b)) {
+            addControl(b);
+            b = in.read();
+        }
+
+        boolean hasParam = false;
+        boolean negParam = false;
+        if (b == '-') {
+            negParam = true;
+            hasParam = true;
+            b = in.read();
+        }
+
+        int param = 0;
+        while (isDigit(b)) {
+            param *= 10;
+            param += (b - '0');
+            hasParam = true;
+            b = in.read();
+        }
+
+        // space is consumed as part of the
+        // control word, but is not added to the
+        // control word
+        if (b != ' ') {
+            in.unread(b);
+        }
+
+        if (hasParam) {
+            if (negParam) {
+                param = -param;
+            }
+            processControlWord(param, in);
+        } else {
+            processControlWord();
+        }
+
+        pendingControlCount = 0;
+    }
+
+    private void lazyStartParagraph() throws IOException, SAXException, 
TikaException {
+        if (!inParagraph) {
+            // Ensure </i></b> order
+            if (groupState.italic) {
+                end("i");
+            }
+            if (groupState.bold) {
+                end("b");
+            }
+            if (pendingListEnd != 0 && groupState.list != pendingListEnd) {
+                endList(pendingListEnd);
+                pendingListEnd = 0;
+            }
+            if (inList() && pendingListEnd != groupState.list) {
+                startList(groupState.list);
+            }
+            if (inList()) {
+                out.startElement("li");
+            } else {
+                out.startElement("p");
+            }
+
+            // Ensure <b><i> order
+            if (groupState.bold) {
+                start("b");
+            }
+            if (groupState.italic) {
+                start("i");
+            }
+            inParagraph = true;
+        }
+    }
+
+    private void endParagraph(boolean preserveStyles) throws IOException, 
SAXException, TikaException {
+        pushText();
+        //maintain consecutive new lines
+        if (!inParagraph) {
+            lazyStartParagraph();
+        }
+        if (inParagraph) {
+            if (groupState.italic) {
+                end("i");
+                groupState.italic = preserveStyles;
+            }
+            if (groupState.bold) {
+                end("b");
+                groupState.bold = preserveStyles;
+            }
+            if (inList()) {
+                out.endElement("li");
+            } else {
+                out.endElement("p");
+            }
+
+            if (preserveStyles && (groupState.bold || groupState.italic)) {
+                start("p");
+                if (groupState.bold) {
+                    start("b");
+                }
+                if (groupState.italic) {
+                    start("i");
+                }
+                inParagraph = true;
+            } else {
+                inParagraph = false;
+            }
+        }
+
+        // Ensure closing the list at document end
+        if (!preserveStyles && pendingListEnd != 0) {
+            endList(pendingListEnd);
+            pendingListEnd = 0;
+        }
+    }
+
+    // Push pending UTF16 units to out ContentHandler
+    private void pushChars() throws IOException, SAXException, TikaException {
+        if (pendingCharCount != 0) {
+            lazyStartParagraph();
+            out.characters(pendingChars, 0, pendingCharCount);
+            pendingCharCount = 0;
+        }
+    }
+
+    // Decodes the buffered bytes in pendingBytes
+    // into UTF16 code units, and sends the characters
+    // to the out ContentHandler, if we are in the body,
+    // else appends the characters to the pendingBuffer
+    private void pushBytes() throws IOException, SAXException, TikaException {
+        if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != 
null)) {
+
+            final CharsetDecoder decoder = getDecoder();
+            pendingByteBuffer.limit(pendingByteCount);
+            assert pendingByteBuffer.position() == 0;
+            assert outputBuffer.position() == 0;
+
+            while (true) {
+                // We pass true for endOfInput because, when
+                // we are called, we should have seen a
+                // complete sequence of characters for this
+                // charset:
+                final CoderResult result = decoder.decode(pendingByteBuffer, 
outputBuffer, true);
+
+                final int pos = outputBuffer.position();
+                if (pos > 0) {
+                    if (inHeader || fieldState == 1) {
+                        pendingBuffer.append(outputArray, 0, pos);
+                    } else {
+                        lazyStartParagraph();
+                        out.characters(outputArray, 0, pos);
+                    }
+                    outputBuffer.position(0);
+                }
+
+                if (result == CoderResult.UNDERFLOW) {
+                    break;
+                }
+            }
+
+            while (true) {
+                final CoderResult result = decoder.flush(outputBuffer);
+
+                final int pos = outputBuffer.position();
+                if (pos > 0) {
+                    if (inHeader || fieldState == 1) {
+                        pendingBuffer.append(outputArray, 0, pos);
+                    } else {
+                        lazyStartParagraph();
+                        out.characters(outputArray, 0, pos);
+                    }
+                    outputBuffer.position(0);
+                }
+
+                if (result == CoderResult.UNDERFLOW) {
+                    break;
+                }
+            }
+
+            // Reset for next decode
+            decoder.reset();
+            pendingByteBuffer.position(0);
+        }
+
+        pendingByteCount = 0;
+    }
+
+    // NOTE: s must be ascii alpha only
+    private boolean equals(String s) {
+        if (pendingControlCount != s.length()) {
+            return false;
+        }
+        for (int idx = 0; idx < pendingControlCount; idx++) {
+            assert isAlpha(s.charAt(idx));
+            if (((byte) s.charAt(idx)) != pendingControl[idx]) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    private void processControlSymbol(char ch) throws IOException, 
SAXException, TikaException {
+        switch (ch) {
+            case '~':
+                // Non-breaking space -> unicode NON-BREAKING SPACE
+                addOutputChar('\u00a0');
+                break;
+            case '*':
+                // Ignorable destination (control words defined after
+                // the 1987 RTF spec). These are already handled by
+                // processGroupStart()
+                break;
+            case '-':
+                // Optional hyphen -> unicode SOFT HYPHEN
+                addOutputChar('\u00ad');
+                break;
+            case '_':
+                // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN
+                addOutputChar('\u2011');
+                break;
+            default:
+                break;
+        }
+    }
+
+    private CharsetDecoder getDecoder() throws TikaException {
+        Charset charset = getCharset();
+
+        // Common case: charset is same as last time, so
+        // just reuse it:
+        if (lastCharset == null || !charset.equals(lastCharset)) {
+            decoder = charset.newDecoder();
+            decoder.onMalformedInput(CodingErrorAction.REPLACE);
+            decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
+            lastCharset = charset;
+        }
+
+        return decoder;
+    }
+
+    // Return current charset in-use
+    private Charset getCharset() throws TikaException {
+        // If a specific font (fN) was set, use its charset
+        if (groupState.fontCharset != null) {
+            return groupState.fontCharset;
+        }
+
+        // Else, if global default font (defN) was set, use that one
+        if (globalDefaultFont != -1 && !inHeader) {
+            Charset cs = fontToCharset.get(globalDefaultFont);
+            if (cs != null) {
+                return cs;
+            }
+        }
+
+        // Else, use the global charset
+        if (globalCharset == null) {
+            throw new TikaException("unable to determine charset");
+        }
+
+        return globalCharset;
+    }
+
+    // Handle control word that takes a parameter:
+    private void processControlWord(int param, PushbackInputStream in) throws 
IOException, SAXException, TikaException {
+
+        // TODO: afN?  (associated font number)
+
+        // TODO: do these alter text output...?
+        /*
+            } else if (equals("stshfdbch")) {
+                // font to be used by default in
+                // style sheet for East Asian chars
+                // arg N is font table entry
+            } else if (equals("stshfloch")) {
+                // font to be used by default in
+                // style sheet for ASCII chars
+                // arg N is font table entry
+            } else if (equals("stshfhich")) {
+                // font to be used by default in
+                // style sheet for High Ansi chars
+                // arg N is font table entry
+            } else if (equals("stshfbi")) {
+                // style sheet for Complex Scripts (BIDI) chars
+                // arg N is font table entry
+                */
+
+        // TODO: inefficient that we check equals N times;
+        // we'd get better perf w/ real lexer (eg
+        // JFlex), which uses single-pass FSM to do cmp:
+        if (inHeader) {
+            if (equals("ansicpg")) {
+                // ANSI codepage
+                Charset cs = ANSICPG_MAP.get(param);
+                if (cs != null) {
+                    globalCharset = cs;
+                }
+            } else if (equals("deff")) {
+                // Default font
+                globalDefaultFont = param;
+            } else if (equals("nofpages")) {
+                metadata.add(Office.PAGE_COUNT, Integer.toString(param));
+            } else if (equals("nofwords")) {
+                metadata.add(Office.WORD_COUNT, Integer.toString(param));
+            } else if (equals("nofchars")) {
+                metadata.add(Office.CHARACTER_COUNT, Integer.toString(param));
+            } else if (equals("yr")) {
+                year = param;
+            } else if (equals("mo")) {
+                month = param;
+            } else if (equals("dy")) {
+                day = param;
+            } else if (equals("hr")) {
+                hour = param;
+            } else if (equals("min")) {
+                minute = param;
+            }
+
+            if (fontTableState == 1) {
+                // Still inside font table -- record the
+                // mappings of fN to the fcharset:
+                if (groupState.depth < fontTableDepth) {
+                    fontTableState = 2;
+                } else {
+                    if (equals("f")) {
+                        // Start new font definition
+                        curFontID = param;
+                    } else if (equals("fcharset")) {
+                        Charset cs = FCHARSET_MAP.get(param);
+                        if (cs != null) {
+                            fontToCharset.put(curFontID, cs);
+                        }
+                    }
+                }
+            }
+
+            if (currentList != null) {
+                if (equals("listid")) {
+                    currentList.id = param;
+                    currentListTable.put(currentList.id, currentList);
+                } else if (equals("listtemplateid")) {
+                    currentList.templateID = param;
+                } else if (equals("levelnfc") || equals("levelnfcn")) {
+                    //sanity check to make sure list information isn't corrupt
+                    if (listTableLevel > -1 &&
+                            listTableLevel < currentList.numberType.length) {
+                        currentList.numberType[listTableLevel] = param;
+                    }
+                }
+            }
+        } else {
+            // In document
+            if (equals("b")) {
+                // b0
+                assert param == 0;
+                if (groupState.bold) {
+                    pushText();
+                    if (groupState.italic) {
+                        end("i");
+                    }
+                    end("b");
+                    if (groupState.italic) {
+                        start("i");
+                    }
+                    groupState.bold = false;
+                }
+            } else if (equals("i")) {
+                // i0
+                assert param == 0;
+                if (groupState.italic) {
+                    pushText();
+                    end("i");
+                    groupState.italic = false;
+                }
+            } else if (equals("f")) {
+                // Change current font
+                Charset fontCharset = fontToCharset.get(param);
+
+                // Push any buffered text before changing
+                // font:
+                pushText();
+
+                if (fontCharset != null) {
+                    groupState.fontCharset = fontCharset;
+                } else {
+                    // DOC ERROR: font change referenced a
+                    // non-table'd font number
+                    // TODO: log a warning?  Throw an exc?
+                    groupState.fontCharset = null;
+                }
+            } else if (equals("ls")) {
+                groupState.list = param;
+            } else if (equals("lslvl")) {
+                groupState.listLevel = param;
+            }
+        }
+
+        // Process unicode escape. This can appear in doc
+        // or in header, since the metadata (info) fields
+        // in the header can be unicode escaped as well:
+        if (equals("u")) {
+            // Unicode escape
+            if (!groupState.ignore || groupState.sv || groupState.sn) {
+                final char utf16CodeUnit = (char) (param & 0xffff);
+                addOutputChar(utf16CodeUnit);
+            }
+
+            // After seeing a unicode escape we must
+            // skip the next ucSkip ansi chars (the
+            // "unicode shadow")
+            ansiSkip = groupState.ucSkip;
+        } else if (equals("uc")) {
+            // Change unicode shadow length
+            groupState.ucSkip = param;
+        } else if (equals("bin")) {
+            if (param >= 0) {
+                if (groupState.pictDepth == 1) {
+                    try {
+                        embObjHandler.writeBytes(in, param);
+                    } catch (IOException e) {
+                        //param was out of bounds or something went wrong 
during writing.
+                        //skip this obj and move on
+                        //TODO: log.warn
+                        embObjHandler.reset();
+                    }
+                } else {
+                    int bytesToRead = param;
+                    byte[] tmpArray = new byte[Math.min(1024, bytesToRead)];
+                    while (bytesToRead > 0) {
+                        int r = in.read(tmpArray, 0, Math.min(bytesToRead, 
tmpArray.length));
+                        if (r < 0) {
+                            throw new TikaException("unexpected end of file: 
need " + param + " bytes of binary data, found " + (param - bytesToRead));
+                        }
+                        bytesToRead -= r;
+                    }
+                }
+            } else {
+                // log some warning?
+            }
+        }
+    }
+
+    private boolean inList() {
+        return !ignoreLists && groupState.list != 0;
+    }
+
+    /**
+     * Marks the current list as pending to end. This is done to be able to 
merge list items of
+     * the same list within the same enclosing list tag (ie. either 
<code>"ul"</code>, or
+     * <code>"ol"</code>).
+     */
+    private void pendingListEnd() {
+        pendingListEnd = groupState.list;
+        groupState.list = 0;
+    }
+
+    /**
+     * Emits the end tag of a list. Uses {@link #isUnorderedList(int)} to 
determine the list
+     * type for the given <code>listID</code>.
+     *
+     * @param listID The ID of the list.
+     * @throws IOException
+     * @throws SAXException
+     * @throws TikaException
+     */
+    private void endList(int listID) throws IOException, SAXException, 
TikaException {
+        if (!ignoreLists) {
+            out.endElement(isUnorderedList(listID) ? "ul" : "ol");
+        }
+    }
+
+    /**
+     * Emits the start tag of a list. Uses {@link #isUnorderedList(int)} to 
determine the list
+     * type for the given <code>listID</code>.
+     *
+     * @param listID The ID of the list.
+     * @throws IOException
+     * @throws SAXException
+     * @throws TikaException
+     */
+    private void startList(int listID) throws IOException, SAXException, 
TikaException {
+        if (!ignoreLists) {
+            out.startElement(isUnorderedList(listID) ? "ul" : "ol");
+        }
+    }
+
+    private boolean isUnorderedList(int listID) {
+        ListDescriptor list = listTable.get(listID);
+        if (list != null) {
+            return list.isUnordered(groupState.listLevel);
+        }
+        return true;
+    }
+
+    private void end(String tag) throws IOException, SAXException, 
TikaException {
+        out.endElement(tag);
+    }
+
+    private void start(String tag) throws IOException, SAXException, 
TikaException {
+        out.startElement(tag);
+    }
+
+    // Handle non-parameter control word:
+    private void processControlWord() throws IOException, SAXException, 
TikaException {
+        if (inHeader) {
+            if (equals("ansi")) {
+                globalCharset = WINDOWS_1252;
+            } else if (equals("pca")) {
+                globalCharset = CP850;
+            } else if (equals("pc")) {
+                globalCharset = CP437;
+            } else if (equals("mac")) {
+                globalCharset = MAC_ROMAN;
+            }
+
+            if (equals("colortbl") || equals("stylesheet") || 
equals("fonttbl")) {
+                groupState.ignore = true;
+            } else if (equals("listtable")) {
+                currentListTable = listTable;
+            } else if (equals("listoverridetable")) {
+                currentListTable = listOverrideTable;
+            }
+
+            if (uprState == -1) {
+                // TODO: we can also parse \creatim, \revtim,
+                // \printim, \version, etc.
+                if (equals("author")) {
+                    nextMetaData = TikaCoreProperties.CREATOR;
+                } else if (equals("title")) {
+                    nextMetaData = TikaCoreProperties.TITLE;
+                } else if (equals("subject")) {
+                    // TODO: Move to OO subject in Tika 2.0
+                    nextMetaData = 
TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT;
+                } else if (equals("keywords")) {
+                    nextMetaData = 
TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT;
+                } else if (equals("category")) {
+                    nextMetaData = OfficeOpenXMLCore.CATEGORY;
+                } else if (equals("comment")) {
+                    nextMetaData = TikaCoreProperties.COMMENTS;
+                } else if (equals("company")) {
+                    nextMetaData = OfficeOpenXMLExtended.COMPANY;
+                } else if (equals("manager")) {
+                    nextMetaData = OfficeOpenXMLExtended.MANAGER;
+                } else if (equals("template")) {
+                    nextMetaData = OfficeOpenXMLExtended.TEMPLATE;
+                } else if (equals("creatim")) {
+                    nextMetaData = TikaCoreProperties.CREATED;
+                }
+            }
+
+            if (fontTableState == 0) {
+                // Didn't see font table yet
+                if (equals("fonttbl")) {
+                    fontTableState = 1;
+                    fontTableDepth = groupState.depth;
+                }
+            } else if (fontTableState == 1) {
+                // Inside font table
+                if (groupState.depth < fontTableDepth) {
+                    fontTableState = 2;
+                }
+            }
+
+            // List table handling
+            if (currentListTable != null) {
+                if (equals("list") || equals("listoverride")) {
+                    currentList = new ListDescriptor();
+                    listTableLevel = -1;
+                } else if (currentList != null) {
+                    if (equals("liststylename")) {
+                        currentList.isStyle = true;
+                    } else if (equals("listlevel")) {
+                        listTableLevel++;
+                    }
+                }
+            }
+
+            if (!groupState.ignore && (equals("par") || equals("pard") || 
equals("sect") || equals("sectd") || equals("plain") || equals("ltrch") || 
equals("rtlch"))) {
+                inHeader = false;
+            }
+        } else {
+            if (equals("b")) {
+                if (!groupState.bold) {
+                    pushText();
+                    lazyStartParagraph();
+                    if (groupState.italic) {
+                        // Make sure nesting is always <b><i>
+                        end("i");
+                    }
+                    groupState.bold = true;
+                    start("b");
+                    if (groupState.italic) {
+                        start("i");
+                    }
+                }
+            } else if (equals("i")) {
+                if (!groupState.italic) {
+                    pushText();
+                    lazyStartParagraph();
+                    groupState.italic = true;
+                    start("i");
+                }
+            }
+        }
+
+        final boolean ignored = groupState.ignore;
+
+        if (equals("pard")) {
+            // Reset styles
+            pushText();
+            if (groupState.italic) {
+                end("i");
+                groupState.italic = false;
+            }
+            if (groupState.bold) {
+                end("b");
+                groupState.bold = false;
+            }
+            if (inList()) { // && (groupStates.size() == 1 || 
groupStates.peekLast().list < 0))
+                pendingListEnd();
+            }
+        } else if (equals("par")) {
+            if (!ignored) {
+                endParagraph(true);
+            }
+        } else if (equals("shptxt")) {
+            pushText();
+            // Text inside a shape
+            groupState.ignore = false;
+        } else if (equals("atnid")) {
+            pushText();
+            // Annotation ID
+            groupState.ignore = false;
+        } else if (equals("atnauthor")) {
+            pushText();
+            // Annotation author
+            groupState.ignore = false;
+        } else if (equals("annotation")) {
+            pushText();
+            // Annotation
+            groupState.ignore = false;
+        } else if (equals("listtext")) {
+            groupState.ignore = true;
+        } else if (equals("cell")) {
+            // TODO: we should produce a table output here?
+            //addOutputChar(' ');
+            endParagraph(true);
+        } else if (equals("sp")) {
+            groupState.sp = true;
+        } else if (equals("sn")) {
+            embObjHandler.startSN();
+            groupState.sn = true;
+        } else if (equals("sv")) {
+            embObjHandler.startSV();
+            groupState.sv = true;
+        } else if (equals("object")) {
+            pushText();
+            embObjHandler.setInObject(true);
+            groupState.object = true;
+        } else if (equals("objdata")) {
+            groupState.objdata = true;
+            embObjHandler.startObjData();
+        } else if (equals("pict")) {
+            pushText();
+            // TODO: create img tag?  but can that support
+            // embedded image data?
+            groupState.pictDepth = 1;
+            embObjHandler.startPict();
+        } else if (equals("line")) {
+            if (!ignored) {
+                addOutputChar('\n');
+            }
+        } else if (equals("column")) {
+            if (!ignored) {
+                addOutputChar(' ');
+            }
+        } else if (equals("page")) {
+            if (!ignored) {
+                addOutputChar('\n');
+            }
+        } else if (equals("softline")) {
+            if (!ignored) {
+                addOutputChar('\n');
+            }
+        } else if (equals("softcolumn")) {
+            if (!ignored) {
+                addOutputChar(' ');
+            }
+        } else if (equals("softpage")) {
+            if (!ignored) {
+                addOutputChar('\n');
+            }
+        } else if (equals("tab")) {
+            if (!ignored) {
+                addOutputChar('\t');
+            }
+        } else if (equals("upr")) {
+            uprState = 0;
+        } else if (equals("ud") && uprState == 1) {
+            uprState = -1;
+            // 2nd group inside the upr destination, which
+            // contains the unicode encoding of the text, so
+            // we want to keep that:
+            groupState.ignore = false;
+        } else if (equals("bullet")) {
+            if (!ignored) {
+                // unicode BULLET
+                addOutputChar('\u2022');
+            }
+        } else if (equals("endash")) {
+            if (!ignored) {
+                // unicode EN DASH
+                addOutputChar('\u2013');
+            }
+        } else if (equals("emdash")) {
+            if (!ignored) {
+                // unicode EM DASH
+                addOutputChar('\u2014');
+            }
+        } else if (equals("enspace")) {
+            if (!ignored) {
+                // unicode EN SPACE
+                addOutputChar('\u2002');
+            }
+        } else if (equals("qmspace")) {
+            if (!ignored) {
+                // quarter em space -> unicode FOUR-PER-EM SPACE
+                addOutputChar('\u2005');
+            }
+        } else if (equals("emspace")) {
+            if (!ignored) {
+                // unicode EM SPACE
+                addOutputChar('\u2003');
+            }
+        } else if (equals("lquote")) {
+            if (!ignored) {
+                // unicode LEFT SINGLE QUOTATION MARK
+                addOutputChar('\u2018');
+            }
+        } else if (equals("rquote")) {
+            if (!ignored) {
+                // unicode RIGHT SINGLE QUOTATION MARK
+                addOutputChar('\u2019');
+            }
+        } else if (equals("ldblquote")) {
+            if (!ignored) {
+                // unicode LEFT DOUBLE QUOTATION MARK
+                addOutputChar('\u201C');
+            }
+        } else if (equals("rdblquote")) {
+            if (!ignored) {
+                // unicode RIGHT DOUBLE QUOTATION MARK
+                addOutputChar('\u201D');
+            }
+        } else if (equals("fldinst")) {
+            fieldState = 1;
+            groupState.ignore = false;
+        } else if (equals("fldrslt") && fieldState == 2) {
+            assert pendingURL != null;
+            lazyStartParagraph();
+            out.startElement("a", "href", pendingURL);
+            pendingURL = null;
+            fieldState = 3;
+            groupState.ignore = false;
+        }
+    }
+
+    // Push new GroupState
+    private void processGroupStart(PushbackInputStream in) throws IOException {
+        ansiSkip = 0;
+        // Push current groupState onto the stack
+        groupStates.add(groupState);
+
+        // Make new GroupState
+        groupState = new GroupState(groupState);
+        assert groupStates.size() == groupState.depth : "size=" + 
groupStates.size() + " depth=" + groupState.depth;
+
+        if (uprState == 0) {
+            uprState = 1;
+            groupState.ignore = true;
+        }
+
+        // Check for ignorable groups. Note that
+        // sometimes we un-ignore within this group, eg
+        // when handling upr escape.
+        int b2 = in.read();
+        if (b2 == '\\') {
+            int b3 = in.read();
+            if (b3 == '*') {
+                groupState.ignore = true;
+            }
+            in.unread(b3);
+        }
+        in.unread(b2);
+    }
+
+    // Pop current GroupState
+    private void processGroupEnd() throws IOException, SAXException, 
TikaException {
+        if (inHeader) {
+            if (nextMetaData != null) {
+                if (nextMetaData == TikaCoreProperties.CREATED) {
+                    Calendar cal = Calendar.getInstance(TimeZone.getDefault(), 
Locale.ROOT);
+                    cal.set(year, month - 1, day, hour, minute, 0);
+                    metadata.set(nextMetaData, cal.getTime());
+                } else if (nextMetaData.isMultiValuePermitted()) {
+                    metadata.add(nextMetaData, pendingBuffer.toString());
+                } else {
+                    metadata.set(nextMetaData, pendingBuffer.toString());
+                }
+                nextMetaData = null;
+            }
+            pendingBuffer.setLength(0);
+        }
+
+        assert groupState.depth > 0;
+        ansiSkip = 0;
+
+        if (groupState.objdata == true) {
+            embObjHandler.handleCompletedObject();
+            groupState.objdata = false;
+        } else if (groupState.pictDepth > 0) {
+            if (groupState.sn == true) {
+                embObjHandler.endSN();
+            } else if (groupState.sv == true) {
+                embObjHandler.endSV();
+            } else if (groupState.sp == true) {
+                embObjHandler.endSP();
+            } else if (groupState.pictDepth == 1) {
+                embObjHandler.handleCompletedObject();
+            }
+        }
+
+        if (groupState.object == true) {
+            embObjHandler.setInObject(false);
+        }
+
+        // Be robust if RTF doc is corrupt (has too many
+        // closing }s):
+        // TODO: log a warning?
+        if (groupStates.size() > 0) {
+            // Restore group state:
+            final GroupState outerGroupState = groupStates.removeLast();
+
+            // Close italic, if outer does not have italic or
+            // bold changed:
+            if (groupState.italic) {
+                if (!outerGroupState.italic ||
+                        groupState.bold != outerGroupState.bold) {
+                    end("i");
+                    groupState.italic = false;
+                }
+            }
+
+            // Close bold
+            if (groupState.bold && !outerGroupState.bold) {
+                end("b");
+            }
+
+            // Open bold
+            if (!groupState.bold && outerGroupState.bold) {
+                start("b");
+            }
+
+            // Open italic
+            if (!groupState.italic && outerGroupState.italic) {
+                start("i");
+            }
+            groupState = outerGroupState;
+        }
+        assert groupStates.size() == groupState.depth;
+
+        if (fieldState == 1) {
+            String s = pendingBuffer.toString().trim();
+            pendingBuffer.setLength(0);
+            if (s.startsWith("HYPERLINK")) {
+                s = s.substring(9).trim();
+                // TODO: what other instructions can be in a
+                // HYPERLINK destination?
+                final boolean isLocalLink = s.contains("\\l ");
+                int idx = s.indexOf('"');
+                if (idx != -1) {
+                    int idx2 = s.indexOf('"', 1 + idx);
+                    if (idx2 != -1) {
+                        s = s.substring(1 + idx, idx2);
+                    }
+                }
+                pendingURL = (isLocalLink ? "#" : "") + s;
+                fieldState = 2;
+            } else {
+                fieldState = 0;
+            }
+
+            // TODO: we could process the other known field
+            // types.  Right now, we will extract their text
+            // inlined, but fail to record them in metadata
+            // as a field value.
+        } else if (fieldState == 3) {
+            out.endElement("a");
+            fieldState = 0;
+        }
+    }
+}


Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.parser.microsoft.POIFSContainerDetector

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,27 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+org.apache.tika.parser.chm.ChmParser
+org.apache.tika.parser.microsoft.JackcessParser
+org.apache.tika.parser.microsoft.OfficeParser
+org.apache.tika.parser.microsoft.OldExcelParser
+org.apache.tika.parser.microsoft.TNEFParser
+org.apache.tika.parser.microsoft.ooxml.OOXMLParser
+#org.apache.tika.parser.odf.OpenDocumentContentParser
+#org.apache.tika.parser.odf.OpenDocumentMetaParser
+org.apache.tika.parser.odf.OpenDocumentParser
+#org.apache.tika.parser.opendocument.OpenOfficeParser
+org.apache.tika.parser.rtf.RTFParser

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Tests major functionality of ChmBlockInfo
+ * 
+ */
+public class TestChmBlockInfo {
+    private byte[] data;
+    private ChmBlockInfo chmBlockInfo;
+    private ChmDirectoryListingSet chmDirListCont = null;
+    private ChmLzxcResetTable clrt = null;
+    private ChmLzxcControlData chmLzxcControlData = null;
+
+    @Before
+    public void setUp() throws Exception {
+        data = TestParameters.chmData;
+        /* Creates and parses itsf header */
+        ChmItsfHeader chmItsHeader = new ChmItsfHeader();
+        // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
+        // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
+        chmItsHeader.parse(ChmCommons.copyOfRange(data, 0,
+                ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
+        /* Creates and parses itsp block */
+        ChmItspHeader chmItspHeader = new ChmItspHeader();
+        // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
+        // chmItsHeader.getDirOffset(),
+        // (int) chmItsHeader.getDirOffset()
+        // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+        chmItspHeader.parse(ChmCommons.copyOfRange(data,
+                (int) chmItsHeader.getDirOffset(),
+                (int) chmItsHeader.getDirOffset()
+                        + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+        /* Creating instance of ChmDirListingContainer */
+        chmDirListCont = new ChmDirectoryListingSet(data, chmItsHeader,
+                chmItspHeader);
+        int indexOfControlData = chmDirListCont.getControlDataIndex();
+
+        int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
+                ChmConstants.LZXC.getBytes(UTF_8));
+        byte[] dir_chunk = null;
+        if (indexOfResetTable > 0) {
+            // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
+            // indexOfResetTable
+            // +
+            // 
chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
+            dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
+                    indexOfResetTable
+                            + chmDirListCont.getDirectoryListingEntryList()
+                                    .get(indexOfControlData).getLength());
+        }
+
+        /* Creates and parses control block */
+        chmLzxcControlData = new ChmLzxcControlData();
+        chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
+
+        int indexOfFeList = chmDirListCont.getResetTableIndex();
+        int startIndex = (int) chmDirListCont.getDataOffset()
+                + chmDirListCont.getDirectoryListingEntryList()
+                        .get(indexOfFeList).getOffset();
+        // dir_chunk = Arrays.copyOfRange(data, startIndex , startIndex +
+        // 
chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength());
+        dir_chunk = ChmCommons.copyOfRange(data, startIndex, startIndex
+                        + 
chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength());
+        clrt = new ChmLzxcResetTable();
+        clrt.parse(dir_chunk, clrt);
+    }
+
+    @Test
+    public void testToString() {
+        if (chmBlockInfo == null)
+            testGetChmBlockInfo();
+        assertTrue(chmBlockInfo.toString().length() > 0);
+    }
+
+    @Test
+    public void testGetChmBlockInfo() {
+        for (DirectoryListingEntry directoryListingEntry : 
chmDirListCont.getDirectoryListingEntryList()) {
+            chmBlockInfo = ChmBlockInfo.getChmBlockInfoInstance(
+                    directoryListingEntry, (int) clrt.getBlockLen(),
+                    chmLzxcControlData);
+            // Assert.assertTrue(!directoryListingEntry.getName().isEmpty() &&
+            // chmBlockInfo.toString() != null);
+            assertTrue(!ChmCommons.isEmpty(directoryListingEntry
+                    .getName()) && chmBlockInfo.toString() != null);
+        }
+    }
+
+    @After
+    public void tearDown() throws Exception {
+        data = null;
+        chmBlockInfo = null;
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,207 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmExtractor;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.SAXException;
+
+public class TestChmExtraction {
+
+    private final Parser parser = new ChmParser();
+
+    private final List<String> files = Arrays.asList(
+            "/test-documents/testChm.chm",
+            "/test-documents/testChm2.chm",
+            "/test-documents/testChm3.chm");
+
+    @Test
+    public void testGetText() throws Exception {
+        BodyContentHandler handler = new BodyContentHandler();
+        new ChmParser().parse(
+                new ByteArrayInputStream(TestParameters.chmData),
+                handler, new Metadata(), new ParseContext());
+        assertTrue(handler.toString().contains(
+                "The TCard method accepts only numeric arguments"));
+    }
+
+    @Test
+    public void testChmParser() throws Exception{
+        for (String fileName : files) {
+            InputStream stream = 
TestChmExtraction.class.getResourceAsStream(fileName);
+            testingChm(stream);
+        }
+    }
+
+    private void testingChm(InputStream stream) throws IOException, 
SAXException, TikaException {
+      try {
+          BodyContentHandler handler = new BodyContentHandler(-1);
+          parser.parse(stream, handler, new Metadata(), new ParseContext());
+          assertTrue(!handler.toString().isEmpty());
+      } finally {
+          stream.close();
+      }
+    }
+
+    @Test
+    public void testExtractChmEntries() throws TikaException, IOException{
+        for (String fileName : files) {
+            try (InputStream stream = 
TestChmExtraction.class.getResourceAsStream(fileName)) {
+                testExtractChmEntry(stream);
+            }
+        }
+    }
+    
+    protected boolean findZero(byte[] textData) {
+        for (byte b : textData) {
+            if (b==0) {
+                return true;
+            }
+        }
+        
+        return false;
+    }
+    
+    protected boolean niceAscFileName(String name) {
+        for (char c : name.toCharArray()) {
+            if (c>=127 || c<32) {
+                //non-ascii char or control char
+                return false;
+            }
+        }
+        
+        return true;
+    }
+    
+    protected void testExtractChmEntry(InputStream stream) throws 
TikaException, IOException{
+        ChmExtractor chmExtractor = new ChmExtractor(stream);
+        ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
+        final Pattern htmlPairP = Pattern.compile("\\Q<html\\E.+\\Q</html>\\E"
+                , Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | 
Pattern.DOTALL);
+        
+        Set<String> names = new HashSet<String>();
+        
+        for (DirectoryListingEntry directoryListingEntry : 
entries.getDirectoryListingEntryList()) {
+            byte[] data = chmExtractor.extractChmEntry(directoryListingEntry);
+            
+            //Entry names should be nice. Disable this if the test chm do have 
bad looking but valid entry names.
+            if (! niceAscFileName(directoryListingEntry.getName())) {
+                throw new TikaException("Warning: File name contains a non 
ascii char : " + directoryListingEntry.getName());
+            }
+            
+            final String lowName = 
directoryListingEntry.getName().toLowerCase(Locale.ROOT);
+            
+            //check duplicate entry name which is seen before.
+            if (names.contains(lowName)) {
+                throw new TikaException("Duplicate File name detected : " + 
directoryListingEntry.getName());
+            }
+            names.add(lowName);
+            
+            if (lowName.endsWith(".html")
+                    || lowName.endsWith(".htm")
+                    || lowName.endsWith(".hhk")
+                    || lowName.endsWith(".hhc")
+                    //|| name.endsWith(".bmp")
+                    ) {
+                if (findZero(data)) {
+                    throw new TikaException("Xhtml/text file contains '\\0' : 
" + directoryListingEntry.getName());
+                }
+
+                //validate html
+                String html = new String(data, ISO_8859_1);
+                if (! htmlPairP.matcher(html).find()) {
+                    System.err.println(lowName + " is invalid.");
+                    System.err.println(html);
+                    throw new TikaException("Invalid xhtml file : " + 
directoryListingEntry.getName());
+                }
+//                else {
+//                    System.err.println(directoryListingEntry.getName() + " 
is valid.");
+//                }
+            }
+        }
+    }
+    
+
+    @Test
+    public void testMultiThreadedChmExtraction() throws InterruptedException {
+        ExecutorService executor = 
Executors.newFixedThreadPool(TestParameters.NTHREADS);
+        for (int i = 0; i < TestParameters.NTHREADS; i++) {
+            executor.execute(new Runnable() {
+                public void run() {
+                    for (String fileName : files) {
+                        InputStream stream = null;
+                        try {
+                            stream = 
TestChmExtraction.class.getResourceAsStream(fileName);
+                            BodyContentHandler handler = new 
BodyContentHandler(-1);
+                            parser.parse(stream, handler, new Metadata(), new 
ParseContext());
+                            assertTrue(!handler.toString().isEmpty());
+                        } catch (Exception e) {
+                            e.printStackTrace();
+                        } finally {
+                            try {
+                                stream.close();
+                            } catch (IOException e) {
+                                e.printStackTrace();
+                            }
+                        }
+                    }
+                }
+            });
+        }
+        executor.shutdown();
+        // Waits until all threads will have finished
+        while (!executor.isTerminated()) {
+            Thread.sleep(500);
+        }
+    }
+    
+    @Test
+    public void test_TIKA_1446() throws Exception {
+        URL chmDir = 
TestChmExtraction.class.getResource("/test-documents/chm/");
+        File chmFolder = new File(chmDir.toURI());
+        for (String fileName : chmFolder.list()) {
+            File file = new File(chmFolder, fileName);
+            InputStream stream = new FileInputStream(file);
+            testingChm(stream);
+        }
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import java.io.ByteArrayInputStream;
+import java.util.List;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmExtractor;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestChmExtractor {
+    private ChmExtractor chmExtractor = null;
+
+    @Before
+    public void setUp() throws Exception {
+        chmExtractor = new ChmExtractor(
+                new ByteArrayInputStream(TestParameters.chmData));
+    }
+
+    @Test
+    public void testEnumerateChm() {
+        List<String> chmEntries = chmExtractor.enumerateChm();
+        assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER,
+                chmEntries.size());
+    }
+
+    @Test
+    public void testGetChmDirList() {
+        assertNotNull(chmExtractor.getChmDirList());
+    }
+
+    @Test
+    public void testExtractChmEntry() throws TikaException{
+        ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
+        
+        int count = 0;
+        for (DirectoryListingEntry directoryListingEntry : 
entries.getDirectoryListingEntryList()) {
+            chmExtractor.extractChmEntry(directoryListingEntry);
+            ++count;
+        }
+        assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER, count);
+    }
+
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Tests all public functions of ChmItsfHeader
+ * 
+ */
+public class TestChmItsfHeader {
+    private ChmItsfHeader chmItsfHeader = null;
+
+    @Before
+    public void setUp() throws Exception {
+        chmItsfHeader = new ChmItsfHeader();
+        byte[] data = TestParameters.chmData;
+        // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
+        // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
+        chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0,
+                ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
+    }
+
+    @Test
+    public void getDataOffset() {
+        assertEquals(TestParameters.VP_DATA_OFFSET_LENGTH,
+                chmItsfHeader.getDataOffset());
+    }
+
+    @Test
+    public void getDir_uuid() {
+        assertNotNull(chmItsfHeader.getDir_uuid());
+    }
+
+    @Test
+    public void getDirLen() {
+        assertEquals(TestParameters.VP_DIRECTORY_LENGTH,
+                chmItsfHeader.getDirLen());
+    }
+
+    @Test
+    public void getDirOffset() {
+        assertEquals(TestParameters.VP_DIRECTORY_OFFSET,
+                chmItsfHeader.getDirOffset());
+    }
+
+    @Test
+    public void getHeaderLen() {
+        assertEquals(TestParameters.VP_ITSF_HEADER_LENGTH,
+                chmItsfHeader.getHeaderLen());
+    }
+
+    @Test
+    public void getLangId() {
+        assertEquals(TestParameters.VP_LANGUAGE_ID,
+                chmItsfHeader.getLangId());
+    }
+
+    @Test
+    public void getLastModified() {
+        assertEquals(TestParameters.VP_LAST_MODIFIED,
+                chmItsfHeader.getLastModified());
+    }
+
+    @Test
+    public void getUnknown_000c() {
+        assertEquals(TestParameters.VP_UNKNOWN_000C,
+                chmItsfHeader.getUnknown_000c());
+    }
+
+    @Test
+    public void getUnknownLen() {
+        assertEquals(TestParameters.VP_UNKNOWN_LEN,
+                chmItsfHeader.getUnknownLen());
+    }
+    
+    @Test
+    public void getUnknownOffset() {
+        assertEquals(TestParameters.VP_UNKNOWN_OFFSET,
+                chmItsfHeader.getUnknownOffset());
+    }
+
+    @Test
+    public void getVersion() {
+        assertEquals(TestParameters.VP_VERSION,
+                chmItsfHeader.getVersion());
+    }
+
+    @Test
+    public void testToString() {
+        assertTrue(chmItsfHeader.toString().contains(
+                TestParameters.VP_ISTF_SIGNATURE));
+    }
+
+    @After
+    public void tearDown() throws Exception {
+        chmItsfHeader = null;
+    }
+}

svn commit: r1725014 [13/28] - in /tika/branches/2.x: tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-parser-module/ tika-parser-modules/tika-advanced-parser-m...

Reply via email to