TIKA-2179 -- add detection and parsing for word2006ml files

Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/81fad8c9
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/81fad8c9
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/81fad8c9

Branch: refs/heads/master
Commit: 81fad8c97e60a3de7d926dc4ce10cbd235549583
Parents: a9a9e08
Author: tballison <[email protected]>
Authored: Wed Nov 23 14:04:50 2016 -0500
Committer: tballison <[email protected]>
Committed: Wed Nov 23 14:04:50 2016 -0500

----------------------------------------------------------------------
 CHANGES.txt                                     |    2 +
 .../org/apache/tika/mime/tika-mimetypes.xml     |    6 +
 .../parser/microsoft/MSOfficeParserConfig.java  |   38 +
 .../microsoft/ooxml/xwpf/BinaryDataHandler.java |  120 ++
 .../ooxml/xwpf/BodyContentHandler.java          |  271 +++
 .../ooxml/xwpf/CorePropertiesHandler.java       |  144 ++
 .../ooxml/xwpf/ExtendedPropertiesHandler.java   |   67 +
 .../microsoft/ooxml/xwpf/PartHandler.java       |   43 +
 .../microsoft/ooxml/xwpf/Relationship.java      |   52 +
 .../ooxml/xwpf/RelationshipsHandler.java        |   86 +
 .../ooxml/xwpf/RelationshipsManager.java        |   58 +
 .../microsoft/ooxml/xwpf/Word2006MLHandler.java |  168 ++
 .../microsoft/ooxml/xwpf/Word2006MLParser.java  |   67 +
 .../services/org.apache.tika.parser.Parser      |    1 +
 .../ooxml/xwpf/Word2006MLParserTest.java        |  182 ++
 .../test-documents/testWORD_2003ml.xml          | 1042 +++++++++++
 .../test-documents/testWORD_2006ml.xml          | 1678 ++++++++++++++++++
 .../test-documents/testWORD_2006ml_src.docx     |  Bin 0 -> 99960 bytes
 18 files changed, 4025 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 0c4afca..3bda350 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.15 - ??
 
+  * Add mime detection and parser for Word 2006ML format (TIKA-2179).
+
   * Upgrade to POI 3.16-beta1 (TIKA-2116).
 
   * Allow configuration of timeout for ForkParser (TIKA-2170).

http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index fffb9bb..30068da 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -645,6 +645,12 @@
     <_comment>Word 2003 xml format, pre-ooxml</_comment>
     <_comment>glob pattern typically *.doc</_comment>
   </mime-type>
+  <mime-type type="application/vnd.ms-word2006ml">
+    <root-XML localName="package" 
namespaceURI="http://schemas.microsoft.com/office/2006/xmlPackage"/>
+    <sub-class-of type="application/xml"/>
+    <_comment>Word 2006 xml format, pre-ooxml</_comment>
+    <_comment>glob pattern typically *.xml</_comment>
+  </mime-type>
 
   <mime-type type="application/rdf+xml">
     <root-XML localName="RDF"/>

http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOfficeParserConfig.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOfficeParserConfig.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOfficeParserConfig.java
new file mode 100644
index 0000000..8f8086a
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOfficeParserConfig.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+
+public class MSOfficeParserConfig {
+
+    private boolean includeDeletedContent = true;
+
+    /**
+     * Sets whether or not the parser should include deleted content.
+     * <b>This has not been implemented in all MSOffice parsers yet!!!</b>
+     * @param includeDeletedContent
+     */
+    public void setIncludeDeletedContent(boolean includeDeletedContent) {
+        this.includeDeletedContent = includeDeletedContent;
+    }
+
+    public boolean getIncludeDeletedContent() {
+        return includeDeletedContent;
+    }
+}
+
+

http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BinaryDataHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BinaryDataHandler.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BinaryDataHandler.java
new file mode 100644
index 0000000..c2177cf
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BinaryDataHandler.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+class BinaryDataHandler extends PartHandler {
+
+    private final XHTMLContentHandler handler;
+    private final Metadata metadata;
+    private final ParseContext parseContext;
+
+    private boolean inBinaryData = false;
+    private StringBuilder buffer = new StringBuilder();
+
+    final Base64 base64 = new Base64();
+
+
+    public BinaryDataHandler(XHTMLContentHandler handler, Metadata metadata, 
ParseContext context) {
+        this.handler = handler;
+        this.metadata = metadata;
+        this.parseContext = context;
+    }
+
+
+    @Override
+    public void startDocument() throws SAXException {
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+
+    }
+
+    @Override
+    void endPart() throws SAXException, TikaException {
+        if (hasData()) {
+            EmbeddedDocumentExtractor embeddedDocumentExtractor =
+                    
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
+            Metadata embeddedMetadata = new Metadata();
+            try (TikaInputStream stream = 
TikaInputStream.get(getInputStream())) {
+                embeddedDocumentExtractor.parseEmbedded(stream, handler, 
embeddedMetadata, false);
+            } catch (IOException e) {
+                throw new TikaException("error in finishing part", e);
+            }
+            buffer.setLength(0);
+        }
+
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, 
Attributes atts) throws SAXException {
+
+        if (uri.equals(Word2006MLHandler.PKG_NS) && 
localName.equals("binaryData")) {
+            inBinaryData = true;
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws 
SAXException {
+        if (uri.equals(Word2006MLHandler.PKG_NS) && 
localName.equals("binaryData")) {
+            inBinaryData = false;
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws 
SAXException {
+        if (inBinaryData) {
+            buffer.append(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws 
SAXException {
+
+    }
+
+    @Override
+    public String getPartContentType() {
+        return "";
+    }
+
+    boolean hasData() {
+        return buffer.length() > 0;
+    }
+
+    private InputStream getInputStream() {
+        byte[] bytes = base64.decode(buffer.toString());
+        return new ByteArrayInputStream(bytes);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BodyContentHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BodyContentHandler.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BodyContentHandler.java
new file mode 100644
index 0000000..ea16191
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BodyContentHandler.java
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.MSOfficeParserConfig;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+/**
+ * This class is intended to handle anything that might contain IBodyElements:
+ * main document, headers, footers, notes, etc.
+ */
+
+class BodyContentHandler extends PartHandler {
+
+
+    private enum EditType{
+        NONE,
+        INSERT,
+        DELETE
+    };
+
+    private final static String W_NS = 
"http://schemas.openxmlformats.org/wordprocessingml/2006/main";;
+    private final static String MC_NS = 
"http://schemas.openxmlformats.org/markup-compatibility/2006";;
+    private final static String OFFICE_DOC_RELATIONSHIP_NS = 
"http://schemas.openxmlformats.org/officeDocument/2006/relationships";;
+
+    private final static char[] TAB = new char[1];
+
+    static {
+        TAB[0] = '\t';
+    }
+
+    private final String partName;
+    private final RelationshipsManager relationshipsManager;
+    private final XHTMLContentHandler handler;
+    private final Metadata metadata;
+    private final ParseContext parseContext;
+    private final boolean includeDeletedContent;
+
+    private boolean inR = false;
+    private boolean inT = false;
+    private boolean inRPr = false;
+    private boolean inDelText = false;
+    private boolean inAlternateContent = false; //in alternate content section
+    private boolean inACChoice = false; //if in alternate, choice or fallback?
+    private boolean inACFallback = false;
+    private boolean hasWrittenAHref = false;
+    private boolean hasWrittenFormatting = false;
+    private String editAuthor = null;
+    private String editDate = null;
+    private EditType editType = EditType.NONE;
+    private String hyperlink = null;
+
+    private TmpFormatting currFormat = new TmpFormatting();
+
+    public BodyContentHandler(String partName, RelationshipsManager 
relationshipsManager,
+                              XHTMLContentHandler handler, Metadata metadata, 
ParseContext context) {
+        this.partName = partName;
+        this.relationshipsManager = relationshipsManager;
+        this.handler = handler;
+        this.metadata = metadata;
+        this.parseContext = context;
+        MSOfficeParserConfig config = context.get(MSOfficeParserConfig.class);
+        boolean tmpIncludeDeleted = true;
+        if (config != null) {
+            tmpIncludeDeleted = config.getIncludeDeletedContent();
+        }
+        includeDeletedContent = tmpIncludeDeleted;
+    }
+
+
+    @Override
+    public void startDocument() throws SAXException {
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws 
SAXException {
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) throws SAXException {
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, 
Attributes atts) throws SAXException {
+        if (uri.equals(MC_NS)) {
+            if (localName.equals("AlternateContent")) {
+                inAlternateContent = true;
+            } else if (localName.equals("Choice")) {
+                inACChoice = true;
+            } else if (localName.equals("Fallback")) {
+                inACFallback = true;
+            }
+        }
+        if (inACFallback) {
+            return;
+        }
+
+        if (uri.equals(W_NS)) {
+            if (localName.equals("p")) {
+                handler.startElement("p");
+            } else if (localName.equals("r")) {
+                inR = true;
+            } else if (localName.equals("t")) {
+                inT = true;
+            } else if (localName.equals("tab")) {
+                handler.characters(TAB, 0, 1);
+            } else if (localName.equals("tbl")) {
+                handler.startElement("table");
+            } else if (localName.equals("tc")) {
+                handler.startElement("td");
+            } else if (localName.equals("tr")) {
+                handler.startElement("tr");
+            } else if (localName.equals("rPr")) {
+                inRPr = true;
+            } else if (inR && inRPr && localName.equals("i")) {
+                //rprs don't have to be inR; ignore those that aren't
+                currFormat.italics = true;
+            } else if (inR && inRPr && localName.equals("b")) {
+                currFormat.bold = true;
+            } else if (localName.equals("delText")) {
+                inDelText = true;
+            } else if (localName.equals("ins")) {
+                editAuthor = atts.getValue(W_NS, "author");
+                editDate = atts.getValue(W_NS, "date");
+                editType = EditType.INSERT;
+            } else if (localName.equals("del")) {
+                editAuthor = atts.getValue(W_NS, "author");
+                editDate = atts.getValue(W_NS, "date");
+                editType = EditType.DELETE;
+            } else if (localName.equals("hyperlink")) {
+                String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, 
"id");
+                if (hyperlinkId != null) {
+                    Relationship relationship = 
relationshipsManager.getRelationship(getName(), hyperlinkId);
+                    if (relationship != null && 
XWPFRelation.HYPERLINK.getRelation().equals(relationship.getContentType())) {
+                        hyperlink = relationship.getTarget();
+                        handler.startElement("a", "href", hyperlink);
+                        hasWrittenAHref = true;
+                    }
+                }
+            }
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws 
SAXException {
+        if (uri.equals(MC_NS)) {
+            if (localName.equals("AlternateContent")) {
+                inAlternateContent = false;
+            } else if (localName.equals("Choice")) {
+                inACChoice = false;
+            } else if (localName.equals("Fallback")) {
+                inACFallback = false;
+            }
+        }
+        if (uri.equals(W_NS)) {
+            if (inACFallback) {
+                return;
+            }
+            if (localName.equals("p")) {
+                handler.endElement("p");
+            } else if (localName.equals("r")) {
+                closeStyleTags();
+                inR = false;
+                hasWrittenFormatting = false;
+            } else if (localName.equals("t")) {
+                inT = false;
+            } else if (localName.equals("tbl")) {
+                handler.endElement("table");
+            } else if (localName.equals("tc")) {
+                handler.endElement("td");
+            } else if (localName.equals("tr")) {
+                handler.endElement("tr");
+            } else if (localName.equals("rPr")) {
+                inRPr = false;
+            } else if (localName.equals("delText")) {
+                inDelText = false;
+            } else if (localName.equals("ins") || localName.equals("del")) {
+                editType = EditType.NONE;
+                editAuthor = null;
+                editDate = null;
+            } else if (localName.equals("hyperlink") && hasWrittenAHref) {
+                handler.endElement("a");
+                hasWrittenAHref = false;
+            }
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws 
SAXException {
+        if (inACFallback) {
+            return;
+        }
+
+        if (inR && !hasWrittenFormatting) {
+            if (currFormat.bold) {
+                handler.startElement("b");
+            }
+            if (currFormat.italics) {
+                handler.startElement("i");
+            }
+            hasWrittenFormatting = true;
+        }
+        if (inT) {
+            handler.characters(ch, start, length);
+        } else if (includeDeletedContent && inDelText) {
+            handler.characters(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws 
SAXException {
+        if (inACFallback) {
+            return;
+        }
+
+        if (inT) {
+            handler.characters(ch, start, length);
+        }
+    }
+
+    @Override
+    public String getPartContentType() {
+        return partName;
+    }
+
+
+
+    void closeStyleTags() throws SAXException {
+        if (hasWrittenFormatting) {
+            if (currFormat.italics) {
+                handler.endElement("i");
+            }
+            if (currFormat.bold) {
+                handler.endElement("b");
+            }
+        }
+
+        currFormat.bold = false;
+        currFormat.italics = false;
+    }
+
+    private class TmpFormatting {
+        boolean italics = false;
+        boolean bold = false;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/CorePropertiesHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/CorePropertiesHandler.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/CorePropertiesHandler.java
new file mode 100644
index 0000000..b0bca08
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/CorePropertiesHandler.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.poi.openxml4j.opc.ContentTypes;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+class CorePropertiesHandler extends PartHandler {
+
+    final static String DC_NS = "http://purl.org/dc/elements/1.1";;
+    final static String DC_TERMS_NS = "http://purl.org/dc/terms";;
+    final static String CP_NS = 
"http://schemas.openxmlformats.org/package/2006/metadata/core-properties";;
+
+    private final Metadata metadata;
+
+    final StringBuilder buffer = new StringBuilder();
+    final Map<String, Map<String, Property>> properties = new HashMap<>();
+
+    public CorePropertiesHandler(Metadata metadata) {
+        this.metadata = metadata;
+        addProperties();
+    }
+
+    void addProperties() {
+        Map<String, Property> dc = properties.get(DC_NS);
+        if (dc == null) {
+            dc = new HashMap<>();
+        }
+        dc.put("creator", TikaCoreProperties.CREATOR);
+        dc.put("title", TikaCoreProperties.TITLE);
+        dc.put("description", TikaCoreProperties.DESCRIPTION);
+        properties.put(DC_NS, dc);
+
+        Map<String, Property> dcTerms = properties.get(DC_TERMS_NS);
+        if (dcTerms == null) {
+            dcTerms = new HashMap<>();
+        }
+        dcTerms.put("created", TikaCoreProperties.CREATED);
+        dcTerms.put("modified", TikaCoreProperties.MODIFIED);
+
+        properties.put(DC_TERMS_NS, dcTerms);
+
+        Map<String, Property> cp = properties.get(CP_NS);
+        if (cp == null) {
+            cp = new HashMap<>();
+        }
+        cp.put("category", OfficeOpenXMLCore.CATEGORY);
+        cp.put("contentStatus", OfficeOpenXMLCore.CONTENT_STATUS);
+        cp.put("lastModifiedBy", OfficeOpenXMLCore.LAST_MODIFIED_BY);
+        cp.put("lastPrinted", OfficeOpenXMLCore.LAST_PRINTED);
+        cp.put("revision", OfficeOpenXMLCore.REVISION);
+        cp.put("subject", OfficeOpenXMLCore.SUBJECT);
+        cp.put("version", OfficeOpenXMLCore.VERSION);
+        properties.put(CP_NS, cp);
+    }
+
+    @Override
+    public void startDocument() throws SAXException {
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        buffer.setLength(0);
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws 
SAXException {
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) throws SAXException {
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, 
Attributes atts) throws SAXException {
+
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws 
SAXException {
+        Property prop = getProperty(uri, localName);
+        if (prop != null) {
+
+            if (prop.isMultiValuePermitted()) {
+                metadata.add(prop, buffer.toString());
+            } else {
+                metadata.set(prop, buffer.toString());
+            }
+        }
+        buffer.setLength(0);
+
+    }
+
+    private Property getProperty(String uri, String localName) {
+        if (uri.endsWith("/")) {
+            uri = uri.substring(0, uri.length()-1);
+        }
+
+        Map<String, Property> m = properties.get(uri);
+        if (m != null) {
+            return m.get(localName);
+        }
+        return null;
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws 
SAXException {
+        buffer.append(ch, start, length);
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws 
SAXException {
+        buffer.append(ch, start, length);
+    }
+
+    @Override
+    public String getPartContentType() {
+        return ContentTypes.CORE_PROPERTIES_PART;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ExtendedPropertiesHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ExtendedPropertiesHandler.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ExtendedPropertiesHandler.java
new file mode 100644
index 0000000..07e5e23
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ExtendedPropertiesHandler.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.Property;
+
+class ExtendedPropertiesHandler extends CorePropertiesHandler {
+
+    final static String EP_NS = 
"http://schemas.openxmlformats.org/officeDocument/2006/extended-properties";;
+
+    public ExtendedPropertiesHandler(Metadata metadata) {
+        super(metadata);
+    }
+
+    @Override
+    void addProperties() {
+        Map<String, Property> ep = properties.get(EP_NS);
+        if (ep == null) {
+            ep = new HashMap<>();
+        }
+        ep.put("AppVersion", OfficeOpenXMLExtended.APP_VERSION);
+        ep.put("Application", OfficeOpenXMLExtended.APPLICATION);
+        ep.put("Comments", OfficeOpenXMLExtended.COMMENTS);
+        ep.put("Company", OfficeOpenXMLExtended.COMPANY);
+        ep.put("DocSecurity", OfficeOpenXMLExtended.DOC_SECURITY);
+        ep.put("HiddenSlides", OfficeOpenXMLExtended.HIDDEN_SLIDES);
+        ep.put("Manager", OfficeOpenXMLExtended.MANAGER);
+        ep.put("Notes", OfficeOpenXMLExtended.NOTES);
+        ep.put("PresentationFormat", 
OfficeOpenXMLExtended.PRESENTATION_FORMAT);
+        ep.put("Template", OfficeOpenXMLExtended.TEMPLATE);
+        ep.put("TotalTime", OfficeOpenXMLExtended.TOTAL_TIME);
+        ep.put("Pages", Office.PAGE_COUNT);
+        ep.put("Words", Office.WORD_COUNT);
+        ep.put("Characters", Office.CHARACTER_COUNT);
+        ep.put("CharactersWithSpaces", Office.CHARACTER_COUNT_WITH_SPACES);
+        ep.put("Paragraphs", Office.PARAGRAPH_COUNT);
+        ep.put("Lines", Office.LINE_COUNT);
+        properties.put(EP_NS, ep);
+    }
+
+    @Override
+    public String getPartContentType() {
+        return 
"application/vnd.openxmlformats-officedocument.extended-properties+xml";
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/PartHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/PartHandler.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/PartHandler.java
new file mode 100644
index 0000000..79bcafe
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/PartHandler.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+abstract class PartHandler extends DefaultHandler {
+
+    private String name;
+
+    public abstract String getPartContentType();
+
+    public void setName(String name) {
+        this.name = name;
+    }
+
+    public String getName() {
+        return name;
+    }
+
+    /**
+     * Override this to flush buffers, etc if necessary
+     */
+    void endPart() throws SAXException, TikaException {
+
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Relationship.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Relationship.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Relationship.java
new file mode 100644
index 0000000..19b0dd4
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Relationship.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+
+import org.apache.poi.openxml4j.opc.TargetMode;
+
+class Relationship {
+
+    private final String contentType;
+
+    private final String target;
+
+    private final TargetMode targetMode;
+
+    public Relationship(String contentType, String target) {
+        this(contentType, target, null);
+    }
+
+    public Relationship(String contentType, String target, TargetMode 
targetMode) {
+        this.contentType = contentType;
+        this.target = target;
+        this.targetMode = targetMode;
+    }
+
+    public String getContentType() {
+        return contentType;
+    }
+
+    public String getTarget() {
+        return target;
+    }
+
+    public TargetMode getTargetMode() {
+        return targetMode;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsHandler.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsHandler.java
new file mode 100644
index 0000000..211b048
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsHandler.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+
+import org.apache.poi.openxml4j.opc.ContentTypes;
+import org.apache.poi.openxml4j.opc.TargetMode;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+class RelationshipsHandler extends PartHandler {
+
+    final static String REL_NS = 
"http://schemas.openxmlformats.org/package/2006/relationships";;
+
+    private final RelationshipsManager relationshipsManager;
+
+    public RelationshipsHandler(RelationshipsManager relationshipsManager) {
+        this.relationshipsManager = relationshipsManager;
+    }
+
+
+    @Override
+    public void startDocument() throws SAXException {
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws 
SAXException {
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) throws SAXException {
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, 
Attributes atts) throws SAXException {
+        if (uri.equals(REL_NS)) {
+            if (localName.equals("Relationship")) {
+                String id = atts.getValue("", "Id");
+                String type = atts.getValue("", "Type");
+                String target = atts.getValue("", "Target");
+                String targetModeString = atts.getValue("", "TargetMode");
+                TargetMode targetMode = "EXTERNAL".equals(targetModeString)? 
TargetMode.EXTERNAL :
+                        TargetMode.INTERNAL;
+                relationshipsManager.addRelationship(getName(), id, type, 
target, targetMode);
+            }
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws 
SAXException {
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws 
SAXException {
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws 
SAXException {
+
+    }
+
+    @Override
+    public String getPartContentType() {
+        return ContentTypes.RELATIONSHIPS_PART;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsManager.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsManager.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsManager.java
new file mode 100644
index 0000000..d1954ac
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsManager.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.poi.openxml4j.opc.TargetMode;
+
+class RelationshipsManager {
+
+    Map<String, Map<String, Relationship>> map = new HashMap<>();
+
+    public void addRelationship(String relsFileName, String id, String type, 
String target, TargetMode targetMode) {
+        String packageName = convertRelsFileNameToPackageName(relsFileName);
+        Map<String, Relationship> thisPackageRels = map.get(packageName);
+        if (thisPackageRels == null) {
+            thisPackageRels = new HashMap<>();
+        }
+        thisPackageRels.put(id, new Relationship(type, target, targetMode));
+        map.put(packageName, thisPackageRels);
+    }
+
+    public Relationship getRelationship(String packageName, String id) {
+        Map<String, Relationship> thisPackageRels = map.get(packageName);
+        if (thisPackageRels != null) {
+            return thisPackageRels.get(id);
+        }
+        return null;
+    }
+
+    private String convertRelsFileNameToPackageName(String relsFileName) {
+        if ("/_rels/.rels".equals(relsFileName)) {
+            return "/";
+        }
+
+        String tmp = relsFileName;
+        tmp = tmp.replaceFirst("\\/_rels\\/", "/");
+        tmp = tmp.replaceFirst(".rels\\Z", "");
+        return tmp;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLHandler.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLHandler.java
new file mode 100644
index 0000000..cf919cc
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLHandler.java
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+class Word2006MLHandler extends DefaultHandler {
+
+    final static String PKG_NS = 
"http://schemas.microsoft.com/office/2006/xmlPackage";;
+
+
+    private final XHTMLContentHandler handler;
+    private final Metadata metadata;
+    private final ParseContext parseContext;
+
+    private final Map<String, PartHandler> partHandlers = new HashMap<>();
+    private final BinaryDataHandler binaryDataHandler;
+    private final RelationshipsManager relationshipsManager = new 
RelationshipsManager();
+    private PartHandler currentPartHandler = null;
+
+    public Word2006MLHandler(XHTMLContentHandler handler, Metadata metadata, 
ParseContext context) {
+        this.handler = handler;
+        this.metadata = metadata;
+        this.parseContext = context;
+
+        addPackageHandler(new RelationshipsHandler(relationshipsManager));
+
+        addPackageHandler(new BodyContentHandler(
+                XWPFRelation.DOCUMENT.getContentType(),
+                relationshipsManager,
+                handler, metadata, context));
+        addPackageHandler(new BodyContentHandler(
+                XWPFRelation.FOOTNOTE.getContentType(),
+                relationshipsManager,
+                handler, metadata, context));
+        addPackageHandler(new BodyContentHandler(
+                
"application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml",
+                relationshipsManager,
+                handler, metadata, context));
+        addPackageHandler(new BodyContentHandler(
+                XWPFRelation.HEADER.getContentType(),
+                relationshipsManager,
+                handler, metadata, context));
+        addPackageHandler(new BodyContentHandler(
+                XWPFRelation.FOOTER.getContentType(),
+                relationshipsManager,
+                handler, metadata, context));
+        addPackageHandler(new BodyContentHandler(
+                
"application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml",
+                relationshipsManager,
+                handler, metadata, context));
+        addPackageHandler(new BodyContentHandler(
+                
"application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml",
+                relationshipsManager,
+                handler, metadata, context));
+        addPackageHandler(new BodyContentHandler(
+                
"application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
+                relationshipsManager,
+                handler, metadata, context));
+
+        addPackageHandler(new CorePropertiesHandler(metadata));
+        addPackageHandler(new ExtendedPropertiesHandler(metadata));
+        binaryDataHandler = new BinaryDataHandler(handler, metadata, context);
+    }
+
+    private void addPackageHandler(PartHandler partHandler) {
+        partHandlers.put(partHandler.getPartContentType(), partHandler);
+    }
+
+
+    @Override
+    public void startDocument() throws SAXException {
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws 
SAXException {
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) throws SAXException {
+
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, 
Attributes atts) throws SAXException {
+        if (uri.equals(PKG_NS) && localName.equals("part")) {
+            //start of a package
+            String name = atts.getValue(PKG_NS, "name");
+            String contentType = atts.getValue(PKG_NS, "contentType");
+            currentPartHandler = partHandlers.get(contentType);
+            //for now treat every unknown part type
+            //as if it contained binary data
+            if (currentPartHandler == null) {
+                currentPartHandler = binaryDataHandler;
+            }
+            if (currentPartHandler != null) {
+                currentPartHandler.setName(name);
+            }
+        } else if (currentPartHandler != null) {
+            currentPartHandler.startElement(uri, localName, qName, atts);
+        }
+
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws 
SAXException {
+        if (uri.equals(PKG_NS) && localName.equals("part")) {
+            //do post processing
+            if (currentPartHandler != null) {
+                try {
+                    currentPartHandler.endPart();
+                } catch (TikaException e) {
+                    throw new SAXException(e);
+                }
+            }
+            //then reset
+            currentPartHandler = null;
+        } else if (currentPartHandler != null) {
+            currentPartHandler.endElement(uri, localName, qName);
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws 
SAXException {
+        if (currentPartHandler != null) {
+            currentPartHandler.characters(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws 
SAXException {
+        if (currentPartHandler != null) {
+            currentPartHandler.characters(ch, start, length);
+        }
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParser.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParser.java
new file mode 100644
index 0000000..4609bf5
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParser.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+
+public class Word2006MLParser extends AbstractParser {
+
+    protected static final Set<MediaType> SUPPORTED_TYPES = 
Collections.singleton(
+                    MediaType.application("vnd.ms-word2006ml"));
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+                      Metadata metadata, ParseContext context) throws 
IOException, SAXException, TikaException {
+        final XHTMLContentHandler xhtml =
+                new XHTMLContentHandler(handler, metadata);
+
+        xhtml.startDocument();
+
+        try {
+            context.getSAXParser().parse(
+                    new CloseShieldInputStream(stream),
+                    new OfflineContentHandler(new EmbeddedContentHandler(
+                            new Word2006MLHandler(xhtml, metadata, context))));
+        } catch (SAXException e) {
+            throw new TikaException("XML parse error", e);
+        } finally {
+            xhtml.endDocument();
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 6ed2f6c..fcd5840 100644
--- 
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ 
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -42,6 +42,7 @@ org.apache.tika.parser.microsoft.OfficeParser
 org.apache.tika.parser.microsoft.OldExcelParser
 org.apache.tika.parser.microsoft.TNEFParser
 org.apache.tika.parser.microsoft.ooxml.OOXMLParser
+org.apache.tika.parser.microsoft.ooxml.xwpf.Word2006MLParser
 org.apache.tika.parser.microsoft.xml.WordMLParser
 org.apache.tika.parser.microsoft.xml.SpreadsheetMLParser
 org.apache.tika.parser.mp3.Mp3Parser

http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java
new file mode 100644
index 0000000..607e6ef
--- /dev/null
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.microsoft.MSOfficeParserConfig;
+import org.junit.Test;
+
+
+public class Word2006MLParserTest extends TikaTest {
+
+    @Test
+    public void basicTest() throws Exception {
+
+
+
+        List<Metadata> metadataList = 
getRecursiveMetadata("testWORD_2006ml.xml");
+
+        assertEquals(5, metadataList.size());
+
+        Metadata m = metadataList.get(0);
+
+        assertEquals("2016-11-23T12:07:00Z", 
m.get(TikaCoreProperties.CREATED));
+        assertEquals("2016-11-23T12:07:00Z", 
m.get(TikaCoreProperties.MODIFIED));
+        assertEquals("My Document Title", m.get(TikaCoreProperties.TITLE));
+        assertEquals("This is the Author", m.get(TikaCoreProperties.CREATOR));
+        assertEquals("2", m.get(OfficeOpenXMLCore.REVISION));
+        assertEquals("Allison, Timothy B.", 
m.get(OfficeOpenXMLCore.LAST_MODIFIED_BY));
+        assertEquals("0", m.get(OfficeOpenXMLExtended.DOC_SECURITY));
+        assertEquals("225", m.get(Office.WORD_COUNT));
+        assertEquals("3", m.get(Office.PARAGRAPH_COUNT));
+        assertEquals("1506", m.get(Office.CHARACTER_COUNT_WITH_SPACES));
+        assertEquals("10", m.get(Office.LINE_COUNT));
+        assertEquals("16.0000", m.get(OfficeOpenXMLExtended.APP_VERSION));
+
+
+        String content = m.get(RecursiveParserWrapper.TIKA_CONTENT);
+
+
+        assertContainsCountTimes("engaging title page", content, 1);
+        assertContainsCountTimes("<p>This is the Author</p>", content, 1);
+        assertContainsCountTimes("<p>This is an engaging title page</p>", 
content, 1);
+
+        assertContains("<p>My Document Title</p>", content);
+        assertContains("<p>My Document Subtitle</p>", content);
+
+        assertContains("<p>\tHeading1\t3</p>", content);
+
+
+        //TODO: integrate numbering
+        assertContains("Really basic 2.", content);
+
+        assertContainsCountTimes("This is a text box", content, 1);
+
+        assertContains("<p>This is a hyperlink: <a 
href=\"http://tika.apache.org\";>tika</a></p>", content);
+
+        assertContains("<p>This is a link to a local file: <a 
href=\"file:///C:\\data\\test.png\">test.png</a></p>", content);
+
+        assertContains("<p>This is          10 spaces</p>", content);
+
+        //caption
+        assertContains("<p>Table 1: Table1 Caption</p>", content);
+
+        //embedded table
+        //TODO: figure out how to handle embedded tables in html
+        assertContains("<p>Embedded table r1c1</p>", content);
+
+        //shape
+        assertContainsCountTimes("<p>This is text within a shape", content, 1);
+
+        //sdt rich text
+        assertContains("<p>Rich text content control", content);
+
+        //sdt simple text
+        assertContains("<p>Simple text content control", content);
+
+        //sdt repeating
+        assertContains("Repeating content", content);
+
+        //sdt dropdown
+        //TODO: get options for dropdown
+        assertContains("Drop down1", content);
+
+        //sdt date
+        assertContains("<p>11/16/2016</p>", content);
+
+        //test that <tab/> works
+        assertContains("tab\ttab", content);
+
+        assertContainsCountTimes("serious word art", content, 1);
+        assertContainsCountTimes("Wordartr1c1", content, 1);
+
+        //glossary document contents
+        assertContains("Click or tap to enter a date", content);
+
+        //basic formatting
+        assertContains("<p>The <i>quick</i> brown <b>fox 
</b>j<i>um</i><b><i>ped</i></b> over",
+                content);
+
+        //TODO: add chart parsing
+//        assertContains("This is the chart", content);
+
+        assertContains("This is a comment", content);
+
+        assertContains("This is an endnote", content);
+
+        assertContains("this is the footnote", content);
+
+        assertContains("First page header", content);
+
+        assertContains("Even page header", content);
+
+        assertContains("Odd page header", content);
+
+        assertContains("First page footer", content);
+
+        assertContains("Even page footer", content);
+
+        assertContains("Odd page footer", content);
+
+        //test default includes deleted
+        assertContains("frog", content);
+
+        assertContains("Mattmann", content);
+
+        //TODO: extract this...Note that it is in "Backup" not "Choice"!!!
+//        assertContains("This is the chart title", content);
+
+
+
+    }
+
+    private void assertContainsCountTimes(String needle, String haystack, int 
expectedCount) {
+        int i = haystack.indexOf("engaging title page");
+        int cnt = 0;
+        while (i > -1) {
+            cnt++;
+            i = haystack.indexOf("engaging title page", i+1);
+        }
+        assertEquals("found needle >"+ needle+"<"+cnt+" times instead of 
expected: "+expectedCount,
+                expectedCount, cnt);
+
+    }
+
+    @Test
+    public void testSkipDeleted() throws Exception {
+        ParseContext pc = new ParseContext();
+        MSOfficeParserConfig msOfficeParserConfig = new MSOfficeParserConfig();
+        msOfficeParserConfig.setIncludeDeletedContent(false);
+        pc.set(MSOfficeParserConfig.class, msOfficeParserConfig);
+
+        XMLResult r = getXML("testWORD_2006ml.xml", pc);
+        assertNotContained("frog", r.xml);
+    }
+
+}

Reply via email to