TIKA-2179 -- add detection and parsing for word2006ml files
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/81fad8c9 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/81fad8c9 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/81fad8c9 Branch: refs/heads/master Commit: 81fad8c97e60a3de7d926dc4ce10cbd235549583 Parents: a9a9e08 Author: tballison <[email protected]> Authored: Wed Nov 23 14:04:50 2016 -0500 Committer: tballison <[email protected]> Committed: Wed Nov 23 14:04:50 2016 -0500 ---------------------------------------------------------------------- CHANGES.txt | 2 + .../org/apache/tika/mime/tika-mimetypes.xml | 6 + .../parser/microsoft/MSOfficeParserConfig.java | 38 + .../microsoft/ooxml/xwpf/BinaryDataHandler.java | 120 ++ .../ooxml/xwpf/BodyContentHandler.java | 271 +++ .../ooxml/xwpf/CorePropertiesHandler.java | 144 ++ .../ooxml/xwpf/ExtendedPropertiesHandler.java | 67 + .../microsoft/ooxml/xwpf/PartHandler.java | 43 + .../microsoft/ooxml/xwpf/Relationship.java | 52 + .../ooxml/xwpf/RelationshipsHandler.java | 86 + .../ooxml/xwpf/RelationshipsManager.java | 58 + .../microsoft/ooxml/xwpf/Word2006MLHandler.java | 168 ++ .../microsoft/ooxml/xwpf/Word2006MLParser.java | 67 + .../services/org.apache.tika.parser.Parser | 1 + .../ooxml/xwpf/Word2006MLParserTest.java | 182 ++ .../test-documents/testWORD_2003ml.xml | 1042 +++++++++++ .../test-documents/testWORD_2006ml.xml | 1678 ++++++++++++++++++ .../test-documents/testWORD_2006ml_src.docx | Bin 0 -> 99960 bytes 18 files changed, 4025 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 0c4afca..3bda350 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,7 @@ Release 1.15 - ?? + * Add mime detection and parser for Word 2006ML format (TIKA-2179). + * Upgrade to POI 3.16-beta1 (TIKA-2116). * Allow configuration of timeout for ForkParser (TIKA-2170). http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml ---------------------------------------------------------------------- diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index fffb9bb..30068da 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -645,6 +645,12 @@ <_comment>Word 2003 xml format, pre-ooxml</_comment> <_comment>glob pattern typically *.doc</_comment> </mime-type> + <mime-type type="application/vnd.ms-word2006ml"> + <root-XML localName="package" namespaceURI="http://schemas.microsoft.com/office/2006/xmlPackage"/> + <sub-class-of type="application/xml"/> + <_comment>Word 2006 xml format, pre-ooxml</_comment> + <_comment>glob pattern typically *.xml</_comment> + </mime-type> <mime-type type="application/rdf+xml"> <root-XML localName="RDF"/> http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOfficeParserConfig.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOfficeParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOfficeParserConfig.java new file mode 100644 index 0000000..8f8086a --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOfficeParserConfig.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + + +public class MSOfficeParserConfig { + + private boolean includeDeletedContent = true; + + /** + * Sets whether or not the parser should include deleted content. + * <b>This has not been implemented in all MSOffice parsers yet!!!</b> + * @param includeDeletedContent + */ + public void setIncludeDeletedContent(boolean includeDeletedContent) { + this.includeDeletedContent = includeDeletedContent; + } + + public boolean getIncludeDeletedContent() { + return includeDeletedContent; + } +} + + http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BinaryDataHandler.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BinaryDataHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BinaryDataHandler.java new file mode 100644 index 0000000..c2177cf --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BinaryDataHandler.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft.ooxml.xwpf; + + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.codec.binary.Base64; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +class BinaryDataHandler extends PartHandler { + + private final XHTMLContentHandler handler; + private final Metadata metadata; + private final ParseContext parseContext; + + private boolean inBinaryData = false; + private StringBuilder buffer = new StringBuilder(); + + final Base64 base64 = new Base64(); + + + public BinaryDataHandler(XHTMLContentHandler handler, Metadata metadata, ParseContext context) { + this.handler = handler; + this.metadata = metadata; + this.parseContext = context; + } + + + @Override + public void startDocument() throws SAXException { + } + + @Override + public void endDocument() throws SAXException { + + } + + @Override + void endPart() throws SAXException, TikaException { + if (hasData()) { + EmbeddedDocumentExtractor embeddedDocumentExtractor = + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext); + Metadata embeddedMetadata = new Metadata(); + try (TikaInputStream stream = TikaInputStream.get(getInputStream())) { + embeddedDocumentExtractor.parseEmbedded(stream, handler, embeddedMetadata, false); + } catch (IOException e) { + throw new TikaException("error in finishing part", e); + } + buffer.setLength(0); + } + + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { + + if (uri.equals(Word2006MLHandler.PKG_NS) && localName.equals("binaryData")) { + inBinaryData = true; + } + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + if (uri.equals(Word2006MLHandler.PKG_NS) && localName.equals("binaryData")) { + inBinaryData = false; + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (inBinaryData) { + buffer.append(ch, start, length); + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + + } + + @Override + public String getPartContentType() { + return ""; + } + + boolean hasData() { + return buffer.length() > 0; + } + + private InputStream getInputStream() { + byte[] bytes = base64.decode(buffer.toString()); + return new ByteArrayInputStream(bytes); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BodyContentHandler.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BodyContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BodyContentHandler.java new file mode 100644 index 0000000..ea16191 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BodyContentHandler.java @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft.ooxml.xwpf; + + +import org.apache.poi.xwpf.usermodel.XWPFRelation; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.microsoft.MSOfficeParserConfig; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +/** + * This class is intended to handle anything that might contain IBodyElements: + * main document, headers, footers, notes, etc. + */ + +class BodyContentHandler extends PartHandler { + + + private enum EditType{ + NONE, + INSERT, + DELETE + }; + + private final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006"; + private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"; + + private final static char[] TAB = new char[1]; + + static { + TAB[0] = '\t'; + } + + private final String partName; + private final RelationshipsManager relationshipsManager; + private final XHTMLContentHandler handler; + private final Metadata metadata; + private final ParseContext parseContext; + private final boolean includeDeletedContent; + + private boolean inR = false; + private boolean inT = false; + private boolean inRPr = false; + private boolean inDelText = false; + private boolean inAlternateContent = false; //in alternate content section + private boolean inACChoice = false; //if in alternate, choice or fallback? + private boolean inACFallback = false; + private boolean hasWrittenAHref = false; + private boolean hasWrittenFormatting = false; + private String editAuthor = null; + private String editDate = null; + private EditType editType = EditType.NONE; + private String hyperlink = null; + + private TmpFormatting currFormat = new TmpFormatting(); + + public BodyContentHandler(String partName, RelationshipsManager relationshipsManager, + XHTMLContentHandler handler, Metadata metadata, ParseContext context) { + this.partName = partName; + this.relationshipsManager = relationshipsManager; + this.handler = handler; + this.metadata = metadata; + this.parseContext = context; + MSOfficeParserConfig config = context.get(MSOfficeParserConfig.class); + boolean tmpIncludeDeleted = true; + if (config != null) { + tmpIncludeDeleted = config.getIncludeDeletedContent(); + } + includeDeletedContent = tmpIncludeDeleted; + } + + + @Override + public void startDocument() throws SAXException { + } + + @Override + public void endDocument() throws SAXException { + } + + @Override + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + + @Override + public void endPrefixMapping(String prefix) throws SAXException { + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { + if (uri.equals(MC_NS)) { + if (localName.equals("AlternateContent")) { + inAlternateContent = true; + } else if (localName.equals("Choice")) { + inACChoice = true; + } else if (localName.equals("Fallback")) { + inACFallback = true; + } + } + if (inACFallback) { + return; + } + + if (uri.equals(W_NS)) { + if (localName.equals("p")) { + handler.startElement("p"); + } else if (localName.equals("r")) { + inR = true; + } else if (localName.equals("t")) { + inT = true; + } else if (localName.equals("tab")) { + handler.characters(TAB, 0, 1); + } else if (localName.equals("tbl")) { + handler.startElement("table"); + } else if (localName.equals("tc")) { + handler.startElement("td"); + } else if (localName.equals("tr")) { + handler.startElement("tr"); + } else if (localName.equals("rPr")) { + inRPr = true; + } else if (inR && inRPr && localName.equals("i")) { + //rprs don't have to be inR; ignore those that aren't + currFormat.italics = true; + } else if (inR && inRPr && localName.equals("b")) { + currFormat.bold = true; + } else if (localName.equals("delText")) { + inDelText = true; + } else if (localName.equals("ins")) { + editAuthor = atts.getValue(W_NS, "author"); + editDate = atts.getValue(W_NS, "date"); + editType = EditType.INSERT; + } else if (localName.equals("del")) { + editAuthor = atts.getValue(W_NS, "author"); + editDate = atts.getValue(W_NS, "date"); + editType = EditType.DELETE; + } else if (localName.equals("hyperlink")) { + String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id"); + if (hyperlinkId != null) { + Relationship relationship = relationshipsManager.getRelationship(getName(), hyperlinkId); + if (relationship != null && XWPFRelation.HYPERLINK.getRelation().equals(relationship.getContentType())) { + hyperlink = relationship.getTarget(); + handler.startElement("a", "href", hyperlink); + hasWrittenAHref = true; + } + } + } + } + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + if (uri.equals(MC_NS)) { + if (localName.equals("AlternateContent")) { + inAlternateContent = false; + } else if (localName.equals("Choice")) { + inACChoice = false; + } else if (localName.equals("Fallback")) { + inACFallback = false; + } + } + if (uri.equals(W_NS)) { + if (inACFallback) { + return; + } + if (localName.equals("p")) { + handler.endElement("p"); + } else if (localName.equals("r")) { + closeStyleTags(); + inR = false; + hasWrittenFormatting = false; + } else if (localName.equals("t")) { + inT = false; + } else if (localName.equals("tbl")) { + handler.endElement("table"); + } else if (localName.equals("tc")) { + handler.endElement("td"); + } else if (localName.equals("tr")) { + handler.endElement("tr"); + } else if (localName.equals("rPr")) { + inRPr = false; + } else if (localName.equals("delText")) { + inDelText = false; + } else if (localName.equals("ins") || localName.equals("del")) { + editType = EditType.NONE; + editAuthor = null; + editDate = null; + } else if (localName.equals("hyperlink") && hasWrittenAHref) { + handler.endElement("a"); + hasWrittenAHref = false; + } + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (inACFallback) { + return; + } + + if (inR && !hasWrittenFormatting) { + if (currFormat.bold) { + handler.startElement("b"); + } + if (currFormat.italics) { + handler.startElement("i"); + } + hasWrittenFormatting = true; + } + if (inT) { + handler.characters(ch, start, length); + } else if (includeDeletedContent && inDelText) { + handler.characters(ch, start, length); + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + if (inACFallback) { + return; + } + + if (inT) { + handler.characters(ch, start, length); + } + } + + @Override + public String getPartContentType() { + return partName; + } + + + + void closeStyleTags() throws SAXException { + if (hasWrittenFormatting) { + if (currFormat.italics) { + handler.endElement("i"); + } + if (currFormat.bold) { + handler.endElement("b"); + } + } + + currFormat.bold = false; + currFormat.italics = false; + } + + private class TmpFormatting { + boolean italics = false; + boolean bold = false; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/CorePropertiesHandler.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/CorePropertiesHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/CorePropertiesHandler.java new file mode 100644 index 0000000..b0bca08 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/CorePropertiesHandler.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft.ooxml.xwpf; + + +import java.util.HashMap; +import java.util.Map; + +import org.apache.poi.openxml4j.opc.ContentTypes; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +class CorePropertiesHandler extends PartHandler { + + final static String DC_NS = "http://purl.org/dc/elements/1.1"; + final static String DC_TERMS_NS = "http://purl.org/dc/terms"; + final static String CP_NS = "http://schemas.openxmlformats.org/package/2006/metadata/core-properties"; + + private final Metadata metadata; + + final StringBuilder buffer = new StringBuilder(); + final Map<String, Map<String, Property>> properties = new HashMap<>(); + + public CorePropertiesHandler(Metadata metadata) { + this.metadata = metadata; + addProperties(); + } + + void addProperties() { + Map<String, Property> dc = properties.get(DC_NS); + if (dc == null) { + dc = new HashMap<>(); + } + dc.put("creator", TikaCoreProperties.CREATOR); + dc.put("title", TikaCoreProperties.TITLE); + dc.put("description", TikaCoreProperties.DESCRIPTION); + properties.put(DC_NS, dc); + + Map<String, Property> dcTerms = properties.get(DC_TERMS_NS); + if (dcTerms == null) { + dcTerms = new HashMap<>(); + } + dcTerms.put("created", TikaCoreProperties.CREATED); + dcTerms.put("modified", TikaCoreProperties.MODIFIED); + + properties.put(DC_TERMS_NS, dcTerms); + + Map<String, Property> cp = properties.get(CP_NS); + if (cp == null) { + cp = new HashMap<>(); + } + cp.put("category", OfficeOpenXMLCore.CATEGORY); + cp.put("contentStatus", OfficeOpenXMLCore.CONTENT_STATUS); + cp.put("lastModifiedBy", OfficeOpenXMLCore.LAST_MODIFIED_BY); + cp.put("lastPrinted", OfficeOpenXMLCore.LAST_PRINTED); + cp.put("revision", OfficeOpenXMLCore.REVISION); + cp.put("subject", OfficeOpenXMLCore.SUBJECT); + cp.put("version", OfficeOpenXMLCore.VERSION); + properties.put(CP_NS, cp); + } + + @Override + public void startDocument() throws SAXException { + } + + @Override + public void endDocument() throws SAXException { + buffer.setLength(0); + } + + @Override + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + + @Override + public void endPrefixMapping(String prefix) throws SAXException { + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { + + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + Property prop = getProperty(uri, localName); + if (prop != null) { + + if (prop.isMultiValuePermitted()) { + metadata.add(prop, buffer.toString()); + } else { + metadata.set(prop, buffer.toString()); + } + } + buffer.setLength(0); + + } + + private Property getProperty(String uri, String localName) { + if (uri.endsWith("/")) { + uri = uri.substring(0, uri.length()-1); + } + + Map<String, Property> m = properties.get(uri); + if (m != null) { + return m.get(localName); + } + return null; + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + buffer.append(ch, start, length); + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + buffer.append(ch, start, length); + } + + @Override + public String getPartContentType() { + return ContentTypes.CORE_PROPERTIES_PART; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ExtendedPropertiesHandler.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ExtendedPropertiesHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ExtendedPropertiesHandler.java new file mode 100644 index 0000000..07e5e23 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ExtendedPropertiesHandler.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft.ooxml.xwpf; + + +import java.util.HashMap; +import java.util.Map; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.OfficeOpenXMLExtended; +import org.apache.tika.metadata.Property; + +class ExtendedPropertiesHandler extends CorePropertiesHandler { + + final static String EP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"; + + public ExtendedPropertiesHandler(Metadata metadata) { + super(metadata); + } + + @Override + void addProperties() { + Map<String, Property> ep = properties.get(EP_NS); + if (ep == null) { + ep = new HashMap<>(); + } + ep.put("AppVersion", OfficeOpenXMLExtended.APP_VERSION); + ep.put("Application", OfficeOpenXMLExtended.APPLICATION); + ep.put("Comments", OfficeOpenXMLExtended.COMMENTS); + ep.put("Company", OfficeOpenXMLExtended.COMPANY); + ep.put("DocSecurity", OfficeOpenXMLExtended.DOC_SECURITY); + ep.put("HiddenSlides", OfficeOpenXMLExtended.HIDDEN_SLIDES); + ep.put("Manager", OfficeOpenXMLExtended.MANAGER); + ep.put("Notes", OfficeOpenXMLExtended.NOTES); + ep.put("PresentationFormat", OfficeOpenXMLExtended.PRESENTATION_FORMAT); + ep.put("Template", OfficeOpenXMLExtended.TEMPLATE); + ep.put("TotalTime", OfficeOpenXMLExtended.TOTAL_TIME); + ep.put("Pages", Office.PAGE_COUNT); + ep.put("Words", Office.WORD_COUNT); + ep.put("Characters", Office.CHARACTER_COUNT); + ep.put("CharactersWithSpaces", Office.CHARACTER_COUNT_WITH_SPACES); + ep.put("Paragraphs", Office.PARAGRAPH_COUNT); + ep.put("Lines", Office.LINE_COUNT); + properties.put(EP_NS, ep); + } + + @Override + public String getPartContentType() { + return "application/vnd.openxmlformats-officedocument.extended-properties+xml"; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/PartHandler.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/PartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/PartHandler.java new file mode 100644 index 0000000..79bcafe --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/PartHandler.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml.xwpf; + +import org.apache.tika.exception.TikaException; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +abstract class PartHandler extends DefaultHandler { + + private String name; + + public abstract String getPartContentType(); + + public void setName(String name) { + this.name = name; + } + + public String getName() { + return name; + } + + /** + * Override this to flush buffers, etc if necessary + */ + void endPart() throws SAXException, TikaException { + + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Relationship.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Relationship.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Relationship.java new file mode 100644 index 0000000..19b0dd4 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Relationship.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft.ooxml.xwpf; + + +import org.apache.poi.openxml4j.opc.TargetMode; + +class Relationship { + + private final String contentType; + + private final String target; + + private final TargetMode targetMode; + + public Relationship(String contentType, String target) { + this(contentType, target, null); + } + + public Relationship(String contentType, String target, TargetMode targetMode) { + this.contentType = contentType; + this.target = target; + this.targetMode = targetMode; + } + + public String getContentType() { + return contentType; + } + + public String getTarget() { + return target; + } + + public TargetMode getTargetMode() { + return targetMode; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsHandler.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsHandler.java new file mode 100644 index 0000000..211b048 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsHandler.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft.ooxml.xwpf; + + +import org.apache.poi.openxml4j.opc.ContentTypes; +import org.apache.poi.openxml4j.opc.TargetMode; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +class RelationshipsHandler extends PartHandler { + + final static String REL_NS = "http://schemas.openxmlformats.org/package/2006/relationships"; + + private final RelationshipsManager relationshipsManager; + + public RelationshipsHandler(RelationshipsManager relationshipsManager) { + this.relationshipsManager = relationshipsManager; + } + + + @Override + public void startDocument() throws SAXException { + } + + @Override + public void endDocument() throws SAXException { + } + + @Override + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + + @Override + public void endPrefixMapping(String prefix) throws SAXException { + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { + if (uri.equals(REL_NS)) { + if (localName.equals("Relationship")) { + String id = atts.getValue("", "Id"); + String type = atts.getValue("", "Type"); + String target = atts.getValue("", "Target"); + String targetModeString = atts.getValue("", "TargetMode"); + TargetMode targetMode = "EXTERNAL".equals(targetModeString)? TargetMode.EXTERNAL : + TargetMode.INTERNAL; + relationshipsManager.addRelationship(getName(), id, type, target, targetMode); + } + } + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + + } + + @Override + public String getPartContentType() { + return ContentTypes.RELATIONSHIPS_PART; + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsManager.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsManager.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsManager.java new file mode 100644 index 0000000..d1954ac --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsManager.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft.ooxml.xwpf; + + +import java.util.HashMap; +import java.util.Map; + +import org.apache.poi.openxml4j.opc.TargetMode; + +class RelationshipsManager { + + Map<String, Map<String, Relationship>> map = new HashMap<>(); + + public void addRelationship(String relsFileName, String id, String type, String target, TargetMode targetMode) { + String packageName = convertRelsFileNameToPackageName(relsFileName); + Map<String, Relationship> thisPackageRels = map.get(packageName); + if (thisPackageRels == null) { + thisPackageRels = new HashMap<>(); + } + thisPackageRels.put(id, new Relationship(type, target, targetMode)); + map.put(packageName, thisPackageRels); + } + + public Relationship getRelationship(String packageName, String id) { + Map<String, Relationship> thisPackageRels = map.get(packageName); + if (thisPackageRels != null) { + return thisPackageRels.get(id); + } + return null; + } + + private String convertRelsFileNameToPackageName(String relsFileName) { + if ("/_rels/.rels".equals(relsFileName)) { + return "/"; + } + + String tmp = relsFileName; + tmp = tmp.replaceFirst("\\/_rels\\/", "/"); + tmp = tmp.replaceFirst(".rels\\Z", ""); + return tmp; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLHandler.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLHandler.java new file mode 100644 index 0000000..cf919cc --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLHandler.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft.ooxml.xwpf; + + +import java.util.HashMap; +import java.util.Map; + +import org.apache.poi.xwpf.usermodel.XWPFRelation; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +class Word2006MLHandler extends DefaultHandler { + + final static String PKG_NS = "http://schemas.microsoft.com/office/2006/xmlPackage"; + + + private final XHTMLContentHandler handler; + private final Metadata metadata; + private final ParseContext parseContext; + + private final Map<String, PartHandler> partHandlers = new HashMap<>(); + private final BinaryDataHandler binaryDataHandler; + private final RelationshipsManager relationshipsManager = new RelationshipsManager(); + private PartHandler currentPartHandler = null; + + public Word2006MLHandler(XHTMLContentHandler handler, Metadata metadata, ParseContext context) { + this.handler = handler; + this.metadata = metadata; + this.parseContext = context; + + addPackageHandler(new RelationshipsHandler(relationshipsManager)); + + addPackageHandler(new BodyContentHandler( + XWPFRelation.DOCUMENT.getContentType(), + relationshipsManager, + handler, metadata, context)); + addPackageHandler(new BodyContentHandler( + XWPFRelation.FOOTNOTE.getContentType(), + relationshipsManager, + handler, metadata, context)); + addPackageHandler(new BodyContentHandler( + "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml", + relationshipsManager, + handler, metadata, context)); + addPackageHandler(new BodyContentHandler( + XWPFRelation.HEADER.getContentType(), + relationshipsManager, + handler, metadata, context)); + addPackageHandler(new BodyContentHandler( + XWPFRelation.FOOTER.getContentType(), + relationshipsManager, + handler, metadata, context)); + addPackageHandler(new BodyContentHandler( + "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml", + relationshipsManager, + handler, metadata, context)); + addPackageHandler(new BodyContentHandler( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml", + relationshipsManager, + handler, metadata, context)); + addPackageHandler(new BodyContentHandler( + "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml", + relationshipsManager, + handler, metadata, context)); + + addPackageHandler(new CorePropertiesHandler(metadata)); + addPackageHandler(new ExtendedPropertiesHandler(metadata)); + binaryDataHandler = new BinaryDataHandler(handler, metadata, context); + } + + private void addPackageHandler(PartHandler partHandler) { + partHandlers.put(partHandler.getPartContentType(), partHandler); + } + + + @Override + public void startDocument() throws SAXException { + } + + @Override + public void endDocument() throws SAXException { + } + + @Override + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + + @Override + public void endPrefixMapping(String prefix) throws SAXException { + + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { + if (uri.equals(PKG_NS) && localName.equals("part")) { + //start of a package + String name = atts.getValue(PKG_NS, "name"); + String contentType = atts.getValue(PKG_NS, "contentType"); + currentPartHandler = partHandlers.get(contentType); + //for now treat every unknown part type + //as if it contained binary data + if (currentPartHandler == null) { + currentPartHandler = binaryDataHandler; + } + if (currentPartHandler != null) { + currentPartHandler.setName(name); + } + } else if (currentPartHandler != null) { + currentPartHandler.startElement(uri, localName, qName, atts); + } + + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + if (uri.equals(PKG_NS) && localName.equals("part")) { + //do post processing + if (currentPartHandler != null) { + try { + currentPartHandler.endPart(); + } catch (TikaException e) { + throw new SAXException(e); + } + } + //then reset + currentPartHandler = null; + } else if (currentPartHandler != null) { + currentPartHandler.endElement(uri, localName, qName); + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (currentPartHandler != null) { + currentPartHandler.characters(ch, start, length); + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + if (currentPartHandler != null) { + currentPartHandler.characters(ch, start, length); + } + + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParser.java new file mode 100644 index 0000000..4609bf5 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParser.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft.ooxml.xwpf; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.OfflineContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + + +public class Word2006MLParser extends AbstractParser { + + protected static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton( + MediaType.application("vnd.ms-word2006ml")); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + final XHTMLContentHandler xhtml = + new XHTMLContentHandler(handler, metadata); + + xhtml.startDocument(); + + try { + context.getSAXParser().parse( + new CloseShieldInputStream(stream), + new OfflineContentHandler(new EmbeddedContentHandler( + new Word2006MLHandler(xhtml, metadata, context)))); + } catch (SAXException e) { + throw new TikaException("XML parse error", e); + } finally { + xhtml.endDocument(); + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser index 6ed2f6c..fcd5840 100644 --- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser @@ -42,6 +42,7 @@ org.apache.tika.parser.microsoft.OfficeParser org.apache.tika.parser.microsoft.OldExcelParser org.apache.tika.parser.microsoft.TNEFParser org.apache.tika.parser.microsoft.ooxml.OOXMLParser +org.apache.tika.parser.microsoft.ooxml.xwpf.Word2006MLParser org.apache.tika.parser.microsoft.xml.WordMLParser org.apache.tika.parser.microsoft.xml.SpreadsheetMLParser org.apache.tika.parser.mp3.Mp3Parser http://git-wip-us.apache.org/repos/asf/tika/blob/81fad8c9/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java new file mode 100644 index 0000000..607e6ef --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java @@ -0,0 +1,182 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft.ooxml.xwpf; + +import static org.junit.Assert.assertEquals; + +import java.util.List; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.OfficeOpenXMLExtended; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.parser.microsoft.MSOfficeParserConfig; +import org.junit.Test; + + +public class Word2006MLParserTest extends TikaTest { + + @Test + public void basicTest() throws Exception { + + + + List<Metadata> metadataList = getRecursiveMetadata("testWORD_2006ml.xml"); + + assertEquals(5, metadataList.size()); + + Metadata m = metadataList.get(0); + + assertEquals("2016-11-23T12:07:00Z", m.get(TikaCoreProperties.CREATED)); + assertEquals("2016-11-23T12:07:00Z", m.get(TikaCoreProperties.MODIFIED)); + assertEquals("My Document Title", m.get(TikaCoreProperties.TITLE)); + assertEquals("This is the Author", m.get(TikaCoreProperties.CREATOR)); + assertEquals("2", m.get(OfficeOpenXMLCore.REVISION)); + assertEquals("Allison, Timothy B.", m.get(OfficeOpenXMLCore.LAST_MODIFIED_BY)); + assertEquals("0", m.get(OfficeOpenXMLExtended.DOC_SECURITY)); + assertEquals("225", m.get(Office.WORD_COUNT)); + assertEquals("3", m.get(Office.PARAGRAPH_COUNT)); + assertEquals("1506", m.get(Office.CHARACTER_COUNT_WITH_SPACES)); + assertEquals("10", m.get(Office.LINE_COUNT)); + assertEquals("16.0000", m.get(OfficeOpenXMLExtended.APP_VERSION)); + + + String content = m.get(RecursiveParserWrapper.TIKA_CONTENT); + + + assertContainsCountTimes("engaging title page", content, 1); + assertContainsCountTimes("<p>This is the Author</p>", content, 1); + assertContainsCountTimes("<p>This is an engaging title page</p>", content, 1); + + assertContains("<p>My Document Title</p>", content); + assertContains("<p>My Document Subtitle</p>", content); + + assertContains("<p>\tHeading1\t3</p>", content); + + + //TODO: integrate numbering + assertContains("Really basic 2.", content); + + assertContainsCountTimes("This is a text box", content, 1); + + assertContains("<p>This is a hyperlink: <a href=\"http://tika.apache.org\">tika</a></p>", content); + + assertContains("<p>This is a link to a local file: <a href=\"file:///C:\\data\\test.png\">test.png</a></p>", content); + + assertContains("<p>This is 10 spaces</p>", content); + + //caption + assertContains("<p>Table 1: Table1 Caption</p>", content); + + //embedded table + //TODO: figure out how to handle embedded tables in html + assertContains("<p>Embedded table r1c1</p>", content); + + //shape + assertContainsCountTimes("<p>This is text within a shape", content, 1); + + //sdt rich text + assertContains("<p>Rich text content control", content); + + //sdt simple text + assertContains("<p>Simple text content control", content); + + //sdt repeating + assertContains("Repeating content", content); + + //sdt dropdown + //TODO: get options for dropdown + assertContains("Drop down1", content); + + //sdt date + assertContains("<p>11/16/2016</p>", content); + + //test that <tab/> works + assertContains("tab\ttab", content); + + assertContainsCountTimes("serious word art", content, 1); + assertContainsCountTimes("Wordartr1c1", content, 1); + + //glossary document contents + assertContains("Click or tap to enter a date", content); + + //basic formatting + assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over", + content); + + //TODO: add chart parsing +// assertContains("This is the chart", content); + + assertContains("This is a comment", content); + + assertContains("This is an endnote", content); + + assertContains("this is the footnote", content); + + assertContains("First page header", content); + + assertContains("Even page header", content); + + assertContains("Odd page header", content); + + assertContains("First page footer", content); + + assertContains("Even page footer", content); + + assertContains("Odd page footer", content); + + //test default includes deleted + assertContains("frog", content); + + assertContains("Mattmann", content); + + //TODO: extract this...Note that it is in "Backup" not "Choice"!!! +// assertContains("This is the chart title", content); + + + + } + + private void assertContainsCountTimes(String needle, String haystack, int expectedCount) { + int i = haystack.indexOf("engaging title page"); + int cnt = 0; + while (i > -1) { + cnt++; + i = haystack.indexOf("engaging title page", i+1); + } + assertEquals("found needle >"+ needle+"<"+cnt+" times instead of expected: "+expectedCount, + expectedCount, cnt); + + } + + @Test + public void testSkipDeleted() throws Exception { + ParseContext pc = new ParseContext(); + MSOfficeParserConfig msOfficeParserConfig = new MSOfficeParserConfig(); + msOfficeParserConfig.setIncludeDeletedContent(false); + pc.set(MSOfficeParserConfig.class, msOfficeParserConfig); + + XMLResult r = getXML("testWORD_2006ml.xml", pc); + assertNotContained("frog", r.xml); + } + +}
