Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.iwork; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import java.util.HashMap; +import java.util.Map; + +class NumbersContentHandler extends DefaultHandler { + + private final XHTMLContentHandler xhtml; + private final Metadata metadata; + + private boolean inSheet = false; + + private boolean inText = false; + private boolean parseText = false; + + private boolean inMetadata = false; + private Property metadataKey; + private String metadataPropertyQName; + + private boolean inTable = false; + private int numberOfSheets = 0; + private int numberOfColumns = -1; + private int currentColumn = 0; + + private Map<String, String> menuItems = new HashMap<String, String>(); + private String currentMenuItemId; + + NumbersContentHandler(XHTMLContentHandler xhtml, Metadata metadata) { + this.xhtml = xhtml; + this.metadata = metadata; + } + + @Override + public void endDocument() throws SAXException { + metadata.set(Metadata.PAGE_COUNT, String.valueOf(numberOfSheets)); + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { + if ("ls:workspace".equals(qName)) { + inSheet = true; + numberOfSheets++; + xhtml.startElement("div"); + String sheetName = attributes.getValue("ls:workspace-name"); + metadata.add("sheetNames", sheetName); + } + + if ("sf:text".equals(qName)) { + inText = true; + xhtml.startElement("p"); + } + + if ("sf:p".equals(qName)) { + parseText = true; + } + + if ("sf:metadata".equals(qName)) { + inMetadata = true; + return; + } + + if (inMetadata && metadataKey == null) { + metadataKey = resolveMetadataKey(localName); + metadataPropertyQName = qName; + } + + if (inMetadata && metadataKey != null && "sf:string".equals(qName)) { + metadata.add(metadataKey, attributes.getValue("sfa:string")); + } + + if (!inSheet) { + return; + } + + if ("sf:tabular-model".equals(qName)) { + String tableName = attributes.getValue("sf:name"); + xhtml.startElement("div"); + xhtml.characters(tableName); + xhtml.endElement("div"); + inTable = true; + xhtml.startElement("table"); + xhtml.startElement("tr"); + currentColumn = 0; + } + + if ("sf:menu-choices".equals(qName)) { + menuItems = new HashMap<String, String>(); + } + + if (inTable && "sf:grid".equals(qName)) { + numberOfColumns = Integer.parseInt(attributes.getValue("sf:numcols")); + } + + if (menuItems != null && "sf:t".equals(qName)) { + currentMenuItemId = attributes.getValue("sfa:ID"); + } + + if (currentMenuItemId != null && "sf:ct".equals(qName)) { + menuItems.put(currentMenuItemId, attributes.getValue("sfa:s")); + } + + if (inTable && "sf:ct".equals(qName)) { + if (currentColumn >= numberOfColumns) { + currentColumn = 0; + xhtml.endElement("tr"); + xhtml.startElement("tr"); + } + + xhtml.element("td", attributes.getValue("sfa:s")); + currentColumn++; + } + + if (inTable && ("sf:n".equals(qName) || "sf:rn".equals(qName))) { + if (currentColumn >= numberOfColumns) { + currentColumn = 0; + xhtml.endElement("tr"); + xhtml.startElement("tr"); + } + + xhtml.element("td", attributes.getValue("sf:v")); + currentColumn++; + } + + if (inTable && "sf:proxied-cell-ref".equals(qName)) { + if (currentColumn >= numberOfColumns) { + currentColumn = 0; + xhtml.endElement("tr"); + xhtml.startElement("tr"); + } + + xhtml.element("td", menuItems.get(attributes.getValue("sfa:IDREF"))); + currentColumn++; + } + + if ("sf:chart-name".equals(qName)) { + // Extract chart name: + xhtml.startElement("div", "class", "chart"); + xhtml.startElement("h1"); + xhtml.characters(attributes.getValue("sfa:string")); + xhtml.endElement("h1"); + xhtml.endElement("div"); + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (parseText && length > 0) { + xhtml.characters(ch, start, length); + } + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + if ("ls:workspace".equals(qName)) { + inSheet = false; + xhtml.endElement("div"); + } + + if ("sf:text".equals(qName)) { + inText = false; + xhtml.endElement("p"); + } + + if ("sf:p".equals(qName)) { + parseText = false; + } + + if ("sf:metadata".equals(qName)) { + inMetadata = false; + } + + if (inMetadata && qName.equals(metadataPropertyQName)) { + metadataPropertyQName = null; + metadataKey = null; + } + + if (!inSheet) { + return; + } + + if ("sf:menu-choices".equals(qName)) { + } + + if ("sf:tabular-model".equals(qName)) { + inTable = false; + xhtml.endElement("tr"); + xhtml.endElement("table"); + } + + if (currentMenuItemId != null && "sf:t".equals(qName)) { + currentMenuItemId = null; + } + } + + private Property resolveMetadataKey(String localName) { + if ("authors".equals(localName)) { + return TikaCoreProperties.CREATOR; + } + if ("title".equals(localName)) { + return TikaCoreProperties.TITLE; + } + if ("comment".equals(localName)) { + return TikaCoreProperties.COMMENTS; + } + return Property.internalText(localName); + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,448 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.iwork; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +class PagesContentHandler extends DefaultHandler { + + private final XHTMLContentHandler xhtml; + private final Metadata metadata; + + /** The (interesting) part of the document we're in. Should be more structured... */ + private enum DocumentPart { + METADATA, PARSABLE_TEXT, + HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST, + FOOTERS, FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST, + FOOTNOTES, ANNOTATIONS; + } + private DocumentPart inPart = null; + private boolean ghostText; + + private static String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + + private boolean parseProperty = false; + private int pageCount = 0; + private int slPageCount = 0; + + private HeaderFooter headers = null; + private HeaderFooter footers = null; + private Footnotes footnotes = null; + private Annotations annotations = null; + + private Map<String, List<List<String>>> tableData = + new HashMap<String, List<List<String>>>(); + private String activeTableId; + private int numberOfColumns = 0; + private List<String> activeRow = new ArrayList<String>(); + + private String metaDataLocalName; + private String metaDataQName; + + PagesContentHandler(XHTMLContentHandler xhtml, Metadata metadata) { + this.xhtml = xhtml; + this.metadata = metadata; + } + + @Override + public void endDocument() throws SAXException { + metadata.set(Metadata.PAGE_COUNT, String.valueOf(pageCount)); + if (pageCount > 0) { + doFooter(); + xhtml.endElement("div"); + } + } + + @Override + public void startElement( + String uri, String localName, String qName, Attributes attributes) + throws SAXException { + if (parseProperty) { + String value = parsePrimitiveElementValue(qName, attributes); + if (value != null) { + Object metaDataKey = resolveMetaDataKey(metaDataLocalName); + if(metaDataKey instanceof Property) { + metadata.set((Property)metaDataKey, value); + } else { + metadata.add((String)metaDataKey, value); + } + } + } + + if ("sl:publication-info".equals(qName)) { + inPart = DocumentPart.METADATA; + } else if ("sf:metadata".equals(qName)) { + inPart = DocumentPart.METADATA; + } else if ("sf:page-start".equals(qName) || "sl:page-group".equals(qName)) { + if (pageCount > 0) { + doFooter(); + xhtml.endElement("div"); + } + xhtml.startElement("div"); + if ("sl:page-group".equals(qName)) { + slPageCount++; + } else { + pageCount++; + } + doHeader(); + } else if ("sf:p".equals(qName)) { + if (pageCount+slPageCount > 0) { + inPart = DocumentPart.PARSABLE_TEXT; + xhtml.startElement("p"); + } + } else if ("sf:attachment".equals(qName)) { + String kind = attributes.getValue("sf:kind"); + if ("tabular-attachment".equals(kind)) { + activeTableId = attributes.getValue("sfa:ID"); + tableData.put(activeTableId, new ArrayList<List<String>>()); + } + } else if ("sf:attachment-ref".equals(qName)) { + String idRef = attributes.getValue("sfa:IDREF"); + outputTable(idRef); + } else if ("sf:headers".equals(qName)) { + headers = new HeaderFooter(qName); + inPart = DocumentPart.HEADERS; + } else if ("sf:footers".equals(qName)) { + footers = new HeaderFooter(qName); + inPart = DocumentPart.FOOTERS; + } else if ("sf:header".equals(qName)) { + inPart = headers.identifyPart(attributes.getValue("sf:name")); + } else if ("sf:footer".equals(qName)) { + inPart = footers.identifyPart(attributes.getValue("sf:name")); + } else if ("sf:page-number".equals(qName)) { + if (inPart == DocumentPart.FOOTER_ODD + || inPart == DocumentPart.FOOTER_FIRST + || inPart == DocumentPart.FOOTER_EVEN) { + // We are in a footer + footers.hasAutoPageNumber = true; + footers.autoPageNumberFormat = attributes.getValue("sf:format"); + } else { + headers.hasAutoPageNumber = true; + headers.autoPageNumberFormat = attributes.getValue("sf:format"); + } + + xhtml.characters(Integer.toString(this.pageCount)); + } else if ("sf:footnotes".equals(qName)) { + footnotes = new Footnotes(); + inPart = DocumentPart.FOOTNOTES; + } else if ("sf:footnote-mark".equals(qName)) { + footnotes.recordMark(attributes.getValue("sf:mark")); + } else if ("sf:footnote".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) { + // What about non auto-numbered? + String footnoteMark = attributes.getValue("sf:autonumber"); + if (footnotes != null) { + String footnoteText = footnotes.footnotes.get(footnoteMark); + if (footnoteText != null) { + xhtml.startElement("div", "style", "footnote"); + xhtml.characters("Footnote:" ); // As shown in Pages + xhtml.characters(footnoteText); + xhtml.endElement("div"); + } + } + } else if ("sf:annotations".equals(qName)) { + annotations = new Annotations(); + inPart = DocumentPart.ANNOTATIONS; + } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) { + annotations.start(attributes.getValue("sf:target")); + } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) { + xhtml.startElement("div", "style", "annotated"); + + String annotationText = annotations.annotations.get(attributes.getValue("sfa:ID")); + if (annotationText != null) { + xhtml.startElement("div", "style", "annotation"); + xhtml.characters(annotationText); + xhtml.endElement("div"); + } + } else if ("sf:ghost-text".equals(qName)) { + ghostText = true; + } + + if (activeTableId != null) { + parseTableData(qName, attributes); + } + + if (inPart == DocumentPart.METADATA) { + metaDataLocalName = localName; + metaDataQName = qName; + parseProperty = true; + } + } + + @Override + public void endElement(String uri, String localName, String qName) + throws SAXException { + if (metaDataLocalName != null && metaDataLocalName.equals(localName)) { + metaDataLocalName = null; + parseProperty = false; + } + + if ("sl:publication-info".equals(qName)) { + inPart = null; + } else if ("sf:metadata".equals(qName)) { + inPart = null; + } else if ("sf:p".equals(qName) && (pageCount+slPageCount) > 0) { + inPart = null; + xhtml.endElement("p"); + } else if ("sf:attachment".equals(qName)) { + activeTableId = null; + } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) { + annotations.end(); + } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) { + xhtml.endElement("div"); + } else if ("sf:ghost-text".equals(qName)) { + ghostText = false; + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (length > 0) { + if (inPart == DocumentPart.PARSABLE_TEXT) { + if (!ghostText) { + xhtml.characters(ch, start, length); + } + } else if(inPart != null) { + String str = new String(ch, start, length); + if (inPart == DocumentPart.HEADER_FIRST) headers.defaultFirst = str; + if (inPart == DocumentPart.HEADER_EVEN) headers.defaultEven = str; + if (inPart == DocumentPart.HEADER_ODD) headers.defaultOdd = str; + if (inPart == DocumentPart.FOOTER_FIRST) footers.defaultFirst = str; + if (inPart == DocumentPart.FOOTER_EVEN) footers.defaultEven = str; + if (inPart == DocumentPart.FOOTER_ODD) footers.defaultOdd = str; + if (inPart == DocumentPart.FOOTNOTES) footnotes.text(str); + if (inPart == DocumentPart.ANNOTATIONS) annotations.text(str); + } + } + } + + private void parseTableData(String qName, Attributes attributes) { + if ("sf:grid".equals(qName)) { + String numberOfColumns = attributes.getValue("sf:numcols"); + this.numberOfColumns = Integer.parseInt(numberOfColumns); + } else if ("sf:ct".equals(qName)) { + activeRow.add(attributes.getValue("sfa:s")); + + if (activeRow.size() >= 3) { + tableData.get(activeTableId).add(activeRow); + activeRow = new ArrayList<String>(); + } + } + } + + private void outputTable(String idRef) throws SAXException { + List<List<String>> tableData = this.tableData.get(idRef); + if (tableData != null) { + xhtml.startElement("table"); + for (List<String> row : tableData) { + xhtml.startElement("tr"); + for (String cell : row) { + xhtml.element("td", cell); + } + xhtml.endElement("tr"); + } + xhtml.endElement("table"); + } + } + + /** + * Returns a resolved key that is common in other document types or + * returns the specified metaDataLocalName if no common key could be found. + * The key could be a simple String key, or could be a {@link Property} + * + * @param metaDataLocalName The localname of the element containing metadata + * @return a resolved key that is common in other document types + */ + private Object resolveMetaDataKey(String metaDataLocalName) { + Object metaDataKey = metaDataLocalName; + if ("sf:authors".equals(metaDataQName)) { + metaDataKey = TikaCoreProperties.CREATOR; + } else if ("sf:title".equals(metaDataQName)) { + metaDataKey = TikaCoreProperties.TITLE; + } else if ("sl:SLCreationDateProperty".equals(metaDataQName)) { + metaDataKey = TikaCoreProperties.CREATED; + } else if ("sl:SLLastModifiedDateProperty".equals(metaDataQName)) { + metaDataKey = Metadata.LAST_MODIFIED; + } else if ("sl:language".equals(metaDataQName)) { + metaDataKey = TikaCoreProperties.LANGUAGE; + } + return metaDataKey; + } + + /** + * Returns the value of a primitive element e.g.: + * <sl:number sfa:number="0" sfa:type="f"/> - the number attribute + * <sl:string sfa:string="en"/> = the string attribute + * <p> + * Returns <code>null</code> if the value could not be extracted from + * the list of attributes. + * + * @param qName The fully qualified name of the element containing + * the value to extract + * @param attributes The list of attributes of which one contains the + * value to be extracted + * @return the value of a primitive element + */ + private String parsePrimitiveElementValue( + String qName, Attributes attributes) { + if ("sl:string".equals(qName) || "sf:string".equals(qName)) { + return attributes.getValue("sfa:string"); + } else if ("sl:number".equals(qName)) { + return attributes.getValue("sfa:number"); + } else if ("sl:date".equals(qName)) { + return attributes.getValue("sf:val"); + } + + return null; + } + + private void doHeader() throws SAXException { + if (headers != null) { + headers.output("header"); + } + } + private void doFooter() throws SAXException { + if (footers != null) { + footers.output("footer"); + } + } + + /** + * Represents the Headers or Footers in a document + */ + private class HeaderFooter { + private String type; // sf:headers or sf:footers + private String defaultOdd; + private String defaultEven; + private String defaultFirst; + private boolean hasAutoPageNumber; + private String autoPageNumberFormat; + // TODO Can there be custom ones? + + private HeaderFooter(String type) { + this.type = type; + } + private DocumentPart identifyPart(String name) { + if("SFWPDefaultOddHeaderIdentifier".equals(name)) + return DocumentPart.HEADER_ODD; + if("SFWPDefaultEvenHeaderIdentifier".equals(name)) + return DocumentPart.HEADER_EVEN; + if("SFWPDefaultFirstHeaderIdentifier".equals(name)) + return DocumentPart.HEADER_FIRST; + + if("SFWPDefaultOddFooterIdentifier".equals(name)) + return DocumentPart.FOOTER_ODD; + if("SFWPDefaultEvenFooterIdentifier".equals(name)) + return DocumentPart.FOOTER_EVEN; + if("SFWPDefaultFirstFooterIdentifier".equals(name)) + return DocumentPart.FOOTER_FIRST; + + return null; + } + private void output(String what) throws SAXException { + String text = null; + if (pageCount == 1 && defaultFirst != null) { + text = defaultFirst; + } else if (pageCount % 2 == 0 && defaultEven != null) { + text = defaultEven; + } else { + text = defaultOdd; + } + + if (text != null) { + xhtml.startElement("div", "class", "header"); + xhtml.characters(text); + if (hasAutoPageNumber) { + if (autoPageNumberFormat == null) { // raw number + xhtml.characters("\t" + pageCount); + } else if (autoPageNumberFormat.equals("upper-roman")){ + xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumerals(pageCount)); + } else if (autoPageNumberFormat.equals("lower-roman")){ + xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumeralsLower(pageCount)); + } else if (autoPageNumberFormat.equals("upper-alpha")){ + xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumeric(pageCount)); + } else if (autoPageNumberFormat.equals("lower-alpha")){ + xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumericLower(pageCount)); + } + } + xhtml.endElement("div"); + } + } + } + /** + * Represents Footnotes in a document. The way these work + * in the file format isn't very clean... + */ + private static class Footnotes { + /** Mark -> Text */ + Map<String,String> footnotes = new HashMap<String, String>(); + String lastSeenMark = null; + + /** + * Normally happens before the text of the mark + */ + private void recordMark(String mark) { + lastSeenMark = mark; + } + private void text(String text) { + if (lastSeenMark != null) { + if (footnotes.containsKey(lastSeenMark)) { + text = footnotes.get(lastSeenMark) + text; + } + footnotes.put(lastSeenMark, text); + } + } + } + /** + * Represents Annotations in a document. We currently + * just grab all the sf:p text in each one + */ + private class Annotations { + /** ID -> Text */ + Map<String,String> annotations = new HashMap<String, String>(); + String currentID = null; + StringBuffer currentText = null; + + private void start(String id) { + currentID = id; + currentText = new StringBuffer(); + } + private void text(String text) { + if (text != null && text.length() > 0 && currentText != null) { + currentText.append(text); + } + } + private void end() { + if (currentText.length() > 0) { + annotations.put(currentID, currentText.toString()); + currentID = null; + currentText = null; + } + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Set; + +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; +import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; +import org.apache.commons.compress.compressors.gzip.GzipUtils; +import org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream; +import org.apache.commons.compress.compressors.snappy.FramedSnappyCompressorInputStream; +import org.apache.commons.compress.compressors.snappy.SnappyCompressorInputStream; +import org.apache.commons.compress.compressors.xz.XZCompressorInputStream; +import org.apache.commons.compress.compressors.z.ZCompressorInputStream; +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Parser for various compression formats. + */ +public class CompressorParser extends AbstractParser { + + /** Serial version UID */ + private static final long serialVersionUID = 2793565792967222459L; + + private static final MediaType BZIP = MediaType.application("x-bzip"); + private static final MediaType BZIP2 = MediaType.application("x-bzip2"); + private static final MediaType GZIP = MediaType.application("gzip"); + private static final MediaType GZIP_ALT = MediaType.application("x-gzip"); + private static final MediaType COMPRESS = MediaType.application("x-compress"); + private static final MediaType XZ = MediaType.application("x-xz"); + private static final MediaType PACK = MediaType.application("x-java-pack200"); + private static final MediaType SNAPPY = MediaType.application("x-snappy-framed"); + private static final MediaType ZLIB = MediaType.application("zlib"); + + private static final Set<MediaType> SUPPORTED_TYPES = + MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, COMPRESS, XZ, PACK, ZLIB); + + static MediaType getMediaType(CompressorInputStream stream) { + // TODO Add support for the remaining CompressorInputStream formats: + // LZMACompressorInputStream + // LZWInputStream -> UnshrinkingInputStream + if (stream instanceof BZip2CompressorInputStream) { + return BZIP2; + } else if (stream instanceof GzipCompressorInputStream) { + return GZIP; + } else if (stream instanceof XZCompressorInputStream) { + return XZ; + } else if (stream instanceof DeflateCompressorInputStream) { + return ZLIB; + } else if (stream instanceof ZCompressorInputStream) { + return COMPRESS; + } else if (stream instanceof Pack200CompressorInputStream) { + return PACK; + } else if (stream instanceof FramedSnappyCompressorInputStream || + stream instanceof SnappyCompressorInputStream) { + // TODO Add unit tests for this format + return SNAPPY; + } else { + return MediaType.OCTET_STREAM; + } + } + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + // At the end we want to close the compression stream to release + // any associated resources, but the underlying document stream + // should not be closed + stream = new CloseShieldInputStream(stream); + + // Ensure that the stream supports the mark feature + stream = new BufferedInputStream(stream); + + CompressorInputStream cis; + try { + CompressorParserOptions options = + context.get(CompressorParserOptions.class, new CompressorParserOptions() { + public boolean decompressConcatenated(Metadata metadata) { + return false; + } + }); + CompressorStreamFactory factory = + new CompressorStreamFactory(options.decompressConcatenated(metadata)); + cis = factory.createCompressorInputStream(stream); + } catch (CompressorException e) { + throw new TikaException("Unable to uncompress document stream", e); + } + + MediaType type = getMediaType(cis); + if (!type.equals(MediaType.OCTET_STREAM)) { + metadata.set(CONTENT_TYPE, type.toString()); + } + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + try { + Metadata entrydata = new Metadata(); + String name = metadata.get(Metadata.RESOURCE_NAME_KEY); + if (name != null) { + if (name.endsWith(".tbz")) { + name = name.substring(0, name.length() - 4) + ".tar"; + } else if (name.endsWith(".tbz2")) { + name = name.substring(0, name.length() - 5) + ".tar"; + } else if (name.endsWith(".bz")) { + name = name.substring(0, name.length() - 3); + } else if (name.endsWith(".bz2")) { + name = name.substring(0, name.length() - 4); + } else if (name.endsWith(".xz")) { + name = name.substring(0, name.length() - 3); + } else if (name.endsWith(".zlib")) { + name = name.substring(0, name.length() - 5); + } else if (name.endsWith(".pack")) { + name = name.substring(0, name.length() - 5); + } else if (name.length() > 0) { + name = GzipUtils.getUncompressedFilename(name); + } + entrydata.set(Metadata.RESOURCE_NAME_KEY, name); + } + + // Use the delegate parser to parse the compressed document + EmbeddedDocumentExtractor extractor = context.get( + EmbeddedDocumentExtractor.class, + new ParsingEmbeddedDocumentExtractor(context)); + if (extractor.shouldParseEmbedded(entrydata)) { + extractor.parseEmbedded(cis, xhtml, entrydata, true); + } + } finally { + cis.close(); + } + + xhtml.endDocument(); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParserOptions.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParserOptions.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParserOptions.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParserOptions.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import org.apache.tika.metadata.Metadata; + +/** + * Interface for setting options for the {@link CompressorParser} by passing + * via the {@link ParseContext}. + */ +public interface CompressorParserOptions { + + /** + * @param metadata document metadata + * @return whether to decompress concatenated streams or not + */ + boolean decompressConcatenated(Metadata metadata); +} Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,287 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Date; +import java.util.Set; + +import org.apache.commons.compress.PasswordRequiredException; +import org.apache.commons.compress.archivers.ArchiveEntry; +import org.apache.commons.compress.archivers.ArchiveException; +import org.apache.commons.compress.archivers.ArchiveInputStream; +import org.apache.commons.compress.archivers.ArchiveStreamFactory; +import org.apache.commons.compress.archivers.StreamingNotSupportedException; +import org.apache.commons.compress.archivers.ar.ArArchiveInputStream; +import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream; +import org.apache.commons.compress.archivers.dump.DumpArchiveInputStream; +import org.apache.commons.compress.archivers.jar.JarArchiveInputStream; +import org.apache.commons.compress.archivers.sevenz.SevenZFile; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException; +import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.Feature; +import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** + * Parser for various packaging formats. Package entries will be written to + * the XHTML event stream as <div class="package-entry"> elements that + * contain the (optional) entry name as a <h1> element and the full + * structured body content of the parsed entry. + * <p> + * User must have JCE Unlimited Strength jars installed for encryption to + * work with 7Z files (see: COMPRESS-299 and TIKA-1521). If the jars + * are not installed, an IOException will be thrown, and potentially + * wrapped in a TikaException. + */ +public class PackageParser extends AbstractParser { + + /** Serial version UID */ + private static final long serialVersionUID = -5331043266963888708L; + + private static final MediaType ZIP = MediaType.APPLICATION_ZIP; + private static final MediaType JAR = MediaType.application("java-archive"); + private static final MediaType AR = MediaType.application("x-archive"); + private static final MediaType CPIO = MediaType.application("x-cpio"); + private static final MediaType DUMP = MediaType.application("x-tika-unix-dump"); + private static final MediaType TAR = MediaType.application("x-tar"); + private static final MediaType SEVENZ = MediaType.application("x-7z-compressed"); + + private static final Set<MediaType> SUPPORTED_TYPES = + MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR, SEVENZ); + + static MediaType getMediaType(ArchiveInputStream stream) { + if (stream instanceof JarArchiveInputStream) { + return JAR; + } else if (stream instanceof ZipArchiveInputStream) { + return ZIP; + } else if (stream instanceof ArArchiveInputStream) { + return AR; + } else if (stream instanceof CpioArchiveInputStream) { + return CPIO; + } else if (stream instanceof DumpArchiveInputStream) { + return DUMP; + } else if (stream instanceof TarArchiveInputStream) { + return TAR; + } else if (stream instanceof SevenZWrapper) { + return SEVENZ; + } else { + return MediaType.OCTET_STREAM; + } + } + + static boolean isZipArchive(MediaType type) { + return type.equals(ZIP) || type.equals(JAR); + } + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + // Ensure that the stream supports the mark feature + if (! TikaInputStream.isTikaInputStream(stream)) + stream = new BufferedInputStream(stream); + + + TemporaryResources tmp = new TemporaryResources(); + ArchiveInputStream ais = null; + try { + ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory()); + // At the end we want to close the archive stream to release + // any associated resources, but the underlying document stream + // should not be closed + ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream)); + + } catch (StreamingNotSupportedException sne) { + // Most archive formats work on streams, but a few need files + if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) { + // Rework as a file, and wrap + stream.reset(); + TikaInputStream tstream = TikaInputStream.get(stream, tmp); + + // Seven Zip suports passwords, was one given? + String password = null; + PasswordProvider provider = context.get(PasswordProvider.class); + if (provider != null) { + password = provider.getPassword(metadata); + } + + SevenZFile sevenz; + if (password == null) { + sevenz = new SevenZFile(tstream.getFile()); + } else { + sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked")); + } + + // Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty + ais = new SevenZWrapper(sevenz); + } else { + tmp.close(); + throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne); + } + } catch (ArchiveException e) { + tmp.close(); + throw new TikaException("Unable to unpack document stream", e); + } + + MediaType type = getMediaType(ais); + if (!type.equals(MediaType.OCTET_STREAM)) { + metadata.set(CONTENT_TYPE, type.toString()); + } + // Use the delegate parser to parse the contained document + EmbeddedDocumentExtractor extractor = context.get( + EmbeddedDocumentExtractor.class, + new ParsingEmbeddedDocumentExtractor(context)); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + try { + ArchiveEntry entry = ais.getNextEntry(); + while (entry != null) { + if (!entry.isDirectory()) { + parseEntry(ais, entry, extractor, xhtml); + } + entry = ais.getNextEntry(); + } + } catch (UnsupportedZipFeatureException zfe) { + // If it's an encrypted document of unknown password, report as such + if (zfe.getFeature() == Feature.ENCRYPTION) { + throw new EncryptedDocumentException(zfe); + } + // Otherwise fall through to raise the exception as normal + } catch (PasswordRequiredException pre) { + throw new EncryptedDocumentException(pre); + } finally { + ais.close(); + tmp.close(); + } + + xhtml.endDocument(); + } + + private void parseEntry( + ArchiveInputStream archive, ArchiveEntry entry, + EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml) + throws SAXException, IOException, TikaException { + String name = entry.getName(); + if (archive.canReadEntryData(entry)) { + // Fetch the metadata on the entry contained in the archive + Metadata entrydata = handleEntryMetadata(name, null, + entry.getLastModifiedDate(), entry.getSize(), xhtml); + + // Recurse into the entry if desired + if (extractor.shouldParseEmbedded(entrydata)) { + // For detectors to work, we need a mark/reset supporting + // InputStream, which ArchiveInputStream isn't, so wrap + TemporaryResources tmp = new TemporaryResources(); + try { + TikaInputStream tis = TikaInputStream.get(archive, tmp); + extractor.parseEmbedded(tis, xhtml, entrydata, true); + } finally { + tmp.dispose(); + } + } + } else if (name != null && name.length() > 0) { + xhtml.element("p", name); + } + } + + protected static Metadata handleEntryMetadata( + String name, Date createAt, Date modifiedAt, + Long size, XHTMLContentHandler xhtml) + throws SAXException, IOException, TikaException { + Metadata entrydata = new Metadata(); + if (createAt != null) { + entrydata.set(TikaCoreProperties.CREATED, createAt); + } + if (modifiedAt != null) { + entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt); + } + if (size != null) { + entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size)); + } + if (name != null && name.length() > 0) { + name = name.replace("\\", "/"); + entrydata.set(Metadata.RESOURCE_NAME_KEY, name); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", name); + xhtml.startElement("div", attributes); + xhtml.endElement("div"); + + entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name); + } + return entrydata; + } + + // Pending a fix for COMPRESS-269, we have to wrap ourselves + private static class SevenZWrapper extends ArchiveInputStream { + private SevenZFile file; + private SevenZWrapper(SevenZFile file) { + this.file = file; + } + + @Override + public int read() throws IOException { + return file.read(); + } + @Override + public int read(byte[] b) throws IOException { + return file.read(b); + } + @Override + public int read(byte[] b, int off, int len) throws IOException { + return file.read(b, off, len); + } + + @Override + public ArchiveEntry getNextEntry() throws IOException { + return file.getNextEntry(); + } + + @Override + public void close() throws IOException { + file.close(); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import com.github.junrar.Archive; +import com.github.junrar.exception.RarException; +import com.github.junrar.rarfile.FileHeader; + +/** + * Parser for Rar files. + */ +public class RarParser extends AbstractParser { + private static final long serialVersionUID = 6157727985054451501L; + + private static final Set<MediaType> SUPPORTED_TYPES = Collections + .singleton(MediaType.application("x-rar-compressed")); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext arg0) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + EmbeddedDocumentExtractor extractor = context.get( + EmbeddedDocumentExtractor.class, + new ParsingEmbeddedDocumentExtractor(context)); + + Archive rar = null; + try (TemporaryResources tmp = new TemporaryResources()) { + TikaInputStream tis = TikaInputStream.get(stream, tmp); + rar = new Archive(tis.getFile()); + + if (rar.isEncrypted()) { + throw new EncryptedDocumentException(); + } + + //Without this BodyContentHandler does not work + xhtml.element("div", " "); + + FileHeader header = rar.nextFileHeader(); + while (header != null && !Thread.currentThread().isInterrupted()) { + if (!header.isDirectory()) { + try (InputStream subFile = rar.getInputStream(header)) { + Metadata entrydata = PackageParser.handleEntryMetadata( + "".equals(header.getFileNameW()) ? header.getFileNameString() : header.getFileNameW(), + header.getCTime(), header.getMTime(), + header.getFullUnpackSize(), + xhtml + ); + + if (extractor.shouldParseEmbedded(entrydata)) { + extractor.parseEmbedded(subFile, handler, entrydata, true); + } + } + } + + header = rar.nextFileHeader(); + } + + } catch (RarException e) { + throw new TikaException("RarParser Exception", e); + } finally { + if (rar != null) + rar.close(); + + } + + xhtml.endDocument(); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,413 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Enumeration; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Locale; +import java.util.Set; +import java.util.regex.Pattern; + +import org.apache.commons.compress.archivers.ArchiveException; +import org.apache.commons.compress.archivers.ArchiveInputStream; +import org.apache.commons.compress.archivers.ArchiveStreamFactory; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipFile; +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.apache.commons.io.IOUtils; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.openxml4j.opc.PackageAccess; +import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; +import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; +import org.apache.tika.detect.Detector; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.iwork.IWorkPackageParser; +import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * A detector that works on Zip documents and other archive and compression + * formats to figure out exactly what the file is. + */ +public class ZipContainerDetector implements Detector { + private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE); + + // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes + private static final String VISIO_DOCUMENT = + "http://schemas.microsoft.com/visio/2010/relationships/document"; + // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes + private static final String STRICT_CORE_DOCUMENT = + "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument"; + + /** Serial version UID */ + private static final long serialVersionUID = 2891763938430295453L; + + public MediaType detect(InputStream input, Metadata metadata) + throws IOException { + // Check if we have access to the document + if (input == null) { + return MediaType.OCTET_STREAM; + } + + TemporaryResources tmp = new TemporaryResources(); + try { + TikaInputStream tis = TikaInputStream.get(input, tmp); + + byte[] prefix = new byte[1024]; // enough for all known formats + int length = tis.peek(prefix); + + MediaType type = detectArchiveFormat(prefix, length); + if (PackageParser.isZipArchive(type) + && TikaInputStream.isTikaInputStream(input)) { + return detectZipFormat(tis); + } else if (!type.equals(MediaType.OCTET_STREAM)) { + return type; + } else { + return detectCompressorFormat(prefix, length); + } + } finally { + try { + tmp.dispose(); + } catch (TikaException e) { + // ignore + } + } + } + + private static MediaType detectCompressorFormat(byte[] prefix, int length) { + try { + CompressorStreamFactory factory = new CompressorStreamFactory(); + CompressorInputStream cis = factory.createCompressorInputStream( + new ByteArrayInputStream(prefix, 0, length)); + try { + return CompressorParser.getMediaType(cis); + } finally { + IOUtils.closeQuietly(cis); + } + } catch (CompressorException e) { + return MediaType.OCTET_STREAM; + } + } + + private static MediaType detectArchiveFormat(byte[] prefix, int length) { + try { + ArchiveStreamFactory factory = new ArchiveStreamFactory(); + ArchiveInputStream ais = factory.createArchiveInputStream( + new ByteArrayInputStream(prefix, 0, length)); + try { + if ((ais instanceof TarArchiveInputStream) + && !TarArchiveInputStream.matches(prefix, length)) { + // ArchiveStreamFactory is too relaxed, see COMPRESS-117 + return MediaType.OCTET_STREAM; + } else { + return PackageParser.getMediaType(ais); + } + } finally { + IOUtils.closeQuietly(ais); + } + } catch (ArchiveException e) { + return MediaType.OCTET_STREAM; + } + } + + private static MediaType detectZipFormat(TikaInputStream tis) { + try { + ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()? + try { + MediaType type = detectOpenDocument(zip); + if (type == null) { + type = detectOPCBased(zip, tis); + } + if (type == null) { + type = detectIWork(zip); + } + if (type == null) { + type = detectJar(zip); + } + if (type == null) { + type = detectKmz(zip); + } + if (type == null) { + type = detectIpa(zip); + } + if (type != null) { + return type; + } + } finally { + // TODO: shouldn't we record the open + // container so it can be later + // reused...? + // tis.setOpenContainer(zip); + try { + zip.close(); + } catch (IOException e) { + // ignore + } + } + } catch (IOException e) { + // ignore + } + // Fallback: it's still a zip file, we just don't know what kind of one + return MediaType.APPLICATION_ZIP; + } + + /** + * OpenDocument files, along with EPub files and ASiC ones, have a + * mimetype entry in the root of their Zip file. This entry contains + * the mimetype of the overall file, stored as a single string. + */ + private static MediaType detectOpenDocument(ZipFile zip) { + try { + ZipArchiveEntry mimetype = zip.getEntry("mimetype"); + if (mimetype != null) { + try (InputStream stream = zip.getInputStream(mimetype)) { + return MediaType.parse(IOUtils.toString(stream, UTF_8)); + } + } else { + return null; + } + } catch (IOException e) { + return null; + } + } + + private static MediaType detectOPCBased(ZipFile zip, TikaInputStream stream) { + try { + if (zip.getEntry("_rels/.rels") != null + || zip.getEntry("[Content_Types].xml") != null) { + // Use POI to open and investigate it for us + OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ); + stream.setOpenContainer(pkg); + + // Is at an OOXML format? + MediaType type = detectOfficeOpenXML(pkg); + if (type != null) return type; + + // Is it XPS format? + type = detectXPSOPC(pkg); + if (type != null) return type; + + // Is it an AutoCAD format? + type = detectAutoCADOPC(pkg); + if (type != null) return type; + + // We don't know what it is, sorry + return null; + } else { + return null; + } + } catch (IOException e) { + return null; + } catch (RuntimeException e) { + return null; + } catch (InvalidFormatException e) { + return null; + } + } + /** + * Detects the type of an OfficeOpenXML (OOXML) file from + * opened Package + */ + public static MediaType detectOfficeOpenXML(OPCPackage pkg) { + // Check for the normal Office core document + PackageRelationshipCollection core = + pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT); + // Otherwise check for some other Office core document types + if (core.size() == 0) { + core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT); + } + if (core.size() == 0) { + core = pkg.getRelationshipsByType(VISIO_DOCUMENT); + } + + // If we didn't find a single core document of any type, skip detection + if (core.size() != 1) { + // Invalid OOXML Package received + return null; + } + + // Get the type of the core document part + PackagePart corePart = pkg.getPart(core.getRelationship(0)); + String coreType = corePart.getContentType(); + + // Turn that into the type of the overall document + String docType = coreType.substring(0, coreType.lastIndexOf('.')); + + // The Macro Enabled formats are a little special + if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) { + docType = docType.toLowerCase(Locale.ROOT) + ".12"; + } + + if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) { + docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12"); + } + + // Build the MediaType object and return + return MediaType.parse(docType); + } + /** + * Detects Open XML Paper Specification (XPS) + */ + private static MediaType detectXPSOPC(OPCPackage pkg) { + PackageRelationshipCollection xps = + pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation"); + if (xps.size() == 1) { + return MediaType.application("vnd.ms-xpsdocument"); + } else { + // Non-XPS Package received + return null; + } + } + /** + * Detects AutoCAD formats that live in OPC packaging + */ + private static MediaType detectAutoCADOPC(OPCPackage pkg) { + PackageRelationshipCollection dwfxSeq = + pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence"); + if (dwfxSeq.size() == 1) { + return MediaType.parse("model/vnd.dwfx+xps"); + } else { + // Non-AutoCAD Package received + return null; + } + } + + private static MediaType detectIWork(ZipFile zip) { + if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) { + // Locate the appropriate index file entry, and reads from that + // the root element of the document. That is used to the identify + // the correct type of the keynote container. + for (String entryName : IWorkPackageParser.IWORK_CONTENT_ENTRIES) { + IWORKDocumentType type = IWORKDocumentType.detectType(zip.getEntry(entryName), zip); + if (type != null) { + return type.getType(); + } + } + + // Not sure, fallback to the container type + return MediaType.application("vnd.apple.iwork"); + } else { + return null; + } + } + + private static MediaType detectJar(ZipFile zip) { + if (zip.getEntry("META-INF/MANIFEST.MF") != null) { + // It's a Jar file, or something based on Jar + + // Is it an Android APK? + if (zip.getEntry("AndroidManifest.xml") != null) { + return MediaType.application("vnd.android.package-archive"); + } + + // Check for WAR and EAR + if (zip.getEntry("WEB-INF/") != null) { + return MediaType.application("x-tika-java-web-archive"); + } + if (zip.getEntry("META-INF/application.xml") != null) { + return MediaType.application("x-tika-java-enterprise-archive"); + } + + // Looks like a regular Jar Archive + return MediaType.application("java-archive"); + } else { + // Some Android APKs miss the default Manifest + if (zip.getEntry("AndroidManifest.xml") != null) { + return MediaType.application("vnd.android.package-archive"); + } + + return null; + } + } + + private static MediaType detectKmz(ZipFile zip) { + boolean kmlFound = false; + + Enumeration<ZipArchiveEntry> entries = zip.getEntries(); + while (entries.hasMoreElements()) { + ZipArchiveEntry entry = entries.nextElement(); + String name = entry.getName(); + if (!entry.isDirectory() + && name.indexOf('/') == -1 && name.indexOf('\\') == -1) { + if (name.endsWith(".kml") && !kmlFound) { + kmlFound = true; + } else { + return null; + } + } + } + + if (kmlFound) { + return MediaType.application("vnd.google-earth.kmz"); + } else { + return null; + } + } + + /** + * To be considered as an IPA file, it needs to match all of these + */ + private static HashSet<Pattern> ipaEntryPatterns = new HashSet<Pattern>() { + private static final long serialVersionUID = 6545295886322115362L; + { + add(Pattern.compile("^Payload/$")); + add(Pattern.compile("^Payload/.*\\.app/$")); + add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/$")); + add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/CodeResources$")); + add(Pattern.compile("^Payload/.*\\.app/Info\\.plist$")); + add(Pattern.compile("^Payload/.*\\.app/PkgInfo$")); + }}; + @SuppressWarnings("unchecked") + private static MediaType detectIpa(ZipFile zip) { + // Note - consider generalising this logic, if another format needs many regexp matching + Set<Pattern> tmpPatterns = (Set<Pattern>)ipaEntryPatterns.clone(); + + Enumeration<ZipArchiveEntry> entries = zip.getEntries(); + while (entries.hasMoreElements()) { + ZipArchiveEntry entry = entries.nextElement(); + String name = entry.getName(); + + Iterator<Pattern> ip = tmpPatterns.iterator(); + while (ip.hasNext()) { + if (ip.next().matcher(name).matches()) { + ip.remove(); + } + } + if (tmpPatterns.isEmpty()) { + // We've found everything we need to find + return MediaType.application("x-itunes-ipa"); + } + } + + // If we get here, not all required entries were found + return null; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector Wed Jan 6 03:50:50 2016 @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.tika.parser.pkg.ZipContainerDetector Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan 6 03:50:50 2016 @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +org.apache.tika.parser.pkg.CompressorParser +org.apache.tika.parser.pkg.PackageParser +org.apache.tika.parser.pkg.RarParser +org.apache.tika.parser.iwork.IWorkPackageParser + Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.iwork; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; + +/** + * Test class for the <code>AutoPageNumberUtils</code> helper class. + */ +public class AutoPageNumberUtilsTest { + + /** + * Check upper-case alpha-numeric numbers are generated based on the + * input page number. + */ + @Test + public void testAlphaUpper() { + assertEquals("A", AutoPageNumberUtils.asAlphaNumeric(1)); + assertEquals("Z", AutoPageNumberUtils.asAlphaNumeric(26)); + assertEquals("AA", AutoPageNumberUtils.asAlphaNumeric(27)); + assertEquals("ZZ", AutoPageNumberUtils.asAlphaNumeric(52)); + assertEquals("AAA", AutoPageNumberUtils.asAlphaNumeric(53)); + assertEquals("ZZZ", AutoPageNumberUtils.asAlphaNumeric(78)); + } + + /** + * Check lower-case alpha-numeric numbers are generated based on the + * input page number. + */ + @Test + public void testAlphaLower() { + assertEquals("a", AutoPageNumberUtils.asAlphaNumericLower(1)); + assertEquals("z", AutoPageNumberUtils.asAlphaNumericLower(26)); + assertEquals("aa", AutoPageNumberUtils.asAlphaNumericLower(27)); + assertEquals("zz", AutoPageNumberUtils.asAlphaNumericLower(52)); + assertEquals("aaa", AutoPageNumberUtils.asAlphaNumericLower(53)); + assertEquals("zzz", AutoPageNumberUtils.asAlphaNumericLower(78)); + } + + /** + * Check upper-case Roman numerals numbers are generated based on the + * input page number. + */ + @Test + public void testRomanUpper() { + assertEquals("I", AutoPageNumberUtils.asRomanNumerals(1)); + assertEquals("XXVI", AutoPageNumberUtils.asRomanNumerals(26)); + assertEquals("XXVII", AutoPageNumberUtils.asRomanNumerals(27)); + } + + /** + * Check lower-case Roman numerals numbers are generated based on the + * input page number. + */ + @Test + public void testRomanLower() { + assertEquals("i", AutoPageNumberUtils.asRomanNumeralsLower(1)); + assertEquals("xxvi", AutoPageNumberUtils.asRomanNumeralsLower(26)); + assertEquals("xxvii", AutoPageNumberUtils.asRomanNumeralsLower(27)); + } + +}
