Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.odf; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.DublinCore; +import org.apache.tika.metadata.MSOffice; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.PagedText; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.xml.AttributeDependantMetadataHandler; +import org.apache.tika.parser.xml.AttributeMetadataHandler; +import org.apache.tika.parser.xml.ElementMetadataHandler; +import org.apache.tika.parser.xml.MetadataHandler; +import org.apache.tika.parser.xml.XMLParser; +import org.apache.tika.sax.TeeContentHandler; +import org.apache.tika.sax.xpath.CompositeMatcher; +import org.apache.tika.sax.xpath.Matcher; +import org.apache.tika.sax.xpath.MatchingContentHandler; +import org.apache.tika.sax.xpath.XPathParser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Parser for OpenDocument <code>meta.xml</code> files. + */ +public class OpenDocumentMetaParser extends XMLParser { + /** + * Serial version UID + */ + private static final long serialVersionUID = -8739250869531737584L; + + private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0"; + private static final XPathParser META_XPATH = new XPathParser("meta", META_NS); + + /** + * @see OfficeOpenXMLCore#SUBJECT + * @deprecated use OfficeOpenXMLCore#SUBJECT + */ + @Deprecated + private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR = + Property.composite(Office.INITIAL_AUTHOR, + new Property[]{Property.externalText("initial-creator")}); + + private static ContentHandler getDublinCoreHandler( + Metadata metadata, Property property, String element) { + return new ElementMetadataHandler( + DublinCore.NAMESPACE_URI_DC, element, + metadata, property); + } + + private static ContentHandler getMeta( + ContentHandler ch, Metadata md, Property property, String element) { + Matcher matcher = new CompositeMatcher( + META_XPATH.parse("//meta:" + element), + META_XPATH.parse("//meta:" + element + "//text()")); + ContentHandler branch = + new MatchingContentHandler(new MetadataHandler(md, property), matcher); + return new TeeContentHandler(ch, branch); + } + + private static ContentHandler getUserDefined( + ContentHandler ch, Metadata md) { + Matcher matcher = new CompositeMatcher( + META_XPATH.parse("//meta:user-defined/@meta:name"), + META_XPATH.parse("//meta:user-defined//text()")); + // eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1 + ContentHandler branch = new MatchingContentHandler( + new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX), + matcher); + return new TeeContentHandler(ch, branch); + } + + @Deprecated + private static ContentHandler getStatistic( + ContentHandler ch, Metadata md, String name, String attribute) { + Matcher matcher = + META_XPATH.parse("//meta:document-statistic/@meta:" + attribute); + ContentHandler branch = new MatchingContentHandler( + new AttributeMetadataHandler(META_NS, attribute, md, name), matcher); + return new TeeContentHandler(ch, branch); + } + + private static ContentHandler getStatistic( + ContentHandler ch, Metadata md, Property property, String attribute) { + Matcher matcher = + META_XPATH.parse("//meta:document-statistic/@meta:" + attribute); + ContentHandler branch = new MatchingContentHandler( + new AttributeMetadataHandler(META_NS, attribute, md, property), matcher); + return new TeeContentHandler(ch, branch); + } + + protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) { + // We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date + // Process the Dublin Core Attributes + ch = new TeeContentHandler(super.getContentHandler(ch, md, context), + getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"), + getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"), + getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"), + getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"), + getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"), + getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"), + getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"), + getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"), + getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"), + getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights")); + + // Process the OO Meta Attributes + ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date"); + // ODF uses dc:date for modified + ch = new TeeContentHandler(ch, new ElementMetadataHandler( + DublinCore.NAMESPACE_URI_DC, "date", + md, TikaCoreProperties.MODIFIED)); + + // ODF uses dc:subject for description + ch = new TeeContentHandler(ch, new ElementMetadataHandler( + DublinCore.NAMESPACE_URI_DC, "subject", + md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT)); + ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword"); + + ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration"); + ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles"); + ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator"); + ch = getMeta(ch, md, Property.externalText("generator"), "generator"); + + // Process the user defined Meta Attributes + ch = getUserDefined(ch, md); + + // Process the OO Statistics Attributes + ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count"); + ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count"); + ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count"); + ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count"); + ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count"); + ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count"); + ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count"); + ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count"); + + // Legacy, Tika-1.0 style attributes + // TODO Remove these in Tika 2.0 + ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count"); + ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count"); + ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count"); + ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count"); + ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count"); + ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count"); + ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count"); + + // Legacy Statistics Attributes, replaced with real keys above + // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770) + ch = getStatistic(ch, md, "nbPage", "page-count"); + ch = getStatistic(ch, md, "nbPara", "paragraph-count"); + ch = getStatistic(ch, md, "nbWord", "word-count"); + ch = getStatistic(ch, md, "nbCharacter", "character-count"); + ch = getStatistic(ch, md, "nbTab", "table-count"); + ch = getStatistic(ch, md, "nbObject", "object-count"); + ch = getStatistic(ch, md, "nbImg", "image-count"); + + // Normalise the rest + ch = new NSNormalizerContentHandler(ch); + return ch; + } + + @Override + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + super.parse(stream, handler, metadata, context); + // Copy subject to description for OO2 + String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT); + if (odfSubject != null && !odfSubject.equals("") && + (metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) { + metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject); + } + } + +}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.odf; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.Enumeration; +import java.util.HashSet; +import java.util.Set; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; +import java.util.zip.ZipInputStream; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.EndDocumentShieldingContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * OpenOffice parser + */ +public class OpenDocumentParser extends AbstractParser { + + /** + * Serial version UID + */ + private static final long serialVersionUID = -6410276875438618287L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("vnd.sun.xml.writer"), + MediaType.application("vnd.oasis.opendocument.text"), + MediaType.application("vnd.oasis.opendocument.graphics"), + MediaType.application("vnd.oasis.opendocument.presentation"), + MediaType.application("vnd.oasis.opendocument.spreadsheet"), + MediaType.application("vnd.oasis.opendocument.chart"), + MediaType.application("vnd.oasis.opendocument.image"), + MediaType.application("vnd.oasis.opendocument.formula"), + MediaType.application("vnd.oasis.opendocument.text-master"), + MediaType.application("vnd.oasis.opendocument.text-web"), + MediaType.application("vnd.oasis.opendocument.text-template"), + MediaType.application("vnd.oasis.opendocument.graphics-template"), + MediaType.application("vnd.oasis.opendocument.presentation-template"), + MediaType.application("vnd.oasis.opendocument.spreadsheet-template"), + MediaType.application("vnd.oasis.opendocument.chart-template"), + MediaType.application("vnd.oasis.opendocument.image-template"), + MediaType.application("vnd.oasis.opendocument.formula-template"), + MediaType.application("x-vnd.oasis.opendocument.text"), + MediaType.application("x-vnd.oasis.opendocument.graphics"), + MediaType.application("x-vnd.oasis.opendocument.presentation"), + MediaType.application("x-vnd.oasis.opendocument.spreadsheet"), + MediaType.application("x-vnd.oasis.opendocument.chart"), + MediaType.application("x-vnd.oasis.opendocument.image"), + MediaType.application("x-vnd.oasis.opendocument.formula"), + MediaType.application("x-vnd.oasis.opendocument.text-master"), + MediaType.application("x-vnd.oasis.opendocument.text-web"), + MediaType.application("x-vnd.oasis.opendocument.text-template"), + MediaType.application("x-vnd.oasis.opendocument.graphics-template"), + MediaType.application("x-vnd.oasis.opendocument.presentation-template"), + MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"), + MediaType.application("x-vnd.oasis.opendocument.chart-template"), + MediaType.application("x-vnd.oasis.opendocument.image-template"), + MediaType.application("x-vnd.oasis.opendocument.formula-template")))); + + private static final String META_NAME = "meta.xml"; + + private Parser meta = new OpenDocumentMetaParser(); + + private Parser content = new OpenDocumentContentParser(); + + public Parser getMetaParser() { + return meta; + } + + public void setMetaParser(Parser meta) { + this.meta = meta; + } + + public Parser getContentParser() { + return content; + } + + public void setContentParser(Parser content) { + this.content = content; + } + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler baseHandler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + // Open the Zip stream + // Use a File if we can, and an already open zip is even better + ZipFile zipFile = null; + ZipInputStream zipStream = null; + if (stream instanceof TikaInputStream) { + TikaInputStream tis = (TikaInputStream) stream; + Object container = ((TikaInputStream) stream).getOpenContainer(); + if (container instanceof ZipFile) { + zipFile = (ZipFile) container; + } else if (tis.hasFile()) { + zipFile = new ZipFile(tis.getFile()); + } else { + zipStream = new ZipInputStream(stream); + } + } else { + zipStream = new ZipInputStream(stream); + } + + // Prepare to handle the content + XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata); + + // As we don't know which of the metadata or the content + // we'll hit first, catch the endDocument call initially + EndDocumentShieldingContentHandler handler = + new EndDocumentShieldingContentHandler(xhtml); + + // If we can, process the metadata first, then the + // rest of the file afterwards + // Only possible to guarantee that when opened from a file not a stream + ZipEntry entry = null; + if (zipFile != null) { + entry = zipFile.getEntry(META_NAME); + handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler); + + Enumeration<? extends ZipEntry> entries = zipFile.entries(); + while (entries.hasMoreElements()) { + entry = entries.nextElement(); + if (!META_NAME.equals(entry.getName())) { + handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler); + } + } + zipFile.close(); + } else { + do { + entry = zipStream.getNextEntry(); + handleZipEntry(entry, zipStream, metadata, context, handler); + } while (entry != null); + zipStream.close(); + } + + // Only now call the end document + if (handler.getEndDocumentWasCalled()) { + handler.reallyEndDocument(); + } + } + + private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata, + ParseContext context, EndDocumentShieldingContentHandler handler) + throws IOException, SAXException, TikaException { + if (entry == null) return; + + if (entry.getName().equals("mimetype")) { + String type = IOUtils.toString(zip, UTF_8); + metadata.set(Metadata.CONTENT_TYPE, type); + } else if (entry.getName().equals(META_NAME)) { + meta.parse(zip, new DefaultHandler(), metadata, context); + } else if (entry.getName().endsWith("content.xml")) { + if (content instanceof OpenDocumentContentParser) { + ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context); + } else { + // Foreign content parser was set: + content.parse(zip, handler, metadata, context); + } + } else if (entry.getName().endsWith("styles.xml")) { + if (content instanceof OpenDocumentContentParser) { + ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context); + } else { + // Foreign content parser was set: + content.parse(zip, handler, metadata, context); + } + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.opendocument; + +import org.apache.tika.parser.odf.OpenDocumentParser; + +/** + * OpenOffice parser + * + * @deprecated Use the {@link OpenDocumentParser} class instead. + * This class will be removed in Apache Tika 1.0. + */ +public class OpenOfficeParser extends OpenDocumentParser { +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.rtf; + +import java.nio.charset.Charset; + +/* Holds all state associated with current RTF group, ie { + * ... }. */ + +class GroupState { + public int depth; + public boolean bold; + public boolean italic; + // True if we are skipping all text in current group, + // eg if group leads with a \*: + public boolean ignore; + // Default is 1 if no uc control has been seen yet: + public int ucSkip = 1; + public int list; + public int listLevel; + public Charset fontCharset; + //in objdata + public boolean objdata; + //depth in pict, 1 = at pict level + public int pictDepth; + //in picprop key/value pair + public boolean sp; + //in picprop's name + public boolean sn; + //in picprop's value + public boolean sv; + //in embedded object or not + public boolean object; + + // Create default (root) GroupState + public GroupState() { + } + + // Create new GroupState, inheriting all properties from current one, adding 1 to the depth + public GroupState(GroupState other) { + bold = other.bold; + italic = other.italic; + ignore = other.ignore; + ucSkip = other.ucSkip; + list = other.list; + listLevel = other.listLevel; + fontCharset = other.fontCharset; + depth = 1 + other.depth; + pictDepth = other.pictDepth > 0 ? other.pictDepth + 1 : 0; + //do not inherit object, sn, sv or sp + + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.rtf; + +/** + * Contains the information for a single list in the list or list override tables. + */ +public class ListDescriptor { + public final static int NUMBER_TYPE_BULLET = 23; + + public int id; + // We record this but don't make use if it today: + public int templateID; + // We record this but don't make use if it today: + public boolean isStyle; + public int[] numberType = new int[9]; + + public boolean isUnordered(int level) { + return numberType[level] == NUMBER_TYPE_BULLET; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,287 @@ +package org.apache.tika.parser.rtf; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.io.FilenameUtils; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.Detector; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.RTFMetadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MimeType; +import org.apache.tika.mime.MimeTypeException; +import org.apache.tika.mime.MimeTypes; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * This class buffers data from embedded objects and pictures. + * <p/> + * <p/> + * <p/> + * When the parser has finished an object or picture and called + * {@link #handleCompletedObject()}, this will write the object + * to the {@link #handler}. + * <p/> + * <p/> + * <p/> + * This (in combination with TextExtractor) expects basically a flat parse. It will pull out + * all pict whether they are tied to objdata or are intended + * to be standalone. + * <p/> + * <p/> + * This tries to pull metadata around a pict that is encoded + * with {sp {sn} {sv}} types of data. This information + * sometimes contains the name and even full file path of the original file. + */ +class RTFEmbObjHandler { + + private static final String EMPTY_STRING = ""; + private final ContentHandler handler; + + + private final ParseContext context; + private final ByteArrayOutputStream os; + //high hex cached for writing hexpair chars (data) + private int hi = -1; + private int thumbCount = 0; + //don't need atomic, do need mutable + private AtomicInteger unknownFilenameCount = new AtomicInteger(); + private boolean inObject = false; + private String sv = EMPTY_STRING; + private String sn = EMPTY_STRING; + private StringBuilder sb = new StringBuilder(); + private Metadata metadata; + private EMB_STATE state = EMB_STATE.NADA; + protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, ParseContext context) { + this.handler = handler; + this.context = context; + os = new ByteArrayOutputStream(); + } + + protected void startPict() { + state = EMB_STATE.PICT; + metadata = new Metadata(); + } + + protected void startObjData() { + state = EMB_STATE.OBJDATA; + metadata = new Metadata(); + } + + protected void startSN() { + sb.setLength(0); + sb.append(RTFMetadata.RTF_PICT_META_PREFIX); + } + + protected void endSN() { + sn = sb.toString(); + } + + protected void startSV() { + sb.setLength(0); + } + + protected void endSV() { + sv = sb.toString(); + } + + //end metadata pair + protected void endSP() { + metadata.add(sn, sv); + } + + protected boolean getInObject() { + return inObject; + } + + protected void setInObject(boolean v) { + inObject = v; + } + + protected void writeMetadataChar(char c) { + sb.append(c); + } + + protected void writeHexChar(int b) throws IOException, TikaException { + //if not hexchar, ignore + //white space is common + if (TextExtractor.isHexChar(b)) { + if (hi == -1) { + hi = 16 * TextExtractor.hexValue(b); + } else { + long sum = hi + TextExtractor.hexValue(b); + if (sum > Integer.MAX_VALUE || sum < 0) { + throw new IOException("hex char to byte overflow"); + } + + os.write((int) sum); + + hi = -1; + } + return; + } + if (b == -1) { + throw new TikaException("hit end of stream before finishing byte pair"); + } + } + + protected void writeBytes(InputStream is, int len) throws IOException, TikaException { + if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) { + throw new IOException("length of bytes to read out of bounds: " + len); + } + + byte[] bytes = new byte[len]; + int bytesRead = is.read(bytes); + if (bytesRead < len) { + throw new TikaException("unexpected end of file: need " + len + + " bytes of binary data, found " + (len - bytesRead)); + } + os.write(bytes); + } + + /** + * Call this when the objdata/pict has completed + * + * @throws IOException + * @throws SAXException + * @throws TikaException + */ + protected void handleCompletedObject() throws IOException, SAXException, TikaException { + EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class); + + if (embeddedExtractor == null) { + embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); + } + + byte[] bytes = os.toByteArray(); + if (state == EMB_STATE.OBJDATA) { + RTFObjDataParser objParser = new RTFObjDataParser(); + try { + byte[] objBytes = objParser.parse(bytes, metadata, unknownFilenameCount); + extractObj(objBytes, handler, embeddedExtractor, metadata); + } catch (IOException e) { + //swallow. If anything goes wrong, ignore. + } + } else if (state == EMB_STATE.PICT) { + String filePath = metadata.get(RTFMetadata.RTF_PICT_META_PREFIX + "wzDescription"); + if (filePath != null && filePath.length() > 0) { + metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filePath); + metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(filePath)); + } + metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject)); + extractObj(bytes, handler, embeddedExtractor, metadata); + + } else if (state == EMB_STATE.NADA) { + //swallow...no start for pict or embed?! + } + reset(); + } + + private void extractObj(byte[] bytes, ContentHandler handler, + EmbeddedDocumentExtractor embeddedExtractor, Metadata metadata) + throws SAXException, IOException, TikaException { + + if (bytes == null) { + return; + } + + metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(bytes.length)); + + if (embeddedExtractor.shouldParseEmbedded(metadata)) { + TikaInputStream stream = TikaInputStream.get(bytes); + if (metadata.get(Metadata.RESOURCE_NAME_KEY) == null) { + String extension = getExtension(stream, metadata); + stream.reset(); + if (inObject && state == EMB_STATE.PICT) { + metadata.set(Metadata.RESOURCE_NAME_KEY, "thumbnail_" + thumbCount++ + extension); + metadata.set(RTFMetadata.THUMBNAIL, "true"); + } else { + metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + + extension); + } + } + try { + embeddedExtractor.parseEmbedded( + stream, + new EmbeddedContentHandler(handler), + metadata, false); + } finally { + stream.close(); + } + } + } + + private String getExtension(TikaInputStream is, Metadata metadata) { + String cType = metadata.get(Metadata.CONTENT_TYPE); + TikaConfig config = getConfig(); + if (cType == null) { + Detector detector = config.getDetector(); + try { + MediaType mediaType = detector.detect(is, metadata); + MimeTypes types = config.getMimeRepository(); + MimeType mime = types.forName(mediaType.toString()); + metadata.set(Metadata.CONTENT_TYPE, mediaType.getSubtype()); + return mime.getExtension(); + } catch (IOException e) { + //swallow + } catch (MimeTypeException e) { + + } + } + return ".bin"; + } + + private TikaConfig getConfig() { + TikaConfig config = context.get(TikaConfig.class); + if (config == null) { + config = TikaConfig.getDefaultConfig(); + } + return config; + } + + /** + * reset state after each object. + * Do not reset unknown file number. + */ + protected void reset() { + state = EMB_STATE.NADA; + os.reset(); + metadata = new Metadata(); + hi = -1; + sv = EMPTY_STRING; + sn = EMPTY_STRING; + sb.setLength(0); + } + + private enum EMB_STATE { + PICT, //recording pict data + OBJDATA, //recording objdata + NADA + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,315 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.tika.parser.rtf; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.util.Locale; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.io.FilenameUtils; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentEntry; +import org.apache.poi.poifs.filesystem.DocumentInputStream; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.Ole10Native; +import org.apache.poi.poifs.filesystem.Ole10NativeException; +import org.apache.poi.util.IOUtils; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.RTFMetadata; +import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType; + +/** + * Many thanks to Simon Mourier for: + * http://stackoverflow.com/questions/14779647/extract-embedded-image-object-in-rtf + * and for granting permission to use his code in Tika. + */ +class RTFObjDataParser { + + private final static int[] INT_LE_POWS = new int[]{ + 1, 256, 65536, 16777216 + }; + + private final static String WIN_ASCII = "WINDOWS-1252"; + + /** + * Parses the embedded object/pict string + * + * @param bytes actual bytes (already converted from the + * hex pair string stored in the embedded object data into actual bytes or read + * as raw binary bytes) + * @return a SimpleRTFEmbObj or null + * @throws IOException if there are any surprise surprises during parsing + */ + + /** + * @param bytes + * @param metadata incoming metadata + * @param unknownFilenameCount + * @return byte[] for contents of obj data + * @throws IOException + */ + protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger unknownFilenameCount) + throws IOException { + ByteArrayInputStream is = new ByteArrayInputStream(bytes); + long version = readUInt(is); + metadata.add(RTFMetadata.EMB_APP_VERSION, Long.toString(version)); + + long formatId = readUInt(is); + //2 is an embedded object. 1 is a link. + if (formatId != 2L) { + return null; + } + String className = readLengthPrefixedAnsiString(is).trim(); + String topicName = readLengthPrefixedAnsiString(is).trim(); + String itemName = readLengthPrefixedAnsiString(is).trim(); + + if (className != null && className.length() > 0) { + metadata.add(RTFMetadata.EMB_CLASS, className); + } + if (topicName != null && topicName.length() > 0) { + metadata.add(RTFMetadata.EMB_TOPIC, topicName); + } + if (itemName != null && itemName.length() > 0) { + metadata.add(RTFMetadata.EMB_ITEM, itemName); + } + + long dataSz = readUInt(is); + + //readBytes tests for reading too many bytes + byte[] embObjBytes = readBytes(is, dataSz); + + if (className.toLowerCase(Locale.ROOT).equals("package")) { + return handlePackage(embObjBytes, metadata); + } else if (className.toLowerCase(Locale.ROOT).equals("pbrush")) { + //simple bitmap bytes + return embObjBytes; + } else { + ByteArrayInputStream embIs = new ByteArrayInputStream(embObjBytes); + if (NPOIFSFileSystem.hasPOIFSHeader(embIs)) { + try { + return handleEmbeddedPOIFS(embIs, metadata, unknownFilenameCount); + } catch (IOException e) { + //swallow + } + } + } + return embObjBytes; + } + + + //will throw IOException if not actually POIFS + //can return null byte[] + private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, + AtomicInteger unknownFilenameCount) + throws IOException { + + byte[] ret = null; + try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) { + + DirectoryNode root = fs.getRoot(); + + if (root == null) { + return ret; + } + + if (root.hasEntry("Package")) { + Entry ooxml = root.getEntry("Package"); + TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml)); + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + + IOUtils.copy(stream, out); + ret = out.toByteArray(); + } else { + //try poifs + POIFSDocumentType type = POIFSDocumentType.detectType(root); + if (type == POIFSDocumentType.OLE10_NATIVE) { + try { + // Try to un-wrap the OLE10Native record: + Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root); + ret = ole.getDataBuffer(); + } catch (Ole10NativeException ex) { + // Not a valid OLE10Native record, skip it + } + } else if (type == POIFSDocumentType.COMP_OBJ) { + + DocumentEntry contentsEntry; + try { + contentsEntry = (DocumentEntry) root.getEntry("CONTENTS"); + } catch (FileNotFoundException ioe) { + contentsEntry = (DocumentEntry) root.getEntry("Contents"); + } + + try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) { + ret = new byte[contentsEntry.getSize()]; + inp.readFully(ret); + } + } else { + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + is.reset(); + IOUtils.copy(is, out); + ret = out.toByteArray(); + metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension()); + metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); + } + } + } + return ret; + } + + + /** + * can return null if there is a linked object + * instead of an embedded file + */ + private byte[] handlePackage(byte[] pkgBytes, Metadata metadata) throws IOException { + //now parse the package header + ByteArrayInputStream is = new ByteArrayInputStream(pkgBytes); + readUShort(is); + + String displayName = readAnsiString(is); + + //should we add this to the metadata? + readAnsiString(is); //iconFilePath + readUShort(is); //iconIndex + int type = readUShort(is); //type + + //1 is link, 3 is embedded object + //this only handles embedded objects + if (type != 3) { + return null; + } + //should we really be ignoring this filePathLen? + readUInt(is); //filePathLen + + String ansiFilePath = readAnsiString(is); //filePath + long bytesLen = readUInt(is); + byte[] objBytes = initByteArray(bytesLen); + is.read(objBytes); + StringBuilder unicodeFilePath = new StringBuilder(); + + try { + long unicodeLen = readUInt(is); + + for (int i = 0; i < unicodeLen; i++) { + int lo = is.read(); + int hi = is.read(); + int sum = lo + 256 * hi; + if (hi == -1 || lo == -1) { + //stream ran out; empty SB and stop + unicodeFilePath.setLength(0); + break; + } + unicodeFilePath.append((char) sum); + } + } catch (IOException e) { + //swallow; the unicode file path is optional and might not happen + unicodeFilePath.setLength(0); + } + String fileNameToUse = ""; + String pathToUse = ""; + if (unicodeFilePath.length() > 0) { + String p = unicodeFilePath.toString(); + fileNameToUse = p; + pathToUse = p; + } else { + fileNameToUse = displayName == null ? "" : displayName; + pathToUse = ansiFilePath == null ? "" : ansiFilePath; + } + metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(fileNameToUse)); + metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pathToUse); + + return objBytes; + } + + + private int readUShort(InputStream is) throws IOException { + int lo = is.read(); + int hi = is.read() * 256; + if (lo == -1 || hi == -1) { + throw new IOException("Hit end of stream before reading little endian unsigned short."); + } + return hi + lo; + } + + private long readUInt(InputStream is) throws IOException { + long sum = 0; + for (int i = 0; i < 4; i++) { + int v = is.read(); + if (v == -1) { + throw new IOException("Hit end of stream before finishing little endian unsigned int."); + } + sum += v * (long) INT_LE_POWS[i]; + } + return sum; + } + + private String readAnsiString(InputStream is) throws IOException { + StringBuilder sb = new StringBuilder(); + int c = is.read(); + while (c > 0) { + sb.append((char) c); + c = is.read(); + } + if (c == -1) { + throw new IOException("Hit end of stream before end of AnsiString"); + } + return sb.toString(); + } + + private String readLengthPrefixedAnsiString(InputStream is) throws IOException { + long len = readUInt(is); + byte[] bytes = readBytes(is, len); + try { + return new String(bytes, WIN_ASCII); + } catch (UnsupportedEncodingException e) { + //shouldn't ever happen + throw new IOException("Unsupported encoding"); + } + } + + + private byte[] readBytes(InputStream is, long len) throws IOException { + //initByteArray tests for "reading of too many bytes" + byte[] bytes = initByteArray(len); + int read = is.read(bytes); + if (read != len) { + throw new IOException("Hit end of stream before reading all bytes"); + } + + return bytes; + } + + private byte[] initByteArray(long len) throws IOException { + if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) { + throw new IOException("Requested length for reading bytes is out of bounds: " + len); + } + return new byte[(int) len]; + + } +} + Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.rtf; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import org.apache.commons.io.input.TaggedInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * RTF parser + */ +public class RTFParser extends AbstractParser { + + /** + * Serial version UID + */ + private static final long serialVersionUID = -4165069489372320313L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MediaType.application("rtf")); + /** + * maximum number of bytes per embedded object/pict (default: 20MB) + */ + private static int EMB_OBJ_MAX_BYTES = 20 * 1024 * 1024; //20MB + + /** + * See {@link #setMaxBytesForEmbeddedObject(int)}. + * + * @return maximum number of bytes allowed for an embedded object. + */ + public static int getMaxBytesForEmbeddedObject() { + return EMB_OBJ_MAX_BYTES; + } + + /** + * Bytes for embedded objects are currently cached in memory. + * If something goes wrong during the parsing of an embedded object, + * it is possible that a read length may be crazily too long + * and cause a heap crash. + * + * @param max maximum number of bytes to allow for embedded objects. If + * the embedded object has more than this number of bytes, skip it. + */ + public static void setMaxBytesForEmbeddedObject(int max) { + EMB_OBJ_MAX_BYTES = max; + } + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + metadata.set(Metadata.CONTENT_TYPE, "application/rtf"); + TaggedInputStream tagged = new TaggedInputStream(stream); + try { + XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata); + RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context); + final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler); + ert.extract(stream); + } catch (IOException e) { + tagged.throwIfCauseOf(e); + throw new TikaException("Error parsing an RTF document", e); + } + } +}
