Author: tallison Date: Wed Apr 16 18:04:20 2014 New Revision: 1588005 URL: http://svn.apache.org/r1588005 Log: TIKA-1010 extract embedded documents from RTF
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFEmbeddedFiles.rtf tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFEmbeddedLink.rtf tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFRegularImages.rtf Modified: tika/trunk/CHANGES.txt tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java tika/trunk/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Modified: tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1588005&r1=1588004&r2=1588005&view=diff ============================================================================== --- tika/trunk/CHANGES.txt (original) +++ tika/trunk/CHANGES.txt Wed Apr 16 18:04:20 2014 @@ -1,5 +1,7 @@ Release 1.6 - ??/??/2014 + * Extract attachments from RTF files (TIKA-1010) + * Support Outlook Personal Folders File Format *.pst (TIKA-623) * Added mime entries for additional Ogg based formats (TIKA-1259) Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java?rev=1588005&r1=1588004&r2=1588005&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java Wed Apr 16 18:04:20 2014 @@ -21,6 +21,7 @@ import java.util.HashSet; public class FilenameUtils { + /** * Reserved characters */ @@ -34,12 +35,14 @@ public class FilenameUtils { private final static HashSet<Character> RESERVED = new HashSet<Character>(38); + static { for (int i=0; i<RESERVED_FILENAME_CHARACTERS.length; ++i) { RESERVED.add(RESERVED_FILENAME_CHARACTERS[i]); } } + /** * Scans the given file name for reserved characters on different OSs and * file systems and returns a sanitized version of the name with the @@ -70,4 +73,40 @@ public class FilenameUtils { return sb.toString(); } + + /** + * This is a duplication of the algorithm and functionality + * available in commons io FilenameUtils. If Java's File were + * able handle Windows file paths correctly in linux, + * we wouldn't need this. + * <p> + * The goal of this is to get a filename from a path. + * The package parsers and some other embedded doc + * extractors could put anything into Metadata.RESOURCE_NAME_KEY. + * <p> + * If a careless client used that filename as if it were a + * filename and not a path when writing embedded files, + * bad things could happen. Consider: "../../../my_ppt.ppt". + * <p> + * Consider using this in combination with {@link #normalize(String)}. + * + * @param path path to strip + * @return empty string or a filename, never null + */ + public static String getName(final String path) { + + if (path == null || path.length() == 0) { + return ""; + } + int unix = path.lastIndexOf("/"); + int windows = path.lastIndexOf("\\"); + //some macintosh file names are stored with : as the delimiter + //also necessary to properly handle C:somefilename + int colon = path.lastIndexOf(":"); + String cand = path.substring(Math.max(colon, Math.max(unix, windows))+1); + if (cand.equals("..") || cand.equals(".")){ + return ""; + } + return cand; + } } Added: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java?rev=1588005&view=auto ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java (added) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java Wed Apr 16 18:04:20 2014 @@ -0,0 +1,46 @@ +package org.apache.tika.metadata; /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; public interface +RTFMetadata { + public static final String PREFIX_RTF_META = "rtf_meta"; + + + public static final String RTF_PICT_META_PREFIX = "rtf_pict:"; + + /** + * if set to true, this means that an image file is probably a "thumbnail" + * any time a pict/emf/wmf is in an object + */ + Property THUMBNAIL = Property.internalBoolean(PREFIX_RTF_META+ + Metadata.NAMESPACE_PREFIX_DELIMITER+"thumbnail"); + + /** + * if an application and version is given as part of the + * embedded object, this is the literal string + */ + Property EMB_APP_VERSION = Property.internalText(PREFIX_RTF_META+ + Metadata.NAMESPACE_PREFIX_DELIMITER+"emb_app_version"); + + Property EMB_CLASS = Property.internalText(PREFIX_RTF_META+ + Metadata.NAMESPACE_PREFIX_DELIMITER+"emb_class"); + + Property EMB_TOPIC = Property.internalText(PREFIX_RTF_META+ + Metadata.NAMESPACE_PREFIX_DELIMITER+"emb_topic"); + + Property EMB_ITEM = Property.internalText(PREFIX_RTF_META+ + Metadata.NAMESPACE_PREFIX_DELIMITER+"emb_item"); + +} Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java?rev=1588005&r1=1588004&r2=1588005&view=diff ============================================================================== --- tika/trunk/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java (original) +++ tika/trunk/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java Wed Apr 16 18:04:20 2014 @@ -94,5 +94,24 @@ public class FilenameUtilsTest { assertEquals(EXPECTED_NAME, FilenameUtils.normalize(TEST_NAME)); } + @Test + public void testGetName() throws Exception { + testFilenameEquality("quick.ppt", "C:\\the\\quick.ppt"); + testFilenameEquality("quick.ppt", "/the/quick.ppt"); + testFilenameEquality("", "/the/quick/"); + testFilenameEquality("", "~/the/quick////\\\\//"); + testFilenameEquality("~~quick", "~~quick"); + testFilenameEquality("quick.ppt", "quick.ppt"); + testFilenameEquality("", "////"); + testFilenameEquality("", "C:////"); + testFilenameEquality("", ".."); + testFilenameEquality("quick", "C:////../the/D:/quick"); + testFilenameEquality("file.ppt", "path:to:file.ppt" ); + testFilenameEquality("HW.txt", "_1457338542/HW.txt" ); + } + + private void testFilenameEquality(String expected, String path) { + assertEquals(expected, FilenameUtils.getName(path)); + } } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java?rev=1588005&r1=1588004&r2=1588005&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java Wed Apr 16 18:04:20 2014 @@ -33,6 +33,18 @@ class GroupState { public int list; public int listLevel; public Charset fontCharset; + //in objdata + public boolean objdata; + //depth in pict, 1 = at pict level + public int pictDepth; + //in picprop key/value pair + public boolean sp; + //in picprop's name + public boolean sn; + //in picprop's value + public boolean sv; + //in embedded object or not + public boolean object; // Create default (root) GroupState public GroupState() { @@ -48,5 +60,8 @@ class GroupState { listLevel = other.listLevel; fontCharset = other.fontCharset; depth = 1+other.depth; + pictDepth = other.pictDepth > 0 ? other.pictDepth + 1 : 0; + //do not inherit object, sn, sv or sp + } } Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java?rev=1588005&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java (added) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java Wed Apr 16 18:04:20 2014 @@ -0,0 +1,276 @@ +package org.apache.tika.parser.rtf; /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import +java.util.concurrent.atomic.AtomicInteger; import org.apache.tika.config.TikaConfig; import +org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import +org.apache.tika.extractor.EmbeddedDocumentExtractor; import +org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; import org.apache.tika.io.FilenameUtils; import +org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import +org.apache.tika.metadata.RTFMetadata; import org.apache.tika.mime.MediaType; import +org.apache.tika.mime.MimeType; import org.apache.tika.mime.MimeTypeException; import +org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.ParseContext; import +org.apache.tika.sax.EmbeddedContentHandler; import org.xml.sax.ContentHandler; import +org.xml.sax.SAXException; /** + * This class buffers data from embedded objects and pictures. + * + * <p/> + * + * When the parser has finished an object or picture and called + * {@link #handleCompletedObject()}, this will write the object + * to the {@link #handler}. + * + * <p/> + * + * This (in combination with TextExtractor) expects basically a flat parse. It will pull out + * all pict whether they are tied to objdata or are intended + * to be standalone. + * + * <p/> + * This tries to pull metadata around a pict that is encoded + * with {sp {sn} {sv}} types of data. This information + * sometimes contains the name and even full file path of the original file. + * + */ class RTFEmbObjHandler { + + private static final String EMPTY_STRING = ""; + + private enum EMB_STATE { + PICT, //recording pict data + OBJDATA, //recording objdata + NADA + }; + + //high hex cached for writing hexpair chars (data) + private int hi = -1; + + private int thumbCount = 0; + //don't need atomic, do need mutable + private AtomicInteger unknownFilenameCount = new AtomicInteger(); + + private boolean inObject = false; + + private String sv = EMPTY_STRING; + private String sn = EMPTY_STRING; + + private StringBuilder sb = new StringBuilder(); + + private final ContentHandler handler; + private Metadata metadata; + private final ParseContext context; + + private final ByteArrayOutputStream os; + private EMB_STATE state = EMB_STATE.NADA; + + protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, ParseContext context) { + this.handler = handler; + this.context = context; + os = new ByteArrayOutputStream(); + } + protected void startPict() { + state = EMB_STATE.PICT; + metadata = new Metadata(); + } + + protected void startObjData() { + state = EMB_STATE.OBJDATA; + metadata = new Metadata(); + } + + protected void startSN() { + sb.setLength(0); + sb.append(RTFMetadata.RTF_PICT_META_PREFIX); + } + + protected void endSN() { + sn = sb.toString(); + } + + protected void startSV() { + sb.setLength(0); + } + + protected void endSV() { + sv = sb.toString(); + } + + //end metadata pair + protected void endSP() { + metadata.add(sn, sv); + } + + protected void setInObject(boolean v) { + inObject = v; + } + + protected boolean getInObject() { + return inObject; + } + + protected void writeMetadataChar(char c) { + sb.append(c); + } + + protected void writeHexChar(int b) throws IOException, TikaException { + //if not hexchar, ignore + //white space is common + if (TextExtractor.isHexChar(b)) { + if (hi == -1) { + hi = 16*TextExtractor.hexValue(b); + } else { + long sum = hi+TextExtractor.hexValue(b); + if (sum > Integer.MAX_VALUE || sum < 0) { + throw new IOException("hex char to byte overflow"); + } + + os.write((int)sum); + + hi = -1; + } + return; + } + if (b == -1) { + throw new TikaException("hit end of stream before finishing byte pair"); + } + } + + + protected void writeBytes(InputStream is, int len) throws IOException, TikaException { + if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) { + throw new IOException("length of bytes to read out of bounds: " + len); + } + + byte[] bytes = new byte[len]; + int bytesRead = is.read(bytes); + if (bytesRead < len) { + throw new TikaException("unexpected end of file: need " + len + + " bytes of binary data, found " + (len-bytesRead)); + } + os.write(bytes); + } + + /** + * Call this when the objdata/pict has completed + * @throws IOException + * @throws SAXException + * @throws TikaException + */ + protected void handleCompletedObject() throws IOException, SAXException, TikaException { + EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class); + + if (embeddedExtractor == null) { + embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); + } + + byte[] bytes = os.toByteArray(); + if (state == EMB_STATE.OBJDATA) { + RTFObjDataParser objParser = new RTFObjDataParser(); + try{ + byte[] objBytes = objParser.parse(bytes, metadata, unknownFilenameCount); + extractObj(objBytes, handler, embeddedExtractor, metadata); + } catch (IOException e) { + //swallow. If anything goes wrong, ignore. + } + } else if (state == EMB_STATE.PICT) { + String filePath = metadata.get(RTFMetadata.RTF_PICT_META_PREFIX+"wzDescription"); + if (filePath != null && filePath.length() > 0){ + metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filePath); + metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(filePath)); + } + metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject)); + extractObj(bytes, handler, embeddedExtractor, metadata); + + } else if (state == EMB_STATE.NADA) { + //swallow...no start for pict or embed?! + } + reset(); + } + + private void extractObj(byte[] bytes, ContentHandler handler, + EmbeddedDocumentExtractor embeddedExtractor, Metadata metadata) + throws SAXException, IOException, TikaException { + + if (bytes == null) { + return; + } + + metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(bytes.length)); + + if (embeddedExtractor.shouldParseEmbedded(metadata)) { + TikaInputStream stream = TikaInputStream.get(bytes); + if (metadata.get(Metadata.RESOURCE_NAME_KEY) == null) { + String extension = getExtension(stream, metadata); + stream.reset(); + if (inObject && state == EMB_STATE.PICT) { + metadata.set(Metadata.RESOURCE_NAME_KEY, "thumbnail_"+thumbCount++ + extension); + metadata.set(RTFMetadata.THUMBNAIL, "true"); + } else { + metadata.set(Metadata.RESOURCE_NAME_KEY, "file_"+unknownFilenameCount.getAndIncrement() + +extension); + } + } + try { + embeddedExtractor.parseEmbedded( + stream, + new EmbeddedContentHandler(handler), + metadata, false); + } finally { + stream.close(); + } + } + } + + private String getExtension(TikaInputStream is, Metadata metadata) { + String cType = metadata.get(Metadata.CONTENT_TYPE); + TikaConfig config = getConfig(); + if (cType == null) { + Detector detector = config.getDetector(); + try { + MediaType mediaType = detector.detect(is, metadata); + MimeTypes types = config.getMimeRepository(); + MimeType mime = types.forName(mediaType.toString()); + metadata.set(Metadata.CONTENT_TYPE, mediaType.getSubtype()); + return mime.getExtension(); + } catch (IOException e) { + //swallow + } catch (MimeTypeException e) { + + } + } + return ".bin"; + } + + private TikaConfig getConfig() { + TikaConfig config = context.get(TikaConfig.class); + if (config == null) { + config = TikaConfig.getDefaultConfig(); + } + return config; + } + + /** + * reset state after each object. + * Do not reset unknown file number. + */ + protected void reset() { + state = EMB_STATE.NADA; + os.reset(); + metadata = new Metadata(); + hi = -1; + sv = EMPTY_STRING; + sn = EMPTY_STRING; + sb.setLength(0); + } +} Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java?rev=1588005&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java (added) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java Wed Apr 16 18:04:20 2014 @@ -0,0 +1,312 @@ +package org.apache.tika.parser.rtf; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentEntry; +import org.apache.poi.poifs.filesystem.DocumentInputStream; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.Ole10Native; +import org.apache.poi.poifs.filesystem.Ole10NativeException; +import org.apache.poi.util.IOUtils; +import org.apache.tika.io.FilenameUtils; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.RTFMetadata; +import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType; + +/** + * Many thanks to Simon Mourier for: + * http://stackoverflow.com/questions/14779647/extract-embedded-image-object-in-rtf + * and for granting permission to use his code in Tika. + * + */ +class RTFObjDataParser { + + private final static int[] INT_LE_POWS = new int[]{ + 1, 256, 65536, 16777216 + }; + + private final static String WIN_ASCII = "WINDOWS-1252"; + + /** + * Parses the embedded object/pict string + * + * @param bytes actual bytes (already converted from the + * hex pair string stored in the embedded object data into actual bytes or read + * as raw binary bytes) + * @return a SimpleRTFEmbObj or null + * @throws IOException if there are any surprise surprises during parsing + */ + + /** + * + * @param bytes + * @param metadata incoming metadata + * @param unknownFilenameCount + * @return byte[] for contents of obj data + * @throws IOException + */ + protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger unknownFilenameCount) + throws IOException { + ByteArrayInputStream is = new ByteArrayInputStream(bytes); + long version = readUInt(is); + metadata.add(RTFMetadata.EMB_APP_VERSION, Long.toString(version)); + + long formatId = readUInt(is); + //2 is an embedded object. 1 is a link. + if (formatId != 2L) { + return null; + } + String className = readLengthPrefixedAnsiString(is).trim(); + String topicName = readLengthPrefixedAnsiString(is).trim(); + String itemName = readLengthPrefixedAnsiString(is).trim(); + + if (className != null && className.length() > 0) { + metadata.add(RTFMetadata.EMB_CLASS, className); + } + if (topicName != null && topicName.length() > 0) { + metadata.add(RTFMetadata.EMB_TOPIC, topicName); + } + if (itemName != null && itemName.length() > 0) { + metadata.add(RTFMetadata.EMB_ITEM, itemName); + } + + long dataSz = readUInt(is); + + //readBytes tests for reading too many bytes + byte[] embObjBytes = readBytes(is, dataSz); + + if (className.toLowerCase().equals("package")){ + return handlePackage(embObjBytes, metadata); + } else if (className.toLowerCase().equals("pbrush")) { + //simple bitmap bytes + return embObjBytes; + } else { + ByteArrayInputStream embIs = new ByteArrayInputStream(embObjBytes); + if (NPOIFSFileSystem.hasPOIFSHeader(embIs)){ + try{ + return handleEmbeddedPOIFS(embIs, metadata, unknownFilenameCount); + } catch (IOException e) { + //swallow + } + } + } + return embObjBytes; + } + + + //will throw IOException if not actually POIFS + //can return null byte[] + private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, + AtomicInteger unknownFilenameCount) + throws IOException { + + NPOIFSFileSystem fs = null; + byte[] ret = null; + try { + + fs = new NPOIFSFileSystem(is); + + DirectoryNode root = fs.getRoot(); + + if (root == null) { + return ret; + } + + if (root.hasEntry("Package")){ + Entry ooxml = root.getEntry("Package"); + TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml)); + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + + IOUtils.copy(stream, out); + ret = out.toByteArray(); + } else { + //try poifs + POIFSDocumentType type = POIFSDocumentType.detectType(root); + if (type == POIFSDocumentType.OLE10_NATIVE) { + try { + // Try to un-wrap the OLE10Native record: + Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode)root); + ret = ole.getDataBuffer(); + } catch (Ole10NativeException ex) { + // Not a valid OLE10Native record, skip it + } + } else if (type == POIFSDocumentType.COMP_OBJ) { + + DocumentEntry contentsEntry; + try { + contentsEntry = (DocumentEntry)root.getEntry("CONTENTS"); + } catch (FileNotFoundException ioe) { + contentsEntry = (DocumentEntry)root.getEntry("Contents"); + } + + DocumentInputStream inp = null; + try { + inp = new DocumentInputStream(contentsEntry); + ret = new byte[contentsEntry.getSize()]; + inp.readFully(ret); + } finally { + if (inp != null) { + inp.close(); + } + } + } else { + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + is.reset(); + IOUtils.copy(is, out); + ret = out.toByteArray(); + metadata.set(Metadata.RESOURCE_NAME_KEY, "file_"+unknownFilenameCount.getAndIncrement() + "."+type.getExtension()); + metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); + } + } + } finally { + if (fs != null) { + fs.close(); + } + } + return ret; + } + + + + /** + * can return null if there is a linked object + * instead of an embedded file + */ + private byte[] handlePackage(byte[] pkgBytes, Metadata metadata) throws IOException { + //now parse the package header + ByteArrayInputStream is = new ByteArrayInputStream(pkgBytes); + readUShort(is); + + String displayName = readAnsiString(is); + + //should we add this to the metadata? + readAnsiString(is); //iconFilePath + readUShort(is); //iconIndex + int type = readUShort(is); //type + + //1 is link, 3 is embedded object + //this only handles embedded objects + if (type != 3) { + return null; + } + //should we really be ignoring this filePathLen? + readUInt(is); //filePathLen + + String ansiFilePath = readAnsiString(is); //filePath + long bytesLen = readUInt(is); + byte[] objBytes = initByteArray(bytesLen); + is.read(objBytes); + StringBuilder unicodeFilePath = new StringBuilder(); + + try { + long unicodeLen = readUInt(is); + + for (int i = 0; i < unicodeLen; i++){ + int lo = is.read(); + int hi = is.read(); + int sum = lo+256*hi; + if (hi == -1 || lo == -1){ + //stream ran out; empty SB and stop + unicodeFilePath.setLength(0); + break; + } + unicodeFilePath.append((char)sum); + } + } catch (IOException e) { + //swallow; the unicode file path is optional and might not happen + unicodeFilePath.setLength(0); + } + String fileNameToUse = ""; + String pathToUse = ""; + if (unicodeFilePath.length() > 0){ + String p = unicodeFilePath.toString(); + fileNameToUse = p; + pathToUse = p; + } else { + fileNameToUse = displayName == null ? "" : displayName; + pathToUse = ansiFilePath == null ? "" : ansiFilePath; + } + metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(fileNameToUse)); + metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pathToUse); + + return objBytes; + } + + + private int readUShort(InputStream is) throws IOException { + int lo = is.read(); + int hi = is.read()*256; + if (lo == -1 || hi == -1) { + throw new IOException("Hit end of stream before reading little endian unsigned short."); + } + return hi+lo; + } + + private long readUInt(InputStream is) throws IOException { + long sum = 0; + for (int i = 0; i < 4; i++){ + int v = is.read(); + if (v == -1) { + throw new IOException("Hit end of stream before finishing little endian unsigned int."); + } + sum += v*(long)INT_LE_POWS[i]; + } + return sum; + } + + private String readAnsiString(InputStream is) throws IOException { + StringBuilder sb = new StringBuilder(); + int c = is.read(); + while (c > 0) { + sb.append((char)c); + c = is.read(); + } + if (c == -1) { + throw new IOException("Hit end of stream before end of AnsiString"); + } + return sb.toString(); + } + + private String readLengthPrefixedAnsiString(InputStream is) throws IOException { + long len = readUInt(is); + byte[] bytes = readBytes(is, len); + try { + return new String(bytes, WIN_ASCII); + } catch (UnsupportedEncodingException e) { + //shouldn't ever happen + throw new IOException("Unsupported encoding"); + } + } + + + private byte[] readBytes(InputStream is, long len) throws IOException { + //initByteArray tests for "reading of too many bytes" + byte[] bytes = initByteArray(len); + int read = is.read(bytes); + if (read != len) { + throw new IOException("Hit end of stream before reading all bytes"); + } + + return bytes; + } + + private byte[] initByteArray(long len) throws IOException { + if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) { + throw new IOException("Requested length for reading bytes is out of bounds: " + len); + } + return new byte[(int)len]; + + } +} + Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=1588005&r1=1588004&r2=1588005&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Wed Apr 16 18:04:20 2014 @@ -46,13 +46,43 @@ public class RTFParser extends AbstractP return SUPPORTED_TYPES; } + /** maximum number of bytes per embedded object/pict (default: 20MB)*/ + private static int EMB_OBJ_MAX_BYTES = 20*1024*1024; //20MB + + /** + * Bytes for embedded objects are currently cached in memory. + * If something goes wrong during the parsing of an embedded object, + * it is possible that a read length may be crazily too long + * and cause a heap crash. + * + * @param max maximum number of bytes to allow for embedded objects. If + * the embedded object has more than this number of bytes, skip it. + */ + public static void setMaxBytesForEmbeddedObject(int max) { + EMB_OBJ_MAX_BYTES = max; + } + + /** + * See {@link #setMaxBytesForEmbeddedObject(int)}. + * + * @return maximum number of bytes allowed for an embedded object. + * + */ + public static int getMaxBytesForEmbeddedObject() { + return EMB_OBJ_MAX_BYTES; + } + public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TaggedInputStream tagged = new TaggedInputStream(stream); try { - final TextExtractor ert = new TextExtractor(new XHTMLContentHandler(handler, metadata), metadata); + RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(handler, + metadata, context); + final TextExtractor ert = + new TextExtractor(new XHTMLContentHandler(handler, + metadata), metadata, embObjHandler); ert.extract(stream); metadata.add(Metadata.CONTENT_TYPE, "application/rtf"); } catch (IOException e) { Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1588005&r1=1588004&r2=1588005&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java Wed Apr 16 18:04:20 2014 @@ -130,6 +130,7 @@ final class TextExtractor { private static final Charset GB2312 = getCharset("GB2312"); private static final Charset MS949 = getCharset("ms949"); + private int written = 0; // Hold pending bytes (encoded in the current charset) // for text output: private byte[] pendingBytes = new byte[16]; @@ -205,6 +206,7 @@ final class TextExtractor { private final XHTMLContentHandler out; private final Metadata metadata; + private final RTFEmbObjHandler embObjHandler; // Used when extracting CREATION date: private int year, month, day, hour, minute; @@ -327,9 +329,11 @@ final class TextExtractor { ANSICPG_MAP.put(57011, WINDOWS_57011); // Punjabi } - public TextExtractor(XHTMLContentHandler out, Metadata metadata) { + public TextExtractor(XHTMLContentHandler out, Metadata metadata, + RTFEmbObjHandler embObjHandler) { this.metadata = metadata; this.out = out; + this.embObjHandler = embObjHandler; } public boolean isIgnoringLists() { @@ -340,7 +344,7 @@ final class TextExtractor { this.ignoreLists = ignore; } - private static boolean isHexChar(int ch) { + protected static boolean isHexChar(int ch) { return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F'); @@ -355,7 +359,7 @@ final class TextExtractor { return ch >= '0' && ch <= '9'; } - private static int hexValue(int ch) { + protected static int hexValue(int ch) { if (ch >= '0' && ch <= '9') { return ch - '0'; } else if (ch >= 'a' && ch <= 'z') { @@ -384,16 +388,19 @@ final class TextExtractor { if (pendingCharCount != 0) { pushChars(); } - - // Save the byte in pending buffer: - if (pendingByteCount == pendingBytes.length) { - // Gradual but exponential growth: - final byte[] newArray = new byte[(int) (pendingBytes.length*1.25)]; - System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length); - pendingBytes = newArray; - pendingByteBuffer = ByteBuffer.wrap(pendingBytes); - } - pendingBytes[pendingByteCount++] = (byte) b; + if (groupState.pictDepth > 0) { + embObjHandler.writeMetadataChar((char)b); + } else { + // Save the byte in pending buffer: + if (pendingByteCount == pendingBytes.length) { + // Gradual but exponential growth: + final byte[] newArray = new byte[(int) (pendingBytes.length*1.25)]; + System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length); + pendingBytes = newArray; + pendingByteBuffer = ByteBuffer.wrap(pendingBytes); + } + pendingBytes[pendingByteCount++] = (byte) b; + } } // Buffers a byte as part of a control word: @@ -417,6 +424,8 @@ final class TextExtractor { if (inHeader || fieldState == 1) { pendingBuffer.append(ch); + } else if (groupState.sn == true || groupState.sv == true) { + embObjHandler.writeMetadataChar(ch); } else { if (pendingCharCount == pendingChars.length) { // Gradual but exponential growth: @@ -467,7 +476,12 @@ final class TextExtractor { // parsed document closing brace break; } - } else if (b != '\r' && b != '\n' && (!groupState.ignore || nextMetaData != null)) { + } else if (groupState.objdata == true || + groupState.pictDepth == 1) { + embObjHandler.writeHexChar(b); + } else if (b != '\r' && b != '\n' + && (!groupState.ignore || nextMetaData != null || + groupState.sn == true || groupState.sv == true)) { // Linefeed and carriage return are not // significant if (ansiSkip != 0) { @@ -924,7 +938,7 @@ final class TextExtractor { // in the header can be unicode escaped as well: if (equals("u")) { // Unicode escape - if (!groupState.ignore) { + if (!groupState.ignore || groupState.sv || groupState.sn) { final char utf16CodeUnit = (char) (param & 0xffff); addOutputChar(utf16CodeUnit); } @@ -938,14 +952,25 @@ final class TextExtractor { groupState.ucSkip = (int) param; } else if (equals("bin")) { if (param >= 0) { - int bytesToRead = param; - byte[] tmpArray = new byte[Math.min(1024, bytesToRead)]; - while (bytesToRead > 0) { - int r = in.read(tmpArray, 0, Math.min(bytesToRead, tmpArray.length)); - if (r < 0) { - throw new TikaException("unexpected end of file: need " + param + " bytes of binary data, found " + (param-bytesToRead)); + if (groupState.pictDepth == 1) { + try{ + embObjHandler.writeBytes(in, param); + } catch (IOException e) { + //param was out of bounds or something went wrong during writing. + //skip this obj and move on + //TODO: log.warn + embObjHandler.reset(); + } + } else { + int bytesToRead = param; + byte[] tmpArray = new byte[Math.min(1024, bytesToRead)]; + while (bytesToRead > 0) { + int r = in.read(tmpArray, 0, Math.min(bytesToRead, tmpArray.length)); + if (r < 0) { + throw new TikaException("unexpected end of file: need " + param + " bytes of binary data, found " + (param-bytesToRead)); + } + bytesToRead -= r; } - bytesToRead -= r; } } else { // log some warning? @@ -1156,11 +1181,27 @@ final class TextExtractor { // TODO: we should produce a table output here? //addOutputChar(' '); endParagraph(true); + } else if (equals("sp")) { + groupState.sp = true; + } else if (equals("sn")) { + embObjHandler.startSN(); + groupState.sn = true; + } else if (equals("sv")) { + embObjHandler.startSV(); + groupState.sv = true; + } else if (equals("object")) { + pushText(); + embObjHandler.setInObject(true); + groupState.object = true; + } else if (equals("objdata")) { + groupState.objdata = true; + embObjHandler.startObjData(); } else if (equals("pict")) { pushText(); // TODO: create img tag? but can that support // embedded image data? - groupState.ignore = true; + groupState.pictDepth = 1; + embObjHandler.startPict(); } else if (equals("line")) { if (!ignored) { addOutputChar('\n'); @@ -1309,6 +1350,25 @@ final class TextExtractor { assert groupState.depth > 0; ansiSkip = 0; + + if (groupState.objdata == true) { + embObjHandler.handleCompletedObject(); + groupState.objdata = false; + } else if (groupState.pictDepth > 0) { + if (groupState.sn == true) { + embObjHandler.endSN(); + } else if (groupState.sv == true) { + embObjHandler.endSV(); + } else if (groupState.sp == true) { + embObjHandler.endSP(); + } else if (groupState.pictDepth == 1) { + embObjHandler.handleCompletedObject(); + } + } + + if (groupState.object == true) { + embObjHandler.setInObject(false); + } // Be robust if RTF doc is corrupt (has too many // closing }s): Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1588005&r1=1588004&r2=1588005&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Wed Apr 16 18:04:20 2014 @@ -19,22 +19,41 @@ package org.apache.tika.parser.rtf; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertNotNull; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + import org.apache.tika.Tika; import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.Detector; +import org.apache.tika.extractor.ContainerExtractor; +import org.apache.tika.extractor.ParserContainerExtractor; +import org.apache.tika.io.FilenameUtils; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.RTFMetadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.WriteOutContentHandler; import org.junit.Test; +import org.xml.sax.ContentHandler; /** * Junit test class for the Tika {@link RTFParser} @@ -356,7 +375,24 @@ public class RTFParserTest extends TikaT // TIKA-782 @Test public void testBinControlWord() throws Exception { - assertTrue(getXML("testBinControlWord.rtf").xml.indexOf("\u00ff\u00ff\u00ff\u00ff") == -1); + ByteCopyingHandler embHandler = new ByteCopyingHandler(); + TikaInputStream tis = null; + try { + ContainerExtractor ex = new ParserContainerExtractor(); + tis = TikaInputStream.get(getResourceAsStream("/test-documents/testBinControlWord.rtf")); + assertEquals(true, ex.isSupported(tis)); + ex.extract(tis, ex, embHandler); + } finally { + tis.close(); + } + assertEquals(1, embHandler.bytes.size()); + + byte[] bytes = embHandler.bytes.get(0); + assertEquals(10, bytes.length); + //} + assertEquals(125, (int)bytes[4]); + //make sure that at least the last value is correct + assertEquals(-1, (int)bytes[9]); } // TIKA-999 @@ -377,6 +413,167 @@ public class RTFParserTest extends TikaT assertContains("Body", content); } + //TIKA-1010 + @Test + public void testEmbeddedMonster() throws Exception { + Set<MediaType> skipTypes = new HashSet<MediaType>(); + skipTypes.add(MediaType.parse("application/x-emf")); + skipTypes.add(MediaType.parse("application/x-msmetafile")); + + + List<String> trueNames = new ArrayList<String>(); + trueNames.add("file_0.doc"); + trueNames.add("Hw.txt"); + trueNames.add("file_1.xlsx"); + trueNames.add("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip"); + trueNames.add("html-within-zip.zip"); + trueNames.add("text.html"); + trueNames.add("testHTML_utf8_\u666E\u6797\u65AF\u987F.html"); + trueNames.add("testJPEG_\u666E\u6797\u65AF\u987F.jpg"); + trueNames.add("file_2.xls"); + trueNames.add("testMSG_\u666E\u6797\u65AF\u987F.msg"); + trueNames.add("file_3.pdf"); + trueNames.add("file_4.ppt"); + trueNames.add("file_5.pptx"); + trueNames.add("thumbnail_0.jpeg"); + trueNames.add("file_6.doc"); + trueNames.add("file_7.doc"); + trueNames.add("file_8.docx"); + trueNames.add("testJPEG_\u666E\u6797\u65AF\u987F.jpg"); + + List<String> trueTypes = new ArrayList<String>(); + trueTypes.add("application/msword"); + trueTypes.add("text/plain"); + trueTypes.add("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); + trueTypes.add("application/zip"); + trueTypes.add("application/zip"); + trueTypes.add("text/html"); + trueTypes.add("text/html"); + trueTypes.add("image/jpeg"); + trueTypes.add("application/vnd.ms-excel"); + trueTypes.add("application/vnd.ms-outlook"); + trueTypes.add("application/pdf"); + trueTypes.add("application/vnd.ms-powerpoint"); + trueTypes.add("application/vnd.openxmlformats-officedocument.presentationml.presentation"); + trueTypes.add("image/jpeg"); + trueTypes.add("application/msword"); + trueTypes.add("application/msword"); + trueTypes.add("application/vnd.openxmlformats-officedocument.wordprocessingml.document"); + trueTypes.add("image/jpeg"); + + TrackingHandler tracker = new TrackingHandler(skipTypes); + TikaInputStream tis = null; + try { + ContainerExtractor ex = new ParserContainerExtractor(); + tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf")); + assertEquals(true, ex.isSupported(tis)); + ex.extract(tis, ex, tracker); + + } finally { + tis.close(); + } + + assertEquals(trueNames.size(), tracker.filenames.size()); + assertEquals(trueTypes.size(), tracker.mediaTypes.size()); + for (int i = 0; i < tracker.filenames.size(); i++) { + String expectedName = trueNames.get(i); + if (expectedName == null) { + assertNull(tracker.filenames.get(i)); + } else { + assertNotNull(tracker.filenames.get(i)); + //necessary to getName() because MSOffice extractor includes + //directory: _1457338524/HW.txt + assertEquals("filename equals ", + expectedName, FilenameUtils.getName(tracker.filenames.get(i))); + } + assertEquals(trueTypes.get(i), tracker.mediaTypes.get(i).toString()); + } + + tracker = new TrackingHandler(); + tis = null; + try { + ContainerExtractor ex = new ParserContainerExtractor(); + tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf")); + assertEquals(true, ex.isSupported(tis)); + ex.extract(tis, ex, tracker); + + } finally { + tis.close(); + } + assertEquals(47, tracker.filenames.size()); + assertEquals("thumbnail_26.emf", tracker.filenames.get(45)); + assertEquals("thumbnail_27.wmf", tracker.filenames.get(46)); + } + + //TIKA-1010 test regular (not "embedded") images/picts + public void testRegularImages() throws Exception { + Parser base = new AutoDetectParser(); + ParseContext ctx = new ParseContext(); + RecursiveMetadataParser parser = new RecursiveMetadataParser(base, false); + ctx.set(org.apache.tika.parser.Parser.class, parser); + TikaInputStream tis = null; + ContentHandler handler = new BodyContentHandler(); + Metadata rootMetadata = new Metadata(); + rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf"); + try { + tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf")); + parser.parse(tis, handler, rootMetadata, ctx); + } finally { + tis.close(); + } + List<Metadata> metadatas = parser.getAllMetadata(); + + Metadata meta_jpg_exif = metadatas.get(0);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg"); + Metadata meta_jpg = metadatas.get(2);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg"); + + assertTrue(meta_jpg_exif != null); + assertTrue(meta_jpg != null); + assertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor")); + assertTrue(meta_jpg.get("Comments").contains("Licensed to the Apache")); + //make sure old metadata doesn't linger between objects + assertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor")); + assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL)); + assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL)); + + assertEquals(40, meta_jpg.names().length); + assertEquals(105, meta_jpg.names().length); + } + + //TIKA-1010 test linked embedded doc + @Test + public void testEmbeddedLinkedDocument() throws Exception { + Set<MediaType> skipTypes = new HashSet<MediaType>(); + skipTypes.add(MediaType.parse("application/x-emf")); + skipTypes.add(MediaType.parse("application/x-msmetafile")); + + TrackingHandler tracker = new TrackingHandler(skipTypes); + TikaInputStream tis = null; + try { + ContainerExtractor ex = new ParserContainerExtractor(); + tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf")); + assertEquals(true, ex.isSupported(tis)); + ex.extract(tis, ex, tracker); + + } finally { + tis.close(); + } + //should gracefully skip link and not throw NPE, IOEx, etc + assertEquals(0, tracker.filenames.size()); + + tracker = new TrackingHandler(); + tis = null; + try { + ContainerExtractor ex = new ParserContainerExtractor(); + tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf")); + assertEquals(true, ex.isSupported(tis)); + ex.extract(tis, ex, tracker); + } finally { + tis.close(); + } + //should gracefully skip link and not throw NPE, IOEx, etc + assertEquals(2, tracker.filenames.size()); + } + private Result getResult(String filename) throws Exception { File file = getResourceAsFile("/test-documents/" + filename);