Repository: tika Updated Branches: refs/heads/master 27e026eff -> b9befb427
TIKA-2247 and TIKA-2246 -- add parsers for EMF/WMF Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/b9befb42 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/b9befb42 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/b9befb42 Branch: refs/heads/master Commit: b9befb4272cf8b2bda3b3ea25b0511bbabfdeded Parents: 27e026e Author: tballison <[email protected]> Authored: Mon Feb 6 14:31:09 2017 -0500 Committer: tballison <[email protected]> Committed: Mon Feb 6 14:31:09 2017 -0500 ---------------------------------------------------------------------- CHANGES.txt | 2 + .../apache/tika/parser/microsoft/EMFParser.java | 163 +++++++++++++++++++ .../apache/tika/parser/microsoft/WMFParser.java | 98 +++++++++++ .../services/org.apache.tika.parser.Parser | 4 +- .../tika/parser/microsoft/EMFParserTest.java | 66 ++++++++ .../tika/parser/microsoft/WMFParserTest.java | 42 +++++ .../apache/tika/parser/rtf/RTFParserTest.java | 40 ++--- .../testEXCEL_embeddedPDF_mac.xls | Bin 0 -> 69632 bytes .../testEXCEL_embeddedPDF_mac.xlsx | Bin 0 -> 80578 bytes .../testEXCEL_embeddedPDF_windows.xls | Bin 0 -> 61952 bytes .../testEXCEL_embeddedPDF_windows.xlsx | Bin 0 -> 49843 bytes 11 files changed, 394 insertions(+), 21 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index bfe817d..b8e2dec 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,7 @@ Release 1.15 - ?? + * Add parsers for EMF/WMF files (TIKA-2246/TIKA-2247). + * Official mime types for BMP, EMF and WMF have been registered with IANA, so switch to these (image/bmp image/emf image/wmf) (TIKA-2250) http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java new file mode 100644 index 0000000..be4bc14 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java @@ -0,0 +1,163 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import org.apache.poi.hemf.extractor.HemfExtractor; +import org.apache.poi.hemf.record.AbstractHemfComment; +import org.apache.poi.hemf.record.HemfCommentPublic; +import org.apache.poi.hemf.record.HemfCommentRecord; +import org.apache.poi.hemf.record.HemfRecord; +import org.apache.poi.hemf.record.HemfRecordType; +import org.apache.poi.hemf.record.HemfText; +import org.apache.poi.util.RecordFormatException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Extracts files embedded in EMF and offers a + * very rough capability to extract text if there + * is text stored in the EMF. + * <p/> + * To improve text extraction, we'd have to implement + * quite a bit more at the POI level. We'd want to track changes + * in font and use that information for identifying character sets, + * inserting spaces and new lines. + */ +public class EMFParser extends AbstractParser { + + private static final MediaType MEDIA_TYPE = MediaType.image("emf"); + private static final MediaType WMF_MEDIA_TYPE = MediaType.image("wmf"); + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MEDIA_TYPE); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + EmbeddedDocumentExtractor embeddedDocumentExtractor = null; + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + try { + HemfExtractor ex = new HemfExtractor(stream); + long lastY = -1; + long lastX = -1; + long fudgeFactorX = 1000;//derive this from the font or frame/bounds information + StringBuilder buffer = new StringBuilder(); + for (HemfRecord record : ex) { + if (record.getRecordType() == HemfRecordType.comment) { + AbstractHemfComment comment = ((HemfCommentRecord) record).getComment(); + if (comment instanceof HemfCommentPublic.MultiFormats) { + if (embeddedDocumentExtractor == null) { + embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + } + handleMultiFormats((HemfCommentPublic.MultiFormats)comment, xhtml, embeddedDocumentExtractor); + } else if (comment instanceof HemfCommentPublic.WindowsMetafile) { + if (embeddedDocumentExtractor == null) { + embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + } + handleWMF((HemfCommentPublic.WindowsMetafile)comment, xhtml, embeddedDocumentExtractor); + } + } else if (record.getRecordType().equals(HemfRecordType.exttextoutw)) { + HemfText.ExtTextOutW extTextOutW = (HemfText.ExtTextOutW) record; + if (lastY > -1 && lastY != extTextOutW.getY()) { + xhtml.startElement("p"); + xhtml.characters(buffer.toString()); + xhtml.endElement("p"); + buffer.setLength(0); + lastX = -1; + } + if (lastX > -1 && extTextOutW.getX() - lastX > fudgeFactorX) { + buffer.append(" "); + } + String txt = extTextOutW.getText(); + buffer.append(txt); + lastY = extTextOutW.getY(); + lastX = extTextOutW.getX(); + } + } + if (buffer.length() > 0) { + xhtml.startElement("p"); + xhtml.characters(buffer.toString()); + xhtml.endElement("p"); + } + } catch (RecordFormatException e) { //POI's hemfparser can throw these for "parse exceptions" + throw new TikaException(e.getMessage()); + } catch (RuntimeException e) { //convert Runtime to RecordFormatExceptions + throw new TikaException(e.getMessage()); + } + xhtml.endDocument(); + } + + private void handleWMF(HemfCommentPublic.WindowsMetafile comment, ContentHandler contentHandler, + EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, SAXException, TikaException { + Metadata embeddedMetadata = new Metadata(); + embeddedMetadata.set(Metadata.CONTENT_TYPE, WMF_MEDIA_TYPE.toString()); + if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { + try (InputStream is = TikaInputStream.get(comment.getWmfInputStream())) { + embeddedDocumentExtractor.parseEmbedded(is, + new EmbeddedContentHandler(contentHandler), embeddedMetadata, false); + + } + + } + + } + + private void handleMultiFormats(HemfCommentPublic.MultiFormats comment, ContentHandler handler, + EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, TikaException, SAXException { + for (HemfCommentPublic.HemfMultiFormatsData data : + ((HemfCommentPublic.MultiFormats) comment).getData()) { + handleEmbedded(data.getData(), embeddedDocumentExtractor, handler); + } + } + + private static void handleEmbedded(byte[] data, + EmbeddedDocumentExtractor embeddedDocumentExtractor, + ContentHandler handler) throws TikaException, SAXException { + try (InputStream is = TikaInputStream.get(data)) { + Metadata embeddedMetadata = new Metadata(); + if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { + embeddedDocumentExtractor.parseEmbedded(is, + new EmbeddedContentHandler(handler), embeddedMetadata, false); + } + } catch (IOException e) { + + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java new file mode 100644 index 0000000..68388a3 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java @@ -0,0 +1,98 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.util.Collections; +import java.util.Set; + +import org.apache.poi.hwmf.record.HwmfFont; +import org.apache.poi.hwmf.record.HwmfRecord; +import org.apache.poi.hwmf.record.HwmfRecordType; +import org.apache.poi.hwmf.record.HwmfText; +import org.apache.poi.hwmf.usermodel.HwmfPicture; +import org.apache.poi.util.LocaleUtil; +import org.apache.poi.util.RecordFormatException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * This parser offers a very rough capability to extract text if there + * is text stored in the WMF files. + */ +public class WMFParser extends AbstractParser { + + private static final MediaType MEDIA_TYPE = MediaType.image("wmf"); + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MEDIA_TYPE); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + try { + HwmfPicture picture = new HwmfPicture(stream); + //TODO: make x/y info public in POI so that we can use it here + //to determine when to keep two text parts on the same line + for (HwmfRecord record : picture.getRecords()) { + Charset charset = LocaleUtil.CHARSET_1252; + //this is pure hackery for specifying the font + //TODO: do what Graphics does by maintaining the stack, etc.! + //This fix should be done within POI + if (record.getRecordType().equals(HwmfRecordType.createFontIndirect)) { + HwmfFont font = ((HwmfText.WmfCreateFontIndirect) record).getFont(); + charset = (font.getCharSet().getCharset() == null) ? LocaleUtil.CHARSET_1252 : font.getCharSet().getCharset(); + } + if (record.getRecordType().equals(HwmfRecordType.extTextOut)) { + HwmfText.WmfExtTextOut textOut = (HwmfText.WmfExtTextOut) record; + xhtml.startElement("p"); + xhtml.characters(textOut.getText(charset)); + xhtml.endElement("p"); + } else if (record.getRecordType().equals(HwmfRecordType.textOut)) { + HwmfText.WmfTextOut textOut = (HwmfText.WmfTextOut) record; + xhtml.startElement("p"); + xhtml.characters(textOut.getText(charset)); + xhtml.endElement("p"); + } + } + } catch (RecordFormatException e) { //POI's hwmfparser can throw these for "parse exceptions" + throw new TikaException(e.getMessage()); + } catch (RuntimeException e) { //convert Runtime to RecordFormatExceptions + throw new TikaException(e.getMessage()); + } catch (AssertionError e) { //POI's hwmfparser can throw these for parse exceptions + throw new TikaException(e.getMessage()); + } + xhtml.endDocument(); + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser index 602ee2c..4b1c5bf 100644 --- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser @@ -36,6 +36,8 @@ org.apache.tika.parser.jpeg.JpegParser org.apache.tika.parser.mail.RFC822Parser org.apache.tika.parser.mbox.MboxParser org.apache.tika.parser.mbox.OutlookPSTParser +org.apache.tika.parser.microsoft.EMFParser +org.apache.tika.parser.microsoft.WMFParser org.apache.tika.parser.microsoft.JackcessParser org.apache.tika.parser.microsoft.MSOwnerFileParser org.apache.tika.parser.microsoft.OfficeParser @@ -76,4 +78,4 @@ org.apache.tika.parser.geo.topic.GeoParser org.apache.tika.parser.external.CompositeExternalParser org.apache.tika.parser.journal.JournalParser org.apache.tika.parser.image.ICNSParser -org.apache.tika.parser.dbf.DBFParser \ No newline at end of file +org.apache.tika.parser.dbf.DBFParser http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java new file mode 100644 index 0000000..e6d2db3 --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java @@ -0,0 +1,66 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.junit.Assert.assertEquals; + +import java.util.List; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.junit.Test; + +public class EMFParserTest extends TikaTest { + + @Test + public void testTextExtractionWindows() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_embeddedPDF_windows.xls"); + Metadata emfMetadata = metadataList.get(1); + assertEquals("image/emf", emfMetadata.get(Metadata.CONTENT_TYPE)); + assertContains("<p>testPDF.pdf</p>", emfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + + //this is just the usual embedded pdf + Metadata pdfMetadata = metadataList.get(2); + assertEquals("application/pdf", pdfMetadata.get(Metadata.CONTENT_TYPE)); + assertContains("is a toolkit for detecting", pdfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + + } + + @Test + public void testTextExtractionMac() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_embeddedPDF_mac.xls"); + Metadata emfMetadata = metadataList.get(2); + assertEquals("image/emf", emfMetadata.get(Metadata.CONTENT_TYPE)); + assertContains("is a toolkit for detecting", emfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + //test that a space was inserted before url + assertContains("Tika http://incubator.apache.org/tika/", emfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + } + + @Test + public void testPDFExtraction() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_embeddedPDF_mac.xls"); + //this pdf has to be extracted from within the EMF + //it does not exist as a standalone pdf file inside the _mac.xls file. + Metadata pdfMetadata = metadataList.get(1); + assertEquals("application/pdf", pdfMetadata.get(Metadata.CONTENT_TYPE)); + assertContains("is a toolkit for detecting", pdfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + } + + +} + http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java new file mode 100644 index 0000000..42fb220 --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java @@ -0,0 +1,42 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.junit.Assert.assertEquals; + +import java.util.List; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.junit.Test; + +public class WMFParserTest extends TikaTest { + + @Test + public void testTextExtractionWindows() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testXLSX_Thumbnail.xlsx"); + Metadata wmfMetadata = metadataList.get(1); + assertEquals("image/wmf", wmfMetadata.get(Metadata.CONTENT_TYPE)); + assertContains("This file contains an embedded thumbnail", + wmfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + } + + //TODO fix wmf text extraction in "testRTFEmbeddedFiles.rtf" + //Chinese is garbled. +} + http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java index 68388b5..b957b8c 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java @@ -412,29 +412,29 @@ public class RTFParserTest extends TikaTest { public void testEmbeddedMonster() throws Exception { Map<Integer, Pair> expected = new HashMap<>(); - expected.put(2, new Pair("Hw.txt","text/plain; charset=ISO-8859-1")); - expected.put(3, new Pair("file_0.doc", "application/msword")); - expected.put(6, new Pair("file_1.xlsx", + expected.put(3, new Pair("Hw.txt","text/plain; charset=ISO-8859-1")); + expected.put(4, new Pair("file_0.doc", "application/msword")); + expected.put(7, new Pair("file_1.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")); - expected.put(9, new Pair("text.html", "text/html; charset=windows-1252")); - expected.put(10, new Pair("html-within-zip.zip", "application/zip")); - expected.put(11, new Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip")); - expected.put(14, new Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html; charset=UTF-8")); - expected.put(17, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg")); - expected.put(20, new Pair("file_2.xls", "application/vnd.ms-excel")); - expected.put(23, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg", "application/vnd.ms-outlook")); - expected.put(26, new Pair("file_3.pdf", "application/pdf")); - expected.put(29, new Pair("file_4.ppt", "application/vnd.ms-powerpoint")); - expected.put(33, new Pair("file_5.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation")); - expected.put(32, new Pair("thumbnail.jpeg", "image/jpeg")); - expected.put(36, new Pair("file_6.doc", "application/msword")); - expected.put(39, new Pair("file_7.doc", "application/msword")); - expected.put(42, new Pair("file_8.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document")); - expected.put(45, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg")); + expected.put(10, new Pair("text.html", "text/html; charset=windows-1252")); + expected.put(11, new Pair("html-within-zip.zip", "application/zip")); + expected.put(12, new Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip")); + expected.put(15, new Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html; charset=UTF-8")); + expected.put(18, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg")); + expected.put(21, new Pair("file_2.xls", "application/vnd.ms-excel")); + expected.put(24, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg", "application/vnd.ms-outlook")); + expected.put(27, new Pair("file_3.pdf", "application/pdf")); + expected.put(30, new Pair("file_4.ppt", "application/vnd.ms-powerpoint")); + expected.put(34, new Pair("file_5.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation")); + expected.put(33, new Pair("thumbnail.jpeg", "image/jpeg")); + expected.put(37, new Pair("file_6.doc", "application/msword")); + expected.put(40, new Pair("file_7.doc", "application/msword")); + expected.put(43, new Pair("file_8.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document")); + expected.put(46, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg")); List<Metadata> metadataList = getRecursiveMetadata("testRTFEmbeddedFiles.rtf"); - assertEquals(48, metadataList.size()); + assertEquals(49, metadataList.size()); for (Map.Entry<Integer, Pair> e : expected.entrySet()) { Metadata metadata = metadataList.get(e.getKey()); Pair p = e.getValue(); @@ -448,7 +448,7 @@ public class RTFParserTest extends TikaTest { assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE)); } assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_æ®ææ¯é¡¿.jpg", - metadataList.get(45).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)); + metadataList.get(46).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)); } //TIKA-1010 test regular (not "embedded") images/picts http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_mac.xls ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_mac.xls b/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_mac.xls new file mode 100644 index 0000000..aee4277 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_mac.xls differ http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_mac.xlsx ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_mac.xlsx b/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_mac.xlsx new file mode 100644 index 0000000..32c83c1 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_mac.xlsx differ http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_windows.xls ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_windows.xls b/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_windows.xls new file mode 100644 index 0000000..546deef Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_windows.xls differ http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_windows.xlsx ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_windows.xlsx b/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_windows.xlsx new file mode 100644 index 0000000..903c2f5 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_windows.xlsx differ
