tika git commit: TIKA-2247 and TIKA-2246 -- add parsers for EMF/WMF

tallison Mon, 06 Feb 2017 11:31:46 -0800

Repository: tika
Updated Branches:
  refs/heads/master 27e026eff -> b9befb427



TIKA-2247 and TIKA-2246 -- add parsers for EMF/WMF


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/b9befb42
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/b9befb42
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/b9befb42

Branch: refs/heads/master
Commit: b9befb4272cf8b2bda3b3ea25b0511bbabfdeded
Parents: 27e026e
Author: tballison <[email protected]>
Authored: Mon Feb 6 14:31:09 2017 -0500
Committer: tballison <[email protected]>
Committed: Mon Feb 6 14:31:09 2017 -0500

----------------------------------------------------------------------
 CHANGES.txt                                     |   2 +
 .../apache/tika/parser/microsoft/EMFParser.java | 163 +++++++++++++++++++
 .../apache/tika/parser/microsoft/WMFParser.java |  98 +++++++++++
 .../services/org.apache.tika.parser.Parser      |   4 +-
 .../tika/parser/microsoft/EMFParserTest.java    |  66 ++++++++
 .../tika/parser/microsoft/WMFParserTest.java    |  42 +++++
 .../apache/tika/parser/rtf/RTFParserTest.java   |  40 ++---
 .../testEXCEL_embeddedPDF_mac.xls               | Bin 0 -> 69632 bytes
 .../testEXCEL_embeddedPDF_mac.xlsx              | Bin 0 -> 80578 bytes
 .../testEXCEL_embeddedPDF_windows.xls           | Bin 0 -> 61952 bytes
 .../testEXCEL_embeddedPDF_windows.xlsx          | Bin 0 -> 49843 bytes
 11 files changed, 394 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index bfe817d..b8e2dec 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.15 - ??
 
+  * Add parsers for EMF/WMF files (TIKA-2246/TIKA-2247).
+
   * Official mime types for BMP, EMF and WMF have been registered with
     IANA, so switch to these (image/bmp image/emf image/wmf) (TIKA-2250)
 

http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
new file mode 100644
index 0000000..be4bc14
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
@@ -0,0 +1,163 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.poi.hemf.extractor.HemfExtractor;
+import org.apache.poi.hemf.record.AbstractHemfComment;
+import org.apache.poi.hemf.record.HemfCommentPublic;
+import org.apache.poi.hemf.record.HemfCommentRecord;
+import org.apache.poi.hemf.record.HemfRecord;
+import org.apache.poi.hemf.record.HemfRecordType;
+import org.apache.poi.hemf.record.HemfText;
+import org.apache.poi.util.RecordFormatException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Extracts files embedded in EMF and offers a
+ * very rough capability to extract text if there
+ * is text stored in the EMF.
+ * <p/>
+ * To improve text extraction, we'd have to implement
+ * quite a bit more at the POI level.  We'd want to track changes
+ * in font and use that information for identifying character sets,
+ * inserting spaces and new lines.
+ */
+public class EMFParser extends AbstractParser {
+
+    private static final MediaType MEDIA_TYPE = MediaType.image("emf");
+    private static final MediaType WMF_MEDIA_TYPE = MediaType.image("wmf");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MEDIA_TYPE);
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        EmbeddedDocumentExtractor embeddedDocumentExtractor = null;
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        try {
+            HemfExtractor ex = new HemfExtractor(stream);
+            long lastY = -1;
+            long lastX = -1;
+            long fudgeFactorX = 1000;//derive this from the font or 
frame/bounds information
+            StringBuilder buffer = new StringBuilder();
+            for (HemfRecord record : ex) {
+                if (record.getRecordType() == HemfRecordType.comment) {
+                    AbstractHemfComment comment = ((HemfCommentRecord) 
record).getComment();
+                    if (comment instanceof HemfCommentPublic.MultiFormats) {
+                        if (embeddedDocumentExtractor == null) {
+                            embeddedDocumentExtractor = 
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+                        }
+                        
handleMultiFormats((HemfCommentPublic.MultiFormats)comment, xhtml, 
embeddedDocumentExtractor);
+                    } else if (comment instanceof  
HemfCommentPublic.WindowsMetafile) {
+                        if (embeddedDocumentExtractor == null) {
+                            embeddedDocumentExtractor = 
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+                        }
+                        handleWMF((HemfCommentPublic.WindowsMetafile)comment, 
xhtml, embeddedDocumentExtractor);
+                    }
+                } else if 
(record.getRecordType().equals(HemfRecordType.exttextoutw)) {
+                    HemfText.ExtTextOutW extTextOutW = (HemfText.ExtTextOutW) 
record;
+                    if (lastY > -1 && lastY != extTextOutW.getY()) {
+                        xhtml.startElement("p");
+                        xhtml.characters(buffer.toString());
+                        xhtml.endElement("p");
+                        buffer.setLength(0);
+                        lastX = -1;
+                    }
+                    if (lastX > -1 && extTextOutW.getX() - lastX > 
fudgeFactorX) {
+                        buffer.append(" ");
+                    }
+                    String txt = extTextOutW.getText();
+                    buffer.append(txt);
+                    lastY = extTextOutW.getY();
+                    lastX = extTextOutW.getX();
+                }
+            }
+            if (buffer.length() > 0) {
+                xhtml.startElement("p");
+                xhtml.characters(buffer.toString());
+                xhtml.endElement("p");
+            }
+        } catch (RecordFormatException e) { //POI's hemfparser can throw these 
for "parse exceptions"
+            throw new TikaException(e.getMessage());
+        } catch (RuntimeException e) { //convert Runtime to 
RecordFormatExceptions
+            throw new TikaException(e.getMessage());
+        }
+        xhtml.endDocument();
+    }
+
+    private void handleWMF(HemfCommentPublic.WindowsMetafile comment, 
ContentHandler contentHandler,
+                           EmbeddedDocumentExtractor 
embeddedDocumentExtractor) throws IOException, SAXException, TikaException {
+        Metadata embeddedMetadata = new Metadata();
+        embeddedMetadata.set(Metadata.CONTENT_TYPE, WMF_MEDIA_TYPE.toString());
+        if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
+            try (InputStream is = 
TikaInputStream.get(comment.getWmfInputStream())) {
+                embeddedDocumentExtractor.parseEmbedded(is,
+                        new EmbeddedContentHandler(contentHandler), 
embeddedMetadata, false);
+
+            }
+
+        }
+
+    }
+
+    private void handleMultiFormats(HemfCommentPublic.MultiFormats comment, 
ContentHandler handler,
+                                    EmbeddedDocumentExtractor 
embeddedDocumentExtractor) throws IOException, TikaException, SAXException {
+        for (HemfCommentPublic.HemfMultiFormatsData data :
+                ((HemfCommentPublic.MultiFormats) comment).getData()) {
+            handleEmbedded(data.getData(), embeddedDocumentExtractor, handler);
+        }
+    }
+
+    private static void handleEmbedded(byte[] data,
+                                       EmbeddedDocumentExtractor 
embeddedDocumentExtractor,
+                                       ContentHandler handler) throws 
TikaException, SAXException {
+        try (InputStream is = TikaInputStream.get(data)) {
+            Metadata embeddedMetadata = new Metadata();
+            if 
(embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
+                embeddedDocumentExtractor.parseEmbedded(is,
+                        new EmbeddedContentHandler(handler), embeddedMetadata, 
false);
+            }
+        } catch (IOException e) {
+
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
new file mode 100644
index 0000000..68388a3
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.poi.hwmf.record.HwmfFont;
+import org.apache.poi.hwmf.record.HwmfRecord;
+import org.apache.poi.hwmf.record.HwmfRecordType;
+import org.apache.poi.hwmf.record.HwmfText;
+import org.apache.poi.hwmf.usermodel.HwmfPicture;
+import org.apache.poi.util.LocaleUtil;
+import org.apache.poi.util.RecordFormatException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * This parser offers a very rough capability to extract text if there
+ * is text stored in the WMF files.
+ */
+public class WMFParser extends AbstractParser {
+
+    private static final MediaType MEDIA_TYPE = MediaType.image("wmf");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MEDIA_TYPE);
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        try {
+            HwmfPicture picture = new HwmfPicture(stream);
+            //TODO: make x/y info public in POI so that we can use it here
+            //to determine when to keep two text parts on the same line
+            for (HwmfRecord record : picture.getRecords()) {
+                Charset charset = LocaleUtil.CHARSET_1252;
+                //this is pure hackery for specifying the font
+                //TODO: do what Graphics does by maintaining the stack, etc.!
+                //This fix should be done within POI
+                if 
(record.getRecordType().equals(HwmfRecordType.createFontIndirect)) {
+                    HwmfFont font = ((HwmfText.WmfCreateFontIndirect) 
record).getFont();
+                    charset = (font.getCharSet().getCharset() == null) ? 
LocaleUtil.CHARSET_1252 : font.getCharSet().getCharset();
+                }
+                if (record.getRecordType().equals(HwmfRecordType.extTextOut)) {
+                    HwmfText.WmfExtTextOut textOut = (HwmfText.WmfExtTextOut) 
record;
+                    xhtml.startElement("p");
+                    xhtml.characters(textOut.getText(charset));
+                    xhtml.endElement("p");
+                } else if 
(record.getRecordType().equals(HwmfRecordType.textOut)) {
+                    HwmfText.WmfTextOut textOut = (HwmfText.WmfTextOut) record;
+                    xhtml.startElement("p");
+                    xhtml.characters(textOut.getText(charset));
+                    xhtml.endElement("p");
+                }
+            }
+        } catch (RecordFormatException e) { //POI's hwmfparser can throw these 
for "parse exceptions"
+            throw new TikaException(e.getMessage());
+        } catch (RuntimeException e) { //convert Runtime to 
RecordFormatExceptions
+            throw new TikaException(e.getMessage());
+        } catch (AssertionError e) { //POI's hwmfparser can throw these for 
parse exceptions
+            throw new TikaException(e.getMessage());
+        }
+        xhtml.endDocument();
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 602ee2c..4b1c5bf 100644
--- 
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ 
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -36,6 +36,8 @@ org.apache.tika.parser.jpeg.JpegParser
 org.apache.tika.parser.mail.RFC822Parser
 org.apache.tika.parser.mbox.MboxParser
 org.apache.tika.parser.mbox.OutlookPSTParser
+org.apache.tika.parser.microsoft.EMFParser
+org.apache.tika.parser.microsoft.WMFParser
 org.apache.tika.parser.microsoft.JackcessParser
 org.apache.tika.parser.microsoft.MSOwnerFileParser
 org.apache.tika.parser.microsoft.OfficeParser
@@ -76,4 +78,4 @@ org.apache.tika.parser.geo.topic.GeoParser
 org.apache.tika.parser.external.CompositeExternalParser
 org.apache.tika.parser.journal.JournalParser
 org.apache.tika.parser.image.ICNSParser
-org.apache.tika.parser.dbf.DBFParser
\ No newline at end of file
+org.apache.tika.parser.dbf.DBFParser

http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java
new file mode 100644
index 0000000..e6d2db3
--- /dev/null
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.junit.Test;
+
+public class EMFParserTest extends TikaTest {
+
+    @Test
+    public void testTextExtractionWindows() throws Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("testEXCEL_embeddedPDF_windows.xls");
+        Metadata emfMetadata = metadataList.get(1);
+        assertEquals("image/emf", emfMetadata.get(Metadata.CONTENT_TYPE));
+        assertContains("<p>testPDF.pdf</p>", 
emfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+
+        //this is just the usual embedded pdf
+        Metadata pdfMetadata = metadataList.get(2);
+        assertEquals("application/pdf", 
pdfMetadata.get(Metadata.CONTENT_TYPE));
+        assertContains("is a toolkit for detecting", 
pdfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+
+    }
+
+    @Test
+    public void testTextExtractionMac() throws Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("testEXCEL_embeddedPDF_mac.xls");
+        Metadata emfMetadata = metadataList.get(2);
+        assertEquals("image/emf", emfMetadata.get(Metadata.CONTENT_TYPE));
+        assertContains("is a toolkit for detecting", 
emfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        //test that a space was inserted before url
+        assertContains("Tika http://incubator.apache.org/tika/";, 
emfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+    }
+
+    @Test
+    public void testPDFExtraction() throws Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("testEXCEL_embeddedPDF_mac.xls");
+        //this pdf has to be extracted from within the EMF
+        //it does not exist as a standalone pdf file inside the _mac.xls file.
+        Metadata pdfMetadata = metadataList.get(1);
+        assertEquals("application/pdf", 
pdfMetadata.get(Metadata.CONTENT_TYPE));
+        assertContains("is a toolkit for detecting", 
pdfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+    }
+
+
+}
+

http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java
new file mode 100644
index 0000000..42fb220
--- /dev/null
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.junit.Test;
+
+public class WMFParserTest extends TikaTest {
+
+    @Test
+    public void testTextExtractionWindows() throws Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("testXLSX_Thumbnail.xlsx");
+        Metadata wmfMetadata = metadataList.get(1);
+        assertEquals("image/wmf", wmfMetadata.get(Metadata.CONTENT_TYPE));
+        assertContains("This file contains an embedded thumbnail",
+                wmfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+    }
+
+    //TODO fix wmf text extraction in "testRTFEmbeddedFiles.rtf"
+    //Chinese is garbled.
+}
+

http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java 
b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index 68388b5..b957b8c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -412,29 +412,29 @@ public class RTFParserTest extends TikaTest {
     public void testEmbeddedMonster() throws Exception {
 
         Map<Integer, Pair> expected = new HashMap<>();
-        expected.put(2, new Pair("Hw.txt","text/plain; charset=ISO-8859-1"));
-        expected.put(3, new Pair("file_0.doc", "application/msword"));
-        expected.put(6, new Pair("file_1.xlsx",
+        expected.put(3, new Pair("Hw.txt","text/plain; charset=ISO-8859-1"));
+        expected.put(4, new Pair("file_0.doc", "application/msword"));
+        expected.put(7, new Pair("file_1.xlsx",
                 
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
-        expected.put(9, new Pair("text.html", "text/html; 
charset=windows-1252"));
-        expected.put(10, new Pair("html-within-zip.zip", "application/zip"));
-        expected.put(11, new 
Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip"));
-        expected.put(14, new 
Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html; 
charset=UTF-8"));
-        expected.put(17, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", 
"image/jpeg"));
-        expected.put(20, new Pair("file_2.xls", "application/vnd.ms-excel"));
-        expected.put(23, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg", 
"application/vnd.ms-outlook"));
-        expected.put(26, new Pair("file_3.pdf", "application/pdf"));
-        expected.put(29, new Pair("file_4.ppt", 
"application/vnd.ms-powerpoint"));
-        expected.put(33, new Pair("file_5.pptx", 
"application/vnd.openxmlformats-officedocument.presentationml.presentation"));
-        expected.put(32, new Pair("thumbnail.jpeg", "image/jpeg"));
-        expected.put(36, new Pair("file_6.doc", "application/msword"));
-        expected.put(39, new Pair("file_7.doc", "application/msword"));
-        expected.put(42, new Pair("file_8.docx", 
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
-        expected.put(45, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", 
"image/jpeg"));
+        expected.put(10, new Pair("text.html", "text/html; 
charset=windows-1252"));
+        expected.put(11, new Pair("html-within-zip.zip", "application/zip"));
+        expected.put(12, new 
Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip"));
+        expected.put(15, new 
Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html; 
charset=UTF-8"));
+        expected.put(18, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", 
"image/jpeg"));
+        expected.put(21, new Pair("file_2.xls", "application/vnd.ms-excel"));
+        expected.put(24, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg", 
"application/vnd.ms-outlook"));
+        expected.put(27, new Pair("file_3.pdf", "application/pdf"));
+        expected.put(30, new Pair("file_4.ppt", 
"application/vnd.ms-powerpoint"));
+        expected.put(34, new Pair("file_5.pptx", 
"application/vnd.openxmlformats-officedocument.presentationml.presentation"));
+        expected.put(33, new Pair("thumbnail.jpeg", "image/jpeg"));
+        expected.put(37, new Pair("file_6.doc", "application/msword"));
+        expected.put(40, new Pair("file_7.doc", "application/msword"));
+        expected.put(43, new Pair("file_8.docx", 
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+        expected.put(46, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", 
"image/jpeg"));
 
 
         List<Metadata> metadataList = 
getRecursiveMetadata("testRTFEmbeddedFiles.rtf");
-        assertEquals(48, metadataList.size());
+        assertEquals(49, metadataList.size());
         for (Map.Entry<Integer, Pair> e : expected.entrySet()) {
             Metadata metadata = metadataList.get(e.getKey());
             Pair p = e.getValue();
@@ -448,7 +448,7 @@ public class RTFParserTest extends TikaTest {
             assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE));
         }
         
assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_æ®ææ¯é¡¿.jpg",
-                
metadataList.get(45).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+                
metadataList.get(46).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
     }
     
     //TIKA-1010 test regular (not "embedded") images/picts

http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_mac.xls
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_mac.xls 
b/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_mac.xls
new file mode 100644
index 0000000..aee4277
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_mac.xls 
differ

http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_mac.xlsx
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_mac.xlsx 
b/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_mac.xlsx
new file mode 100644
index 0000000..32c83c1
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_mac.xlsx 
differ

http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_windows.xls
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_windows.xls
 
b/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_windows.xls
new file mode 100644
index 0000000..546deef
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_windows.xls
 differ

http://git-wip-us.apache.org/repos/asf/tika/blob/b9befb42/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_windows.xlsx
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_windows.xlsx
 
b/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_windows.xlsx
new file mode 100644
index 0000000..903c2f5
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testEXCEL_embeddedPDF_windows.xlsx
 differ

tika git commit: TIKA-2247 and TIKA-2246 -- add parsers for EMF/WMF

Reply via email to