Repository: tika Updated Branches: refs/heads/2.x ffaa4deaa -> 60d4e3ff2
TIKA-2008 -- add mime definition and parser for MSOwnerFile Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/60d4e3ff Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/60d4e3ff Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/60d4e3ff Branch: refs/heads/2.x Commit: 60d4e3ff2aca931fd8e36d0f8ca8c2944e788aa4 Parents: ffaa4de Author: tballison <talli...@mitre.org> Authored: Wed Jun 15 09:22:18 2016 -0400 Committer: tballison <talli...@mitre.org> Committed: Wed Jun 15 09:22:18 2016 -0400 ---------------------------------------------------------------------- CHANGES.txt | 2 + .../org/apache/tika/mime/TestMimeTypes.java | 5 ++ .../org/apache/tika/mime/tika-mimetypes.xml | 7 ++ .../org/apache/tika/module/office/BundleIT.java | 2 +- .../parser/microsoft/MSOwnerFileParser.java | 81 +++++++++++++++++++ .../services/org.apache.tika.parser.Parser | 1 + .../parser/microsoft/MSOwnerFileParserTest.java | 31 +++++++ .../resources/test-documents/testMSOwnerFile | Bin 0 -> 162 bytes 8 files changed, 128 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 1d8f2cc..81243fe 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -10,6 +10,8 @@ Release 2.0 - Future Development Release 1.14 - ??? + * Add mime definition and parser for MS Owner File (TIKA-2008). + * Add mime definition for Windows Media Metafile (TIKA-2004). * Add mime definitions of iCal and vCalendar (TIKA-2006). http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java index eed11e8..d27c714 100644 --- a/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -981,6 +981,11 @@ public class TestMimeTypes extends TikaTest { assertType("application/x-ms-asx", "testWindowsMediaMeta.asx"); } + @Test + public void testMSOwner() throws Exception { + assertType("application/x-ms-owner", "testMSOwnerFile"); + } + private void assertText(byte[] prefix) throws IOException { assertMagic("text/plain", prefix); } http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml ---------------------------------------------------------------------- diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index c513361..9ec8d76 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -3403,6 +3403,13 @@ <mime-type type="application/x-ms-application"> <glob pattern="*.application"/> </mime-type> + <mime-type type="application/x-ms-owner"> + <_comment>Temporary files created by MSOffice applications</_comment> + <_comment>PRONOM fmt-473</_comment> + <magic priority="80"> + <match value="(?s)^([\\x05-\\x0F]).{53}\\1\x00" type="regex" offset="0"/> + </magic> + </mime-type> <mime-type type="application/x-ms-wmd"> <glob pattern="*.wmd"/> </mime-type> http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-parser-bundles/tika-parser-office-bundle/src/test/java/org/apache/tika/module/office/BundleIT.java ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-office-bundle/src/test/java/org/apache/tika/module/office/BundleIT.java b/tika-parser-bundles/tika-parser-office-bundle/src/test/java/org/apache/tika/module/office/BundleIT.java index 3f564fe..6336ddf 100644 --- a/tika-parser-bundles/tika-parser-office-bundle/src/test/java/org/apache/tika/module/office/BundleIT.java +++ b/tika-parser-bundles/tika-parser-office-bundle/src/test/java/org/apache/tika/module/office/BundleIT.java @@ -80,6 +80,6 @@ public class BundleIT { @Test public void testServicesCreated() throws Exception { ServiceReference[] services = bc.getAllServiceReferences(Parser.class.getName(), null); - assertEquals("Not all Services have started", 24, services.length); + assertEquals("Not all Services have started", 25, services.length); } } http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java new file mode 100644 index 0000000..02c07a6 --- /dev/null +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.Set; + +/** + * Parser for temporary MSOFfice files. + * This currently only extracts the owner's name. + */ +public class MSOwnerFileParser extends AbstractParser { + + private static final int ASCII_CHUNK_LENGTH = 54; + private static final MediaType MEDIA_TYPE = MediaType.application("x-ms-owner"); + /** + * Serial version UID + */ + private static final long serialVersionUID = -752276948656079347L; + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MEDIA_TYPE); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + /** + * Extracts owner from MS temp file + */ + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + byte[] asciiNameBytes = new byte[ASCII_CHUNK_LENGTH]; + IOUtils.readFully(stream, asciiNameBytes); + int asciiNameLength = (int)asciiNameBytes[0];//don't need to convert to unsigned int because it can't be that long + String asciiName = new String(asciiNameBytes, 1, asciiNameLength, StandardCharsets.US_ASCII); + metadata.set(TikaCoreProperties.CREATOR, asciiName); + + int unicodeCharLength = stream.read(); + if (unicodeCharLength > 0) { + stream.read();//zero after the char length + byte[] unicodeBytes = new byte[unicodeCharLength * 2]; + IOUtils.readFully(stream, unicodeBytes); + String unicodeName = new String(unicodeBytes, StandardCharsets.UTF_16LE); + metadata.set(TikaCoreProperties.CREATOR, unicodeName); + } + xhtml.endDocument(); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser index 4d3290e..1c8cee1 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +++ b/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser @@ -19,6 +19,7 @@ org.apache.tika.parser.microsoft.JackcessParser org.apache.tika.parser.microsoft.OfficeParser org.apache.tika.parser.microsoft.OldExcelParser org.apache.tika.parser.microsoft.TNEFParser +org.apache.tika.parser.microsoft.MSOwnerFileParser org.apache.tika.parser.microsoft.ooxml.OOXMLParser org.apache.tika.parser.microsoft.xml.WordMLParser org.apache.tika.parser.microsoft.xml.SpreadsheetMLParser http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java new file mode 100644 index 0000000..3cef3df --- /dev/null +++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class MSOwnerFileParserTest extends TikaTest { + @Test + public void testBasic() throws Exception { + XMLResult r = getXML("testMSOwnerFile"); + assertEquals("heidi", r.metadata.get(TikaCoreProperties.CREATOR)); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-test-resources/src/test/resources/test-documents/testMSOwnerFile ---------------------------------------------------------------------- diff --git a/tika-test-resources/src/test/resources/test-documents/testMSOwnerFile b/tika-test-resources/src/test/resources/test-documents/testMSOwnerFile new file mode 100644 index 0000000..72a5f57 Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testMSOwnerFile differ