Author: nick Date: Fri Jan 20 15:56:05 2012 New Revision: 1233973 URL: http://svn.apache.org/viewvc?rev=1233973&view=rev Log: TIKA-507 FontBox powered .afm font metrics parser, patch from Fernando Arreola
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/font/ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/font/AdobeFontMetricParserTest.java Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser tika/trunk/tika-parsers/src/test/resources/test-documents/testAFM.afm Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java?rev=1233973&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java (added) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java Fri Jan 20 15:56:05 2012 @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.font; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.List; +import java.util.Set; + +import org.apache.fontbox.afm.AFMParser; +import org.apache.fontbox.afm.FontMetric; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Parser for AFM Font Files + */ +public class AdobeFontMetricParser extends AbstractParser { + /** Serial version UID */ + private static final long serialVersionUID = -4820306522217196835L; + + private static final MediaType AFM_TYPE = + MediaType.application( "x-font-adobe-metric" ); + + private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(AFM_TYPE); + + public Set<MediaType> getSupportedTypes( ParseContext context ) { + return SUPPORTED_TYPES; + } + + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + FontMetric fontMetrics; + AFMParser parser = new AFMParser( stream ); + + // Have FontBox process the file + parser.parse(); + fontMetrics = parser.getResult(); + + // Get the comments in the file to display in xhtml + List<String> comments = fontMetrics.getComments(); + + // Get the creation date + extractCreationDate( metadata, comments ); + + metadata.set( Metadata.CONTENT_TYPE, AFM_TYPE.toString() ); + metadata.set( Metadata.TITLE, fontMetrics.getFullName() ); + + // Add metadata associated with the font type + addMetadataByString( metadata, "AvgCharacterWidth", Float.toString( fontMetrics.getAverageCharacterWidth() ) ); + addMetadataByString( metadata, "DocVersion", Float.toString( fontMetrics.getAFMVersion() ) ); + addMetadataByString( metadata, "FontName", fontMetrics.getFontName() ); + addMetadataByString( metadata, "FontFullName", fontMetrics.getFullName() ); + addMetadataByString( metadata, "FontFamilyName", fontMetrics.getFamilyName() ); + addMetadataByString( metadata, "FontVersion", fontMetrics.getFontVersion() ); + addMetadataByString( metadata, "FontWeight", fontMetrics.getWeight() ); + addMetadataByString( metadata, "FontNotice", fontMetrics.getNotice() ); + addMetadataByString( metadata, "FontUnderlineThickness", Float.toString( fontMetrics.getUnderlineThickness() ) ); + + // Output the remaining comments as text + XHTMLContentHandler xhtml = new XHTMLContentHandler( handler, metadata ); + xhtml.startDocument(); + + // Display the comments + if (comments.size() > 0) { + xhtml.element( "h1", "Comments" ); + xhtml.startElement("div", "class", "comments"); + for (String comment : comments) { + xhtml.element( "p", comment ); + } + xhtml.endElement("div"); + } + + xhtml.endDocument(); + } + + private void addMetadataByString( Metadata metadata, String name, String value ) { + // Add metadata if an appropriate value is passed + if (value != null) { + metadata.add( name, value ); + } + } + + private void addMetadataByProperty( Metadata metadata, Property property, String value ) { + // Add metadata if an appropriate value is passed + if (value != null) + { + metadata.set( property, value ); + } + } + + + private void extractCreationDate( Metadata metadata, List<String> comments ) { + String date = null; + + for (String value : comments) { + // Look for the creation date + if( value.matches( ".*Creation\\sDate.*" ) ) { + date = value.substring( value.indexOf( ":" ) + 2 ); + comments.remove( value ); + + break; + } + } + + // If appropriate date then store as metadata + if( date != null ) { + addMetadataByProperty( metadata, Metadata.CREATION_DATE, date ); + } + } +} Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1233973&r1=1233972&r2=1233973&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original) +++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Fri Jan 20 15:56:05 2012 @@ -19,6 +19,7 @@ org.apache.tika.parser.audio.MidiParser org.apache.tika.parser.dwg.DWGParser org.apache.tika.parser.epub.EpubParser org.apache.tika.parser.feed.FeedParser +org.apache.tika.parser.font.AdobeFontMetricParser org.apache.tika.parser.font.TrueTypeParser org.apache.tika.parser.html.HtmlParser org.apache.tika.parser.image.ImageParser Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/font/AdobeFontMetricParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/font/AdobeFontMetricParserTest.java?rev=1233973&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/font/AdobeFontMetricParserTest.java (added) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/font/AdobeFontMetricParserTest.java Fri Jan 20 15:56:05 2012 @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.font; + +import junit.framework.TestCase; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; +import org.apache.tika.io.TikaInputStream; + +/** + * Test case for parsing afm files. + */ +public class AdobeFontMetricParserTest extends TestCase { + public void testAdobeFontMetricParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + TikaInputStream stream = TikaInputStream.get( + AdobeFontMetricParserTest.class.getResource( + "/test-documents/testAFM.afm")); + + try { + parser.parse(stream, handler, metadata, context); + } finally { + stream.close(); + } + + assertEquals("application/x-font-adobe-metric", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("TestFullName", metadata.get(Metadata.TITLE)); + assertEquals("Fri Jul 15 17:50:51 2011", metadata.get(Metadata.CREATION_DATE)); + + assertEquals("TestFontName", metadata.get("FontName")); + assertEquals("TestFullName", metadata.get("FontFullName")); + assertEquals("TestSymbol", metadata.get("FontFamilyName")); + + assertEquals("Medium", metadata.get("FontWeight")); + assertEquals("001.008", metadata.get("FontVersion")); + + String content = handler.toString(); + + // Test that the comments got extracted + assertTrue(content.contains("Comments")); + assertTrue(content.contains("This is a comment in a sample file")); + assertTrue(content.contains("UniqueID 12345")); + } +} Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testAFM.afm URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testAFM.afm?rev=1233973&r1=1233972&r2=1233973&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/resources/test-documents/testAFM.afm (original) +++ tika/trunk/tika-parsers/src/test/resources/test-documents/testAFM.afm Fri Jan 20 15:56:05 2012 @@ -38,7 +38,7 @@ StdHW 91 StdVW 86 -StartCharMetrics 190 +StartCharMetrics 2 C 32 ; WX 250 ; N space ; B 0 0 0 0 ;