Author: nick Date: Mon Jun 28 13:59:08 2010 New Revision: 958581 URL: http://svn.apache.org/viewvc?rev=958581&view=rev Log: Use the new TIFF Metadata entries for image width/length/sampling from the TIFF, JPEG and general Image (ImageIO) parsers. Gives a small number of consistent image related metadata entries across all formats. (TIKA-442)
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java?rev=958581&r1=958580&r2=958581&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java Mon Jun 28 13:59:08 2010 @@ -25,7 +25,7 @@ import java.util.Properties; * A multi-valued metadata container. */ public class Metadata implements CreativeCommons, DublinCore, HttpHeaders, - Message, MSOffice, ClimateForcast, TikaMetadataKeys, TikaMimeKeys { + Message, MSOffice, ClimateForcast, TIFF, TikaMetadataKeys, TikaMimeKeys { /** * A map of all metadata attributes. Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java?rev=958581&r1=958580&r2=958581&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java Mon Jun 28 13:59:08 2010 @@ -32,6 +32,7 @@ import javax.imageio.metadata.IIOMetadat import org.apache.tika.exception.TikaException; import org.apache.tika.io.CloseShieldInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; @@ -70,6 +71,9 @@ public class ImageParser implements Pars ImageReader reader = iterator.next(); reader.setInput(ImageIO.createImageInputStream( new CloseShieldInputStream(stream))); + + metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0))); + metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0))); metadata.set("height", Integer.toString(reader.getHeight(0))); metadata.set("width", Integer.toString(reader.getWidth(0))); @@ -77,6 +81,12 @@ public class ImageParser implements Pars reader.dispose(); } + + // Translate certain Metadata tags from the ImageIO + // specific namespace into the general Tika one + setIfPresent(metadata, "CommentExtensions CommentExtension", Metadata.COMMENTS); + setIfPresent(metadata, "markerSequence com", Metadata.COMMENTS); + setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE); } catch (IIOException e) { throw new TikaException(type + " parse error", e); } @@ -95,6 +105,21 @@ public class ImageParser implements Pars throws IOException, SAXException, TikaException { parse(stream, handler, metadata, new ParseContext()); } + + private static void setIfPresent(Metadata metadata, String imageIOkey, String tikaKey) { + if(metadata.get(imageIOkey) != null) { + metadata.set(tikaKey, metadata.get(imageIOkey)); + } + } + private static void setIfPresent(Metadata metadata, String imageIOkey, Property tikaProp) { + if(metadata.get(imageIOkey) != null) { + String v = metadata.get(imageIOkey); + if(v.endsWith(" ")) { + v = v.substring(0, v.lastIndexOf(' ')); + } + metadata.set(tikaProp, v); + } + } private static void loadMetadata(IIOMetadata imageMetadata, Metadata metadata) { String[] names = imageMetadata.getMetadataFormatNames(); Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java?rev=958581&r1=958580&r2=958581&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java Mon Jun 28 13:59:08 2010 @@ -19,9 +19,12 @@ package org.apache.tika.parser.image; import java.io.IOException; import java.io.InputStream; import java.util.Iterator; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; import org.xml.sax.SAXException; import com.drew.imaging.tiff.TiffMetadataReader; @@ -30,15 +33,15 @@ import com.drew.metadata.Directory; import com.drew.metadata.MetadataException; import com.drew.metadata.Tag; -class TiffExtractor { +public class TiffExtractor { private final Metadata metadata; - public TiffExtractor(Metadata metadata) { + protected TiffExtractor(Metadata metadata) { this.metadata = metadata; } - public void parse(InputStream stream) + protected void parse(InputStream stream) throws IOException, SAXException, TikaException { try { com.drew.metadata.Metadata tiffMetadata = @@ -52,6 +55,7 @@ class TiffExtractor { while (tags.hasNext()) { Tag tag = (Tag)tags.next(); metadata.set(tag.getTagName(), tag.getDescription()); + handleCommonImageTags(metadata, tag); } } } catch (TiffProcessingException e) { @@ -61,4 +65,55 @@ class TiffExtractor { } } + + /** + * Maps common TIFF and EXIF tags onto the Tika + * TIFF image metadata namespace. + */ + public static void handleCommonImageTags(Metadata metadata, Tag tag) throws MetadataException { + // Core tags + if(tag.getTagName().equals("Date/Time") || + tag.getTagType() == 306) { + // Ensure it's in the right format + String date = tag.getDescription(); + int splitAt = date.indexOf(' '); + if(splitAt > -1) { + date = date.substring(0, splitAt).replace(':', '/') + + date.substring(splitAt); + } + metadata.set(Metadata.DATE, date); + return; + } + if(tag.getTagName().equals("Keywords") || + tag.getTagType() == 537) { + metadata.set(Metadata.KEYWORDS, tag.getDescription()); + } + + // EXIF / TIFF Tags + Property key = null; + if(tag.getTagName().equals("Image Width") || + tag.getTagType() == 256) { + key = Metadata.IMAGE_WIDTH; + } + if(tag.getTagName().equals("Image Height") || + tag.getTagType() == 257) { + key = Metadata.IMAGE_LENGTH; + } + if(tag.getTagName().equals("Data Precision") || + tag.getTagName().equals("Bits Per Sample") || + tag.getTagType() == 258) { + key = Metadata.BITS_PER_SAMPLE; + } + if(tag.getTagType() == 277) { + key = Metadata.SAMPLES_PER_PIXEL; + } + + if(key != null) { + Matcher m = LEADING_NUMBERS.matcher(tag.getDescription()); + if(m.matches()) { + metadata.set(key, m.group(1)); + } + } + } + private static final Pattern LEADING_NUMBERS = Pattern.compile("(\\d+)\\s*.*"); } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java?rev=958581&r1=958580&r2=958581&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java Mon Jun 28 13:59:08 2010 @@ -22,6 +22,7 @@ import java.util.Iterator; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.image.TiffExtractor; import org.xml.sax.SAXException; import com.drew.imaging.jpeg.JpegMetadataReader; @@ -52,6 +53,7 @@ class JpegExtractor { while (tags.hasNext()) { Tag tag = (Tag)tags.next(); metadata.set(tag.getTagName(), tag.getDescription()); + TiffExtractor.handleCommonImageTags(metadata, tag); } } } catch (JpegProcessingException e) { @@ -60,5 +62,4 @@ class JpegExtractor { throw new TikaException("Can't read JPEG metadata", e); } } - } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java?rev=958581&r1=958580&r2=958581&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java Mon Jun 28 13:59:08 2010 @@ -43,6 +43,10 @@ public class ImageParserTest extends Tes assertEquals("0", metadata.get("Dimension HorizontalPhysicalPixelSpacing")); assertEquals("BI_RGB", metadata.get("Compression CompressionTypeName")); assertEquals("image/bmp", metadata.get("Content-Type")); + + assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); + assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH)); + assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE)); } public void testGIF() throws Exception { @@ -69,6 +73,10 @@ public class ImageParserTest extends Tes assertEquals("disposalMethod=none, userInputFlag=false, transparentColorFlag=false, delayTime=0, transparentColorIndex=0", metadata.get("GraphicControlExtension")); assertEquals("0", metadata.get("Dimension VerticalPixelOffset")); assertEquals("image/gif", metadata.get("Content-Type")); + + assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); + assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH)); + assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(Metadata.COMMENTS)); } public void testJPEG() throws Exception { @@ -100,6 +108,10 @@ public class ImageParserTest extends Tes assertEquals("keyword=comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("Text TextEntry")); assertEquals("image/jpeg", metadata.get("Content-Type")); assertEquals("process=0, samplePrecision=8, numLines=75, samplesPerLine=100, numFrameComponents=3", metadata.get("markerSequence sof")); + + assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); + assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH)); + assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(Metadata.COMMENTS)); } public void testPNG() throws Exception { @@ -133,6 +145,10 @@ public class ImageParserTest extends Tes assertEquals("true", metadata.get("Chroma BlackIsZero")); assertEquals("year=2008, month=5, day=6, hour=6, minute=18, second=47", metadata.get("Document ImageModificationTime")); assertEquals("image/png", metadata.get("Content-Type")); + + assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); + assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH)); + assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE)); } // TODO: Add TIFF support @@ -145,6 +161,11 @@ public class ImageParserTest extends Tes // // assertEquals("75", metadata.get("height")); // assertEquals("100", metadata.get("width")); +// +// assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); +// assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH)); +// assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE)); +// assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(Metadata.COMMENTS)); // } } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java?rev=958581&r1=958580&r2=958581&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java Mon Jun 28 13:59:08 2010 @@ -38,5 +38,14 @@ public class TiffParserTest extends Test "more contributor license agreements. See the NOTICE file " + "distributed with this work for additional information regarding " + "copyright ownership.", metadata.get("Image Description")); + + // All EXIF/TIFF tags + assertEquals("Inch", metadata.get("Resolution Unit")); + + // Core EXIF/TIFF tags + assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); + assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH)); + assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE)); + assertEquals("3", metadata.get(Metadata.SAMPLES_PER_PIXEL)); } } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java?rev=958581&r1=958580&r2=958581&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java Mon Jun 28 13:59:08 2010 @@ -33,7 +33,18 @@ public class JpegParserTest extends Test getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg"); parser.parse(stream, new DefaultHandler(), metadata); + // All EXIF/TIFF tags assertEquals("Canon EOS 40D", metadata.get("Model")); + + // Core EXIF/TIFF tags + assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); + assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH)); + assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE)); + assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL)); + + // Common tags + assertEquals("2009/10/02 23:02:49", metadata.get(Metadata.DATE)); + assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS)); } }