Author: nick
Date: Mon Jun 28 13:59:08 2010
New Revision: 958581

URL: http://svn.apache.org/viewvc?rev=958581&view=rev
Log:
Use the new TIFF Metadata entries for image width/length/sampling from the 
TIFF, JPEG and general Image (ImageIO) parsers. Gives a small number of 
consistent image related metadata entries across all formats. (TIKA-442)

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java 
Mon Jun 28 13:59:08 2010
@@ -25,7 +25,7 @@ import java.util.Properties;
  * A multi-valued metadata container.
  */
 public class Metadata implements CreativeCommons, DublinCore, HttpHeaders,
-        Message, MSOffice, ClimateForcast, TikaMetadataKeys, TikaMimeKeys {
+        Message, MSOffice, ClimateForcast, TIFF, TikaMetadataKeys, 
TikaMimeKeys {
 
     /**
      * A map of all metadata attributes.

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
 Mon Jun 28 13:59:08 2010
@@ -32,6 +32,7 @@ import javax.imageio.metadata.IIOMetadat
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -70,6 +71,9 @@ public class ImageParser implements Pars
                     ImageReader reader = iterator.next();
                     reader.setInput(ImageIO.createImageInputStream(
                             new CloseShieldInputStream(stream)));
+                    
+                    metadata.set(Metadata.IMAGE_WIDTH, 
Integer.toString(reader.getWidth(0)));
+                    metadata.set(Metadata.IMAGE_LENGTH, 
Integer.toString(reader.getHeight(0)));
                     metadata.set("height", 
Integer.toString(reader.getHeight(0)));
                     metadata.set("width", 
Integer.toString(reader.getWidth(0)));
 
@@ -77,6 +81,12 @@ public class ImageParser implements Pars
 
                     reader.dispose();
                 }
+                
+                // Translate certain Metadata tags from the ImageIO
+                //  specific namespace into the general Tika one
+                setIfPresent(metadata, "CommentExtensions CommentExtension", 
Metadata.COMMENTS);
+                setIfPresent(metadata, "markerSequence com", 
Metadata.COMMENTS);
+                setIfPresent(metadata, "Data BitsPerSample", 
Metadata.BITS_PER_SAMPLE);
             } catch (IIOException e) {
                 throw new TikaException(type + " parse error", e);
             }
@@ -95,6 +105,21 @@ public class ImageParser implements Pars
             throws IOException, SAXException, TikaException {
         parse(stream, handler, metadata, new ParseContext());
     }
+    
+    private static void setIfPresent(Metadata metadata, String imageIOkey, 
String tikaKey) {
+       if(metadata.get(imageIOkey) != null) {
+           metadata.set(tikaKey, metadata.get(imageIOkey));
+       }
+    }
+    private static void setIfPresent(Metadata metadata, String imageIOkey, 
Property tikaProp) {
+       if(metadata.get(imageIOkey) != null) {
+           String v = metadata.get(imageIOkey);
+           if(v.endsWith(" ")) {
+               v = v.substring(0, v.lastIndexOf(' '));
+           }
+           metadata.set(tikaProp, v);
+       }
+    }
 
     private static void loadMetadata(IIOMetadata imageMetadata, Metadata 
metadata) {
         String[] names = imageMetadata.getMetadataFormatNames();

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
 Mon Jun 28 13:59:08 2010
@@ -19,9 +19,12 @@ package org.apache.tika.parser.image;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Iterator;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
 import org.xml.sax.SAXException;
 
 import com.drew.imaging.tiff.TiffMetadataReader;
@@ -30,15 +33,15 @@ import com.drew.metadata.Directory;
 import com.drew.metadata.MetadataException;
 import com.drew.metadata.Tag;
 
-class TiffExtractor {
+public class TiffExtractor {
 
     private final Metadata metadata;
 
-    public TiffExtractor(Metadata metadata) {
+    protected TiffExtractor(Metadata metadata) {
         this.metadata = metadata;
     }
 
-    public void parse(InputStream stream)
+    protected void parse(InputStream stream)
             throws IOException, SAXException, TikaException {
         try {
             com.drew.metadata.Metadata tiffMetadata =
@@ -52,6 +55,7 @@ class TiffExtractor {
                 while (tags.hasNext()) {
                     Tag tag = (Tag)tags.next();
                     metadata.set(tag.getTagName(), tag.getDescription());
+                    handleCommonImageTags(metadata, tag);
                 }
             }
         } catch (TiffProcessingException e) {
@@ -61,4 +65,55 @@ class TiffExtractor {
         }
     }
 
+
+    /**
+     * Maps common TIFF and EXIF tags onto the Tika
+     *  TIFF image metadata namespace.
+     */
+    public static void handleCommonImageTags(Metadata metadata, Tag tag) 
throws MetadataException {
+       // Core tags
+       if(tag.getTagName().equals("Date/Time") ||
+               tag.getTagType() == 306) {
+           // Ensure it's in the right format
+           String date = tag.getDescription();
+           int splitAt = date.indexOf(' '); 
+           if(splitAt > -1) {
+               date = date.substring(0, splitAt).replace(':', '/') +
+                       date.substring(splitAt);
+           }
+           metadata.set(Metadata.DATE, date);
+           return;
+       }
+       if(tag.getTagName().equals("Keywords") ||
+               tag.getTagType() == 537) {
+           metadata.set(Metadata.KEYWORDS, tag.getDescription());
+       }
+       
+       // EXIF / TIFF Tags
+       Property key = null;
+       if(tag.getTagName().equals("Image Width") ||
+               tag.getTagType() == 256) { 
+           key = Metadata.IMAGE_WIDTH;
+       }
+       if(tag.getTagName().equals("Image Height") ||
+               tag.getTagType() == 257) {
+           key = Metadata.IMAGE_LENGTH;
+       }
+       if(tag.getTagName().equals("Data Precision") ||
+               tag.getTagName().equals("Bits Per Sample") ||
+               tag.getTagType() == 258) {
+           key = Metadata.BITS_PER_SAMPLE;
+       }
+       if(tag.getTagType() == 277) {
+           key = Metadata.SAMPLES_PER_PIXEL;
+       }
+       
+       if(key != null) {
+           Matcher m = LEADING_NUMBERS.matcher(tag.getDescription());
+           if(m.matches()) {
+               metadata.set(key, m.group(1));
+           }
+       }
+    }
+    private static final Pattern LEADING_NUMBERS = 
Pattern.compile("(\\d+)\\s*.*");
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
 Mon Jun 28 13:59:08 2010
@@ -22,6 +22,7 @@ import java.util.Iterator;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.image.TiffExtractor;
 import org.xml.sax.SAXException;
 
 import com.drew.imaging.jpeg.JpegMetadataReader;
@@ -52,6 +53,7 @@ class JpegExtractor {
                 while (tags.hasNext()) {
                     Tag tag = (Tag)tags.next();
                     metadata.set(tag.getTagName(), tag.getDescription());
+                    TiffExtractor.handleCommonImageTags(metadata, tag);
                 }
             }
         } catch (JpegProcessingException e) {
@@ -60,5 +62,4 @@ class JpegExtractor {
             throw new TikaException("Can't read JPEG metadata", e);
         }
     }
-
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
 Mon Jun 28 13:59:08 2010
@@ -43,6 +43,10 @@ public class ImageParserTest extends Tes
         assertEquals("0", metadata.get("Dimension 
HorizontalPhysicalPixelSpacing"));
         assertEquals("BI_RGB", metadata.get("Compression 
CompressionTypeName"));
         assertEquals("image/bmp", metadata.get("Content-Type"));
+        
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
     }
 
     public void testGIF() throws Exception {
@@ -69,6 +73,10 @@ public class ImageParserTest extends Tes
         assertEquals("disposalMethod=none, userInputFlag=false, 
transparentColorFlag=false, delayTime=0, transparentColorIndex=0", 
metadata.get("GraphicControlExtension"));
         assertEquals("0", metadata.get("Dimension VerticalPixelOffset"));
         assertEquals("image/gif", metadata.get("Content-Type"));
+        
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("Licensed to the Apache Software Foundation (ASF) under 
one or more contributor license agreements.  See the NOTICE file distributed 
with this work for additional information regarding copyright ownership.", 
metadata.get(Metadata.COMMENTS));
     }
 
     public void testJPEG() throws Exception {
@@ -100,6 +108,10 @@ public class ImageParserTest extends Tes
         assertEquals("keyword=comment, value=Licensed to the Apache Software 
Foundation (ASF) under one or more contributor license agreements.  See the 
NOTICE file distributed with this work for additional information regarding 
copyright ownership.", metadata.get("Text TextEntry"));
         assertEquals("image/jpeg", metadata.get("Content-Type"));
         assertEquals("process=0, samplePrecision=8, numLines=75, 
samplesPerLine=100, numFrameComponents=3", metadata.get("markerSequence sof"));
+        
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("Licensed to the Apache Software Foundation (ASF) under 
one or more contributor license agreements.  See the NOTICE file distributed 
with this work for additional information regarding copyright ownership.", 
metadata.get(Metadata.COMMENTS));
     }
 
     public void testPNG() throws Exception {
@@ -133,6 +145,10 @@ public class ImageParserTest extends Tes
         assertEquals("true", metadata.get("Chroma BlackIsZero"));
         assertEquals("year=2008, month=5, day=6, hour=6, minute=18, 
second=47", metadata.get("Document ImageModificationTime"));
         assertEquals("image/png", metadata.get("Content-Type"));
+        
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
     }
 
 // TODO: Add TIFF support
@@ -145,6 +161,11 @@ public class ImageParserTest extends Tes
 //
 //        assertEquals("75", metadata.get("height"));
 //        assertEquals("100", metadata.get("width"));
+//    
+//        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+//        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+//        assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
+//        assertEquals("Licensed to the Apache Software Foundation (ASF) under 
one or more contributor license agreements.  See the NOTICE file distributed 
with this work for additional information regarding copyright ownership.", 
metadata.get(Metadata.COMMENTS));
 //    }
 
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
 Mon Jun 28 13:59:08 2010
@@ -38,5 +38,14 @@ public class TiffParserTest extends Test
                        "more contributor license agreements.  See the NOTICE 
file " +
                        "distributed with this work for additional information 
regarding " +
                        "copyright ownership.", metadata.get("Image 
Description"));
+        
+        // All EXIF/TIFF tags
+        assertEquals("Inch", metadata.get("Resolution Unit"));
+        
+        // Core EXIF/TIFF tags
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+        assertEquals("3", metadata.get(Metadata.SAMPLES_PER_PIXEL));
     }
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
 Mon Jun 28 13:59:08 2010
@@ -33,7 +33,18 @@ public class JpegParserTest extends Test
             
getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg");
         parser.parse(stream, new DefaultHandler(), metadata);
 
+        // All EXIF/TIFF tags
         assertEquals("Canon EOS 40D", metadata.get("Model"));
+        
+        // Core EXIF/TIFF tags
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+        assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
+        
+        // Common tags
+        assertEquals("2009/10/02 23:02:49", metadata.get(Metadata.DATE));
+        assertEquals("canon-55-250 moscow-birds serbor", 
metadata.get(Metadata.KEYWORDS));
     }
 
 }


Reply via email to