Author: nick
Date: Sat Jul 31 16:09:33 2010
New Revision: 981072

URL: http://svn.apache.org/viewvc?rev=981072&view=rev
Log:
Apply patch from TIKA-472 - Extract JPEG title, description and author
Also fix a few indents to follow tika standard of space not tab

Added:
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented.jpg
   (with props)
Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java?rev=981072&r1=981071&r2=981072&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
 Sat Jul 31 16:09:33 2010
@@ -76,31 +76,31 @@ public class TiffExtractor {
      *  is spread across several EXIF tags.
      */
     public static void handleGeoImageTags(Metadata metadata) {
-       String lat = metadata.get("GPS Latitude");
-       String latNS = metadata.get("GPS Latitude Ref");
-       if(lat != null) {
-           Double latitude = parseHMS(lat);
-           if(latitude != null) {
-               if(latNS != null && latNS.equalsIgnoreCase("S") &&
-                       latitude > 0) {
-                   latitude *= -1;
-               }
-               metadata.set(Metadata.LATITUDE, 
LAT_LONG_FORMAT.format(latitude)); 
-           }
-       }
-       
-       String lng = metadata.get("GPS Longitude");
-       String lngEW = metadata.get("GPS Longitude Ref");
-       if(lng != null) {
-           Double longitude = parseHMS(lng);
-           if(longitude != null) {
-               if(lngEW != null && lngEW.equalsIgnoreCase("W") &&
-                       longitude > 0) {
-                   longitude *= -1;
-               }
-               metadata.set(Metadata.LONGITUDE, 
LAT_LONG_FORMAT.format(longitude));
-           }
-       }
+        String lat = metadata.get("GPS Latitude");
+        String latNS = metadata.get("GPS Latitude Ref");
+        if(lat != null) {
+            Double latitude = parseHMS(lat);
+            if(latitude != null) {
+                if(latNS != null && latNS.equalsIgnoreCase("S") &&
+                        latitude > 0) {
+                    latitude *= -1;
+                }
+                metadata.set(Metadata.LATITUDE, 
LAT_LONG_FORMAT.format(latitude)); 
+            }
+        }
+
+        String lng = metadata.get("GPS Longitude");
+        String lngEW = metadata.get("GPS Longitude Ref");
+        if(lng != null) {
+            Double longitude = parseHMS(lng);
+            if(longitude != null) {
+                if(lngEW != null && lngEW.equalsIgnoreCase("W") &&
+                        longitude > 0) {
+                    longitude *= -1;
+                }
+                metadata.set(Metadata.LONGITUDE, 
LAT_LONG_FORMAT.format(longitude));
+            }
+        }
     }
     private static Double parseHMS(String hms) {
        Matcher m = HOURS_MINUTES_SECONDS.matcher(hms);
@@ -129,54 +129,73 @@ public class TiffExtractor {
      *  TIFF image metadata namespace.
      */
     public static void handleCommonImageTags(Metadata metadata, Tag tag) 
throws MetadataException {
-       // Core tags
-       if(tag.getTagName().equals("Date/Time") ||
-               tag.getTagType() == 306) {
-           // Ensure it's in the right format
-           String date = tag.getDescription();
-           int splitAt = date.indexOf(' '); 
-           if(splitAt > -1) {
-               date = date.substring(0, splitAt).replace(':', '/') +
-                       date.substring(splitAt);
-           }
-           metadata.set(Metadata.DATE, date);
-           return;
-       }
-       if(tag.getTagName().equals("Keywords") ||
-               tag.getTagType() == 537) {
-           metadata.set(Metadata.KEYWORDS, tag.getDescription());
-           return;
-       }
-       if(tag.getTagName().equals("Jpeg Comment")) {
-           metadata.set(Metadata.COMMENTS, tag.getDescription());
-           return;
-       }
-       
-       // EXIF / TIFF Tags
-       Property key = null;
-       if(tag.getTagName().equals("Image Width") ||
-               tag.getTagType() == 256) { 
-           key = Metadata.IMAGE_WIDTH;
-       }
-       if(tag.getTagName().equals("Image Height") ||
-               tag.getTagType() == 257) {
-           key = Metadata.IMAGE_LENGTH;
-       }
-       if(tag.getTagName().equals("Data Precision") ||
-               tag.getTagName().equals("Bits Per Sample") ||
-               tag.getTagType() == 258) {
-           key = Metadata.BITS_PER_SAMPLE;
-       }
-       if(tag.getTagType() == 277) {
-           key = Metadata.SAMPLES_PER_PIXEL;
-       }
-       
-       if(key != null) {
-           Matcher m = LEADING_NUMBERS.matcher(tag.getDescription());
-           if(m.matches()) {
-               metadata.set(key, m.group(1));
-           }
-       }
+        // Core tags
+        if(tag.getTagName().equals("Date/Time") ||
+                tag.getTagType() == 306) {
+            // Ensure it's in the right format
+            String date = tag.getDescription();
+            int splitAt = date.indexOf(' '); 
+            if(splitAt > -1) {
+                date = date.substring(0, splitAt).replace(':', '/') +
+                date.substring(splitAt);
+            }
+            metadata.set(Metadata.DATE, date);
+            return;
+        }
+        if(tag.getTagName().equals("Keywords") ||
+                tag.getTagType() == 537) {
+            metadata.set(Metadata.KEYWORDS, tag.getDescription());
+            return;
+        }
+        if(tag.getTagName().equals("Jpeg Comment")) {
+            metadata.set(Metadata.COMMENTS, tag.getDescription());
+            return;
+        }
+
+        // File info
+        // Metadata Extractor does not read XMP so we need to use the values 
from Iptc or EXIF
+        if("Iptc".equals(tag.getDirectoryName())) {
+            if("Object Name".equals(tag.getTagName())) {
+                metadata.set(Metadata.TITLE, tag.getDescription());
+                return;
+            }
+            if("By-line".equals(tag.getTagName())) {
+                metadata.set(Metadata.AUTHOR, tag.getDescription());
+                return;
+            }          
+            if("Caption/Abstract".equals(tag.getTagName())) {
+                // Looks like metadata extractor returns IPTC newlines as a 
single carriage return,
+                // but the exiv2 command does not so we change to line feed 
here because that is less surprising to users
+                metadata.set(Metadata.DESCRIPTION, 
tag.getDescription().replaceAll("\r\n?", "\n"));
+                return;
+            }
+        }
+
+        // EXIF / TIFF Tags
+        Property key = null;
+        if(tag.getTagName().equals("Image Width") ||
+                tag.getTagType() == 256) { 
+            key = Metadata.IMAGE_WIDTH;
+        }
+        if(tag.getTagName().equals("Image Height") ||
+                tag.getTagType() == 257) {
+            key = Metadata.IMAGE_LENGTH;
+        }
+        if(tag.getTagName().equals("Data Precision") ||
+                tag.getTagName().equals("Bits Per Sample") ||
+                tag.getTagType() == 258) {
+            key = Metadata.BITS_PER_SAMPLE;
+        }
+        if(tag.getTagType() == 277) {
+            key = Metadata.SAMPLES_PER_PIXEL;
+        }
+
+        if(key != null) {
+            Matcher m = LEADING_NUMBERS.matcher(tag.getDescription());
+            if(m.matches()) {
+                metadata.set(key, m.group(1));
+            }
+        }
     }
     private static final Pattern LEADING_NUMBERS = 
Pattern.compile("(\\d+)\\s*.*");
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java?rev=981072&r1=981071&r2=981072&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
 Sat Jul 31 16:09:33 2010
@@ -73,4 +73,24 @@ public class JpegParserTest extends Test
         assertEquals("2009/10/02 23:02:49", metadata.get(Metadata.DATE));
         assertEquals("canon-55-250 moscow-birds serbor", 
metadata.get(Metadata.KEYWORDS));
     }
+    
+    public void testJPEGTitleAndDescription() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+        InputStream stream =
+            
getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg");
+        parser.parse(stream, new DefaultHandler(), metadata, new 
ParseContext());
+          
+        // embedded comments with non-ascii characters
+        //assertEquals("Tosteberga \u00C4ngar", metadata.get(Metadata.TITLE));
+        assertEquals("Tosteberga " + new String(new byte[]{-61, -124}) + 
"ngar", metadata.get(Metadata.TITLE));
+        //assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new 
line)", metadata.get(Metadata.DESCRIPTION));
+        assertEquals("Bird site in north eastern Sk" + new String(new 
byte[]{-61, -91}) + 
+                       "ne, Sweden.\n(new line)", 
metadata.get(Metadata.DESCRIPTION));
+        assertEquals("Some Tourist", metadata.get(Metadata.AUTHOR));
+        // xmp handles spaces in keywords, returns "bird watching, nature 
reserve, coast, grazelands"
+        //assertEquals("bird watching nature reserve coast grazelands", 
metadata.get(Metadata.KEYWORDS));
+        // ordering is odd when returned from parser as one string
+        assertEquals("grazelands nature reserve bird watching coast", 
metadata.get(Metadata.KEYWORDS));
+    }
 }

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented.jpg
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented.jpg?rev=981072&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented.jpg
------------------------------------------------------------------------------
    svn:mime-type = image/jpeg


Reply via email to