Author: nick
Date: Tue Jun 29 12:06:19 2010
New Revision: 958942

URL: http://svn.apache.org/viewvc?rev=958942&view=rev
Log:
Enable extraction of longitude and latitude from JPEG/Tiff files (via the EXIF 
tags), and HTML (via the ICBM meta tag), to the new geographic metadata 
namespace

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_GEO.jpg  
 (with props)
Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
    tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 Tue Jun 29 12:06:19 2010
@@ -18,6 +18,8 @@ package org.apache.tika.parser.html;
 
 import java.net.MalformedURLException;
 import java.net.URL;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.sax.TextContentHandler;
@@ -93,9 +95,21 @@ class HtmlHandler extends TextContentHan
                     xhtml.startElement(uri, local, "meta", atts);
                 }
                 if (atts.getValue("name") != null) {
+                    // Record the meta tag in the metadata
                     metadata.set(
                             atts.getValue("name"),
                             atts.getValue("content"));
+                    // Normalise if possible
+                    if(atts.getValue("name").equalsIgnoreCase("ICBM")) {
+                        Matcher m = Pattern.compile(
+                              "\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*"
+                        ).matcher(atts.getValue("content"));
+                        if(m.matches()) {
+                            metadata.set(Metadata.LATITUDE, m.group(1));
+                            metadata.set(Metadata.LONGITUDE, m.group(2));
+                        }
+                    }
+                    // Allow downstream processing
                     xhtml.startElement(uri, local, "meta", atts);
                 }
             } else if ("BASE".equals(name) && atts.getValue("href") != null) {

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
 Tue Jun 29 12:06:19 2010
@@ -18,6 +18,7 @@ package org.apache.tika.parser.image;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.text.DecimalFormat;
 import java.util.Iterator;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -57,6 +58,7 @@ public class TiffExtractor {
                     metadata.set(tag.getTagName(), tag.getDescription());
                     handleCommonImageTags(metadata, tag);
                 }
+                handleGeoImageTags(metadata);
             }
         } catch (TiffProcessingException e) {
             throw new TikaException("Can't read TIFF metadata", e);
@@ -64,6 +66,52 @@ public class TiffExtractor {
             throw new TikaException("Can't read TIFF metadata", e);
         }
     }
+    
+    /**
+     * Maps EXIF Geo Tags onto the Tika Geo metadata namespace.
+     * Needs to be run at the end, because the GPS information
+     *  is spread across several EXIF tags.
+     */
+    public static void handleGeoImageTags(Metadata metadata) {
+       String lat = metadata.get("GPS Latitude");
+       String latNS = metadata.get("GPS Latitude Ref");
+       if(lat != null) {
+           Double latitude = parseHMS(lat);
+           if(latitude != null) {
+               if(latNS != null && latNS.equalsIgnoreCase("S") &&
+                       latitude > 0) {
+                   latitude *= -1;
+               }
+               metadata.set(Metadata.LATITUDE, 
LAT_LONG_FORMAT.format(latitude)); 
+           }
+       }
+       
+       String lng = metadata.get("GPS Longitude");
+       String lngEW = metadata.get("GPS Longitude Ref");
+       if(lng != null) {
+           Double longitude = parseHMS(lng);
+           if(longitude != null) {
+               if(lngEW != null && lngEW.equalsIgnoreCase("W") &&
+                       longitude > 0) {
+                   longitude *= -1;
+               }
+               metadata.set(Metadata.LONGITUDE, 
LAT_LONG_FORMAT.format(longitude));
+           }
+       }
+    }
+    private static Double parseHMS(String hms) {
+       Matcher m = HOURS_MINUTES_SECONDS.matcher(hms);
+       if(m.matches()) {
+          double value = 
+            Integer.parseInt(m.group(1)) +
+            (Integer.parseInt(m.group(2))/60.0) +
+            (Double.parseDouble(m.group(3))/60.0/60.0);
+          return value;
+       }
+       return null;
+    }
+    private static final Pattern HOURS_MINUTES_SECONDS = 
Pattern.compile("(-?\\d+)\"(\\d+)'(\\d+\\.?\\d*)");
+    private static final DecimalFormat LAT_LONG_FORMAT = new 
DecimalFormat("##0.0####");
 
 
     /**

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
 Tue Jun 29 12:06:19 2010
@@ -55,6 +55,7 @@ class JpegExtractor {
                     metadata.set(tag.getTagName(), tag.getDescription());
                     TiffExtractor.handleCommonImageTags(metadata, tag);
                 }
+                TiffExtractor.handleGeoImageTags(metadata);
             }
         } catch (JpegProcessingException e) {
             throw new TikaException("Can't read JPEG metadata", e);

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 Tue Jun 29 12:06:19 2010
@@ -71,6 +71,9 @@ public class HtmlParserTest extends Test
                 "Title : Test Indexation Html", metadata.get(Metadata.TITLE));
         assertEquals("Tika Developers", metadata.get("Author"));
         assertEquals("5", metadata.get("refresh"));
+        
+        assertEquals("51.2312", metadata.get(Metadata.LATITUDE));
+        assertEquals("-5.1987", metadata.get(Metadata.LONGITUDE));
 
         assertEquals("http://www.apache.org/";, href.toString());
         assertEquals("test-anchor", name.toString());

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
 Tue Jun 29 12:06:19 2010
@@ -17,6 +17,8 @@
 package org.apache.tika.parser.jpeg;
 
 import junit.framework.TestCase;
+
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.metadata.Metadata;
 import org.xml.sax.helpers.DefaultHandler;
@@ -31,7 +33,7 @@ public class JpegParserTest extends Test
         metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
         InputStream stream =
             
getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg");
-        parser.parse(stream, new DefaultHandler(), metadata);
+        parser.parse(stream, new DefaultHandler(), metadata, new 
ParseContext());
 
         // All EXIF/TIFF tags
         assertEquals("Canon EOS 40D", metadata.get("Model"));
@@ -47,4 +49,28 @@ public class JpegParserTest extends Test
         assertEquals("canon-55-250 moscow-birds serbor", 
metadata.get(Metadata.KEYWORDS));
     }
 
+    public void testJPEGGeo() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+        InputStream stream =
+            getClass().getResourceAsStream("/test-documents/testJPEG_GEO.jpg");
+        parser.parse(stream, new DefaultHandler(), metadata, new 
ParseContext());
+        
+        // Geo tags
+        assertEquals("12.54321", metadata.get(Metadata.LATITUDE));
+        assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE));
+
+        // All EXIF/TIFF tags
+        assertEquals("Canon EOS 40D", metadata.get("Model"));
+        
+        // Core EXIF/TIFF tags
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+        assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
+        
+        // Common tags
+        assertEquals("2009/10/02 23:02:49", metadata.get(Metadata.DATE));
+        assertEquals("canon-55-250 moscow-birds serbor", 
metadata.get(Metadata.KEYWORDS));
+    }
 }

Modified: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html 
(original)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html Tue 
Jun 29 12:06:19 2010
@@ -18,10 +18,11 @@
        <head>
         <title>Title : Test Indexation Html</title>
         <meta name="Author" content="Tika Developers">
+        <meta name="ICBM" content="51.2312, -5.1987">
         <meta http-equiv="refresh" content="5">
     </head>
        <body>
                <h1><a name="test-anchor"></a>Test Indexation Html</h1>
                <p><a href="http://www.apache.org/";>Indexation</a> du 
fichier</p>
        </body>
-</html>
\ No newline at end of file
+</html>

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_GEO.jpg
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_GEO.jpg?rev=958942&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_GEO.jpg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to