Author: nick Date: Tue Jun 29 12:06:19 2010 New Revision: 958942 URL: http://svn.apache.org/viewvc?rev=958942&view=rev Log: Enable extraction of longitude and latitude from JPEG/Tiff files (via the EXIF tags), and HTML (via the ICBM meta tag), to the new geographic metadata namespace
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_GEO.jpg (with props) Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=958942&r1=958941&r2=958942&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java Tue Jun 29 12:06:19 2010 @@ -18,6 +18,8 @@ package org.apache.tika.parser.html; import java.net.MalformedURLException; import java.net.URL; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.tika.metadata.Metadata; import org.apache.tika.sax.TextContentHandler; @@ -93,9 +95,21 @@ class HtmlHandler extends TextContentHan xhtml.startElement(uri, local, "meta", atts); } if (atts.getValue("name") != null) { + // Record the meta tag in the metadata metadata.set( atts.getValue("name"), atts.getValue("content")); + // Normalise if possible + if(atts.getValue("name").equalsIgnoreCase("ICBM")) { + Matcher m = Pattern.compile( + "\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*" + ).matcher(atts.getValue("content")); + if(m.matches()) { + metadata.set(Metadata.LATITUDE, m.group(1)); + metadata.set(Metadata.LONGITUDE, m.group(2)); + } + } + // Allow downstream processing xhtml.startElement(uri, local, "meta", atts); } } else if ("BASE".equals(name) && atts.getValue("href") != null) { Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java?rev=958942&r1=958941&r2=958942&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java Tue Jun 29 12:06:19 2010 @@ -18,6 +18,7 @@ package org.apache.tika.parser.image; import java.io.IOException; import java.io.InputStream; +import java.text.DecimalFormat; import java.util.Iterator; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -57,6 +58,7 @@ public class TiffExtractor { metadata.set(tag.getTagName(), tag.getDescription()); handleCommonImageTags(metadata, tag); } + handleGeoImageTags(metadata); } } catch (TiffProcessingException e) { throw new TikaException("Can't read TIFF metadata", e); @@ -64,6 +66,52 @@ public class TiffExtractor { throw new TikaException("Can't read TIFF metadata", e); } } + + /** + * Maps EXIF Geo Tags onto the Tika Geo metadata namespace. + * Needs to be run at the end, because the GPS information + * is spread across several EXIF tags. + */ + public static void handleGeoImageTags(Metadata metadata) { + String lat = metadata.get("GPS Latitude"); + String latNS = metadata.get("GPS Latitude Ref"); + if(lat != null) { + Double latitude = parseHMS(lat); + if(latitude != null) { + if(latNS != null && latNS.equalsIgnoreCase("S") && + latitude > 0) { + latitude *= -1; + } + metadata.set(Metadata.LATITUDE, LAT_LONG_FORMAT.format(latitude)); + } + } + + String lng = metadata.get("GPS Longitude"); + String lngEW = metadata.get("GPS Longitude Ref"); + if(lng != null) { + Double longitude = parseHMS(lng); + if(longitude != null) { + if(lngEW != null && lngEW.equalsIgnoreCase("W") && + longitude > 0) { + longitude *= -1; + } + metadata.set(Metadata.LONGITUDE, LAT_LONG_FORMAT.format(longitude)); + } + } + } + private static Double parseHMS(String hms) { + Matcher m = HOURS_MINUTES_SECONDS.matcher(hms); + if(m.matches()) { + double value = + Integer.parseInt(m.group(1)) + + (Integer.parseInt(m.group(2))/60.0) + + (Double.parseDouble(m.group(3))/60.0/60.0); + return value; + } + return null; + } + private static final Pattern HOURS_MINUTES_SECONDS = Pattern.compile("(-?\\d+)\"(\\d+)'(\\d+\\.?\\d*)"); + private static final DecimalFormat LAT_LONG_FORMAT = new DecimalFormat("##0.0####"); /** Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java?rev=958942&r1=958941&r2=958942&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java Tue Jun 29 12:06:19 2010 @@ -55,6 +55,7 @@ class JpegExtractor { metadata.set(tag.getTagName(), tag.getDescription()); TiffExtractor.handleCommonImageTags(metadata, tag); } + TiffExtractor.handleGeoImageTags(metadata); } } catch (JpegProcessingException e) { throw new TikaException("Can't read JPEG metadata", e); Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=958942&r1=958941&r2=958942&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Tue Jun 29 12:06:19 2010 @@ -71,6 +71,9 @@ public class HtmlParserTest extends Test "Title : Test Indexation Html", metadata.get(Metadata.TITLE)); assertEquals("Tika Developers", metadata.get("Author")); assertEquals("5", metadata.get("refresh")); + + assertEquals("51.2312", metadata.get(Metadata.LATITUDE)); + assertEquals("-5.1987", metadata.get(Metadata.LONGITUDE)); assertEquals("http://www.apache.org/", href.toString()); assertEquals("test-anchor", name.toString()); Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java?rev=958942&r1=958941&r2=958942&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java Tue Jun 29 12:06:19 2010 @@ -17,6 +17,8 @@ package org.apache.tika.parser.jpeg; import junit.framework.TestCase; + +import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.metadata.Metadata; import org.xml.sax.helpers.DefaultHandler; @@ -31,7 +33,7 @@ public class JpegParserTest extends Test metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg"); - parser.parse(stream, new DefaultHandler(), metadata); + parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); // All EXIF/TIFF tags assertEquals("Canon EOS 40D", metadata.get("Model")); @@ -47,4 +49,28 @@ public class JpegParserTest extends Test assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS)); } + public void testJPEGGeo() throws Exception { + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); + InputStream stream = + getClass().getResourceAsStream("/test-documents/testJPEG_GEO.jpg"); + parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); + + // Geo tags + assertEquals("12.54321", metadata.get(Metadata.LATITUDE)); + assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE)); + + // All EXIF/TIFF tags + assertEquals("Canon EOS 40D", metadata.get("Model")); + + // Core EXIF/TIFF tags + assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); + assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH)); + assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE)); + assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL)); + + // Common tags + assertEquals("2009/10/02 23:02:49", metadata.get(Metadata.DATE)); + assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS)); + } } Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html?rev=958942&r1=958941&r2=958942&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html (original) +++ tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html Tue Jun 29 12:06:19 2010 @@ -18,10 +18,11 @@ <head> <title>Title : Test Indexation Html</title> <meta name="Author" content="Tika Developers"> + <meta name="ICBM" content="51.2312, -5.1987"> <meta http-equiv="refresh" content="5"> </head> <body> <h1><a name="test-anchor"></a>Test Indexation Html</h1> <p><a href="http://www.apache.org/">Indexation</a> du fichier</p> </body> -</html> \ No newline at end of file +</html> Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_GEO.jpg URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_GEO.jpg?rev=958942&view=auto ============================================================================== Binary file - no diff available. Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_GEO.jpg ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream