Author: nick
Date: Sat Jul 31 16:09:33 2010
New Revision: 981072
URL: http://svn.apache.org/viewvc?rev=981072&view=rev
Log:
Apply patch from TIKA-472 - Extract JPEG title, description and author
Also fix a few indents to follow tika standard of space not tab
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented.jpg
(with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java?rev=981072&r1=981071&r2=981072&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
Sat Jul 31 16:09:33 2010
@@ -76,31 +76,31 @@ public class TiffExtractor {
* is spread across several EXIF tags.
*/
public static void handleGeoImageTags(Metadata metadata) {
- String lat = metadata.get("GPS Latitude");
- String latNS = metadata.get("GPS Latitude Ref");
- if(lat != null) {
- Double latitude = parseHMS(lat);
- if(latitude != null) {
- if(latNS != null && latNS.equalsIgnoreCase("S") &&
- latitude > 0) {
- latitude *= -1;
- }
- metadata.set(Metadata.LATITUDE,
LAT_LONG_FORMAT.format(latitude));
- }
- }
-
- String lng = metadata.get("GPS Longitude");
- String lngEW = metadata.get("GPS Longitude Ref");
- if(lng != null) {
- Double longitude = parseHMS(lng);
- if(longitude != null) {
- if(lngEW != null && lngEW.equalsIgnoreCase("W") &&
- longitude > 0) {
- longitude *= -1;
- }
- metadata.set(Metadata.LONGITUDE,
LAT_LONG_FORMAT.format(longitude));
- }
- }
+ String lat = metadata.get("GPS Latitude");
+ String latNS = metadata.get("GPS Latitude Ref");
+ if(lat != null) {
+ Double latitude = parseHMS(lat);
+ if(latitude != null) {
+ if(latNS != null && latNS.equalsIgnoreCase("S") &&
+ latitude > 0) {
+ latitude *= -1;
+ }
+ metadata.set(Metadata.LATITUDE,
LAT_LONG_FORMAT.format(latitude));
+ }
+ }
+
+ String lng = metadata.get("GPS Longitude");
+ String lngEW = metadata.get("GPS Longitude Ref");
+ if(lng != null) {
+ Double longitude = parseHMS(lng);
+ if(longitude != null) {
+ if(lngEW != null && lngEW.equalsIgnoreCase("W") &&
+ longitude > 0) {
+ longitude *= -1;
+ }
+ metadata.set(Metadata.LONGITUDE,
LAT_LONG_FORMAT.format(longitude));
+ }
+ }
}
private static Double parseHMS(String hms) {
Matcher m = HOURS_MINUTES_SECONDS.matcher(hms);
@@ -129,54 +129,73 @@ public class TiffExtractor {
* TIFF image metadata namespace.
*/
public static void handleCommonImageTags(Metadata metadata, Tag tag)
throws MetadataException {
- // Core tags
- if(tag.getTagName().equals("Date/Time") ||
- tag.getTagType() == 306) {
- // Ensure it's in the right format
- String date = tag.getDescription();
- int splitAt = date.indexOf(' ');
- if(splitAt > -1) {
- date = date.substring(0, splitAt).replace(':', '/') +
- date.substring(splitAt);
- }
- metadata.set(Metadata.DATE, date);
- return;
- }
- if(tag.getTagName().equals("Keywords") ||
- tag.getTagType() == 537) {
- metadata.set(Metadata.KEYWORDS, tag.getDescription());
- return;
- }
- if(tag.getTagName().equals("Jpeg Comment")) {
- metadata.set(Metadata.COMMENTS, tag.getDescription());
- return;
- }
-
- // EXIF / TIFF Tags
- Property key = null;
- if(tag.getTagName().equals("Image Width") ||
- tag.getTagType() == 256) {
- key = Metadata.IMAGE_WIDTH;
- }
- if(tag.getTagName().equals("Image Height") ||
- tag.getTagType() == 257) {
- key = Metadata.IMAGE_LENGTH;
- }
- if(tag.getTagName().equals("Data Precision") ||
- tag.getTagName().equals("Bits Per Sample") ||
- tag.getTagType() == 258) {
- key = Metadata.BITS_PER_SAMPLE;
- }
- if(tag.getTagType() == 277) {
- key = Metadata.SAMPLES_PER_PIXEL;
- }
-
- if(key != null) {
- Matcher m = LEADING_NUMBERS.matcher(tag.getDescription());
- if(m.matches()) {
- metadata.set(key, m.group(1));
- }
- }
+ // Core tags
+ if(tag.getTagName().equals("Date/Time") ||
+ tag.getTagType() == 306) {
+ // Ensure it's in the right format
+ String date = tag.getDescription();
+ int splitAt = date.indexOf(' ');
+ if(splitAt > -1) {
+ date = date.substring(0, splitAt).replace(':', '/') +
+ date.substring(splitAt);
+ }
+ metadata.set(Metadata.DATE, date);
+ return;
+ }
+ if(tag.getTagName().equals("Keywords") ||
+ tag.getTagType() == 537) {
+ metadata.set(Metadata.KEYWORDS, tag.getDescription());
+ return;
+ }
+ if(tag.getTagName().equals("Jpeg Comment")) {
+ metadata.set(Metadata.COMMENTS, tag.getDescription());
+ return;
+ }
+
+ // File info
+ // Metadata Extractor does not read XMP so we need to use the values
from Iptc or EXIF
+ if("Iptc".equals(tag.getDirectoryName())) {
+ if("Object Name".equals(tag.getTagName())) {
+ metadata.set(Metadata.TITLE, tag.getDescription());
+ return;
+ }
+ if("By-line".equals(tag.getTagName())) {
+ metadata.set(Metadata.AUTHOR, tag.getDescription());
+ return;
+ }
+ if("Caption/Abstract".equals(tag.getTagName())) {
+ // Looks like metadata extractor returns IPTC newlines as a
single carriage return,
+ // but the exiv2 command does not so we change to line feed
here because that is less surprising to users
+ metadata.set(Metadata.DESCRIPTION,
tag.getDescription().replaceAll("\r\n?", "\n"));
+ return;
+ }
+ }
+
+ // EXIF / TIFF Tags
+ Property key = null;
+ if(tag.getTagName().equals("Image Width") ||
+ tag.getTagType() == 256) {
+ key = Metadata.IMAGE_WIDTH;
+ }
+ if(tag.getTagName().equals("Image Height") ||
+ tag.getTagType() == 257) {
+ key = Metadata.IMAGE_LENGTH;
+ }
+ if(tag.getTagName().equals("Data Precision") ||
+ tag.getTagName().equals("Bits Per Sample") ||
+ tag.getTagType() == 258) {
+ key = Metadata.BITS_PER_SAMPLE;
+ }
+ if(tag.getTagType() == 277) {
+ key = Metadata.SAMPLES_PER_PIXEL;
+ }
+
+ if(key != null) {
+ Matcher m = LEADING_NUMBERS.matcher(tag.getDescription());
+ if(m.matches()) {
+ metadata.set(key, m.group(1));
+ }
+ }
}
private static final Pattern LEADING_NUMBERS =
Pattern.compile("(\\d+)\\s*.*");
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java?rev=981072&r1=981071&r2=981072&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
Sat Jul 31 16:09:33 2010
@@ -73,4 +73,24 @@ public class JpegParserTest extends Test
assertEquals("2009/10/02 23:02:49", metadata.get(Metadata.DATE));
assertEquals("canon-55-250 moscow-birds serbor",
metadata.get(Metadata.KEYWORDS));
}
+
+ public void testJPEGTitleAndDescription() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+
getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new
ParseContext());
+
+ // embedded comments with non-ascii characters
+ //assertEquals("Tosteberga \u00C4ngar", metadata.get(Metadata.TITLE));
+ assertEquals("Tosteberga " + new String(new byte[]{-61, -124}) +
"ngar", metadata.get(Metadata.TITLE));
+ //assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new
line)", metadata.get(Metadata.DESCRIPTION));
+ assertEquals("Bird site in north eastern Sk" + new String(new
byte[]{-61, -91}) +
+ "ne, Sweden.\n(new line)",
metadata.get(Metadata.DESCRIPTION));
+ assertEquals("Some Tourist", metadata.get(Metadata.AUTHOR));
+ // xmp handles spaces in keywords, returns "bird watching, nature
reserve, coast, grazelands"
+ //assertEquals("bird watching nature reserve coast grazelands",
metadata.get(Metadata.KEYWORDS));
+ // ordering is odd when returned from parser as one string
+ assertEquals("grazelands nature reserve bird watching coast",
metadata.get(Metadata.KEYWORDS));
+ }
}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented.jpg
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented.jpg?rev=981072&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented.jpg
------------------------------------------------------------------------------
svn:mime-type = image/jpeg