Author: jukka
Date: Wed Apr 14 09:34:54 2010
New Revision: 933893
URL: http://svn.apache.org/viewvc?rev=933893&view=rev
Log:
TIKA-92: Image metadata extraction
Patch by Dmitry Kuzmenko.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java?rev=933893&r1=933892&r2=933893&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
Wed Apr 14 09:34:54 2010
@@ -16,7 +16,7 @@
*/
package org.apache.tika.parser.image;
- import java.io.IOException;
+import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
@@ -27,6 +27,7 @@ import java.util.Set;
import javax.imageio.IIOException;
import javax.imageio.ImageIO;
import javax.imageio.ImageReader;
+import javax.imageio.metadata.IIOMetadata;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
@@ -35,6 +36,8 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -70,6 +73,9 @@ public class ImageParser implements Pars
new CloseShieldInputStream(stream)));
metadata.set("height",
Integer.toString(reader.getHeight(0)));
metadata.set("width",
Integer.toString(reader.getWidth(0)));
+
+ loadMetadata(reader.getImageMetadata(0), metadata);
+
reader.dispose();
}
} catch (IIOException e) {
@@ -91,4 +97,51 @@ public class ImageParser implements Pars
parse(stream, handler, metadata, new ParseContext());
}
+ private static void loadMetadata(IIOMetadata imageMetadata, Metadata
metadata) {
+ String[] names = imageMetadata.getMetadataFormatNames();
+ if (names == null) {
+ return;
+ }
+ int length = names.length;
+ for (int i = 0; i < length; i++) {
+ loadNode(metadata, imageMetadata.getAsTree(names[i]), "", false);
+ }
+ }
+
+ private static void loadNode(
+ Metadata metadata, Node node, String parents,
+ boolean addThisNodeName) {
+ if (addThisNodeName) {
+ if (parents.length() > 0) {
+ parents += " ";
+ }
+ parents += node.getNodeName();
+ }
+ NamedNodeMap map = node.getAttributes();
+ if (map != null) {
+
+ int length = map.getLength();
+ if (length == 1) {
+ metadata.add(parents, map.item(0).getNodeValue());
+ } else if (length > 1) {
+ StringBuffer value = new StringBuffer();
+ for (int i = 0; i < length; i++) {
+ if (i > 0) {
+ value.append(", ");
+ }
+ Node attr = map.item(i);
+
value.append(attr.getNodeName()).append("=").append(attr.getNodeValue());
+ }
+ metadata.add(parents, value.toString());
+ }
+ }
+
+ Node child = node.getFirstChild();
+ while (child != null) {
+ // print children recursively
+ loadNode(metadata, child, parents, true);
+ child = child.getNextSibling();
+ }
+ }
+
}
Modified:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java?rev=933893&r1=933892&r2=933893&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
Wed Apr 14 09:34:54 2010
@@ -37,6 +37,12 @@ public class ImageParserTest extends Tes
assertEquals("75", metadata.get("height"));
assertEquals("100", metadata.get("width"));
+ assertEquals("8 8 8 ", metadata.get("Data BitsPerSample"));
+ assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
+ assertEquals("0", metadata.get("Dimension
VerticalPhysicalPixelSpacing"));
+ assertEquals("0", metadata.get("Dimension
HorizontalPhysicalPixelSpacing"));
+ assertEquals("BI_RGB", metadata.get("Compression
CompressionTypeName"));
+ assertEquals("image/bmp", metadata.get("Content-Type"));
}
public void testGIF() throws Exception {
@@ -48,6 +54,21 @@ public class ImageParserTest extends Tes
assertEquals("75", metadata.get("height"));
assertEquals("100", metadata.get("width"));
+ assertEquals("TRUE", metadata.get("Compression Lossless"));
+ assertEquals("Normal", metadata.get("Dimension ImageOrientation"));
+ assertEquals("lzw", metadata.get("Compression CompressionTypeName"));
+ assertEquals("0", metadata.get("Dimension HorizontalPixelOffset"));
+ assertEquals("imageLeftPosition=0, imageTopPosition=0, imageWidth=100,
imageHeight=75, interlaceFlag=false", metadata.get("ImageDescriptor"));
+ assertEquals("Index", metadata.get("Data SampleFormat"));
+ assertEquals("3", metadata.get("Chroma NumChannels"));
+ assertEquals("1", metadata.get("Compression NumProgressiveScans"));
+ assertEquals("RGB", metadata.get("Chroma ColorSpaceType"));
+ assertEquals("Licensed to the Apache Software Foundation (ASF) under
one or more contributor license agreements. See the NOTICE file distributed
with this work for additional information regarding copyright ownership.",
metadata.get("CommentExtensions CommentExtension"));
+ assertEquals("value=Licensed to the Apache Software Foundation (ASF)
under one or more contributor license agreements. See the NOTICE file
distributed with this work for additional information regarding copyright
ownership., encoding=ISO-8859-1, compression=none", metadata.get("Text
TextEntry"));
+ assertEquals("TRUE", metadata.get("Chroma BlackIsZero"));
+ assertEquals("disposalMethod=none, userInputFlag=false,
transparentColorFlag=false, delayTime=0, transparentColorIndex=0",
metadata.get("GraphicControlExtension"));
+ assertEquals("0", metadata.get("Dimension VerticalPixelOffset"));
+ assertEquals("image/gif", metadata.get("Content-Type"));
}
public void testJPEG() throws Exception {
@@ -59,6 +80,26 @@ public class ImageParserTest extends Tes
assertEquals("75", metadata.get("height"));
assertEquals("100", metadata.get("width"));
+ assertEquals("0.35277778", metadata.get("Dimension
VerticalPixelSize"));
+ assertEquals("false", metadata.get("Compression Lossless"));
+ assertEquals("class=0, htableId=0", metadata.get("markerSequence dht
dhtable"));
+ assertEquals("majorVersion=1, minorVersion=1, resUnits=1, Xdensity=72,
Ydensity=72, thumbWidth=0, thumbHeight=0", metadata.get("JPEGvariety
app0JFIF"));
+ assertEquals("225", metadata.get("markerSequence unknown"));
+ assertEquals("componentSelector=1, dcHuffTable=0, acHuffTable=0",
metadata.get("markerSequence sos scanComponentSpec"));
+ assertEquals("normal", metadata.get("Dimension ImageOrientation"));
+ assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
+ assertEquals("elementPrecision=0, qtableId=0",
metadata.get("markerSequence dqt dqtable"));
+ assertEquals("numScanComponents=3, startSpectralSelection=0,
endSpectralSelection=63, approxHigh=0, approxLow=0",
metadata.get("markerSequence sos"));
+ assertEquals("componentId=1, HsamplingFactor=1, VsamplingFactor=1,
QtableSelector=0", metadata.get("markerSequence sof componentSpec"));
+ assertEquals("JPEG", metadata.get("Compression CompressionTypeName"));
+ assertEquals("0.35277778", metadata.get("Dimension
HorizontalPixelSize"));
+ assertEquals("Licensed to the Apache Software Foundation (ASF) under
one or more contributor license agreements. See the NOTICE file distributed
with this work for additional information regarding copyright ownership.",
metadata.get("markerSequence com"));
+ assertEquals("3", metadata.get("Chroma NumChannels"));
+ assertEquals("1", metadata.get("Compression NumProgressiveScans"));
+ assertEquals("YCbCr", metadata.get("Chroma ColorSpaceType"));
+ assertEquals("keyword=comment, value=Licensed to the Apache Software
Foundation (ASF) under one or more contributor license agreements. See the
NOTICE file distributed with this work for additional information regarding
copyright ownership.", metadata.get("Text TextEntry"));
+ assertEquals("image/jpeg", metadata.get("Content-Type"));
+ assertEquals("process=0, samplePrecision=8, numLines=75,
samplesPerLine=100, numFrameComponents=3", metadata.get("markerSequence sof"));
}
public void testPNG() throws Exception {
@@ -70,6 +111,28 @@ public class ImageParserTest extends Tes
assertEquals("75", metadata.get("height"));
assertEquals("100", metadata.get("width"));
+ assertEquals("0.35273367", metadata.get("Dimension
VerticalPixelSize"));
+ assertEquals("8 8 8", metadata.get("Data BitsPerSample"));
+ assertEquals("Perceptual", metadata.get("sRGB"));
+ assertEquals("true", metadata.get("Compression Lossless"));
+ assertEquals("year=2008, month=5, day=6, hour=6, minute=18,
second=47", metadata.get("tIME"));
+ assertEquals("Normal", metadata.get("Dimension ImageOrientation"));
+ assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
+ assertEquals("keyword=Comment, value=Licensed to the Apache Software
Foundation (ASF) under one or more contributor license agreements. See the
NOTICE file distributed with this work for additional information regarding
copyright ownership.", metadata.get("tEXt tEXtEntry"));
+ assertEquals("deflate", metadata.get("Compression
CompressionTypeName"));
+ assertEquals("UnsignedIntegral", metadata.get("Data SampleFormat"));
+ assertEquals("0.35273367", metadata.get("Dimension
HorizontalPixelSize"));
+ assertEquals("none", metadata.get("Transparency Alpha"));
+ assertEquals("pixelsPerUnitXAxis=2835, pixelsPerUnitYAxis=2835,
unitSpecifier=meter", metadata.get("pHYs"));
+ assertEquals("3", metadata.get("Chroma NumChannels"));
+ assertEquals("1", metadata.get("Compression NumProgressiveScans"));
+ assertEquals("RGB", metadata.get("Chroma ColorSpaceType"));
+ assertEquals("keyword=Comment, value=Licensed to the Apache Software
Foundation (ASF) under one or more contributor license agreements. See the
NOTICE file distributed with this work for additional information regarding
copyright ownership., encoding=ISO-8859-1, compression=none",
metadata.get("Text TextEntry"));
+ assertEquals("PixelInterleaved", metadata.get("Data
PlanarConfiguration"));
+ assertEquals("width=100, height=75, bitDepth=8, colorType=RGB,
compressionMethod=deflate, filterMethod=adaptive, interlaceMethod=none",
metadata.get("IHDR"));
+ assertEquals("true", metadata.get("Chroma BlackIsZero"));
+ assertEquals("year=2008, month=5, day=6, hour=6, minute=18,
second=47", metadata.get("Document ImageModificationTime"));
+ assertEquals("image/png", metadata.get("Content-Type"));
}
// TODO: Add TIFF support