Author: bob
Date: Mon Dec 28 23:22:46 2015
New Revision: 1722029
URL: http://svn.apache.org/viewvc?rev=1722029&view=rev
Log:
TIKA-1812 - Moving multimedia sources to module.
Added:
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/audio/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/BPGParser.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/MetadataFields.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/PSDParser.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/WebPParser.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/video/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/BPGParserTest.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/PSDParserTest.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/WebPParserTest.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/xmp/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/mp3/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/mp3/MpegStreamTest.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/mp4/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/video/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java
Removed:
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/module/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/module/
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/resources/hadoop.jpg
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/resources/testFLV.flv
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/resources/testMID.mid
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/resources/testMP3i18n.mp3
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/resources/tika.png
tika/branches/2.x/tika-parsers/src/main/java/org/apache/tika/parser/audio/
tika/branches/2.x/tika-parsers/src/main/java/org/apache/tika/parser/image/
tika/branches/2.x/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/
tika/branches/2.x/tika-parsers/src/main/java/org/apache/tika/parser/mp3/
tika/branches/2.x/tika-parsers/src/main/java/org/apache/tika/parser/mp4/
tika/branches/2.x/tika-parsers/src/main/java/org/apache/tika/parser/video/
tika/branches/2.x/tika-parsers/src/test/java/org/apache/tika/parser/audio/
tika/branches/2.x/tika-parsers/src/test/java/org/apache/tika/parser/image/
tika/branches/2.x/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/
tika/branches/2.x/tika-parsers/src/test/java/org/apache/tika/parser/mp3/
tika/branches/2.x/tika-parsers/src/test/java/org/apache/tika/parser/mp4/
tika/branches/2.x/tika-parsers/src/test/java/org/apache/tika/parser/video/
Modified:
tika/branches/2.x/tika-parser-modules/pom.xml
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/pom.xml
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Modified: tika/branches/2.x/tika-parser-modules/pom.xml
URL:
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/pom.xml?rev=1722029&r1=1722028&r2=1722029&view=diff
==============================================================================
--- tika/branches/2.x/tika-parser-modules/pom.xml (original)
+++ tika/branches/2.x/tika-parser-modules/pom.xml Mon Dec 28 23:22:46 2015
@@ -33,59 +33,28 @@
<name>Apache Tika Parser Modules</name>
<url>http://tika.apache.org/</url>
- <modules>
+ <properties>
+ <poi.version>3.13</poi.version>
+ <!-- NOTE: sync codec version with POI -->
+ <codec.version>1.9</codec.version>
+ <pdfbox.version>1.8.10</pdfbox.version>
+ </properties>
+
+ <!-- <modules>
<module>tika-multimedia-module</module>
</modules>
-
+ -->
<dependencies>
- <!-- Optional OSGi dependencies, used only when running within OSGi -->
- <dependency>
- <groupId>org.osgi</groupId>
- <artifactId>org.osgi.core</artifactId>
- <scope>provided</scope>
- <optional>true</optional>
- </dependency>
- <dependency>
- <groupId>org.osgi</groupId>
- <artifactId>org.osgi.compendium</artifactId>
- <scope>provided</scope>
- <optional>true</optional>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parsers</artifactId>
- <version>${project.version}</version>
- <scope>provided</scope>
- </dependency>
<!-- Test dependencies -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
- <dependency>
- <groupId>org.ops4j.pax.exam</groupId>
- <artifactId>pax-exam-junit4</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.ops4j.pax.exam</groupId>
- <artifactId>pax-exam-container-native</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.felix</groupId>
- <artifactId>org.apache.felix.framework</artifactId>
- <scope>test</scope>
- </dependency>
<dependency>
- <groupId>org.ops4j.pax.exam</groupId>
- <artifactId>pax-exam-link-assembly</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.ops4j.pax.url</groupId>
- <artifactId>pax-url-aether</artifactId>
+ <groupId>org.mockito</groupId>
+ <artifactId>mockito-core</artifactId>
+ <version>1.7</version>
<scope>test</scope>
</dependency>
<dependency>
@@ -93,10 +62,37 @@
<artifactId>slf4j-simple</artifactId>
<scope>test</scope>
</dependency>
- <dependency>
- <groupId>javax.inject</groupId>
- <artifactId>javax.inject</artifactId>
- <scope>test</scope>
- </dependency>
</dependencies>
+ <build>
+ <pluginManagement>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ <version>2.10</version>
+ <executions>
+ <execution>
+ <id>unpack</id>
+ <phase>compile</phase>
+ <goals>
+ <goal>unpack</goal>
+ </goals>
+ <configuration>
+ <artifactItems>
+ <artifactItem>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-test</artifactId>
+ <version>${project.version}</version>
+ <type>jar</type>
+ <overWrite>true</overWrite>
+
<outputDirectory>${project.build.testOutputDirectory}</outputDirectory>
+ </artifactItem>
+ </artifactItems>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </pluginManagement>
+ </build>
</project>
\ No newline at end of file
Modified: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/pom.xml
URL:
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/pom.xml?rev=1722029&r1=1722028&r2=1722029&view=diff
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/pom.xml
(original)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/pom.xml Mon
Dec 28 23:22:46 2015
@@ -20,74 +20,91 @@
</parent>
<artifactId>tika-multimedia-module</artifactId>
- <packaging>bundle</packaging>
<name>Apache Tika Multimedia Module</name>
<url>http://tika.apache.org/</url>
+
+ <properties>
+ <metadata.extractor.version>2.8.0</metadata.extractor.version>
+ <isoparser.version>1.0.2</isoparser.version>
+ <commons.logging.version>1.1.3</commons.logging.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.drewnoakes</groupId>
+ <artifactId>metadata-extractor</artifactId>
+ <version>${metadata.extractor.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>${codec.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.poi</groupId>
+ <artifactId>poi</artifactId>
+ <version>${poi.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.poi</groupId>
+ <artifactId>poi-scratchpad</artifactId>
+ <version>${poi.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.poi</groupId>
+ <artifactId>poi-ooxml</artifactId>
+ <version>${poi.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>stax</groupId>
+ <artifactId>stax-api</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>xml-apis</groupId>
+ <artifactId>xml-apis</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>com.googlecode.mp4parser</groupId>
+ <artifactId>isoparser</artifactId>
+ <version>${isoparser.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>jempbox</artifactId>
+ <version>${pdfbox.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ <version>${commons.logging.version}</version>
+ </dependency>
+ </dependencies>
+
<build>
<plugins>
<plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
-
<Bundle-Activator>org.apache.tika.module.multimedia.internal.Activator</Bundle-Activator>
- <_runsystempackages>com.sun.xml.bind.marshaller,
- com.sun.xml.internal.bind.marshaller</_runsystempackages>
- <Embed-Dependency>
- metadata-extractor,
- xmpcore,
- commons-codec,
- commons-io,
- jempbox,
- poi,
- isoparser,
- aspectjrt,
-
tika-parsers;inline=org/apache/tika/parser/image/**|org/apache/tika/parser/jpeg/**|org/apache/tika/parser/ocr/**|org/apache/tika/parser/audio/**|org/apache/tika/parser/video/**|org/apache/tika/parser/mp3/**|org/apache/tika/parser/mp4/**
- </Embed-Dependency>
- <Embed-Transitive>true</Embed-Transitive>
- <Export-Package>
- org.apache.tika.parser.image.*,
- org.apache.tika.parser.jpeg.*,
- org.apache.tika.parser.ocr.*,
- org.apache.tika.parser.audio.*,
- org.apache.tika.parser.video.*,
- org.apache.tika.parser.mp3.*,
- org.apache.tika.parser.mp4.*
- </Export-Package>
- <Import-Package>
- *,
- com.adobe.xmp;resolution:=optional,
- com.adobe.xmp.properties;resolution:=optional,
- android.util;resolution:=optional
- </Import-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- <executions>
- <execution>
- <goals>
- <goal>integration-test</goal>
- <goal>verify</goal>
- </goals>
- </execution>
- </executions>
- <configuration>
- <systemPropertyVariables>
- <org.ops4j.pax.logging.DefaultServiceLog.level>
- WARN
- </org.ops4j.pax.logging.DefaultServiceLog.level>
- </systemPropertyVariables>
- <systemProperties>
- <property>
- <name>project.bundle.file</name>
- <value>target/${project.build.finalName}.jar</value>
- </property>
- </systemProperties>
- </configuration>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
</plugin>
</plugins>
</build>
Added:
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java
URL:
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java?rev=1722029&view=auto
==============================================================================
---
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java
(added)
+++
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java
Mon Dec 28 23:22:46 2015
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import javax.sound.sampled.AudioFileFormat;
+import javax.sound.sampled.AudioFileFormat.Type;
+import javax.sound.sampled.AudioFormat;
+import javax.sound.sampled.AudioSystem;
+import javax.sound.sampled.UnsupportedAudioFileException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class AudioParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -6015684081240882695L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.audio("basic"),
+ MediaType.audio("x-wav"),
+ MediaType.audio("x-aiff"))));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // AudioSystem expects the stream to support the mark feature
+ if (!stream.markSupported()) {
+ stream = new BufferedInputStream(stream);
+ }
+ try {
+ AudioFileFormat fileFormat =
AudioSystem.getAudioFileFormat(stream);
+ Type type = fileFormat.getType();
+ if (type == Type.AIFC || type == Type.AIFF) {
+ metadata.set(Metadata.CONTENT_TYPE, "audio/x-aiff");
+ } else if (type == Type.AU || type == Type.SND) {
+ metadata.set(Metadata.CONTENT_TYPE, "audio/basic");
+ } else if (type == Type.WAVE) {
+ metadata.set(Metadata.CONTENT_TYPE, "audio/x-wav");
+ }
+
+ AudioFormat audioFormat = fileFormat.getFormat();
+ int channels = audioFormat.getChannels();
+ if (channels != AudioSystem.NOT_SPECIFIED) {
+ metadata.set("channels", String.valueOf(channels));
+ // TODO: Use XMPDM.TRACKS? (see also frame rate in AudioFormat)
+ }
+ float rate = audioFormat.getSampleRate();
+ if (rate != AudioSystem.NOT_SPECIFIED) {
+ metadata.set("samplerate", String.valueOf(rate));
+ metadata.set(
+ XMPDM.AUDIO_SAMPLE_RATE,
+ Integer.toString((int) rate));
+ }
+ int bits = audioFormat.getSampleSizeInBits();
+ if (bits != AudioSystem.NOT_SPECIFIED) {
+ metadata.set("bits", String.valueOf(bits));
+ if (bits == 8) {
+ metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "8Int");
+ } else if (bits == 16) {
+ metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "16Int");
+ } else if (bits == 32) {
+ metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "32Int");
+ }
+ }
+ metadata.set("encoding", audioFormat.getEncoding().toString());
+
+ // Javadoc suggests that some of the following properties might
+ // be available, but I had no success in finding any:
+
+ // "duration" Long playback duration of the file in microseconds
+ // "author" String name of the author of this file
+ // "title" String title of this file
+ // "copyright" String copyright message
+ // "date" Date date of the recording or release
+ // "comment" String an arbitrary text
+
+ addMetadata(metadata, fileFormat.properties());
+ addMetadata(metadata, audioFormat.properties());
+ } catch (UnsupportedAudioFileException e) {
+ // There is no way to know whether this exception was
+ // caused by the document being corrupted or by the format
+ // just being unsupported. So we do nothing.
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+ private void addMetadata(Metadata metadata, Map<String, Object>
properties) {
+ if (properties != null) {
+ for (Entry<String, Object> entry : properties.entrySet()) {
+ Object value = entry.getValue();
+ if (value != null) {
+ metadata.set(entry.getKey(), value.toString());
+ }
+ }
+ }
+ }
+
+}
Added:
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
URL:
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java?rev=1722029&view=auto
==============================================================================
---
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
(added)
+++
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
Mon Dec 28 23:22:46 2015
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.sound.midi.InvalidMidiDataException;
+import javax.sound.midi.MetaMessage;
+import javax.sound.midi.MidiMessage;
+import javax.sound.midi.MidiSystem;
+import javax.sound.midi.Patch;
+import javax.sound.midi.Sequence;
+import javax.sound.midi.Track;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
+public class MidiParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 6343278584336189432L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("x-midi"),
+ MediaType.audio("midi"))));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ metadata.set(Metadata.CONTENT_TYPE, "audio/midi");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ // MidiSystem expects the stream to support the mark feature
+ InputStream buffered = new BufferedInputStream(stream);
+ try {
+ Sequence sequence = MidiSystem.getSequence(buffered);
+
+ Track[] tracks = sequence.getTracks();
+ metadata.set("tracks", String.valueOf(tracks.length));
+ // TODO: Use XMPDM.TRACKS?
+
+ Patch[] patches = sequence.getPatchList();
+ metadata.set("patches", String.valueOf(patches.length));
+
+ float type = sequence.getDivisionType();
+ if (type == Sequence.PPQ) {
+ metadata.set("divisionType", "PPQ");
+ } else if (type == Sequence.SMPTE_24) {
+ metadata.set("divisionType", "SMPTE_24");
+ } else if (type == Sequence.SMPTE_25) {
+ metadata.set("divisionType", "SMPTE_25");
+ } else if (type == Sequence.SMPTE_30) {
+ metadata.set("divisionType", "SMPTE_30");
+ } else if (type == Sequence.SMPTE_30DROP) {
+ metadata.set("divisionType", "SMPTE_30DROP");
+ } else if (type == Sequence.SMPTE_24) {
+ metadata.set("divisionType", String.valueOf(type));
+ }
+
+ for (Track track : tracks) {
+ xhtml.startElement("p");
+ for (int i = 0; i < track.size(); i++) {
+ MidiMessage message = track.get(i).getMessage();
+ if (message instanceof MetaMessage) {
+ MetaMessage meta = (MetaMessage) message;
+ // Types 1-15 are reserved for text events
+ if (meta.getType() >= 1 && meta.getType() <= 15) {
+ // FIXME: What's the encoding?
+ xhtml.characters(
+ new String(meta.getData(), ISO_8859_1));
+ }
+ }
+ }
+ xhtml.endElement("p");
+ }
+ } catch (InvalidMidiDataException ignore) {
+ // There is no way to know whether this exception was
+ // caused by the document being corrupted or by the format
+ // just being unsupported. So we do nothing.
+ }
+
+ xhtml.endDocument();
+ }
+
+}
Added:
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/BPGParser.java
URL:
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/BPGParser.java?rev=1722029&view=auto
==============================================================================
---
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/BPGParser.java
(added)
+++
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/BPGParser.java
Mon Dec 28 23:22:46 2015
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Photoshop;
+import org.apache.tika.metadata.TIFF;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for the Better Portable Graphics )BPG) File Format.
+ * <p/>
+ * Documentation on the file format is available from
+ * http://bellard.org/bpg/bpg_spec.txt
+ */
+public class BPGParser extends AbstractParser {
+ protected static final int EXTENSION_TAG_EXIF = 1;
+ protected static final int EXTENSION_TAG_ICC_PROFILE = 2;
+ protected static final int EXTENSION_TAG_XMP = 3;
+ protected static final int EXTENSION_TAG_THUMBNAIL = 4;
+ private static final long serialVersionUID = -161736541253892772L;
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.image("x-bpg"), MediaType.image("bpg"))));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Check for the magic header signature
+ byte[] signature = new byte[4];
+ IOUtils.readFully(stream, signature);
+ if (signature[0] == (byte) 'B' && signature[1] == (byte) 'P' &&
+ signature[2] == (byte) 'G' && signature[3] == (byte) 0xfb) {
+ // Good, signature found
+ } else {
+ throw new TikaException("BPG magic signature invalid");
+ }
+
+ // Grab and decode the first byte
+ int pdf = stream.read();
+
+ // Pixel format: Greyscale / 4:2:0 / 4:2:2 / 4:4:4
+ int pixelFormat = pdf & 0x7;
+ // TODO Identify a suitable metadata key for this
+
+ // Is there an alpha plane as well as a colour plane?
+ boolean hasAlphaPlane1 = (pdf & 0x8) == 0x8;
+ // TODO Identify a suitable metadata key for this+hasAlphaPlane2
+
+ // Bit depth minus 8
+ int bitDepth = (pdf >> 4) + 8;
+ metadata.set(TIFF.BITS_PER_SAMPLE, Integer.toString(bitDepth));
+
+ // Grab and decode the second byte
+ int cer = stream.read();
+
+ // Colour Space: YCbCr / RGB / YCgCo / YCbCrK / CMYK
+ int colourSpace = cer & 0x15;
+ switch (colourSpace) {
+ case 0:
+ metadata.set(Photoshop.COLOR_MODE, "YCbCr Colour");
+ break;
+ case 1:
+ metadata.set(Photoshop.COLOR_MODE, "RGB Colour");
+ break;
+ case 2:
+ metadata.set(Photoshop.COLOR_MODE, "YCgCo Colour");
+ break;
+ case 3:
+ metadata.set(Photoshop.COLOR_MODE, "YCbCrK Colour");
+ break;
+ case 4:
+ metadata.set(Photoshop.COLOR_MODE, "CMYK Colour");
+ break;
+ }
+
+ // Are there extensions or not?
+ boolean hasExtensions = (cer & 16) == 16;
+
+ // Is the Alpha Plane 2 flag set?
+ boolean hasAlphaPlane2 = (cer & 32) == 32;
+
+ // cer then holds 2 more booleans - limited range, reserved
+
+ // Width and height next
+ int width = (int) EndianUtils.readUE7(stream);
+ int height = (int) EndianUtils.readUE7(stream);
+ metadata.set(TIFF.IMAGE_LENGTH, height);
+ metadata.set(TIFF.IMAGE_WIDTH, width);
+
+ // Picture Data length
+ EndianUtils.readUE7(stream);
+
+ // Extension Data Length, if extensions present
+ long extensionDataLength = 0;
+ if (hasExtensions)
+ extensionDataLength = EndianUtils.readUE7(stream);
+
+ // Alpha Data Length, if alpha used
+ long alphaDataLength = 0;
+ if (hasAlphaPlane1 || hasAlphaPlane2)
+ alphaDataLength = EndianUtils.readUE7(stream);
+
+ // Extension Data
+ if (hasExtensions) {
+ long extensionsDataSeen = 0;
+ ImageMetadataExtractor metadataExtractor =
+ new ImageMetadataExtractor(metadata);
+
+ while (extensionsDataSeen < extensionDataLength) {
+ int extensionType = (int) EndianUtils.readUE7(stream);
+ int extensionLength = (int) EndianUtils.readUE7(stream);
+ switch (extensionType) {
+ case EXTENSION_TAG_EXIF:
+ metadataExtractor.parseRawExif(stream,
extensionLength, true);
+ break;
+ case EXTENSION_TAG_XMP:
+ handleXMP(stream, extensionLength, metadataExtractor);
+ break;
+ default:
+ stream.skip(extensionLength);
+ }
+ extensionsDataSeen += extensionLength;
+ }
+ }
+
+ // HEVC Header + Data
+ // Alpha HEVC Header + Data
+ // We can't do anything with these parts
+
+ // We don't have any helpful text, sorry...
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+ protected void handleXMP(InputStream stream, int xmpLength,
+ ImageMetadataExtractor extractor) throws
IOException, TikaException, SAXException {
+ byte[] xmp = new byte[xmpLength];
+ IOUtils.readFully(stream, xmp);
+ extractor.parseRawXMP(xmp);
+ }
+}
Added:
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
URL:
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java?rev=1722029&view=auto
==============================================================================
---
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
(added)
+++
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
Mon Dec 28 23:22:46 2015
@@ -0,0 +1,548 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.DecimalFormat;
+import java.text.DecimalFormatSymbols;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.drew.imaging.jpeg.JpegMetadataReader;
+import com.drew.imaging.jpeg.JpegProcessingException;
+import com.drew.imaging.riff.RiffProcessingException;
+import com.drew.imaging.tiff.TiffMetadataReader;
+import com.drew.imaging.tiff.TiffProcessingException;
+import com.drew.imaging.webp.WebpMetadataReader;
+import com.drew.lang.ByteArrayReader;
+import com.drew.lang.GeoLocation;
+import com.drew.lang.Rational;
+import com.drew.metadata.Directory;
+import com.drew.metadata.MetadataException;
+import com.drew.metadata.Tag;
+import com.drew.metadata.exif.ExifIFD0Directory;
+import com.drew.metadata.exif.ExifReader;
+import com.drew.metadata.exif.ExifSubIFDDirectory;
+import com.drew.metadata.exif.ExifThumbnailDirectory;
+import com.drew.metadata.exif.GpsDirectory;
+import com.drew.metadata.iptc.IptcDirectory;
+import com.drew.metadata.jpeg.JpegCommentDirectory;
+import com.drew.metadata.jpeg.JpegDirectory;
+import com.drew.metadata.xmp.XmpReader;
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.IPTC;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.xml.sax.SAXException;
+
+/**
+ * Uses the <a href="http://www.drewnoakes.com/code/exif/">Metadata
Extractor</a> library
+ * to read EXIF and IPTC image metadata and map to Tika fields.
+ * <p/>
+ * As of 2.4.0 the library supports jpeg and tiff.
+ * As of 2.8.0 the library supports webp.
+ */
+public class ImageMetadataExtractor {
+
+ private static final String GEO_DECIMAL_FORMAT_STRING = "#.######"; // 6
dp seems to be reasonable
+ private final Metadata metadata;
+ private DirectoryHandler[] handlers;
+
+ /**
+ * @param metadata to extract to, using default directory handlers
+ */
+ public ImageMetadataExtractor(Metadata metadata) {
+ this(metadata,
+ new CopyUnknownFieldsHandler(),
+ new JpegCommentHandler(),
+ new ExifHandler(),
+ new DimensionsHandler(),
+ new GeotagHandler(),
+ new IptcHandler()
+ );
+ }
+
+ /**
+ * @param metadata to extract to
+ * @param handlers handlers in order, note that handlers may override
values from earlier handlers
+ */
+ public ImageMetadataExtractor(Metadata metadata, DirectoryHandler...
handlers) {
+ this.metadata = metadata;
+ this.handlers = handlers;
+ }
+
+ private static String trimPixels(String s) {
+ //if height/width appears as "100 pixels", trim " pixels"
+ if (s != null) {
+ int i = s.lastIndexOf(" pixels");
+ s = s.substring(0, i);
+ }
+ return s;
+ }
+
+ public void parseJpeg(File file)
+ throws IOException, SAXException, TikaException {
+ try {
+ com.drew.metadata.Metadata jpegMetadata =
JpegMetadataReader.readMetadata(file);
+ handle(jpegMetadata);
+ } catch (JpegProcessingException e) {
+ throw new TikaException("Can't read JPEG metadata", e);
+ } catch (MetadataException e) {
+ throw new TikaException("Can't read JPEG metadata", e);
+ }
+ }
+
+ public void parseTiff(File file)
+ throws IOException, SAXException, TikaException {
+ try {
+ com.drew.metadata.Metadata tiffMetadata =
TiffMetadataReader.readMetadata(file);
+ handle(tiffMetadata);
+ } catch (MetadataException e) {
+ throw new TikaException("Can't read TIFF metadata", e);
+ } catch (TiffProcessingException e) {
+ throw new TikaException("Can't read TIFF metadata", e);
+ }
+ }
+
+ public void parseWebP(File file) throws IOException, TikaException {
+
+ try {
+ com.drew.metadata.Metadata webPMetadata = new
com.drew.metadata.Metadata();
+ webPMetadata = WebpMetadataReader.readMetadata(file);
+ handle(webPMetadata);
+ } catch (IOException e) {
+ throw e;
+ } catch (RiffProcessingException e) {
+ throw new TikaException("Can't process Riff data", e);
+ } catch (MetadataException e) {
+ throw new TikaException("Can't process Riff data", e);
+ }
+ }
+
+ public void parseRawExif(InputStream stream, int length, boolean
needsExifHeader)
+ throws IOException, SAXException, TikaException {
+ byte[] exif;
+ if (needsExifHeader) {
+ exif = new byte[length + 6];
+ exif[0] = (byte) 'E';
+ exif[1] = (byte) 'x';
+ exif[2] = (byte) 'i';
+ exif[3] = (byte) 'f';
+ IOUtils.readFully(stream, exif, 6, length);
+ } else {
+ exif = new byte[length];
+ IOUtils.readFully(stream, exif, 0, length);
+ }
+ parseRawExif(exif);
+ }
+
+ public void parseRawExif(byte[] exifData)
+ throws IOException, SAXException, TikaException {
+ com.drew.metadata.Metadata metadata = new com.drew.metadata.Metadata();
+ ExifReader reader = new ExifReader();
+ reader.extract(new ByteArrayReader(exifData), metadata,
ExifReader.JPEG_SEGMENT_PREAMBLE.length());
+
+ try {
+ handle(metadata);
+ } catch (MetadataException e) {
+ throw new TikaException("Can't process the EXIF Data", e);
+ }
+ }
+
+ public void parseRawXMP(byte[] xmpData)
+ throws IOException, SAXException, TikaException {
+ com.drew.metadata.Metadata metadata = new com.drew.metadata.Metadata();
+ XmpReader reader = new XmpReader();
+ reader.extract(xmpData, metadata);
+
+ try {
+ handle(metadata);
+ } catch (MetadataException e) {
+ throw new TikaException("Can't process the XMP Data", e);
+ }
+ }
+
+ /**
+ * Copies extracted tags to tika metadata using registered handlers.
+ *
+ * @param metadataExtractor Tag directories from a Metadata Extractor
"reader"
+ * @throws MetadataException This method does not handle exceptions from
Metadata Extractor
+ */
+ protected void handle(com.drew.metadata.Metadata metadataExtractor)
+ throws MetadataException {
+ handle(metadataExtractor.getDirectories().iterator());
+ }
+
+ /**
+ * Copies extracted tags to tika metadata using registered handlers.
+ *
+ * @param directories Metadata Extractor {@link
com.drew.metadata.Directory} instances.
+ * @throws MetadataException This method does not handle exceptions from
Metadata Extractor
+ */
+ protected void handle(Iterator<Directory> directories) throws
MetadataException {
+ while (directories.hasNext()) {
+ Directory directory = directories.next();
+ for (DirectoryHandler handler : handlers) {
+ if (handler.supports(directory.getClass())) {
+ handler.handle(directory, metadata);
+ }
+ }
+ }
+ }
+
+ /**
+ * Reads one or more type of Metadata Extractor fields.
+ */
+ static interface DirectoryHandler {
+ /**
+ * @param directoryType A Metadata Extractor directory class
+ * @return true if the directory type is supported by this handler
+ */
+ boolean supports(Class<? extends Directory> directoryType);
+
+ /**
+ * @param directory extracted tags
+ * @param metadata current tika metadata
+ * @throws MetadataException typically field extraction error, aborts
all further extraction
+ */
+ void handle(Directory directory, Metadata metadata)
+ throws MetadataException;
+ }
+
+ /**
+ * Mimics the behavior from TIKA-314 of copying all extracted tags
+ * to tika metadata using field names from Metadata Extractor.
+ */
+ static class CopyAllFieldsHandler implements DirectoryHandler {
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return true;
+ }
+
+ public void handle(Directory directory, Metadata metadata)
+ throws MetadataException {
+ if (directory.getTags() != null) {
+ for (Tag tag : directory.getTags()) {
+ metadata.set(tag.getTagName(), tag.getDescription());
+ }
+ }
+ }
+ }
+
+ /**
+ * Copies all fields regardless of directory, if the tag name
+ * is not identical to a known Metadata field name.
+ * This leads to more predictable behavior than {@link
CopyAllFieldsHandler}.
+ */
+ static class CopyUnknownFieldsHandler implements DirectoryHandler {
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return true;
+ }
+
+ public void handle(Directory directory, Metadata metadata)
+ throws MetadataException {
+ if (directory.getTags() != null) {
+ for (Tag tag : directory.getTags()) {
+ String name = tag.getTagName();
+ if (!MetadataFields.isMetadataField(name) &&
tag.getDescription() != null) {
+ String value = tag.getDescription().trim();
+ if (Boolean.TRUE.toString().equalsIgnoreCase(value)) {
+ value = Boolean.TRUE.toString();
+ } else if
(Boolean.FALSE.toString().equalsIgnoreCase(value)) {
+ value = Boolean.FALSE.toString();
+ }
+ metadata.set(name, value);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Basic image properties for TIFF and JPEG, at least.
+ */
+ static class DimensionsHandler implements DirectoryHandler {
+ private final Pattern LEADING_NUMBERS =
Pattern.compile("(\\d+)\\s*.*");
+
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return directoryType == JpegDirectory.class ||
+ directoryType == ExifSubIFDDirectory.class ||
+ directoryType == ExifThumbnailDirectory.class ||
+ directoryType == ExifIFD0Directory.class;
+ }
+
+ public void handle(Directory directory, Metadata metadata) throws
MetadataException {
+ // The test TIFF has width and height stored as follows according
to exiv2
+ //Exif.Image.ImageWidth Short 1 100
+ //Exif.Image.ImageLength Short 1 75
+ // and the values are found in "Thumbnail Image Width" (and
Height) from Metadata Extractor
+ set(directory, metadata, JpegDirectory.TAG_IMAGE_WIDTH,
Metadata.IMAGE_WIDTH);
+ set(directory, metadata, JpegDirectory.TAG_IMAGE_HEIGHT,
Metadata.IMAGE_LENGTH);
+ // Bits per sample, two methods of extracting, exif overrides jpeg
+ set(directory, metadata, JpegDirectory.TAG_DATA_PRECISION,
Metadata.BITS_PER_SAMPLE);
+ set(directory, metadata, ExifSubIFDDirectory.TAG_BITS_PER_SAMPLE,
Metadata.BITS_PER_SAMPLE);
+ // Straightforward
+ set(directory, metadata,
ExifSubIFDDirectory.TAG_SAMPLES_PER_PIXEL, Metadata.SAMPLES_PER_PIXEL);
+ }
+
+ private void set(Directory directory, Metadata metadata, int
extractTag, Property metadataField) {
+ if (directory.containsTag(extractTag)) {
+ Matcher m =
LEADING_NUMBERS.matcher(directory.getString(extractTag));
+ if (m.matches()) {
+ metadata.set(metadataField, m.group(1));
+ }
+ }
+ }
+ }
+
+ static class JpegCommentHandler implements DirectoryHandler {
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return directoryType == JpegCommentDirectory.class;
+ }
+
+ public void handle(Directory directory, Metadata metadata) throws
MetadataException {
+ if (directory.containsTag(JpegCommentDirectory.TAG_COMMENT)) {
+ metadata.add(TikaCoreProperties.COMMENTS,
directory.getString(JpegCommentDirectory.TAG_COMMENT));
+ }
+ }
+ }
+
+ static class ExifHandler implements DirectoryHandler {
+ // There's a new ExifHandler for each file processed, so this is
thread safe
+ private static final ThreadLocal<SimpleDateFormat> DATE_UNSPECIFIED_TZ
= new ThreadLocal<SimpleDateFormat>() {
+ @Override
+ protected SimpleDateFormat initialValue() {
+ return new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss",
Locale.US);
+ }
+ };
+
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return directoryType == ExifIFD0Directory.class ||
+ directoryType == ExifSubIFDDirectory.class;
+ }
+
+ public void handle(Directory directory, Metadata metadata) {
+ try {
+ handleDateTags(directory, metadata);
+ handlePhotoTags(directory, metadata);
+ handleCommentTags(directory, metadata);
+ } catch (MetadataException e) {
+ // ignore date parse errors and proceed with other tags
+ }
+ }
+
+ /**
+ * EXIF may contain image description, although with undefined
encoding.
+ * Use IPTC for other annotation fields, and XMP for unicode support.
+ */
+ public void handleCommentTags(Directory directory, Metadata metadata) {
+ if (metadata.get(TikaCoreProperties.DESCRIPTION) == null &&
+
directory.containsTag(ExifIFD0Directory.TAG_IMAGE_DESCRIPTION)) {
+ metadata.set(TikaCoreProperties.DESCRIPTION,
+
directory.getString(ExifIFD0Directory.TAG_IMAGE_DESCRIPTION));
+ }
+ }
+
+ /**
+ * Maps common TIFF and EXIF tags onto the Tika
+ * TIFF image metadata namespace.
+ */
+ public void handlePhotoTags(Directory directory, Metadata metadata) {
+ if (directory.containsTag(ExifSubIFDDirectory.TAG_EXPOSURE_TIME)) {
+ Object exposure =
directory.getObject(ExifSubIFDDirectory.TAG_EXPOSURE_TIME);
+ if (exposure instanceof Rational) {
+ metadata.set(Metadata.EXPOSURE_TIME, ((Rational)
exposure).doubleValue());
+ } else {
+ metadata.set(Metadata.EXPOSURE_TIME,
directory.getString(ExifSubIFDDirectory.TAG_EXPOSURE_TIME));
+ }
+ }
+
+ if (directory.containsTag(ExifSubIFDDirectory.TAG_FLASH)) {
+ String flash =
directory.getDescription(ExifSubIFDDirectory.TAG_FLASH);
+ if (flash.contains("Flash fired")) {
+ metadata.set(Metadata.FLASH_FIRED,
Boolean.TRUE.toString());
+ } else if (flash.contains("Flash did not fire")) {
+ metadata.set(Metadata.FLASH_FIRED,
Boolean.FALSE.toString());
+ } else {
+ metadata.set(Metadata.FLASH_FIRED, flash);
+ }
+ }
+
+ if (directory.containsTag(ExifSubIFDDirectory.TAG_FNUMBER)) {
+ Object fnumber =
directory.getObject(ExifSubIFDDirectory.TAG_FNUMBER);
+ if (fnumber instanceof Rational) {
+ metadata.set(Metadata.F_NUMBER, ((Rational)
fnumber).doubleValue());
+ } else {
+ metadata.set(Metadata.F_NUMBER,
directory.getString(ExifSubIFDDirectory.TAG_FNUMBER));
+ }
+ }
+
+ if (directory.containsTag(ExifSubIFDDirectory.TAG_FOCAL_LENGTH)) {
+ Object length =
directory.getObject(ExifSubIFDDirectory.TAG_FOCAL_LENGTH);
+ if (length instanceof Rational) {
+ metadata.set(Metadata.FOCAL_LENGTH, ((Rational)
length).doubleValue());
+ } else {
+ metadata.set(Metadata.FOCAL_LENGTH,
directory.getString(ExifSubIFDDirectory.TAG_FOCAL_LENGTH));
+ }
+ }
+
+ if (directory.containsTag(ExifSubIFDDirectory.TAG_ISO_EQUIVALENT))
{
+ metadata.set(Metadata.ISO_SPEED_RATINGS,
directory.getString(ExifSubIFDDirectory.TAG_ISO_EQUIVALENT));
+ }
+
+ if (directory.containsTag(ExifIFD0Directory.TAG_MAKE)) {
+ metadata.set(Metadata.EQUIPMENT_MAKE,
directory.getString(ExifIFD0Directory.TAG_MAKE));
+ }
+ if (directory.containsTag(ExifIFD0Directory.TAG_MODEL)) {
+ metadata.set(Metadata.EQUIPMENT_MODEL,
directory.getString(ExifIFD0Directory.TAG_MODEL));
+ }
+
+ if (directory.containsTag(ExifIFD0Directory.TAG_ORIENTATION)) {
+ Object length =
directory.getObject(ExifIFD0Directory.TAG_ORIENTATION);
+ if (length instanceof Integer) {
+ metadata.set(Metadata.ORIENTATION,
Integer.toString((Integer) length));
+ } else {
+ metadata.set(Metadata.ORIENTATION,
directory.getString(ExifIFD0Directory.TAG_ORIENTATION));
+ }
+ }
+
+ if (directory.containsTag(ExifIFD0Directory.TAG_SOFTWARE)) {
+ metadata.set(Metadata.SOFTWARE,
directory.getString(ExifIFD0Directory.TAG_SOFTWARE));
+ }
+
+ if (directory.containsTag(ExifIFD0Directory.TAG_X_RESOLUTION)) {
+ Object resolution =
directory.getObject(ExifIFD0Directory.TAG_X_RESOLUTION);
+ if (resolution instanceof Rational) {
+ metadata.set(Metadata.RESOLUTION_HORIZONTAL, ((Rational)
resolution).doubleValue());
+ } else {
+ metadata.set(Metadata.RESOLUTION_HORIZONTAL,
directory.getString(ExifIFD0Directory.TAG_X_RESOLUTION));
+ }
+ }
+ if (directory.containsTag(ExifIFD0Directory.TAG_Y_RESOLUTION)) {
+ Object resolution =
directory.getObject(ExifIFD0Directory.TAG_Y_RESOLUTION);
+ if (resolution instanceof Rational) {
+ metadata.set(Metadata.RESOLUTION_VERTICAL, ((Rational)
resolution).doubleValue());
+ } else {
+ metadata.set(Metadata.RESOLUTION_VERTICAL,
directory.getString(ExifIFD0Directory.TAG_Y_RESOLUTION));
+ }
+ }
+ if (directory.containsTag(ExifIFD0Directory.TAG_RESOLUTION_UNIT)) {
+ metadata.set(Metadata.RESOLUTION_UNIT,
directory.getDescription(ExifIFD0Directory.TAG_RESOLUTION_UNIT));
+ }
+ if (directory.containsTag(ExifThumbnailDirectory.TAG_IMAGE_WIDTH))
{
+ metadata.set(Metadata.IMAGE_WIDTH,
+
trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_WIDTH)));
+ }
+ if
(directory.containsTag(ExifThumbnailDirectory.TAG_IMAGE_HEIGHT)) {
+ metadata.set(Metadata.IMAGE_LENGTH,
+
trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_HEIGHT)));
+ }
+ }
+
+ /**
+ * Maps exif dates to metadata fields.
+ */
+ public void handleDateTags(Directory directory, Metadata metadata)
+ throws MetadataException {
+ // Date/Time Original overrides value from
ExifDirectory.TAG_DATETIME
+ Date original = null;
+ if
(directory.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)) {
+ original =
directory.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL);
+ // Unless we have GPS time we don't know the time zone so date
must be set
+ // as ISO 8601 datetime without timezone suffix (no Z or +/-)
+ if (original != null) {
+ String datetimeNoTimeZone =
DATE_UNSPECIFIED_TZ.get().format(original); // Same time zone as Metadata
Extractor uses
+ metadata.set(TikaCoreProperties.CREATED,
datetimeNoTimeZone);
+ metadata.set(Metadata.ORIGINAL_DATE, datetimeNoTimeZone);
+ }
+ }
+ if (directory.containsTag(ExifIFD0Directory.TAG_DATETIME)) {
+ Date datetime =
directory.getDate(ExifIFD0Directory.TAG_DATETIME);
+ if (datetime != null) {
+ String datetimeNoTimeZone =
DATE_UNSPECIFIED_TZ.get().format(datetime);
+ metadata.set(TikaCoreProperties.MODIFIED,
datetimeNoTimeZone);
+ // If Date/Time Original does not exist this might be
creation date
+ if (metadata.get(TikaCoreProperties.CREATED) == null) {
+ metadata.set(TikaCoreProperties.CREATED,
datetimeNoTimeZone);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Reads image comments, originally TIKA-472.
+ * Metadata Extractor does not read XMP so we need to use the values from
Iptc or EXIF
+ */
+ static class IptcHandler implements DirectoryHandler {
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return directoryType == IptcDirectory.class;
+ }
+
+ public void handle(Directory directory, Metadata metadata)
+ throws MetadataException {
+ if (directory.containsTag(IptcDirectory.TAG_KEYWORDS)) {
+ String[] keywords =
directory.getStringArray(IptcDirectory.TAG_KEYWORDS);
+ for (String k : keywords) {
+ metadata.add(TikaCoreProperties.KEYWORDS, k);
+ }
+ }
+ if (directory.containsTag(IptcDirectory.TAG_HEADLINE)) {
+ metadata.set(TikaCoreProperties.TITLE,
directory.getString(IptcDirectory.TAG_HEADLINE));
+ } else if (directory.containsTag(IptcDirectory.TAG_OBJECT_NAME)) {
+ metadata.set(TikaCoreProperties.TITLE,
directory.getString(IptcDirectory.TAG_OBJECT_NAME));
+ }
+ if (directory.containsTag(IptcDirectory.TAG_BY_LINE)) {
+ metadata.set(TikaCoreProperties.CREATOR,
directory.getString(IptcDirectory.TAG_BY_LINE));
+ metadata.set(IPTC.CREATOR,
directory.getString(IptcDirectory.TAG_BY_LINE));
+ }
+ if (directory.containsTag(IptcDirectory.TAG_CAPTION)) {
+ metadata.set(TikaCoreProperties.DESCRIPTION,
+ // Looks like metadata extractor returns IPTC newlines
as a single carriage return,
+ // but the exiv2 command does not so we change to line
feed here because that is less surprising to users
+
directory.getString(IptcDirectory.TAG_CAPTION).replaceAll("\r\n?", "\n"));
+ }
+ }
+ }
+
+ /**
+ * Maps EXIF Geo Tags onto the Tika Geo metadata namespace.
+ */
+ static class GeotagHandler implements DirectoryHandler {
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return directoryType == GpsDirectory.class;
+ }
+
+ public void handle(Directory directory, Metadata metadata) throws
MetadataException {
+ GeoLocation geoLocation = ((GpsDirectory)
directory).getGeoLocation();
+ if (geoLocation != null) {
+ DecimalFormat geoDecimalFormat = new
DecimalFormat(GEO_DECIMAL_FORMAT_STRING,
+ new DecimalFormatSymbols(Locale.ENGLISH));
+ metadata.set(TikaCoreProperties.LATITUDE,
geoDecimalFormat.format(geoLocation.getLatitude()));
+ metadata.set(TikaCoreProperties.LONGITUDE,
geoDecimalFormat.format(geoLocation.getLongitude()));
+ }
+ }
+ }
+
+}
Added:
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
URL:
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java?rev=1722029&view=auto
==============================================================================
---
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
(added)
+++
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
Mon Dec 28 23:22:46 2015
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import javax.imageio.IIOException;
+import javax.imageio.ImageIO;
+import javax.imageio.ImageReader;
+import javax.imageio.metadata.IIOMetadata;
+import javax.imageio.stream.ImageInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class ImageParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 7852529269245520335L;
+
+ private static final MediaType CANONICAL_BMP_TYPE =
MediaType.image("x-ms-bmp");
+ private static final MediaType JAVA_BMP_TYPE = MediaType.image("bmp");
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ CANONICAL_BMP_TYPE,
+ JAVA_BMP_TYPE,
+ MediaType.image("gif"),
+ MediaType.image("png"),
+ MediaType.image("vnd.wap.wbmp"),
+ MediaType.image("x-icon"),
+ MediaType.image("x-xcf"))));
+
+ private static void setIfPresent(Metadata metadata, String imageIOkey,
String tikaKey) {
+ if (metadata.get(imageIOkey) != null) {
+ metadata.set(tikaKey, metadata.get(imageIOkey));
+ }
+ }
+
+ private static void setIfPresent(Metadata metadata, String imageIOkey,
Property tikaProp) {
+ if (metadata.get(imageIOkey) != null) {
+ String v = metadata.get(imageIOkey);
+ if (v.endsWith(" ")) {
+ v = v.substring(0, v.lastIndexOf(' '));
+ }
+ metadata.set(tikaProp, v);
+ }
+ }
+
+ private static void loadMetadata(IIOMetadata imageMetadata, Metadata
metadata) {
+ String[] names = imageMetadata.getMetadataFormatNames();
+ if (names == null) {
+ return;
+ }
+ for (String name : names) {
+ loadNode(metadata, imageMetadata.getAsTree(name), "", false);
+ }
+ }
+
+ private static void loadNode(
+ Metadata metadata, Node node, String parents,
+ boolean addThisNodeName) {
+ if (addThisNodeName) {
+ if (parents.length() > 0) {
+ parents += " ";
+ }
+ parents += node.getNodeName();
+ }
+ NamedNodeMap map = node.getAttributes();
+ if (map != null) {
+
+ int length = map.getLength();
+ if (length == 1) {
+ metadata.add(parents, normalize(map.item(0).getNodeValue()));
+ } else if (length > 1) {
+ StringBuilder value = new StringBuilder();
+ for (int i = 0; i < length; i++) {
+ if (i > 0) {
+ value.append(", ");
+ }
+ Node attr = map.item(i);
+ value.append(attr.getNodeName());
+ value.append("=");
+ value.append(normalize(attr.getNodeValue()));
+ }
+ metadata.add(parents, value.toString());
+ }
+ }
+
+ Node child = node.getFirstChild();
+ while (child != null) {
+ // print children recursively
+ loadNode(metadata, child, parents, true);
+ child = child.getNextSibling();
+ }
+ }
+
+ private static String normalize(String value) {
+ if (value != null) {
+ value = value.trim();
+ } else {
+ value = "";
+ }
+ if (Boolean.TRUE.toString().equalsIgnoreCase(value)) {
+ return Boolean.TRUE.toString();
+ } else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) {
+ return Boolean.FALSE.toString();
+ }
+ return value;
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ String type = metadata.get(Metadata.CONTENT_TYPE);
+ if (type != null) {
+ // Java has a different idea of the BMP mime type to
+ // what the canonical one is, fix this up.
+ if (CANONICAL_BMP_TYPE.toString().equals(type)) {
+ type = JAVA_BMP_TYPE.toString();
+ }
+
+ try {
+ Iterator<ImageReader> iterator =
+ ImageIO.getImageReadersByMIMEType(type);
+ if (iterator.hasNext()) {
+ ImageReader reader = iterator.next();
+ try {
+ try (ImageInputStream imageStream =
ImageIO.createImageInputStream(
+ new CloseShieldInputStream(stream))) {
+ reader.setInput(imageStream);
+
+ metadata.set(Metadata.IMAGE_WIDTH,
Integer.toString(reader.getWidth(0)));
+ metadata.set(Metadata.IMAGE_LENGTH,
Integer.toString(reader.getHeight(0)));
+ metadata.set("height",
Integer.toString(reader.getHeight(0)));
+ metadata.set("width",
Integer.toString(reader.getWidth(0)));
+
+ loadMetadata(reader.getImageMetadata(0), metadata);
+ }
+ } finally {
+ reader.dispose();
+ }
+ }
+
+ // Translate certain Metadata tags from the ImageIO
+ // specific namespace into the general Tika one
+ setIfPresent(metadata, "CommentExtensions CommentExtension",
TikaCoreProperties.COMMENTS);
+ setIfPresent(metadata, "markerSequence com",
TikaCoreProperties.COMMENTS);
+ setIfPresent(metadata, "Data BitsPerSample",
Metadata.BITS_PER_SAMPLE);
+ } catch (IIOException e) {
+ // TIKA-619: There is a known bug in the Sun API when dealing
with GIF images
+ // which Tika will just ignore.
+ if (!(e.getMessage() != null &&
+ e.getMessage().equals("Unexpected block type 0!") &&
+ type.equals("image/gif"))) {
+ throw new TikaException(type + " parse error", e);
+ }
+ }
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+}
Added:
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/MetadataFields.java
URL:
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/MetadataFields.java?rev=1722029&view=auto
==============================================================================
---
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/MetadataFields.java
(added)
+++
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/MetadataFields.java
Mon Dec 28 23:22:46 2015
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashSet;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+/**
+ * Knowns about all declared {@link Metadata} fields.
+ * Didn't find this functionality anywhere so it was added for
+ * ImageMetadataExtractor, but it can be generalized.
+ */
+public abstract class MetadataFields {
+
+ private static HashSet<String> known;
+
+ static {
+ known = new HashSet<String>();
+ setKnownForClass(TikaCoreProperties.class);
+ setKnownForClass(Metadata.class);
+ }
+
+ private static void setKnownForClass(Class<?> clazz) {
+ Field[] fields = clazz.getFields();
+ for (Field f : fields) {
+ int mod = f.getModifiers();
+ if (Modifier.isPublic(mod) && Modifier.isStatic(mod) &&
Modifier.isFinal(mod)) {
+ Class<?> c = f.getType();
+ if (String.class.equals(c)) {
+ try {
+ String p = (String) f.get(null);
+ if (p != null) {
+ known.add(p);
+ }
+ } catch (IllegalArgumentException e) {
+ e.printStackTrace();
+ } catch (IllegalAccessException e) {
+ e.printStackTrace();
+ }
+ }
+ if (Property.class.isAssignableFrom(c)) {
+ try {
+ Property p = (Property) f.get(null);
+ if (p != null) {
+ known.add(p.getName());
+ }
+ } catch (IllegalArgumentException e) {
+ e.printStackTrace();
+ } catch (IllegalAccessException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+ }
+
+ public static boolean isMetadataField(String name) {
+ return known.contains(name);
+ }
+
+ public static boolean isMetadataField(Property property) {
+ return known.contains(property.getName());
+ }
+
+}
Added:
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/PSDParser.java
URL:
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/PSDParser.java?rev=1722029&view=auto
==============================================================================
---
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/PSDParser.java
(added)
+++
tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/PSDParser.java
Mon Dec 28 23:22:46 2015
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Photoshop;
+import org.apache.tika.metadata.TIFF;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+
+/**
+ * Parser for the Adobe Photoshop PSD File Format.
+ * <p/>
+ * Documentation on the file format is available from
+ *
http://www.adobe.com/devnet-apps/photoshop/fileformatashtml/PhotoshopFileFormats.htm
+ */
+public class PSDParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 883387734607994914L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.image("vnd.adobe.photoshop"))));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Check for the magic header signature
+ byte[] signature = new byte[4];
+ IOUtils.readFully(stream, signature);
+ if (signature[0] == (byte) '8' && signature[1] == (byte) 'B' &&
+ signature[2] == (byte) 'P' && signature[3] == (byte) 'S') {
+ // Good, signature found
+ } else {
+ throw new TikaException("PSD/PSB magic signature invalid");
+ }
+
+ // Check the version
+ int version = EndianUtils.readUShortBE(stream);
+ if (version == 1 || version == 2) {
+ // Good, we support these two
+ } else {
+ throw new TikaException("Invalid PSD/PSB version " + version);
+ }
+
+ // Skip the reserved block
+ IOUtils.readFully(stream, new byte[6]);
+
+ // Number of channels in the image
+ int numChannels = EndianUtils.readUShortBE(stream);
+ // TODO Identify a suitable metadata key for this
+
+ // Width and Height
+ int height = EndianUtils.readIntBE(stream);
+ int width = EndianUtils.readIntBE(stream);
+ metadata.set(TIFF.IMAGE_LENGTH, height);
+ metadata.set(TIFF.IMAGE_WIDTH, width);
+
+ // Depth (bits per channel)
+ int depth = EndianUtils.readUShortBE(stream);
+ metadata.set(TIFF.BITS_PER_SAMPLE, Integer.toString(depth));
+
+ // Colour mode, eg Bitmap or RGB
+ int colorMode = EndianUtils.readUShortBE(stream);
+ metadata.set(Photoshop.COLOR_MODE,
Photoshop._COLOR_MODE_CHOICES_INDEXED[colorMode]);
+
+ // Next is the Color Mode section
+ // We don't care about this bit
+ long colorModeSectionSize = EndianUtils.readIntBE(stream);
+ stream.skip(colorModeSectionSize);
+
+ // Next is the Image Resources section
+ // Check for certain interesting keys here
+ long imageResourcesSectionSize = EndianUtils.readIntBE(stream);
+ long read = 0;
+ while (read < imageResourcesSectionSize) {
+ ResourceBlock rb = new ResourceBlock(stream);
+ read += rb.totalLength;
+
+ // Is it one we can do something useful with?
+ if (rb.id == ResourceBlock.ID_CAPTION) {
+ metadata.add(TikaCoreProperties.DESCRIPTION,
rb.getDataAsString());
+ } else if (rb.id == ResourceBlock.ID_EXIF_1) {
+ // TODO Parse the EXIF info via ImageMetadataExtractor
+ } else if (rb.id == ResourceBlock.ID_EXIF_3) {
+ // TODO Parse the EXIF info via ImageMetadataExtractor
+ } else if (rb.id == ResourceBlock.ID_XMP) {
+ // TODO Parse the XMP info via ImageMetadataExtractor
+ }
+ }
+
+ // Next is the Layer and Mask Info
+ // Finally we have Image Data
+ // We can't do anything with these parts
+
+ // We don't have any helpful text, sorry...
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+ private static class ResourceBlock {
+ private static final long SIGNATURE = 0x3842494d; // 8BIM
+ private static final int ID_CAPTION = 0x03F0;
+ private static final int ID_URL = 0x040B;
+ private static final int ID_EXIF_1 = 0x0422;
+ private static final int ID_EXIF_3 = 0x0423;
+ private static final int ID_XMP = 0x0424;
+
+ private int id;
+ private String name;
+ private byte[] data;
+ private int totalLength;
+
+ private ResourceBlock(InputStream stream) throws IOException,
TikaException {
+ // Verify the signature
+ long sig = EndianUtils.readIntBE(stream);
+ if (sig != SIGNATURE) {
+ throw new TikaException("Invalid Image Resource Block
Signature Found, got " +
+ sig + " 0x" + Long.toHexString(sig) + " but the spec
defines " + SIGNATURE);
+ }
+
+ // Read the block
+ id = EndianUtils.readUShortBE(stream);
+
+ StringBuffer nameB = new StringBuffer();
+ int nameLen = 0;
+ while (true) {
+ int v = stream.read();
+ nameLen++;
+
+ if (v == 0) {
+ // The name length is padded to be even
+ if (nameLen % 2 == 1) {
+ stream.read();
+ nameLen++;
+ }
+ break;
+ } else {
+ nameB.append((char) v);
+ }
+ name = nameB.toString();
+ }
+
+ int dataLen = EndianUtils.readIntBE(stream);
+ if (dataLen % 2 == 1) {
+ // Data Length is even padded
+ dataLen = dataLen + 1;
+ }
+ totalLength = 4 + 2 + nameLen + 4 + dataLen;
+
+ data = new byte[dataLen];
+ IOUtils.readFully(stream, data);
+ }
+
+ private String getDataAsString() {
+ // Will be null padded
+ return new String(data, 0, data.length - 1, US_ASCII);
+ }
+ }
+}