Author: tallison
Date: Tue Jul 7 14:21:50 2015
New Revision: 1689690
URL: http://svn.apache.org/r1689690
Log:
TIKA-1674: initial commit to add example of how to extract embedded files
Added:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ExtractEmbeddedFilesTest.java
Modified:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java
Added:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java?rev=1689690&view=auto
==============================================================================
---
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
(added)
+++
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
Tue Jul 7 14:21:50 2015
@@ -0,0 +1,105 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.example;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.FilenameUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class ExtractEmbeddedFiles {
+
+ private Parser parser = new AutoDetectParser();
+ private Detector detector = ((AutoDetectParser)parser).getDetector();
+ private TikaConfig config = TikaConfig.getDefaultConfig();
+
+ public void extract(InputStream is, Path outputDir) throws SAXException,
TikaException, IOException {
+ Metadata m = new Metadata();
+ ParseContext c = new ParseContext();
+ ContentHandler h = new BodyContentHandler(-1);
+
+ c.set(Parser.class, parser);
+ EmbeddedDocumentExtractor ex = new
MyEmbeddedDocumentExtractor(outputDir, c);
+ c.set(EmbeddedDocumentExtractor.class, ex);
+
+ parser.parse(is, h, m, c);
+ }
+
+ private class MyEmbeddedDocumentExtractor extends
ParsingEmbeddedDocumentExtractor {
+ private final Path outputDir;
+ private int fileCount = 0;
+
+ private MyEmbeddedDocumentExtractor(Path outputDir, ParseContext
context) {
+ super(context);
+ this.outputDir = outputDir;
+ }
+
+ @Override
+ public boolean shouldParseEmbedded(Metadata metadata) {
+ return true;
+ }
+
+ @Override
+ public void parseEmbedded(InputStream stream, ContentHandler handler,
Metadata metadata, boolean outputHtml)
+ throws SAXException, IOException {
+
+ //try to get the name of the embedded file from the metadata
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+
+ if (name == null) {
+ name = "file_" + fileCount++;
+ } else {
+ //make sure to select only the file name (not any directory
paths
+ //that might be included in the name) and make sure
+ //to normalize the name
+ name = FilenameUtils.normalize(FilenameUtils.getName(name));
+ }
+
+ //now try to figure out the right extension for the embedded file
+ MediaType contentType = detector.detect(stream, metadata);
+
+ if (name.indexOf('.')==-1 && contentType!=null) {
+ try {
+ name += config.getMimeRepository().forName(
+ contentType.toString()).getExtension();
+ } catch (MimeTypeException e) {
+ e.printStackTrace();
+ }
+ }
+ //should add check to make sure that you aren't overwriting a file
+ Path outputFile = outputDir.resolve(name);
+ //do a better job than this of checking
+ Files.createDirectories(outputFile.getParent());
+ Files.copy(stream, outputFile);
+ }
+ }
+}
Modified:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java?rev=1689690&r1=1689689&r2=1689690&view=diff
==============================================================================
---
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java
(original)
+++
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java
Tue Jul 7 14:21:50 2015
@@ -19,6 +19,10 @@ package org.apache.tika.example;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
import java.util.List;
import org.apache.tika.Tika;
@@ -204,4 +208,27 @@ public class ParsingExample {
JsonMetadataList.toJson(metadataList, writer);
return writer.toString();
}
+
+
+ /**
+ *
+ * @param outputPath -- output directory to place files
+ * @return list of files created
+ * @throws IOException
+ * @throws SAXException
+ * @throws TikaException
+ */
+ public List<Path> extractEmbeddedDocumentsExample(Path outputPath) throws
IOException,
+ SAXException, TikaException {
+ InputStream stream =
ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx");
+ ExtractEmbeddedFiles ex = new ExtractEmbeddedFiles();
+ ex.extract(stream, outputPath);
+ List<Path> ret = new ArrayList<Path>();
+ try (DirectoryStream<Path> dirStream =
Files.newDirectoryStream(outputPath)) {
+ for (Path entry: dirStream) {
+ ret.add(entry);
+ }
+ }
+ return ret;
+ }
}
Added:
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ExtractEmbeddedFilesTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/ExtractEmbeddedFilesTest.java?rev=1689690&view=auto
==============================================================================
---
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ExtractEmbeddedFilesTest.java
(added)
+++
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ExtractEmbeddedFilesTest.java
Tue Jul 7 14:21:50 2015
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.example;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class ExtractEmbeddedFilesTest {
+
+ ParsingExample parsingExample;
+ Path outputPath;
+
+ @Before
+ public void setUp() throws IOException {
+ parsingExample = new ParsingExample();
+ outputPath = Files.createTempDirectory("tika-ext-emb-example-");
+ }
+
+ @After
+ public void tearDown() throws IOException {
+ //this does not act recursively, this only assumes single level
directory
+ try (DirectoryStream<Path> dirStream =
Files.newDirectoryStream(outputPath)) {
+ for (Path entry: dirStream) {
+ Files.delete(entry);
+ }
+ }
+ Files.delete(outputPath);
+
+ }
+
+ @Test
+ public void testExtractEmbeddedFiles() throws Exception {
+ List<Path> extracted =
parsingExample.extractEmbeddedDocumentsExample(outputPath);
+ //this number should be bigger!!!
+ assertEquals(2, extracted.size());
+ }
+
+}