Author: tallison
Date: Tue Jul  7 14:21:50 2015
New Revision: 1689690

URL: http://svn.apache.org/r1689690
Log:
TIKA-1674: initial commit to add example of how to extract embedded files

Added:
    
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
    
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ExtractEmbeddedFilesTest.java
Modified:
    
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java

Added: 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java?rev=1689690&view=auto
==============================================================================
--- 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
 (added)
+++ 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
 Tue Jul  7 14:21:50 2015
@@ -0,0 +1,105 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.example;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.FilenameUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class ExtractEmbeddedFiles {
+
+    private Parser parser = new AutoDetectParser();
+    private Detector detector = ((AutoDetectParser)parser).getDetector();
+    private TikaConfig config = TikaConfig.getDefaultConfig();
+
+    public void extract(InputStream is, Path outputDir) throws SAXException, 
TikaException, IOException {
+        Metadata m = new Metadata();
+        ParseContext c = new ParseContext();
+        ContentHandler h = new BodyContentHandler(-1);
+
+        c.set(Parser.class, parser);
+        EmbeddedDocumentExtractor ex = new 
MyEmbeddedDocumentExtractor(outputDir, c);
+        c.set(EmbeddedDocumentExtractor.class, ex);
+
+        parser.parse(is, h, m, c);
+    }
+
+    private class MyEmbeddedDocumentExtractor extends 
ParsingEmbeddedDocumentExtractor {
+        private final Path outputDir;
+        private int fileCount = 0;
+
+        private MyEmbeddedDocumentExtractor(Path outputDir, ParseContext 
context) {
+            super(context);
+            this.outputDir = outputDir;
+        }
+
+        @Override
+        public boolean shouldParseEmbedded(Metadata metadata) {
+            return true;
+        }
+
+        @Override
+        public void parseEmbedded(InputStream stream, ContentHandler handler, 
Metadata metadata, boolean outputHtml)
+                throws SAXException, IOException {
+
+            //try to get the name of the embedded file from the metadata
+            String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+
+            if (name == null) {
+                name = "file_" + fileCount++;
+            } else {
+                //make sure to select only the file name (not any directory 
paths
+                //that might be included in the name) and make sure
+                //to normalize the name
+                name = FilenameUtils.normalize(FilenameUtils.getName(name));
+            }
+
+            //now try to figure out the right extension for the embedded file
+            MediaType contentType = detector.detect(stream, metadata);
+
+            if (name.indexOf('.')==-1 && contentType!=null) {
+                try {
+                    name += config.getMimeRepository().forName(
+                            contentType.toString()).getExtension();
+                } catch (MimeTypeException e) {
+                    e.printStackTrace();
+                }
+            }
+            //should add check to make sure that you aren't overwriting a file
+            Path outputFile = outputDir.resolve(name);
+            //do a better job than this of checking
+            Files.createDirectories(outputFile.getParent());
+            Files.copy(stream, outputFile);
+        }
+    }
+}

Modified: 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java?rev=1689690&r1=1689689&r2=1689690&view=diff
==============================================================================
--- 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java
 (original)
+++ 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java
 Tue Jul  7 14:21:50 2015
@@ -19,6 +19,10 @@ package org.apache.tika.example;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringWriter;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
 import java.util.List;
 
 import org.apache.tika.Tika;
@@ -204,4 +208,27 @@ public class ParsingExample {
         JsonMetadataList.toJson(metadataList, writer);
         return writer.toString();
     }
+
+
+    /**
+     *
+     * @param outputPath -- output directory to place files
+     * @return list of files created
+     * @throws IOException
+     * @throws SAXException
+     * @throws TikaException
+     */
+    public List<Path> extractEmbeddedDocumentsExample(Path outputPath) throws 
IOException,
+            SAXException, TikaException {
+        InputStream stream = 
ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx");
+        ExtractEmbeddedFiles ex = new ExtractEmbeddedFiles();
+        ex.extract(stream, outputPath);
+        List<Path> ret = new ArrayList<Path>();
+        try (DirectoryStream<Path> dirStream = 
Files.newDirectoryStream(outputPath)) {
+            for (Path entry: dirStream) {
+                ret.add(entry);
+            }
+        }
+        return ret;
+    }
 }

Added: 
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ExtractEmbeddedFilesTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/ExtractEmbeddedFilesTest.java?rev=1689690&view=auto
==============================================================================
--- 
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ExtractEmbeddedFilesTest.java
 (added)
+++ 
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ExtractEmbeddedFilesTest.java
 Tue Jul  7 14:21:50 2015
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.example;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class ExtractEmbeddedFilesTest {
+
+    ParsingExample parsingExample;
+    Path outputPath;
+
+    @Before
+    public void setUp() throws IOException {
+        parsingExample = new ParsingExample();
+        outputPath = Files.createTempDirectory("tika-ext-emb-example-");
+    }
+
+    @After
+    public void tearDown() throws IOException {
+        //this does not act recursively, this only assumes single level 
directory
+        try (DirectoryStream<Path> dirStream = 
Files.newDirectoryStream(outputPath)) {
+            for (Path entry: dirStream) {
+                Files.delete(entry);
+            }
+        }
+        Files.delete(outputPath);
+
+    }
+
+    @Test
+    public void testExtractEmbeddedFiles() throws Exception {
+        List<Path> extracted = 
parsingExample.extractEmbeddedDocumentsExample(outputPath);
+        //this number should be bigger!!!
+        assertEquals(2, extracted.size());
+    }
+
+}


Reply via email to