Xiaohong Yang created TIKA-4212:
-----------------------------------

             Summary: Tika fails to get file extension of file type 
image/x-rtf-raw-bitmap
                 Key: TIKA-4212
                 URL: https://issues.apache.org/jira/browse/TIKA-4212
             Project: Tika
          Issue Type: Bug
            Reporter: Xiaohong Yang


We use  org.apache.tika.extractor.EmbeddedDocumentExtractor to get embedded 
objects from Word documents.  Two embedded objects are extracted from the 
sample doc file. Their file type is image/x-rtf-raw-bitmap. But Tika fails to 
get the file extension with the following method call

              tikaExtension = 
config.getMimeRepository().forName(contentType.toString()).getExtension();

Wonder if you can fix the problem in the Tika library.  Also wonder if you can 
tell us the file extension of file type is image/x-rtf-raw-bitmap.

Following is the sample code and attached is the tika-config.xml and the sample 
Word file.

The operating system is Ubuntu 20.04. Java version is 17.  Tika version is 
2.9.1 and POI version is 5.2.3.  

 

import org.apache.pdfbox.io.IOUtils;

import org.apache.poi.poifs.filesystem.DirectoryEntry;

import org.apache.poi.poifs.filesystem.DocumentEntry;

import org.apache.poi.poifs.filesystem.DocumentInputStream;

import org.apache.poi.poifs.filesystem.POIFSFileSystem;

import org.apache.tika.config.TikaConfig;

import org.apache.tika.detect.Detector;

import org.apache.tika.extractor.EmbeddedDocumentExtractor;

import org.apache.tika.io.FilenameUtils;

import org.apache.tika.io.TikaInputStream;

import org.apache.tika.metadata.Metadata;

import org.apache.tika.metadata.TikaCoreProperties;

import org.apache.tika.mime.MediaType;

import org.apache.tika.parser.AutoDetectParser;

import org.apache.tika.parser.ParseContext;

import org.apache.tika.parser.Parser;

import org.xml.sax.ContentHandler;

import org.xml.sax.SAXException;

import org.xml.sax.helpers.DefaultHandler;

 

import java.io.*;

import java.net.URL;

import java.nio.file.Path;

 

public class ExtractBitMapFromWord {

    private final Path docFile = new 
File("/home/ubuntu/testdirs/testdir_doc/sample.DOC").toPath();

    private final Path outputDir = new 
File("/home/ubuntu/testdirs/testdir_doc/tika_output/").toPath();

 

    private Parser parser;

    private ParseContext context;

 

 

    public static void main(String args[]) {

        try {

            new ExtractBitMapFromWord().process();

        }

        catch(Exception ex) {

            ex.printStackTrace();

        }

    }

 

    public ExtractBitMapFromWord() {

    }

 

    public void process() throws Exception {

        TikaConfig config = new 
TikaConfig("/home/ubuntu/testdirs/testdir_doc/tika-config.xml");

        ExtractBitMapFromWord.FileEmbeddedDocumentExtractor 
fileEmbeddedDocumentExtractor = new 
ExtractBitMapFromWord.FileEmbeddedDocumentExtractor();

 

        parser = new AutoDetectParser(config);

        context = new ParseContext();

        context.set(Parser.class, parser);

        context.set(TikaConfig.class, config);

        context.set(EmbeddedDocumentExtractor.class, 
fileEmbeddedDocumentExtractor);

 

        URL url = docFile.toUri().toURL();

        Metadata metadata = new Metadata();

        try (InputStream input = TikaInputStream.get(url, metadata)) {

            ContentHandler handler = new DefaultHandler();

            parser.parse(input, handler, metadata, context);

        }

    }

 

    private class FileEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtractor {

        private int count = 0;

 

        public boolean shouldParseEmbedded(Metadata metadata) {

            return true;

        }

 

        public void parseEmbedded(InputStream inputStream, ContentHandler 
contentHandler, Metadata metadata,

                                  boolean outputHtml) throws SAXException, 
IOException {

            String fullFileName = 
metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);

            if (fullFileName == null) {

                fullFileName = "file" + count++;

            }

 

            TikaConfig config = null;

            try {

                config = new 
TikaConfig("/home/ubuntu/testdirs/testdir_doc/tika-config.xml");

            } catch (Exception ex) {

                ex.printStackTrace();

            }

            if (config == null) {

                return;

            }

 

            Detector detector = config.getDetector();;

            MediaType contentType = detector.detect(inputStream, metadata);

            String tikaExtension = null;

            if(fullFileName.indexOf('.') == -1 && contentType != null){

                try {

                    tikaExtension = 
config.getMimeRepository().forName(contentType.toString()).getExtension();

                } catch (Exception ex) {

                    ex.printStackTrace();

                }

 

                if (tikaExtension != null && !tikaExtension.isEmpty() ) {

                    fullFileName += tikaExtension;

                }

            }

 

 

            String[] fileNameSplit = fullFileName.split("/");

            String fileName = fileNameSplit[fileNameSplit.length - 1];

            File outputFile = new File(outputDir.toFile(), 
FilenameUtils.normalize(fileName));

            System.out.println("Extracting '" + fileName + " to " + outputFile);

            FileOutputStream os = null;

            try {

                os = new FileOutputStream(outputFile);

                if (inputStream instanceof TikaInputStream tin) {

                    if (tin.getOpenContainer() instanceof DirectoryEntry) {

                        try(POIFSFileSystem fs = new POIFSFileSystem()){

                            copy((DirectoryEntry) tin.getOpenContainer(), 
fs.getRoot());

                            fs.writeFilesystem(os);

                        }

                    } else {

                        IOUtils.copy(inputStream, os);

                    }

                } else {

                    IOUtils.copy(inputStream, os);

                }

            } catch (Exception ex) {

                ex.printStackTrace();

            } finally {

                if (os != null) {

                    os.flush();

                    os.close();

                }

            }

        }

 

        protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) 
throws IOException {

            for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {

                if (entry instanceof DirectoryEntry) {

                    // Need to recurse

                    DirectoryEntry newDir = 
destDir.createDirectory(entry.getName());

                    copy((DirectoryEntry) entry, newDir);

                } else {

                    // Copy entry

                    try (InputStream contents = new 
DocumentInputStream((DocumentEntry) entry)) {

                        destDir.createDocument(entry.getName(), contents);

                    }

                }

            }

        }

    }

 

}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to