[jira] [Updated] (TIKA-4212) Tika fails to get file extension of file type image/x-rtf-raw-bitmap

Xiaohong Yang (Jira) Thu, 14 Mar 2024 12:00:05 -0700


     [ 
https://issues.apache.org/jira/browse/TIKA-4212?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]


Xiaohong Yang updated TIKA-4212:
--------------------------------
    Attachment: tika-config-and-sample-file.zip

> Tika fails to get file extension of file type image/x-rtf-raw-bitmap
> --------------------------------------------------------------------
>
>                 Key: TIKA-4212
>                 URL: https://issues.apache.org/jira/browse/TIKA-4212
>             Project: Tika
>          Issue Type: Bug
>            Reporter: Xiaohong Yang
>            Priority: Major
>         Attachments: tika-config-and-sample-file.zip
>
>
> We use  org.apache.tika.extractor.EmbeddedDocumentExtractor to get embedded 
> objects from Word documents.  Two embedded objects are extracted from the 
> sample doc file. Their file type is image/x-rtf-raw-bitmap. But Tika fails to 
> get the file extension with the following method call
>               tikaExtension = 
> config.getMimeRepository().forName(contentType.toString()).getExtension();
> Wonder if you can fix the problem in the Tika library.  Also wonder if you 
> can tell us the file extension of file type is image/x-rtf-raw-bitmap.
> Following is the sample code and attached is the tika-config.xml and the 
> sample Word file.
> The operating system is Ubuntu 20.04. Java version is 17.  Tika version is 
> 2.9.1 and POI version is 5.2.3.  
>  
> import org.apache.pdfbox.io.IOUtils;
> import org.apache.poi.poifs.filesystem.DirectoryEntry;
> import org.apache.poi.poifs.filesystem.DocumentEntry;
> import org.apache.poi.poifs.filesystem.DocumentInputStream;
> import org.apache.poi.poifs.filesystem.POIFSFileSystem;
> import org.apache.tika.config.TikaConfig;
> import org.apache.tika.detect.Detector;
> import org.apache.tika.extractor.EmbeddedDocumentExtractor;
> import org.apache.tika.io.FilenameUtils;
> import org.apache.tika.io.TikaInputStream;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.metadata.TikaCoreProperties;
> import org.apache.tika.mime.MediaType;
> import org.apache.tika.parser.AutoDetectParser;
> import org.apache.tika.parser.ParseContext;
> import org.apache.tika.parser.Parser;
> import org.xml.sax.ContentHandler;
> import org.xml.sax.SAXException;
> import org.xml.sax.helpers.DefaultHandler;
>  
> import java.io.*;
> import java.net.URL;
> import java.nio.file.Path;
>  
> public class ExtractBitMapFromWord {
>     private final Path docFile = new 
> File("/home/ubuntu/testdirs/testdir_doc/sample.DOC").toPath();
>     private final Path outputDir = new 
> File("/home/ubuntu/testdirs/testdir_doc/tika_output/").toPath();
>  
>     private Parser parser;
>     private ParseContext context;
>  
>  
>     public static void main(String args[]) {
>         try {
>             new ExtractBitMapFromWord().process();
>         }
>         catch(Exception ex) {
>             ex.printStackTrace();
>         }
>     }
>  
>     public ExtractBitMapFromWord() {
>     }
>  
>     public void process() throws Exception {
>         TikaConfig config = new 
> TikaConfig("/home/ubuntu/testdirs/testdir_doc/tika-config.xml");
>         ExtractBitMapFromWord.FileEmbeddedDocumentExtractor 
> fileEmbeddedDocumentExtractor = new 
> ExtractBitMapFromWord.FileEmbeddedDocumentExtractor();
>  
>         parser = new AutoDetectParser(config);
>         context = new ParseContext();
>         context.set(Parser.class, parser);
>         context.set(TikaConfig.class, config);
>         context.set(EmbeddedDocumentExtractor.class, 
> fileEmbeddedDocumentExtractor);
>  
>         URL url = docFile.toUri().toURL();
>         Metadata metadata = new Metadata();
>         try (InputStream input = TikaInputStream.get(url, metadata)) {
>             ContentHandler handler = new DefaultHandler();
>             parser.parse(input, handler, metadata, context);
>         }
>     }
>  
>     private class FileEmbeddedDocumentExtractor implements 
> EmbeddedDocumentExtractor {
>         private int count = 0;
>  
>         public boolean shouldParseEmbedded(Metadata metadata) {
>             return true;
>         }
>  
>         public void parseEmbedded(InputStream inputStream, ContentHandler 
> contentHandler, Metadata metadata,
>                                   boolean outputHtml) throws SAXException, 
> IOException {
>             String fullFileName = 
> metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
>             if (fullFileName == null) {
>                 fullFileName = "file" + count++;
>             }
>  
>             TikaConfig config = null;
>             try {
>                 config = new 
> TikaConfig("/home/ubuntu/testdirs/testdir_doc/tika-config.xml");
>             } catch (Exception ex) {
>                 ex.printStackTrace();
>             }
>             if (config == null) {
>                 return;
>             }
>  
>             Detector detector = config.getDetector();;
>             MediaType contentType = detector.detect(inputStream, metadata);
>             String tikaExtension = null;
>             if(fullFileName.indexOf('.') == -1 && contentType != null){
>                 try {
>                     tikaExtension = 
> config.getMimeRepository().forName(contentType.toString()).getExtension();
>                 } catch (Exception ex) {
>                     ex.printStackTrace();
>                 }
>  
>                 if (tikaExtension != null && !tikaExtension.isEmpty() ) {
>                     fullFileName += tikaExtension;
>                 }
>             }
>  
>  
>             String[] fileNameSplit = fullFileName.split("/");
>             String fileName = fileNameSplit[fileNameSplit.length - 1];
>             File outputFile = new File(outputDir.toFile(), 
> FilenameUtils.normalize(fileName));
>             System.out.println("Extracting '" + fileName + " to " + 
> outputFile);
>             FileOutputStream os = null;
>             try {
>                 os = new FileOutputStream(outputFile);
>                 if (inputStream instanceof TikaInputStream tin) {
>                     if (tin.getOpenContainer() instanceof DirectoryEntry) {
>                         try(POIFSFileSystem fs = new POIFSFileSystem()){
>                             copy((DirectoryEntry) tin.getOpenContainer(), 
> fs.getRoot());
>                             fs.writeFilesystem(os);
>                         }
>                     } else {
>                         IOUtils.copy(inputStream, os);
>                     }
>                 } else {
>                     IOUtils.copy(inputStream, os);
>                 }
>             } catch (Exception ex) {
>                 ex.printStackTrace();
>             } finally {
>                 if (os != null) {
>                     os.flush();
>                     os.close();
>                 }
>             }
>         }
>  
>         protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) 
> throws IOException {
>             for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {
>                 if (entry instanceof DirectoryEntry) {
>                     // Need to recurse
>                     DirectoryEntry newDir = 
> destDir.createDirectory(entry.getName());
>                     copy((DirectoryEntry) entry, newDir);
>                 } else {
>                     // Copy entry
>                     try (InputStream contents = new 
> DocumentInputStream((DocumentEntry) entry)) {
>                         destDir.createDocument(entry.getName(), contents);
>                     }
>                 }
>             }
>         }
>     }
>  
> }



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Updated] (TIKA-4212) Tika fails to get file extension of file type image/x-rtf-raw-bitmap

Reply via email to