[ https://issues.apache.org/jira/browse/TIKA-4212?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Xiaohong Yang updated TIKA-4212: -------------------------------- Attachment: tika-config-and-sample-file.zip > Tika fails to get file extension of file type image/x-rtf-raw-bitmap > -------------------------------------------------------------------- > > Key: TIKA-4212 > URL: https://issues.apache.org/jira/browse/TIKA-4212 > Project: Tika > Issue Type: Bug > Reporter: Xiaohong Yang > Priority: Major > Attachments: tika-config-and-sample-file.zip > > > We use org.apache.tika.extractor.EmbeddedDocumentExtractor to get embedded > objects from Word documents. Two embedded objects are extracted from the > sample doc file. Their file type is image/x-rtf-raw-bitmap. But Tika fails to > get the file extension with the following method call > tikaExtension = > config.getMimeRepository().forName(contentType.toString()).getExtension(); > Wonder if you can fix the problem in the Tika library. Also wonder if you > can tell us the file extension of file type is image/x-rtf-raw-bitmap. > Following is the sample code and attached is the tika-config.xml and the > sample Word file. > The operating system is Ubuntu 20.04. Java version is 17. Tika version is > 2.9.1 and POI version is 5.2.3. > > import org.apache.pdfbox.io.IOUtils; > import org.apache.poi.poifs.filesystem.DirectoryEntry; > import org.apache.poi.poifs.filesystem.DocumentEntry; > import org.apache.poi.poifs.filesystem.DocumentInputStream; > import org.apache.poi.poifs.filesystem.POIFSFileSystem; > import org.apache.tika.config.TikaConfig; > import org.apache.tika.detect.Detector; > import org.apache.tika.extractor.EmbeddedDocumentExtractor; > import org.apache.tika.io.FilenameUtils; > import org.apache.tika.io.TikaInputStream; > import org.apache.tika.metadata.Metadata; > import org.apache.tika.metadata.TikaCoreProperties; > import org.apache.tika.mime.MediaType; > import org.apache.tika.parser.AutoDetectParser; > import org.apache.tika.parser.ParseContext; > import org.apache.tika.parser.Parser; > import org.xml.sax.ContentHandler; > import org.xml.sax.SAXException; > import org.xml.sax.helpers.DefaultHandler; > > import java.io.*; > import java.net.URL; > import java.nio.file.Path; > > public class ExtractBitMapFromWord { > private final Path docFile = new > File("/home/ubuntu/testdirs/testdir_doc/sample.DOC").toPath(); > private final Path outputDir = new > File("/home/ubuntu/testdirs/testdir_doc/tika_output/").toPath(); > > private Parser parser; > private ParseContext context; > > > public static void main(String args[]) { > try { > new ExtractBitMapFromWord().process(); > } > catch(Exception ex) { > ex.printStackTrace(); > } > } > > public ExtractBitMapFromWord() { > } > > public void process() throws Exception { > TikaConfig config = new > TikaConfig("/home/ubuntu/testdirs/testdir_doc/tika-config.xml"); > ExtractBitMapFromWord.FileEmbeddedDocumentExtractor > fileEmbeddedDocumentExtractor = new > ExtractBitMapFromWord.FileEmbeddedDocumentExtractor(); > > parser = new AutoDetectParser(config); > context = new ParseContext(); > context.set(Parser.class, parser); > context.set(TikaConfig.class, config); > context.set(EmbeddedDocumentExtractor.class, > fileEmbeddedDocumentExtractor); > > URL url = docFile.toUri().toURL(); > Metadata metadata = new Metadata(); > try (InputStream input = TikaInputStream.get(url, metadata)) { > ContentHandler handler = new DefaultHandler(); > parser.parse(input, handler, metadata, context); > } > } > > private class FileEmbeddedDocumentExtractor implements > EmbeddedDocumentExtractor { > private int count = 0; > > public boolean shouldParseEmbedded(Metadata metadata) { > return true; > } > > public void parseEmbedded(InputStream inputStream, ContentHandler > contentHandler, Metadata metadata, > boolean outputHtml) throws SAXException, > IOException { > String fullFileName = > metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); > if (fullFileName == null) { > fullFileName = "file" + count++; > } > > TikaConfig config = null; > try { > config = new > TikaConfig("/home/ubuntu/testdirs/testdir_doc/tika-config.xml"); > } catch (Exception ex) { > ex.printStackTrace(); > } > if (config == null) { > return; > } > > Detector detector = config.getDetector();; > MediaType contentType = detector.detect(inputStream, metadata); > String tikaExtension = null; > if(fullFileName.indexOf('.') == -1 && contentType != null){ > try { > tikaExtension = > config.getMimeRepository().forName(contentType.toString()).getExtension(); > } catch (Exception ex) { > ex.printStackTrace(); > } > > if (tikaExtension != null && !tikaExtension.isEmpty() ) { > fullFileName += tikaExtension; > } > } > > > String[] fileNameSplit = fullFileName.split("/"); > String fileName = fileNameSplit[fileNameSplit.length - 1]; > File outputFile = new File(outputDir.toFile(), > FilenameUtils.normalize(fileName)); > System.out.println("Extracting '" + fileName + " to " + > outputFile); > FileOutputStream os = null; > try { > os = new FileOutputStream(outputFile); > if (inputStream instanceof TikaInputStream tin) { > if (tin.getOpenContainer() instanceof DirectoryEntry) { > try(POIFSFileSystem fs = new POIFSFileSystem()){ > copy((DirectoryEntry) tin.getOpenContainer(), > fs.getRoot()); > fs.writeFilesystem(os); > } > } else { > IOUtils.copy(inputStream, os); > } > } else { > IOUtils.copy(inputStream, os); > } > } catch (Exception ex) { > ex.printStackTrace(); > } finally { > if (os != null) { > os.flush(); > os.close(); > } > } > } > > protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) > throws IOException { > for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) { > if (entry instanceof DirectoryEntry) { > // Need to recurse > DirectoryEntry newDir = > destDir.createDirectory(entry.getName()); > copy((DirectoryEntry) entry, newDir); > } else { > // Copy entry > try (InputStream contents = new > DocumentInputStream((DocumentEntry) entry)) { > destDir.createDocument(entry.getName(), contents); > } > } > } > } > } > > } -- This message was sent by Atlassian Jira (v8.20.10#820010)