Xiaohong Yang created TIKA-4212:
-----------------------------------
Summary: Tika fails to get file extension of file type
image/x-rtf-raw-bitmap
Key: TIKA-4212
URL: https://issues.apache.org/jira/browse/TIKA-4212
Project: Tika
Issue Type: Bug
Reporter: Xiaohong Yang
We use org.apache.tika.extractor.EmbeddedDocumentExtractor to get embedded
objects from Word documents. Two embedded objects are extracted from the
sample doc file. Their file type is image/x-rtf-raw-bitmap. But Tika fails to
get the file extension with the following method call
tikaExtension =
config.getMimeRepository().forName(contentType.toString()).getExtension();
Wonder if you can fix the problem in the Tika library. Also wonder if you can
tell us the file extension of file type is image/x-rtf-raw-bitmap.
Following is the sample code and attached is the tika-config.xml and the sample
Word file.
The operating system is Ubuntu 20.04. Java version is 17. Tika version is
2.9.1 and POI version is 5.2.3.
import org.apache.pdfbox.io.IOUtils;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import java.io.*;
import java.net.URL;
import java.nio.file.Path;
public class ExtractBitMapFromWord {
private final Path docFile = new
File("/home/ubuntu/testdirs/testdir_doc/sample.DOC").toPath();
private final Path outputDir = new
File("/home/ubuntu/testdirs/testdir_doc/tika_output/").toPath();
private Parser parser;
private ParseContext context;
public static void main(String args[]) {
try {
new ExtractBitMapFromWord().process();
}
catch(Exception ex) {
ex.printStackTrace();
}
}
public ExtractBitMapFromWord() {
}
public void process() throws Exception {
TikaConfig config = new
TikaConfig("/home/ubuntu/testdirs/testdir_doc/tika-config.xml");
ExtractBitMapFromWord.FileEmbeddedDocumentExtractor
fileEmbeddedDocumentExtractor = new
ExtractBitMapFromWord.FileEmbeddedDocumentExtractor();
parser = new AutoDetectParser(config);
context = new ParseContext();
context.set(Parser.class, parser);
context.set(TikaConfig.class, config);
context.set(EmbeddedDocumentExtractor.class,
fileEmbeddedDocumentExtractor);
URL url = docFile.toUri().toURL();
Metadata metadata = new Metadata();
try (InputStream input = TikaInputStream.get(url, metadata)) {
ContentHandler handler = new DefaultHandler();
parser.parse(input, handler, metadata, context);
}
}
private class FileEmbeddedDocumentExtractor implements
EmbeddedDocumentExtractor {
private int count = 0;
public boolean shouldParseEmbedded(Metadata metadata) {
return true;
}
public void parseEmbedded(InputStream inputStream, ContentHandler
contentHandler, Metadata metadata,
boolean outputHtml) throws SAXException,
IOException {
String fullFileName =
metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
if (fullFileName == null) {
fullFileName = "file" + count++;
}
TikaConfig config = null;
try {
config = new
TikaConfig("/home/ubuntu/testdirs/testdir_doc/tika-config.xml");
} catch (Exception ex) {
ex.printStackTrace();
}
if (config == null) {
return;
}
Detector detector = config.getDetector();;
MediaType contentType = detector.detect(inputStream, metadata);
String tikaExtension = null;
if(fullFileName.indexOf('.') == -1 && contentType != null){
try {
tikaExtension =
config.getMimeRepository().forName(contentType.toString()).getExtension();
} catch (Exception ex) {
ex.printStackTrace();
}
if (tikaExtension != null && !tikaExtension.isEmpty() ) {
fullFileName += tikaExtension;
}
}
String[] fileNameSplit = fullFileName.split("/");
String fileName = fileNameSplit[fileNameSplit.length - 1];
File outputFile = new File(outputDir.toFile(),
FilenameUtils.normalize(fileName));
System.out.println("Extracting '" + fileName + " to " + outputFile);
FileOutputStream os = null;
try {
os = new FileOutputStream(outputFile);
if (inputStream instanceof TikaInputStream tin) {
if (tin.getOpenContainer() instanceof DirectoryEntry) {
try(POIFSFileSystem fs = new POIFSFileSystem()){
copy((DirectoryEntry) tin.getOpenContainer(),
fs.getRoot());
fs.writeFilesystem(os);
}
} else {
IOUtils.copy(inputStream, os);
}
} else {
IOUtils.copy(inputStream, os);
}
} catch (Exception ex) {
ex.printStackTrace();
} finally {
if (os != null) {
os.flush();
os.close();
}
}
}
protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
throws IOException {
for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {
if (entry instanceof DirectoryEntry) {
// Need to recurse
DirectoryEntry newDir =
destDir.createDirectory(entry.getName());
copy((DirectoryEntry) entry, newDir);
} else {
// Copy entry
try (InputStream contents = new
DocumentInputStream((DocumentEntry) entry)) {
destDir.createDocument(entry.getName(), contents);
}
}
}
}
}
}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)