[
https://issues.apache.org/jira/browse/TIKA-4211?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Tim Allison resolved TIKA-4211.
-------------------------------
Fix Version/s: 2.9.2
3.0.0
Resolution: Fixed
> Tika extractor fails to extract embedded excel from pptx
> --------------------------------------------------------
>
> Key: TIKA-4211
> URL: https://issues.apache.org/jira/browse/TIKA-4211
> Project: Tika
> Issue Type: Bug
> Reporter: Xiaohong Yang
> Priority: Major
> Fix For: 2.9.2, 3.0.0
>
> Attachments: config_and_sample_file.zip
>
>
> We use org.apache.tika.extractor.EmbeddedDocumentExtractor to get embedded
> excel from PowerPoint presentation. It works with most pptx files. But it
> fails to detect the embedded excel with some pptx files.
> Following is the sample code and attached is the tika-config.xml and a pptx
> file that works.
> We cannot provide the pptx file that does not work because it is client data.
> We noticed a difference between the pptx files that work and the pptx file
> that does not work:
> "{*}Worksheet Object{*}" *is in the popup menu when the embedded Excel object
> is right-clicked in the pptx files that work.*
> "{*}Edit Data{*}" *is in the popup menu when the embedded Excel object is
> right-clicked in the pptx file that does not work. This file might be created
> with an old version fo PowerPoint.*
>
> The operating system is Ubuntu 20.04. Java version is 17. Tika version is
> 2.9.1 and POI version is 5.2.3.
>
> import org.apache.pdfbox.io.IOUtils;
> import org.apache.poi.poifs.filesystem.DirectoryEntry;
> import org.apache.poi.poifs.filesystem.DocumentEntry;
> import org.apache.poi.poifs.filesystem.DocumentInputStream;
> import org.apache.poi.poifs.filesystem.POIFSFileSystem;
> import org.apache.tika.config.TikaConfig;
> import org.apache.tika.extractor.EmbeddedDocumentExtractor;
> import org.apache.tika.io.FilenameUtils;
> import org.apache.tika.io.TikaInputStream;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.metadata.TikaCoreProperties;
> import org.apache.tika.parser.AutoDetectParser;
> import org.apache.tika.parser.ParseContext;
> import org.apache.tika.parser.Parser;
> import org.xml.sax.ContentHandler;
> import org.xml.sax.SAXException;
> import org.xml.sax.helpers.DefaultHandler;
>
> import java.io.*;
> import java.net.URL;
> import java.nio.file.Path;
>
> public class ExtractExcelFromPowerPoint {
> private final Path pptxFile = new
> File("/home/ubuntu/testdirs/testdir_pptx/sample.pptx").toPath();
> private final Path outputDir = new
> File("/home/ubuntu/testdirs/testdir_pptx/tika_output/").toPath();
>
> private Parser parser;
> private ParseContext context;
>
>
> public static void main(String args[]) {
> try {
> new ExtractExcelFromPowerPoint().process();
> }
> catch(Exception ex) {
> ex.printStackTrace();
> }
> }
>
> public ExtractExcelFromPowerPoint() {
> }
>
> public void process() throws Exception {
> TikaConfig config = new
> TikaConfig("/home/ubuntu/testdirs/testdir_pptx/tika-config.xml");
> FileEmbeddedDocumentExtractor fileEmbeddedDocumentExtractor = new
> FileEmbeddedDocumentExtractor();
>
> parser = new AutoDetectParser(config);
> context = new ParseContext();
> context.set(Parser.class, parser);
> context.set(TikaConfig.class, config);
> context.set(EmbeddedDocumentExtractor.class,
> fileEmbeddedDocumentExtractor);
>
> URL url = pptxFile.toUri().toURL();
> Metadata metadata = new Metadata();
> try (InputStream input = TikaInputStream.get(url, metadata)) {
> ContentHandler handler = new DefaultHandler();
> parser.parse(input, handler, metadata, context);
> }
> }
>
> private class FileEmbeddedDocumentExtractor implements
> EmbeddedDocumentExtractor {
> private int count = 0;
>
> public boolean shouldParseEmbedded(Metadata metadata) {
> return true;
> }
>
> public void parseEmbedded(InputStream inputStream, ContentHandler
> contentHandler, Metadata metadata,
> boolean outputHtml) throws SAXException,
> IOException {
> String fullFileName =
> metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
> if (fullFileName == null) {
> fullFileName = "file" + count++;
> }
>
> String[] fileNameSplit = fullFileName.split("/");
> String fileName = fileNameSplit[fileNameSplit.length - 1];
> File outputFile = new File(outputDir.toFile(),
> FilenameUtils.normalize(fileName));
> System.out.println("Extracting '" + fileName + " to " +
> outputFile);
> FileOutputStream os = null;
> try {
> os = new FileOutputStream(outputFile);
> if (inputStream instanceof TikaInputStream tin) {
> if (tin.getOpenContainer() instanceof DirectoryEntry) {
> try(POIFSFileSystem fs = new POIFSFileSystem()){
> copy((DirectoryEntry) tin.getOpenContainer(),
> fs.getRoot());
> fs.writeFilesystem(os);
> }
> } else {
> IOUtils.copy(inputStream, os);
> }
> } else {
> IOUtils.copy(inputStream, os);
> }
> } catch (Exception ex) {
> ex.printStackTrace();
> } finally {
> if (os != null) {
> os.flush();
> os.close();
> }
> }
> }
>
> protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
> throws IOException {
> for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {
> if (entry instanceof DirectoryEntry) {
> // Need to recurse
> DirectoryEntry newDir =
> destDir.createDirectory(entry.getName());
> copy((DirectoryEntry) entry, newDir);
> } else {
> // Copy entry
> try (InputStream contents = new
> DocumentInputStream((DocumentEntry) entry)) {
> destDir.createDocument(entry.getName(), contents);
> }
> }
> }
> }
> }
> }
> [^config_and_sample_file.zip]
--
This message was sent by Atlassian Jira
(v8.20.10#820010)