Xiaohong Yang created TIKA-4211:
-----------------------------------

             Summary: Tika extractor fails to extract embedded excel from pptx
                 Key: TIKA-4211
                 URL: https://issues.apache.org/jira/browse/TIKA-4211
             Project: Tika
          Issue Type: Bug
            Reporter: Xiaohong Yang
         Attachments: config_and_sample_file.zip

We use org.apache.tika.extractor.EmbeddedDocumentExtractor to get embedded 
excel from PowerPoint presentation.  It works with most pptx files. But it 
fails to detect the embedded excel with some pptx files.

Following is the sample code and attached is the tika-config.xml and a pptx 
file that works.

We cannot provide the pptx file that does not work because it is client data.

We noticed a difference between the pptx files that work and the pptx file that 
does not work:  

"{*}Worksheet Object{*}" *is in the popup menu when the embedded Excel object 
is right-clicked in the pptx files that work.*

"{*}Edit Data{*}" *is in the popup menu when the embedded Excel object is 
right-clicked in the pptx file that does not work. This file might be created 
with an old version fo PowerPoint.*

 

The operating system is Ubuntu 20.04. Java version is 17.  Tika version is 
2.9.1 and POI version is 5.2.3. 

 

import org.apache.pdfbox.io.IOUtils;

import org.apache.poi.poifs.filesystem.DirectoryEntry;

import org.apache.poi.poifs.filesystem.DocumentEntry;

import org.apache.poi.poifs.filesystem.DocumentInputStream;

import org.apache.poi.poifs.filesystem.POIFSFileSystem;

import org.apache.tika.config.TikaConfig;

import org.apache.tika.extractor.EmbeddedDocumentExtractor;

import org.apache.tika.io.FilenameUtils;

import org.apache.tika.io.TikaInputStream;

import org.apache.tika.metadata.Metadata;

import org.apache.tika.metadata.TikaCoreProperties;

import org.apache.tika.parser.AutoDetectParser;

import org.apache.tika.parser.ParseContext;

import org.apache.tika.parser.Parser;

import org.xml.sax.ContentHandler;

import org.xml.sax.SAXException;

import org.xml.sax.helpers.DefaultHandler;

 

import java.io.*;

import java.net.URL;

import java.nio.file.Path;

 

public class ExtractExcelFromPowerPoint {

    private final Path pptxFile = new 
File("/home/ubuntu/testdirs/testdir_pptx/sample.pptx").toPath();

    private final Path outputDir = new 
File("/home/ubuntu/testdirs/testdir_pptx/tika_output/").toPath();

 

    private Parser parser;

    private ParseContext context;

 

 

    public static void main(String args[]) {

        try {

            new ExtractExcelFromPowerPoint().process();

        }

        catch(Exception ex) {

            ex.printStackTrace();

        }

    }

 

    public ExtractExcelFromPowerPoint() {

    }

 

    public void process() throws Exception {

        TikaConfig config = new 
TikaConfig("/home/ubuntu/testdirs/testdir_pptx/tika-config.xml");

        FileEmbeddedDocumentExtractor fileEmbeddedDocumentExtractor = new 
FileEmbeddedDocumentExtractor();

 

        parser = new AutoDetectParser(config);

        context = new ParseContext();

        context.set(Parser.class, parser);

        context.set(TikaConfig.class, config);

        context.set(EmbeddedDocumentExtractor.class, 
fileEmbeddedDocumentExtractor);

 

        URL url = pptxFile.toUri().toURL();

        Metadata metadata = new Metadata();

        try (InputStream input = TikaInputStream.get(url, metadata)) {

            ContentHandler handler = new DefaultHandler();

            parser.parse(input, handler, metadata, context);

        }

    }

 

    private class FileEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtractor {

        private int count = 0;

 

        public boolean shouldParseEmbedded(Metadata metadata) {

            return true;

        }

 

        public void parseEmbedded(InputStream inputStream, ContentHandler 
contentHandler, Metadata metadata,

                                  boolean outputHtml) throws SAXException, 
IOException {

            String fullFileName = 
metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);

            if (fullFileName == null) {

                fullFileName = "file" + count++;

            }

 

            String[] fileNameSplit = fullFileName.split("/");

            String fileName = fileNameSplit[fileNameSplit.length - 1];

            File outputFile = new File(outputDir.toFile(), 
FilenameUtils.normalize(fileName));

            System.out.println("Extracting '" + fileName + " to " + outputFile);

            FileOutputStream os = null;

            try {

                os = new FileOutputStream(outputFile);

                if (inputStream instanceof TikaInputStream tin) {

                    if (tin.getOpenContainer() instanceof DirectoryEntry) {

                        try(POIFSFileSystem fs = new POIFSFileSystem()){

                            copy((DirectoryEntry) tin.getOpenContainer(), 
fs.getRoot());

                            fs.writeFilesystem(os);

                        }

                    } else {

                        IOUtils.copy(inputStream, os);

                    }

                } else {

                    IOUtils.copy(inputStream, os);

                }

            } catch (Exception ex) {

                ex.printStackTrace();

            } finally {

                if (os != null) {

                    os.flush();

                    os.close();

                }

            }

        }

 

        protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) 
throws IOException {

            for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {

                if (entry instanceof DirectoryEntry) {

                    // Need to recurse

                    DirectoryEntry newDir = 
destDir.createDirectory(entry.getName());

                    copy((DirectoryEntry) entry, newDir);

                } else {

                    // Copy entry

                    try (InputStream contents = new 
DocumentInputStream((DocumentEntry) entry)) {

                        destDir.createDocument(entry.getName(), contents);

                    }

                }

            }

        }

    }

}

[^config_and_sample_file.zip]



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to