[
https://issues.apache.org/jira/browse/TIKA-4211?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17827241#comment-17827241
]
Tim Allison edited comment on TIKA-4211 at 3/14/24 8:20 PM:
------------------------------------------------------------
Step 3: Is there something like this in /ppt/slides/slide2.xml that references
rId2? Is the structure exactly the same graphic->graphicData->..->p:oleObj
{code:java}
<a:graphic>
<a:graphicData
uri="http://schemas.openxmlformats.org/presentationml/2006/ole">
<mc:AlternateContent
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006">
<mc:Choice Requires="v"
xmlns:v="urn:schemas-microsoft-com:vml">
<p:oleObj imgH="3057525" imgW="6715015"
name="Worksheet" progId="Excel.Sheet.12" r:id="rId2">
<p:embed/>
</p:oleObj>
</mc:Choice>
<mc:Fallback>
<p:oleObj imgH="3057525" imgW="6715015"
name="Worksheet" progId="Excel.Sheet.12" r:id="rId2">
<p:embed/>
<p:pic>
<p:nvPicPr>
<p:cNvPr id="0" name=""/>
<p:cNvPicPr/>
<p:nvPr/>
</p:nvPicPr>
<p:blipFill>
<a:blip r:embed="rId3"/>
<a:stretch>
<a:fillRect/>
</a:stretch>
</p:blipFill>
<p:spPr>
<a:xfrm>
<a:off x="2317994" y="2528644"/>
<a:ext cx="6715125"
cy="3057525"/>
</a:xfrm>
<a:prstGeom prst="rect">
<a:avLst/>
</a:prstGeom>
</p:spPr>
</p:pic>
</p:oleObj>
</mc:Fallback>
</mc:AlternateContent>
</a:graphicData>
</a:graphic>
{code}
was (Author: [email protected]):
Step 3: Is there something like this in /ppt/slides/slide2.xml:
{code:java}
<a:graphic>
<a:graphicData
uri="http://schemas.openxmlformats.org/presentationml/2006/ole">
<mc:AlternateContent
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006">
<mc:Choice Requires="v"
xmlns:v="urn:schemas-microsoft-com:vml">
<p:oleObj imgH="3057525" imgW="6715015"
name="Worksheet" progId="Excel.Sheet.12" r:id="rId2">
<p:embed/>
</p:oleObj>
</mc:Choice>
<mc:Fallback>
<p:oleObj imgH="3057525" imgW="6715015"
name="Worksheet" progId="Excel.Sheet.12" r:id="rId2">
<p:embed/>
<p:pic>
<p:nvPicPr>
<p:cNvPr id="0" name=""/>
<p:cNvPicPr/>
<p:nvPr/>
</p:nvPicPr>
<p:blipFill>
<a:blip r:embed="rId3"/>
<a:stretch>
<a:fillRect/>
</a:stretch>
</p:blipFill>
<p:spPr>
<a:xfrm>
<a:off x="2317994" y="2528644"/>
<a:ext cx="6715125"
cy="3057525"/>
</a:xfrm>
<a:prstGeom prst="rect">
<a:avLst/>
</a:prstGeom>
</p:spPr>
</p:pic>
</p:oleObj>
</mc:Fallback>
</mc:AlternateContent>
</a:graphicData>
</a:graphic>
{code}
> Tika extractor fails to extract embedded excel from pptx
> --------------------------------------------------------
>
> Key: TIKA-4211
> URL: https://issues.apache.org/jira/browse/TIKA-4211
> Project: Tika
> Issue Type: Bug
> Reporter: Xiaohong Yang
> Priority: Major
> Attachments: config_and_sample_file.zip
>
>
> We use org.apache.tika.extractor.EmbeddedDocumentExtractor to get embedded
> excel from PowerPoint presentation. It works with most pptx files. But it
> fails to detect the embedded excel with some pptx files.
> Following is the sample code and attached is the tika-config.xml and a pptx
> file that works.
> We cannot provide the pptx file that does not work because it is client data.
> We noticed a difference between the pptx files that work and the pptx file
> that does not work:
> "{*}Worksheet Object{*}" *is in the popup menu when the embedded Excel object
> is right-clicked in the pptx files that work.*
> "{*}Edit Data{*}" *is in the popup menu when the embedded Excel object is
> right-clicked in the pptx file that does not work. This file might be created
> with an old version fo PowerPoint.*
>
> The operating system is Ubuntu 20.04. Java version is 17. Tika version is
> 2.9.1 and POI version is 5.2.3.
>
> import org.apache.pdfbox.io.IOUtils;
> import org.apache.poi.poifs.filesystem.DirectoryEntry;
> import org.apache.poi.poifs.filesystem.DocumentEntry;
> import org.apache.poi.poifs.filesystem.DocumentInputStream;
> import org.apache.poi.poifs.filesystem.POIFSFileSystem;
> import org.apache.tika.config.TikaConfig;
> import org.apache.tika.extractor.EmbeddedDocumentExtractor;
> import org.apache.tika.io.FilenameUtils;
> import org.apache.tika.io.TikaInputStream;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.metadata.TikaCoreProperties;
> import org.apache.tika.parser.AutoDetectParser;
> import org.apache.tika.parser.ParseContext;
> import org.apache.tika.parser.Parser;
> import org.xml.sax.ContentHandler;
> import org.xml.sax.SAXException;
> import org.xml.sax.helpers.DefaultHandler;
>
> import java.io.*;
> import java.net.URL;
> import java.nio.file.Path;
>
> public class ExtractExcelFromPowerPoint {
> private final Path pptxFile = new
> File("/home/ubuntu/testdirs/testdir_pptx/sample.pptx").toPath();
> private final Path outputDir = new
> File("/home/ubuntu/testdirs/testdir_pptx/tika_output/").toPath();
>
> private Parser parser;
> private ParseContext context;
>
>
> public static void main(String args[]) {
> try {
> new ExtractExcelFromPowerPoint().process();
> }
> catch(Exception ex) {
> ex.printStackTrace();
> }
> }
>
> public ExtractExcelFromPowerPoint() {
> }
>
> public void process() throws Exception {
> TikaConfig config = new
> TikaConfig("/home/ubuntu/testdirs/testdir_pptx/tika-config.xml");
> FileEmbeddedDocumentExtractor fileEmbeddedDocumentExtractor = new
> FileEmbeddedDocumentExtractor();
>
> parser = new AutoDetectParser(config);
> context = new ParseContext();
> context.set(Parser.class, parser);
> context.set(TikaConfig.class, config);
> context.set(EmbeddedDocumentExtractor.class,
> fileEmbeddedDocumentExtractor);
>
> URL url = pptxFile.toUri().toURL();
> Metadata metadata = new Metadata();
> try (InputStream input = TikaInputStream.get(url, metadata)) {
> ContentHandler handler = new DefaultHandler();
> parser.parse(input, handler, metadata, context);
> }
> }
>
> private class FileEmbeddedDocumentExtractor implements
> EmbeddedDocumentExtractor {
> private int count = 0;
>
> public boolean shouldParseEmbedded(Metadata metadata) {
> return true;
> }
>
> public void parseEmbedded(InputStream inputStream, ContentHandler
> contentHandler, Metadata metadata,
> boolean outputHtml) throws SAXException,
> IOException {
> String fullFileName =
> metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
> if (fullFileName == null) {
> fullFileName = "file" + count++;
> }
>
> String[] fileNameSplit = fullFileName.split("/");
> String fileName = fileNameSplit[fileNameSplit.length - 1];
> File outputFile = new File(outputDir.toFile(),
> FilenameUtils.normalize(fileName));
> System.out.println("Extracting '" + fileName + " to " +
> outputFile);
> FileOutputStream os = null;
> try {
> os = new FileOutputStream(outputFile);
> if (inputStream instanceof TikaInputStream tin) {
> if (tin.getOpenContainer() instanceof DirectoryEntry) {
> try(POIFSFileSystem fs = new POIFSFileSystem()){
> copy((DirectoryEntry) tin.getOpenContainer(),
> fs.getRoot());
> fs.writeFilesystem(os);
> }
> } else {
> IOUtils.copy(inputStream, os);
> }
> } else {
> IOUtils.copy(inputStream, os);
> }
> } catch (Exception ex) {
> ex.printStackTrace();
> } finally {
> if (os != null) {
> os.flush();
> os.close();
> }
> }
> }
>
> protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
> throws IOException {
> for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {
> if (entry instanceof DirectoryEntry) {
> // Need to recurse
> DirectoryEntry newDir =
> destDir.createDirectory(entry.getName());
> copy((DirectoryEntry) entry, newDir);
> } else {
> // Copy entry
> try (InputStream contents = new
> DocumentInputStream((DocumentEntry) entry)) {
> destDir.createDocument(entry.getName(), contents);
> }
> }
> }
> }
> }
> }
> [^config_and_sample_file.zip]
--
This message was sent by Atlassian Jira
(v8.20.10#820010)