[
https://issues.apache.org/jira/browse/TIKA-4211?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17829597#comment-17829597
]
Xiaohong Yang commented on TIKA-4211:
-------------------------------------
Hi Tim,
I ran the following command and the xlsx is in the result json:
java -jar tika-app-3.0.0-20240321.135818-429.jar -J -t
2020_Capacity_Ramp_Plan.pptx
Here is the related part of the json
[
{
"cp:revision": "8",
"extended-properties:AppVersion": "16.0000",
"meta:paragraph-count": "278",
"meta:word-count": "465",
"extended-properties:PresentationFormat": "Widescreen",
"extended-properties:Application": "Microsoft Office PowerPoint",
"meta:last-author": "Kenneth Nip",
"X-TIKA:Parsed-By-Full-Set": [
"org.apache.tika.parser.DefaultParser",
"org.apache.tika.parser.microsoft.ooxml.OOXMLParser",
"org.apache.tika.parser.image.JpegParser",
"org.apache.tika.parser.ocr.TesseractOCRParser"
],
"X-TIKA:content_handler": "ToTextContentHandler",
"dc:creator": "Kenneth Nip",
"meta:slide-count": "3",
"xmpTPg:NPages": "3",
"resourceName": "2020_Capacity_Ramp_Plan.pptx",
"dcterms:created": "2020-01-04T05:19:17Z",
"dcterms:modified": "2020-01-06T07:58:18Z",
"X-TIKA:Parsed-By": [
"org.apache.tika.parser.DefaultParser",
"org.apache.tika.parser.microsoft.ooxml.OOXMLParser"
],
"dc:title": "PowerPoint Presentation",
"extended-properties:DocSecurityString": "None",
"extended-properties:TotalTime": "342",
"X-TIKA:parse_time_millis": "1223",
"X-TIKA:embedded_depth": "0",
"X-TIKA:content": "…… /
Peter\t\t\t\n\n\n\nMicrosoft_Excel_Worksheet.xlsx\n\n\n",
"Content-Length": "144945",
"Content-Type":
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
},
{
"extended-properties:AppVersion": "16.0300",
"extended-properties:Application": "Microsoft Excel",
"meta:last-author": "Kenneth Nip",
"X-TIKA:embedded_id_path": "/1",
"X-TIKA:content_handler": "ToTextContentHandler",
"dc:creator": "Kenneth Nip",
"extended-properties:Company": "",
"meta:print-date": "2019-11-06T23:43:22Z",
"resourceName": "Microsoft_Excel_Worksheet.xlsx",
"dcterms:created": "2019-10-30T16:50:00Z",
"dcterms:modified": "2020-01-06T07:29:13Z",
"X-TIKA:origResourceName": "C:\\Users\\kenrw\\Downloads\\",
"embeddedRelationshipId": "rId3",
"protected": "false",
"embeddedResourceType": "ATTACHMENT",
"X-TIKA:Parsed-By": [
"org.apache.tika.parser.DefaultParser",
"org.apache.tika.parser.microsoft.ooxml.OOXMLParser"
],
"extended-properties:DocSecurityString": "None",
"X-TIKA:embedded_depth": "1",
"X-TIKA:parse_time_millis": "376",
"X-TIKA:content": "…………..",
"X-TIKA:embedded_resource_path":
"/Microsoft_Excel_Worksheet.xlsx",
"X-TIKA:embedded_id": "1",
"Content-Type":
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"dc:publisher": ""
},
…
]
> Tika extractor fails to extract embedded excel from pptx
> --------------------------------------------------------
>
> Key: TIKA-4211
> URL: https://issues.apache.org/jira/browse/TIKA-4211
> Project: Tika
> Issue Type: Bug
> Reporter: Xiaohong Yang
> Priority: Major
> Attachments: config_and_sample_file.zip
>
>
> We use org.apache.tika.extractor.EmbeddedDocumentExtractor to get embedded
> excel from PowerPoint presentation. It works with most pptx files. But it
> fails to detect the embedded excel with some pptx files.
> Following is the sample code and attached is the tika-config.xml and a pptx
> file that works.
> We cannot provide the pptx file that does not work because it is client data.
> We noticed a difference between the pptx files that work and the pptx file
> that does not work:
> "{*}Worksheet Object{*}" *is in the popup menu when the embedded Excel object
> is right-clicked in the pptx files that work.*
> "{*}Edit Data{*}" *is in the popup menu when the embedded Excel object is
> right-clicked in the pptx file that does not work. This file might be created
> with an old version fo PowerPoint.*
>
> The operating system is Ubuntu 20.04. Java version is 17. Tika version is
> 2.9.1 and POI version is 5.2.3.
>
> import org.apache.pdfbox.io.IOUtils;
> import org.apache.poi.poifs.filesystem.DirectoryEntry;
> import org.apache.poi.poifs.filesystem.DocumentEntry;
> import org.apache.poi.poifs.filesystem.DocumentInputStream;
> import org.apache.poi.poifs.filesystem.POIFSFileSystem;
> import org.apache.tika.config.TikaConfig;
> import org.apache.tika.extractor.EmbeddedDocumentExtractor;
> import org.apache.tika.io.FilenameUtils;
> import org.apache.tika.io.TikaInputStream;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.metadata.TikaCoreProperties;
> import org.apache.tika.parser.AutoDetectParser;
> import org.apache.tika.parser.ParseContext;
> import org.apache.tika.parser.Parser;
> import org.xml.sax.ContentHandler;
> import org.xml.sax.SAXException;
> import org.xml.sax.helpers.DefaultHandler;
>
> import java.io.*;
> import java.net.URL;
> import java.nio.file.Path;
>
> public class ExtractExcelFromPowerPoint {
> private final Path pptxFile = new
> File("/home/ubuntu/testdirs/testdir_pptx/sample.pptx").toPath();
> private final Path outputDir = new
> File("/home/ubuntu/testdirs/testdir_pptx/tika_output/").toPath();
>
> private Parser parser;
> private ParseContext context;
>
>
> public static void main(String args[]) {
> try {
> new ExtractExcelFromPowerPoint().process();
> }
> catch(Exception ex) {
> ex.printStackTrace();
> }
> }
>
> public ExtractExcelFromPowerPoint() {
> }
>
> public void process() throws Exception {
> TikaConfig config = new
> TikaConfig("/home/ubuntu/testdirs/testdir_pptx/tika-config.xml");
> FileEmbeddedDocumentExtractor fileEmbeddedDocumentExtractor = new
> FileEmbeddedDocumentExtractor();
>
> parser = new AutoDetectParser(config);
> context = new ParseContext();
> context.set(Parser.class, parser);
> context.set(TikaConfig.class, config);
> context.set(EmbeddedDocumentExtractor.class,
> fileEmbeddedDocumentExtractor);
>
> URL url = pptxFile.toUri().toURL();
> Metadata metadata = new Metadata();
> try (InputStream input = TikaInputStream.get(url, metadata)) {
> ContentHandler handler = new DefaultHandler();
> parser.parse(input, handler, metadata, context);
> }
> }
>
> private class FileEmbeddedDocumentExtractor implements
> EmbeddedDocumentExtractor {
> private int count = 0;
>
> public boolean shouldParseEmbedded(Metadata metadata) {
> return true;
> }
>
> public void parseEmbedded(InputStream inputStream, ContentHandler
> contentHandler, Metadata metadata,
> boolean outputHtml) throws SAXException,
> IOException {
> String fullFileName =
> metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
> if (fullFileName == null) {
> fullFileName = "file" + count++;
> }
>
> String[] fileNameSplit = fullFileName.split("/");
> String fileName = fileNameSplit[fileNameSplit.length - 1];
> File outputFile = new File(outputDir.toFile(),
> FilenameUtils.normalize(fileName));
> System.out.println("Extracting '" + fileName + " to " +
> outputFile);
> FileOutputStream os = null;
> try {
> os = new FileOutputStream(outputFile);
> if (inputStream instanceof TikaInputStream tin) {
> if (tin.getOpenContainer() instanceof DirectoryEntry) {
> try(POIFSFileSystem fs = new POIFSFileSystem()){
> copy((DirectoryEntry) tin.getOpenContainer(),
> fs.getRoot());
> fs.writeFilesystem(os);
> }
> } else {
> IOUtils.copy(inputStream, os);
> }
> } else {
> IOUtils.copy(inputStream, os);
> }
> } catch (Exception ex) {
> ex.printStackTrace();
> } finally {
> if (os != null) {
> os.flush();
> os.close();
> }
> }
> }
>
> protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
> throws IOException {
> for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {
> if (entry instanceof DirectoryEntry) {
> // Need to recurse
> DirectoryEntry newDir =
> destDir.createDirectory(entry.getName());
> copy((DirectoryEntry) entry, newDir);
> } else {
> // Copy entry
> try (InputStream contents = new
> DocumentInputStream((DocumentEntry) entry)) {
> destDir.createDocument(entry.getName(), contents);
> }
> }
> }
> }
> }
> }
> [^config_and_sample_file.zip]
--
This message was sent by Atlassian Jira
(v8.20.10#820010)