[jira] [Comment Edited] (TIKA-4228) Tika parser crashes JVM when it gets metadata and embedded objects from pdf

Tim Allison (Jira) Wed, 27 Mar 2024 12:59:24 -0700


    [ 
https://issues.apache.org/jira/browse/TIKA-4228?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17831518#comment-17831518
 ]


Tim Allison edited comment on TIKA-4228 at 3/27/24 7:57 PM:
------------------------------------------------------------

As I think about it, this code wouldn't extract all of the embedded images in 
the PDF...so that's not a concern...you'd have to turn on extractInlineImages.

I can run getMetadata() with -Xmx256m with no problems with the current 
branch_2x.


was (Author: [email protected]):
As I think about it, this code wouldn't extract all of the embedded images in 
the PDF...so that's not a concern...you'd have to turn on extractInlineImages.

I can run getMetadata() with -Xmx256m with no problems.

> Tika parser crashes JVM when it gets metadata and embedded objects from pdf
> ---------------------------------------------------------------------------
>
>                 Key: TIKA-4228
>                 URL: https://issues.apache.org/jira/browse/TIKA-4228
>             Project: Tika
>          Issue Type: Bug
>            Reporter: Xiaohong Yang
>            Priority: Major
>         Attachments: tika-config-and-sample-file.zip
>
>
> [^tika-config-and-sample-file.zip]
>  
> We use org.apache.tika.parser.AutoDetectParser to get metadata and embedded 
> objects from pdf documents.  And we found out that it crashes the program (or 
> the JVM) when it gets metadata and embedded files from the sample pdf file.
>  
> Following is the sample code and attached is the tika-config.xml and the 
> sample pdf file. Note that the sample file crashes the JVM in 1 out of 10 
> runs in our production environment.  Sometimes it happens when it gets 
> metadata and sometimes it happens when it extracts embedded files (the 
> chances are about 50/50).
>  
> The operating system is Ubuntu 20.04. Java version is 21.  Tika version is 
> 2.9.0 and POI version is 5.2.3.   
>  
>  
> import org.apache.pdfbox.io.IOUtils;
> import org.apache.poi.poifs.filesystem.DirectoryEntry;
> import org.apache.poi.poifs.filesystem.DocumentEntry;
> import org.apache.poi.poifs.filesystem.DocumentInputStream;
> import org.apache.poi.poifs.filesystem.POIFSFileSystem;
> import org.apache.tika.config.TikaConfig;
> import org.apache.tika.detect.Detector;
> import org.apache.tika.extractor.EmbeddedDocumentExtractor;
> import org.apache.tika.io.FilenameUtils;
> import org.apache.tika.io.TikaInputStream;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.metadata.TikaCoreProperties;
> import org.apache.tika.mime.MediaType;
> import org.apache.tika.parser.AutoDetectParser;
> import org.apache.tika.parser.ParseContext;
> import org.apache.tika.parser.Parser;
> import org.apache.tika.sax.BodyContentHandler;
> import org.xml.sax.ContentHandler;
> import org.xml.sax.SAXException;
> import org.xml.sax.helpers.DefaultHandler;
>  
> import java.io.*;
> import java.net.URL;
> import java.nio.file.Files;
> import java.nio.file.Path;
> import java.nio.file.Paths;
>  
> public class ProcessPdf {
>     private final Path inputFile = new 
> File("/home/ubuntu/testdirs/testdir_pdf/sample.pdf").toPath();
>     private final Path outputDir = new 
> File("/home/ubuntu/testdirs/testdir_pdf/tika_output/").toPath();
>  
>     private Parser parser;
>     private ParseContext context;
>  
>  
>     public static void main(String args[]) {
>         try
> {             System.out.println("Start");             ProcessPdf processPdf 
> = new ProcessPdf();             System.out.println("Get metadata");           
>   processPdf.getMataData();             System.out.println("Extract embedded 
> files");             processPdf.extract();             
> System.out.println("End");         }
>         catch(Exception ex)
> {             ex.printStackTrace();         }
>     }
>  
>     public ProcessPdf()
> {     }
>  
>     public void getMataData() throws Exception {
>         BodyContentHandler handler = new BodyContentHandler(-1);
>  
>         Metadata metadata = new Metadata();
>         try (FileInputStream inputData = new 
> FileInputStream(inputFile.toString()))
> {             TikaConfig config = new 
> TikaConfig("/home/ubuntu/testdirs/testdir_pdf/tika-config.xml");             
> Parser autoDetectParser = new AutoDetectParser(config);             
> ParseContext context = new ParseContext();             
> context.set(TikaConfig.class, config);             
> autoDetectParser.parse(inputData, handler, metadata, context);         }
>  
>         String content = handler.toString();
>     }
>  
>     public void extract() throws Exception {
>         TikaConfig config = new 
> TikaConfig("/home/ubuntu/testdirs/testdir_pdf/tika-config.xml");
>         ProcessPdf.FileEmbeddedDocumentExtractor 
> fileEmbeddedDocumentExtractor = new 
> ProcessPdf.FileEmbeddedDocumentExtractor();
>  
>         parser = new AutoDetectParser(config);
>         context = new ParseContext();
>         context.set(Parser.class, parser);
>         context.set(TikaConfig.class, config);
>         context.set(EmbeddedDocumentExtractor.class, 
> fileEmbeddedDocumentExtractor);
>  
>         URL url = inputFile.toUri().toURL();
>         Metadata metadata = new Metadata();
>         try (InputStream input = TikaInputStream.get(url, metadata))
> {             ContentHandler handler = new DefaultHandler();             
> parser.parse(input, handler, metadata, context);         }
>     }
>  
>     private class FileEmbeddedDocumentExtractor implements 
> EmbeddedDocumentExtractor {
>         private int count = 0;
>  
>         public boolean shouldParseEmbedded(Metadata metadata)
> {             return true;         }
>  
>         public void parseEmbedded(InputStream inputStream, ContentHandler 
> contentHandler, Metadata metadata,
>                                   boolean outputHtml) throws SAXException, 
> IOException {
>             String fullFileName = 
> metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
>             if (fullFileName == null)
> {                 fullFileName = "file" + count++;             }
>  
>             TikaConfig config = null;
>             try
> {                 config = new 
> TikaConfig("/home/ubuntu/testdirs/testdir_pdf/tika-config.xml");             }
> catch (Exception ex)
> {                 ex.printStackTrace();             }
>             if (config == null) \{                 return;             }
>  
>             Detector detector = config.getDetector();;
>             MediaType contentType = detector.detect(inputStream, metadata);
>             String tikaExtension = null;
>             if(fullFileName.indexOf('.') == -1 && contentType != null){
>                 try \{                     tikaExtension = 
> config.getMimeRepository().forName(contentType.toString()).getExtension();    
>              } catch (Exception ex) \{                     
> ex.printStackTrace();                 }
>  
>                 if (tikaExtension != null && !tikaExtension.isEmpty() ) \{    
>                  fullFileName += tikaExtension;                 }
>             }
>  
>             String[] fileNameSplit = fullFileName.split("/");
>             String fileName = fileNameSplit[fileNameSplit.length - 1];
>             File outputFile = new File(outputDir.toFile(), 
> FilenameUtils.normalize(fileName));
>             System.out.println("Extracting '" + fileName + " to " + 
> outputFile);
>             FileOutputStream os = null;
>             try {
>                 os = new FileOutputStream(outputFile);
>                 if (inputStream instanceof TikaInputStream tin) {
>                     if (tin.getOpenContainer() instanceof DirectoryEntry) {
>                         try(POIFSFileSystem fs = new POIFSFileSystem())\{     
>                         copy((DirectoryEntry) tin.getOpenContainer(), 
> fs.getRoot());                             fs.writeFilesystem(os);            
>              }
>                     } else \{                         
> IOUtils.copy(inputStream, os);                     }
>                 } else \{                     IOUtils.copy(inputStream, os);  
>                }
>             } catch (Exception ex) \{                 ex.printStackTrace();   
>           }
> finally {
>                 if (os != null)
> {                     os.flush();                     os.close();             
>     }
>             }
>         }
>  
>         protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) 
> throws IOException {
>             for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {
>                 if (entry instanceof DirectoryEntry)
> {                     // Need to recurse                     DirectoryEntry 
> newDir = destDir.createDirectory(entry.getName());                     
> copy((DirectoryEntry) entry, newDir);                 }
> else {
>                     // Copy entry
>                     try (InputStream contents = new 
> DocumentInputStream((DocumentEntry) entry))
> {                         destDir.createDocument(entry.getName(), contents);  
>                    }
>                 }
>             }
>         }
>     }
> }
>  ^^ 



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Comment Edited] (TIKA-4228) Tika parser crashes JVM when it gets metadata and embedded objects from pdf

Reply via email to