[
https://issues.apache.org/jira/browse/TIKA-4228?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17831524#comment-17831524
]
Tim Allison edited comment on TIKA-4228 at 3/27/24 8:29 PM:
------------------------------------------------------------
What's the exit code? -Are you on a system with an oom killer or other process
killer-, and if so, do the logs suggest that the OS killed the process?
Sorry, ubuntu, right. Anything in the logs?
https://www.baeldung.com/linux/what-killed-a-process
was (Author: [email protected]):
What's the exit code? Are you on a system with an oom killer or other process
killer, and if so, what do its logs say?
> Tika parser crashes JVM when it gets metadata and embedded objects from pdf
> ---------------------------------------------------------------------------
>
> Key: TIKA-4228
> URL: https://issues.apache.org/jira/browse/TIKA-4228
> Project: Tika
> Issue Type: Bug
> Reporter: Xiaohong Yang
> Priority: Major
> Attachments: tika-config-and-sample-file.zip
>
>
> [^tika-config-and-sample-file.zip]
>
> We use org.apache.tika.parser.AutoDetectParser to get metadata and embedded
> objects from pdf documents. And we found out that it crashes the program (or
> the JVM) when it gets metadata and embedded files from the sample pdf file.
>
> Following is the sample code and attached is the tika-config.xml and the
> sample pdf file. Note that the sample file crashes the JVM in 1 out of 10
> runs in our production environment. Sometimes it happens when it gets
> metadata and sometimes it happens when it extracts embedded files (the
> chances are about 50/50).
>
> The operating system is Ubuntu 20.04. Java version is 21. Tika version is
> 2.9.0 and POI version is 5.2.3.
>
>
> import org.apache.pdfbox.io.IOUtils;
> import org.apache.poi.poifs.filesystem.DirectoryEntry;
> import org.apache.poi.poifs.filesystem.DocumentEntry;
> import org.apache.poi.poifs.filesystem.DocumentInputStream;
> import org.apache.poi.poifs.filesystem.POIFSFileSystem;
> import org.apache.tika.config.TikaConfig;
> import org.apache.tika.detect.Detector;
> import org.apache.tika.extractor.EmbeddedDocumentExtractor;
> import org.apache.tika.io.FilenameUtils;
> import org.apache.tika.io.TikaInputStream;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.metadata.TikaCoreProperties;
> import org.apache.tika.mime.MediaType;
> import org.apache.tika.parser.AutoDetectParser;
> import org.apache.tika.parser.ParseContext;
> import org.apache.tika.parser.Parser;
> import org.apache.tika.sax.BodyContentHandler;
> import org.xml.sax.ContentHandler;
> import org.xml.sax.SAXException;
> import org.xml.sax.helpers.DefaultHandler;
>
> import java.io.*;
> import java.net.URL;
> import java.nio.file.Files;
> import java.nio.file.Path;
> import java.nio.file.Paths;
>
> public class ProcessPdf {
> private final Path inputFile = new
> File("/home/ubuntu/testdirs/testdir_pdf/sample.pdf").toPath();
> private final Path outputDir = new
> File("/home/ubuntu/testdirs/testdir_pdf/tika_output/").toPath();
>
> private Parser parser;
> private ParseContext context;
>
>
> public static void main(String args[]) {
> try
> { System.out.println("Start"); ProcessPdf processPdf
> = new ProcessPdf(); System.out.println("Get metadata");
> processPdf.getMataData(); System.out.println("Extract embedded
> files"); processPdf.extract();
> System.out.println("End"); }
> catch(Exception ex)
> { ex.printStackTrace(); }
> }
>
> public ProcessPdf()
> { }
>
> public void getMataData() throws Exception {
> BodyContentHandler handler = new BodyContentHandler(-1);
>
> Metadata metadata = new Metadata();
> try (FileInputStream inputData = new
> FileInputStream(inputFile.toString()))
> { TikaConfig config = new
> TikaConfig("/home/ubuntu/testdirs/testdir_pdf/tika-config.xml");
> Parser autoDetectParser = new AutoDetectParser(config);
> ParseContext context = new ParseContext();
> context.set(TikaConfig.class, config);
> autoDetectParser.parse(inputData, handler, metadata, context); }
>
> String content = handler.toString();
> }
>
> public void extract() throws Exception {
> TikaConfig config = new
> TikaConfig("/home/ubuntu/testdirs/testdir_pdf/tika-config.xml");
> ProcessPdf.FileEmbeddedDocumentExtractor
> fileEmbeddedDocumentExtractor = new
> ProcessPdf.FileEmbeddedDocumentExtractor();
>
> parser = new AutoDetectParser(config);
> context = new ParseContext();
> context.set(Parser.class, parser);
> context.set(TikaConfig.class, config);
> context.set(EmbeddedDocumentExtractor.class,
> fileEmbeddedDocumentExtractor);
>
> URL url = inputFile.toUri().toURL();
> Metadata metadata = new Metadata();
> try (InputStream input = TikaInputStream.get(url, metadata))
> { ContentHandler handler = new DefaultHandler();
> parser.parse(input, handler, metadata, context); }
> }
>
> private class FileEmbeddedDocumentExtractor implements
> EmbeddedDocumentExtractor {
> private int count = 0;
>
> public boolean shouldParseEmbedded(Metadata metadata)
> { return true; }
>
> public void parseEmbedded(InputStream inputStream, ContentHandler
> contentHandler, Metadata metadata,
> boolean outputHtml) throws SAXException,
> IOException {
> String fullFileName =
> metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
> if (fullFileName == null)
> { fullFileName = "file" + count++; }
>
> TikaConfig config = null;
> try
> { config = new
> TikaConfig("/home/ubuntu/testdirs/testdir_pdf/tika-config.xml"); }
> catch (Exception ex)
> { ex.printStackTrace(); }
> if (config == null) \{ return; }
>
> Detector detector = config.getDetector();;
> MediaType contentType = detector.detect(inputStream, metadata);
> String tikaExtension = null;
> if(fullFileName.indexOf('.') == -1 && contentType != null){
> try \{ tikaExtension =
> config.getMimeRepository().forName(contentType.toString()).getExtension();
> } catch (Exception ex) \{
> ex.printStackTrace(); }
>
> if (tikaExtension != null && !tikaExtension.isEmpty() ) \{
> fullFileName += tikaExtension; }
> }
>
> String[] fileNameSplit = fullFileName.split("/");
> String fileName = fileNameSplit[fileNameSplit.length - 1];
> File outputFile = new File(outputDir.toFile(),
> FilenameUtils.normalize(fileName));
> System.out.println("Extracting '" + fileName + " to " +
> outputFile);
> FileOutputStream os = null;
> try {
> os = new FileOutputStream(outputFile);
> if (inputStream instanceof TikaInputStream tin) {
> if (tin.getOpenContainer() instanceof DirectoryEntry) {
> try(POIFSFileSystem fs = new POIFSFileSystem())\{
> copy((DirectoryEntry) tin.getOpenContainer(),
> fs.getRoot()); fs.writeFilesystem(os);
> }
> } else \{
> IOUtils.copy(inputStream, os); }
> } else \{ IOUtils.copy(inputStream, os);
> }
> } catch (Exception ex) \{ ex.printStackTrace();
> }
> finally {
> if (os != null)
> { os.flush(); os.close();
> }
> }
> }
>
> protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
> throws IOException {
> for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {
> if (entry instanceof DirectoryEntry)
> { // Need to recurse DirectoryEntry
> newDir = destDir.createDirectory(entry.getName());
> copy((DirectoryEntry) entry, newDir); }
> else {
> // Copy entry
> try (InputStream contents = new
> DocumentInputStream((DocumentEntry) entry))
> { destDir.createDocument(entry.getName(), contents);
> }
> }
> }
> }
> }
> }
> ^^
--
This message was sent by Atlassian Jira
(v8.20.10#820010)