This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 2a84826e0dfd068a60f4de906e02b23f9e310d4d Author: tballison <[email protected]> AuthorDate: Thu Feb 4 13:16:50 2021 -0500 simplify timeout on process call and the destroyforcibly call on the tesseract process --- .../apache/tika/parser/ocr/TesseractOCRParser.java | 47 +++++++++++----------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 5a8b0e9..54d9388 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -322,44 +322,46 @@ public class TesseractOCRParser extends AbstractParser { ProcessBuilder pb = new ProcessBuilder(cmd); setEnv(config, pb); - final Process process = pb.start(); + Process process = null; + try { + process = pb.start(); + runOCRProcess(process, config.getTimeout()); + } finally { + if (process != null) { + process.destroyForcibly(); + } + } + } + + private void runOCRProcess(Process process, int timeout) throws IOException, TikaException { process.getOutputStream().close(); InputStream out = process.getInputStream(); InputStream err = process.getErrorStream(); StringBuilder outBuilder = new StringBuilder(); StringBuilder errBuilder = new StringBuilder(); - logStream("OCR MSG", out, input, outBuilder); - logStream("OCR ERROR", err, input, errBuilder); - - FutureTask<Integer> waitTask = new FutureTask<>(new Callable<Integer>() { - public Integer call() throws Exception { - return process.waitFor(); - } - }); - - Thread waitThread = new Thread(waitTask); - waitThread.start(); + logStream(out, outBuilder); + logStream(err, errBuilder); int exitValue = Integer.MIN_VALUE; try { - exitValue = waitTask.get(config.getTimeout(), TimeUnit.SECONDS); + boolean finished = process.waitFor(timeout, TimeUnit.SECONDS); + if (! finished) { + throw new TikaException("TesseractOCRParser timeout"); + } + exitValue = process.exitValue(); } catch (InterruptedException e) { - waitThread.interrupt(); - process.destroy(); Thread.currentThread().interrupt(); throw new TikaException("TesseractOCRParser interrupted", e); - } catch (ExecutionException e) { - // should not be thrown - } catch (TimeoutException e) { - waitThread.interrupt(); - process.destroy(); - throw new TikaException("TesseractOCRParser timeout", e); + } catch (IllegalThreadStateException e) { + //this _should_ never be thrown + throw new TikaException("TesseractOCRParser timeout"); } if (exitValue > 0) { throw new TikaException("TesseractOCRParser bad exit value " + exitValue + " err msg: "+errBuilder.toString()); } + } /** @@ -411,8 +413,7 @@ public class TesseractOCRParser extends AbstractParser { * stream of the given process to not block the process. The stream is closed * once fully processed. */ - private void logStream(final String logType, final InputStream stream, - final File file, final StringBuilder out) { + private void logStream(final InputStream stream, final StringBuilder out) { new Thread() { public void run() { Reader reader = new InputStreamReader(stream, UTF_8);
