This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit faaaca7a6334ae53cf512c3b043cfd701793b00e Author: tallison <[email protected]> AuthorDate: Tue Mar 9 08:03:18 2021 -0500 add timeout threshold for fileprofiler --- .../java/org/apache/tika/eval/FileProfiler.java | 23 ++++++++++++++-------- .../resources/tika-eval-file-profiler-config.xml | 1 + 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java index 65908b7..12c3ef4 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java @@ -75,7 +75,7 @@ public class FileProfiler extends AbstractProfiler { .addOption("tablePrefix", true, "EXPERT: optional prefix for table names") .addOption("drop", false, "drop tables if they exist") .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler") - + .addOption("timeoutThresholdMillis", true, "timeout per file in milliseconds") ; } @@ -135,11 +135,20 @@ public class FileProfiler extends AbstractProfiler { try (InputStream is = fileResource.openInputStream()) { try (TikaInputStream tis = TikaInputStream.get(is)) { Path path = tis.getPath(); + long length = -1; Map<Cols, String> data = new HashMap<>(); + try { + length = Files.size(path); + } catch (IOException e) { + LOG.warn("problem getting size: "+relPath, e); + } + long start = System.currentTimeMillis(); int tikaMimeId = writer.getMimeId(detectTika(tis)); + long elapsed = System.currentTimeMillis()-start; + LOG.debug("took "+elapsed+ " ms for tika detect on length "+length); String fileName = ""; String extension = ""; - long length = -1; + try { fileName = FilenameUtils.getName(relPath); } catch (IllegalArgumentException e) { @@ -152,12 +161,6 @@ public class FileProfiler extends AbstractProfiler { LOG.warn("bad extension: "+relPath, e); } - try { - length = Files.size(path); - } catch (IOException e) { - LOG.warn("problem getting size: "+relPath, e); - } - data.put(Cols.FILE_PATH, relPath); data.put(Cols.FILE_NAME, fileName); data.put(Cols.FILE_EXTENSION, extension); @@ -165,7 +168,11 @@ public class FileProfiler extends AbstractProfiler { data.put(Cols.TIKA_MIME_ID, Integer.toString(tikaMimeId)); data.put(Cols.SHA256, DigestUtils.sha256Hex(tis)); if (HAS_FILE) { + start = System.currentTimeMillis(); int fileMimeId = writer.getMimeId(detectFile(tis)); + elapsed = System.currentTimeMillis()-start; + LOG.debug("took "+elapsed+ " ms for file detect on length "+length); + data.put(Cols.FILE_MIME_ID, Integer.toString(fileMimeId)); } writer.writeRow(FILE_PROFILES, data); diff --git a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml index a22523f..c253cbe 100644 --- a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml +++ b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml @@ -45,6 +45,7 @@ description="EXPERT: prefix for table names"/> <option opt="drop" hasArg="false" description="drop tables if they exist"/> <option opt="maxFilesToAdd" hasArg="true" description="maximum number of files to add to the crawler"/> + <option opt="timeoutThresholdMillis" hasArg="true" description="timeout per file in milliseconds"/> </commandline>
