This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4f41baf87a495e0fc175c54067a1c39a908722ba Author: tallison <talli...@apache.org> AuthorDate: Tue Nov 17 13:21:17 2020 -0500 TIKA-3228 -- add file name/extension to FileProfiler and make mime table name distinct. --- .../org/apache/tika/batch/FileResourceCrawler.java | 18 +++++----- .../apache/tika/detect/FileCommandDetector.java | 14 +++++++- .../java/org/apache/tika/eval/FileProfiler.java | 39 +++++++++++++++++++++- .../java/org/apache/tika/eval/TikaEvalCLI.java | 3 +- .../tika/eval/batch/EvalConsumerBuilder.java | 5 ++- .../tika/eval/batch/ExtractComparerBuilder.java | 5 +++ .../tika/eval/batch/ExtractProfilerBuilder.java | 5 +++ .../tika/eval/batch/FileProfilerBuilder.java | 10 ++++-- .../java/org/apache/tika/eval/db/MimeBuffer.java | 4 +-- .../resources/tika-eval-file-profiler-config.xml | 2 +- .../main/resources/tika-eval-profiler-config.xml | 2 +- 11 files changed, 88 insertions(+), 19 deletions(-) diff --git a/tika-batch/src/main/java/org/apache/tika/batch/FileResourceCrawler.java b/tika-batch/src/main/java/org/apache/tika/batch/FileResourceCrawler.java index 7f6057b..3456fc3 100644 --- a/tika-batch/src/main/java/org/apache/tika/batch/FileResourceCrawler.java +++ b/tika-batch/src/main/java/org/apache/tika/batch/FileResourceCrawler.java @@ -113,16 +113,18 @@ public abstract class FileResourceCrawler implements Callable<IFileProcessorFutu boolean isAdded = false; if (select(fileResource.getMetadata())) { - long totalConsecutiveWait = 0; - while (queue.offer(fileResource, 1L, TimeUnit.SECONDS) == false) { + long start = System.currentTimeMillis(); + while (queue.offer(fileResource, PAUSE_INCREMENT_MILLIS, TimeUnit.MILLISECONDS) == false) { + long elapsed = System.currentTimeMillis() - start; + LOG.info("FileResourceCrawler is pausing. Queue is full: {} after {} ms", + queue.size(), elapsed); - LOG.info("FileResourceCrawler is pausing. Queue is full: {}", queue.size()); - Thread.sleep(PAUSE_INCREMENT_MILLIS); - totalConsecutiveWait += PAUSE_INCREMENT_MILLIS; - if (maxConsecWaitInMillis > -1 && totalConsecutiveWait > maxConsecWaitInMillis) { + if (maxConsecWaitInMillis > -1 && elapsed > maxConsecWaitInMillis) { timedOut = true; - LOG.error("Crawler had to wait longer than max consecutive wait time."); - throw new InterruptedException("FileResourceCrawler had to wait longer than max consecutive wait time."); + String msg = "FileResourceCrawler had to wait longer (" + + elapsed + " ms) than allowed ("+maxConsecWaitInMillis+" ms)"; + LOG.error(msg); + throw new InterruptedException(msg); } if (Thread.currentThread().isInterrupted()) { LOG.info("FileResourceCrawler shutting down because of interrupted thread."); diff --git a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java index 6544e43..1e242fe 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java @@ -78,6 +78,13 @@ public class FileCommandDetector implements Detector { return ExternalParser.check(commandline); } + /** + * + * @param input document input stream, or <code>null</code> + * @param metadata input metadata for the document + * @return mime as identified by the file command or application/octet-stream otherwise + * @throws IOException + */ @Override public MediaType detect(InputStream input, Metadata metadata) throws IOException { if (hasFileCommand == null) { @@ -144,7 +151,12 @@ public class FileCommandDetector implements Detector { outThread.join(); } catch (InterruptedException e) { } - return MediaType.parse(outGobbler.toString().trim()); + MediaType mt = MediaType.parse(outGobbler.toString().trim()); + if (mt == null) { + return MediaType.OCTET_STREAM; + } else { + return mt; + } } @Field diff --git a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java index dcd1751..65908b7 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java @@ -20,6 +20,7 @@ import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.io.FilenameUtils; import org.apache.tika.Tika; import org.apache.tika.batch.FileResource; import org.apache.tika.batch.fs.FSProperties; @@ -30,6 +31,8 @@ import org.apache.tika.eval.db.TableInfo; import org.apache.tika.eval.io.IDBWriter; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; @@ -53,6 +56,8 @@ public class FileProfiler extends AbstractProfiler { //TODO: we should allow users to select digest type/encoding and file detector(s). private static final boolean HAS_FILE = FileCommandDetector.checkHasFile(); + private static final Logger LOG = LoggerFactory.getLogger(FileProfiler.class); + static Options OPTIONS; static { @@ -90,6 +95,8 @@ public class FileProfiler extends AbstractProfiler { public static TableInfo FILE_PROFILES = HAS_FILE ? new TableInfo("file_profiles", new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY KEY"), + new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 2048), + new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24), new ColInfo(Cols.LENGTH, Types.BIGINT), new ColInfo(Cols.SHA256, Types.VARCHAR, 64), new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER), @@ -97,11 +104,18 @@ public class FileProfiler extends AbstractProfiler { : new TableInfo("file_profiles", new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY KEY"), + new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 2048), + new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24), new ColInfo(Cols.LENGTH, Types.BIGINT), new ColInfo(Cols.SHA256, Types.VARCHAR, 64), new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER)); + public static TableInfo FILE_MIME_TABLE = new TableInfo("file_mimes", + new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256), + new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12) + ); public static final String DETECT_EXCEPTION = "detect-exception"; private static final Tika TIKA = new Tika(); @@ -123,9 +137,32 @@ public class FileProfiler extends AbstractProfiler { Path path = tis.getPath(); Map<Cols, String> data = new HashMap<>(); int tikaMimeId = writer.getMimeId(detectTika(tis)); + String fileName = ""; + String extension = ""; + long length = -1; + try { + fileName = FilenameUtils.getName(relPath); + } catch (IllegalArgumentException e) { + LOG.warn("bad file name: "+relPath, e); + } + + try { + extension = FilenameUtils.getExtension(relPath); + } catch (IllegalArgumentException e) { + LOG.warn("bad extension: "+relPath, e); + } + + try { + length = Files.size(path); + } catch (IOException e) { + LOG.warn("problem getting size: "+relPath, e); + } + data.put(Cols.FILE_PATH, relPath); + data.put(Cols.FILE_NAME, fileName); + data.put(Cols.FILE_EXTENSION, extension); + data.put(Cols.LENGTH, Long.toString(length)); data.put(Cols.TIKA_MIME_ID, Integer.toString(tikaMimeId)); - data.put(Cols.LENGTH, Long.toString(Files.size(path))); data.put(Cols.SHA256, DigestUtils.sha256Hex(tis)); if (HAS_FILE) { int fileMimeId = writer.getMimeId(detectFile(tis)); diff --git a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java index 6b709b8..fd4206e 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java @@ -33,7 +33,8 @@ import org.apache.tika.eval.reports.ResultsReporter; import org.h2.tools.Console; public class TikaEvalCLI { - static final String[] tools = {"Profile", "Compare", "Report", "StartDB"}; + static final String[] tools = {"Profile", "FileProfile", + "Compare", "Report", "StartDB"}; private static String specifyTools() { StringBuilder sb = new StringBuilder(); diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java index 6f407f6..9953973 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java @@ -76,7 +76,8 @@ public abstract class EvalConsumerBuilder { dbUtil.createTables(getRefTableInfos(), createRefTable); //step 3. create mime buffer - this.mimeBuffer = new MimeBuffer(dbUtil.getConnection(), TikaConfig.getDefaultConfig()); + this.mimeBuffer = new MimeBuffer(dbUtil.getConnection(), getMimeTable(), + TikaConfig.getDefaultConfig()); //step 4. populate the reference tables populateRefTables(); @@ -100,6 +101,8 @@ public abstract class EvalConsumerBuilder { */ protected abstract List<TableInfo> getNonRefTableInfos(); + protected abstract TableInfo getMimeTable(); + protected abstract void addErrorLogTablePairs(DBConsumersManager manager); public void populateRefTables() throws IOException, SQLException { diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java index 909032c..0ae893f 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java @@ -143,6 +143,11 @@ public class ExtractComparerBuilder extends EvalConsumerBuilder { } @Override + protected TableInfo getMimeTable() { + return AbstractProfiler.MIME_TABLE; + } + + @Override protected void addErrorLogTablePairs(DBConsumersManager manager) { Path errorLogA = PropsUtil.getPath(localAttrs.get("errorLogFileA"), null); if (errorLogA == null) { diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java index 729460b..20efbf7 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java @@ -107,6 +107,11 @@ public class ExtractProfilerBuilder extends EvalConsumerBuilder { } @Override + protected TableInfo getMimeTable() { + return AbstractProfiler.MIME_TABLE; + } + + @Override protected void addErrorLogTablePairs(DBConsumersManager manager) { Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null); if (errorLog == null) { diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java index 0ba7bea..a54dd55 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java @@ -17,14 +17,12 @@ package org.apache.tika.eval.batch; import org.apache.tika.batch.FileResourceConsumer; -import org.apache.tika.eval.AbstractProfiler; import org.apache.tika.eval.ExtractProfiler; import org.apache.tika.eval.FileProfiler; import org.apache.tika.eval.db.TableInfo; import org.apache.tika.util.PropsUtil; import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; import java.util.ArrayList; @@ -38,9 +36,10 @@ public class FileProfilerBuilder extends EvalConsumerBuilder { public final static String TABLE_PREFIX_KEY = "tablePrefix"; private final List<TableInfo> tableInfos; + public FileProfilerBuilder() { List<TableInfo> tableInfos = new ArrayList(); - tableInfos.add(AbstractProfiler.MIME_TABLE); + tableInfos.add(FileProfiler.FILE_MIME_TABLE); tableInfos.add(FileProfiler.FILE_PROFILES); this.tableInfos = Collections.unmodifiableList(tableInfos); @@ -83,6 +82,11 @@ public class FileProfilerBuilder extends EvalConsumerBuilder { } @Override + protected TableInfo getMimeTable() { + return FileProfiler.FILE_MIME_TABLE; + } + + @Override protected void addErrorLogTablePairs(DBConsumersManager manager) { Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null); if (errorLog == null) { diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java index 9f6b136..eba731b 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java @@ -36,8 +36,8 @@ public class MimeBuffer extends AbstractDBBuffer { private final Connection connection; - public MimeBuffer(Connection connection, TikaConfig config) throws SQLException { - st = connection.prepareStatement("insert into " + AbstractProfiler.MIME_TABLE.getName() + "( " + + public MimeBuffer(Connection connection, TableInfo mimeTable, TikaConfig config) throws SQLException { + st = connection.prepareStatement("insert into " + mimeTable.getName() + "( " + Cols.MIME_ID.name() + ", " + Cols.MIME_STRING.name() + ", " + Cols.FILE_EXTENSION.name() + ") values (?,?,?)"); diff --git a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml index 6a7867a..a22523f 100644 --- a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml +++ b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml @@ -56,7 +56,7 @@ <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder" crawlOrder="sorted" - maxConsecWaitMillis="5000" + maxConsecWaitMillis="300000" maxFilesToAdd="-1" maxFilesToConsider="-1" includeFilePat="" diff --git a/tika-eval/src/main/resources/tika-eval-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-profiler-config.xml index e5f090f..030bd3f 100644 --- a/tika-eval/src/main/resources/tika-eval-profiler-config.xml +++ b/tika-eval/src/main/resources/tika-eval-profiler-config.xml @@ -75,7 +75,7 @@ <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder" crawlOrder="sorted" - maxConsecWaitMillis="5000" + maxConsecWaitMillis="300000" maxFilesToAdd="-1" maxFilesToConsider="-1" includeFilePat=""