TIKA-1332 -- add English/Spanish common tokens, fix logging
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/dc2dcd4c Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/dc2dcd4c Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/dc2dcd4c Branch: refs/heads/master Commit: dc2dcd4ccc7bca640bb362f72729d0b6ba22a890 Parents: a2d214c Author: tballison <[email protected]> Authored: Thu Feb 16 20:13:07 2017 -0500 Committer: tballison <[email protected]> Committed: Thu Feb 16 20:13:07 2017 -0500 ---------------------------------------------------------------------- .../org/apache/tika/eval/AbstractProfiler.java | 17 +- .../org/apache/tika/eval/ExtractProfiler.java | 24 +- .../java/org/apache/tika/eval/TikaEvalCLI.java | 22 +- .../tika/eval/batch/EvalConsumersBuilder.java | 2 +- .../eval/batch/SingleFileConsumerBuilder.java | 18 +- .../eval/tokens/CommonTokenCountManager.java | 75 +- tika-eval/src/main/resources/common_tokens/en | 20000 +++++++++++++++++ tika-eval/src/main/resources/common_tokens/es | 19997 ++++++++++++++++ tika-eval/src/main/resources/log4j.properties | 11 + .../resources/tika-eval-comparison-config.xml | 6 +- .../resources/tika-eval-profiler-config.xml | 7 +- .../apache/tika/eval/SimpleComparerTest.java | 2 +- .../org/apache/tika/eval/TikaEvalCLITest.java | 2 +- tika-eval/src/test/resources/common_tokens/en | 8 + tika-eval/src/test/resources/common_tokens/es | 10 + .../src/test/resources/common_tokens/zh-cn | 8 + .../src/test/resources/common_tokens/zh-tw | 8 + tika-eval/src/test/resources/commontokens/en | 8 - tika-eval/src/test/resources/commontokens/es | 10 - tika-eval/src/test/resources/commontokens/zh-cn | 8 - tika-eval/src/test/resources/commontokens/zh-tw | 8 - tika-eval/src/test/resources/log4j.properties | 11 - .../src/test/resources/log4j_process.properties | 11 - ...ingle-file-profiler-crawl-extract-config.xml | 4 +- .../single-file-profiler-crawl-input-config.xml | 4 +- 25 files changed, 40143 insertions(+), 138 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java index 24f7358..daa964a 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java @@ -158,6 +158,11 @@ public abstract class AbstractProfiler extends FileResourceConsumer { final LanguageIDWrapper langIder; protected IDBWriter writer; + /** + * + * @param p path to the common_tokens directory. If this is null, try to load from classPath + * @throws IOException + */ public static void loadCommonTokens(Path p) throws IOException { commonTokenCountManager = new CommonTokenCountManager(p); } @@ -536,29 +541,29 @@ public abstract class AbstractProfiler extends FileResourceConsumer { /** * * @param metadata - * @param extractDir + * @param extracts * @return evalfilepaths for files if crawling an extract directory */ protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata, - Path extractDir) { + Path extracts) { String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH); Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath); Path relativeSourceFilePath = Paths.get(m.replaceAll("")); //just try slapping the relextractfilepath on the extractdir - Path extractFile = extractDir.resolve(relExtractFilePath); + Path extractFile = extracts.resolve(relExtractFilePath); if (! Files.isRegularFile(extractFile)) { //if that doesn't work, try to find the right extract file. //This is necessary if crawling extractsA and trying to find a file in //extractsB that is not in the same format: json vs txt or compressed - extractFile = findFile(extractDir, relativeSourceFilePath); + extractFile = findFile(extracts, relativeSourceFilePath); } return new EvalFilePaths(relativeSourceFilePath, extractFile); } //call this if the crawler is crawling through the src directory protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path srcDir, - Path extractDir) { + Path extracts) { Path relativeSourceFilePath = Paths.get(metadata.get(FSProperties.FS_REL_PATH)); - Path extractFile = findFile(extractDir, relativeSourceFilePath); + Path extractFile = findFile(extracts, relativeSourceFilePath); Path inputFile = srcDir.resolve(relativeSourceFilePath); long srcLen = -1l; //try to get the length of the source file in case there was an error http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java index 6840926..f3b10ca 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java @@ -44,18 +44,18 @@ public class ExtractProfiler extends AbstractProfiler { Option db = new Option("db", true, "db file to which to write results"); db.setRequired(true); - //By the time this commandline is parsed, there should be both an extractDir and an inputDir - Option extractDir = new Option("extractDir", true, "directory for extract files"); - extractDir.setRequired(true); + //By the time this commandline is parsed, there should be both an extracts and an inputDir + Option extracts = new Option("extracts", true, "directory for extract files"); + extracts.setRequired(true); Option inputDir = new Option("inputDir", true, "optional: directory for original binary input documents."+ - " If not specified, -extractDir is crawled as is."); + " If not specified, -extracts is crawled as is."); inputDir.setRequired(true); OPTIONS = new Options() .addOption(db) - .addOption(extractDir) + .addOption(extracts) .addOption(inputDir) .addOption("bc", "optional: tika-batch config file") .addOption("numConsumers", true, "optional: number of consumer threads") @@ -71,7 +71,7 @@ public class ExtractProfiler extends AbstractProfiler { HelpFormatter helpFormatter = new HelpFormatter(); helpFormatter.printHelp( 80, - "java -jar tika-eval-x.y.jar Profile -extractDir extracts -db mydb [-inputDir input]", + "java -jar tika-eval-x.y.jar Profile -extracts extracts -db mydb [-inputDir input]", "Tool: Profile", ExtractProfiler.OPTIONS, "Note: for h2 db, do not include the .mv.db at the end of the db name."); @@ -143,16 +143,16 @@ public class ExtractProfiler extends AbstractProfiler { ); private final Path inputDir; - private final Path extractDir; + private final Path extracts; private final ExtractReader.ALTER_METADATA_LIST alterExtractList; private final ExtractReader extractReader = new ExtractReader(); public ExtractProfiler(ArrayBlockingQueue<FileResource> queue, - Path inputDir, Path extractDir, + Path inputDir, Path extracts, IDBWriter dbWriter, ExtractReader.ALTER_METADATA_LIST alterExtractList) { super(queue, dbWriter); this.inputDir = inputDir; - this.extractDir = extractDir; + this.extracts = extracts; this.alterExtractList = alterExtractList; } @@ -161,11 +161,11 @@ public class ExtractProfiler extends AbstractProfiler { Metadata metadata = fileResource.getMetadata(); EvalFilePaths fps = null; - if (inputDir != null && inputDir.equals(extractDir)) { + if (inputDir != null && inputDir.equals(extracts)) { //crawling an extract dir - fps = getPathsFromExtractCrawl(metadata, extractDir); + fps = getPathsFromExtractCrawl(metadata, extracts); } else { - fps = getPathsFromSrcCrawl(metadata, inputDir, extractDir); + fps = getPathsFromSrcCrawl(metadata, inputDir, extracts); } List<Metadata> metadataList = extractReader.loadExtract(fps.getExtractFile(), alterExtractList); http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java index 5860327..2fd7f72 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java @@ -78,7 +78,7 @@ public class TikaEvalCLI { boolean containsBC = false; String inputDir = null; - String extractDir = null; + String extracts = null; String alterExtract = null; //confirm there's a batch-config file for (int i = 0; i < argList.size(); i++) { @@ -93,13 +93,13 @@ public class TikaEvalCLI { } inputDir = argList.get(i+1); i++; - } else if (arg.equals("-extractDir")) { + } else if (arg.equals("-extracts")) { if (i+1 >= argList.size()) { - System.err.println("Must specify directory after -extractDir"); + System.err.println("Must specify directory after -extracts"); ExtractProfiler.USAGE(); return; } - extractDir = argList.get(i+1); + extracts = argList.get(i+1); i++; } else if (arg.equals("-alterExtract")) { if (i+1 >= argList.size()) { @@ -122,15 +122,15 @@ public class TikaEvalCLI { } //need to specify each in this commandline - //if only extractDir is passed to tika-batch, + //if only extracts is passed to tika-batch, //the crawler will see no inputDir and start crawling "input". - //this allows the user to specify either extractDir or inputDir - if (extractDir == null && inputDir != null) { - argList.add("-extractDir"); + //this allows the user to specify either extracts or inputDir + if (extracts == null && inputDir != null) { + argList.add("-extracts"); argList.add(inputDir); - } else if (inputDir == null && extractDir != null) { + } else if (inputDir == null && extracts != null) { argList.add("-inputDir"); - argList.add(extractDir); + argList.add(extracts); } Path tmpBCConfig = null; @@ -210,7 +210,7 @@ public class TikaEvalCLI { } //need to specify each in the commandline that goes into tika-batch - //if only extractDir is passed to tika-batch, + //if only extracts is passed to tika-batch, //the crawler will see no inputDir and start crawling "input". //if the user doesn't specify inputDir, crawl extractsA if (inputDir == null && extractsA != null) { http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java index 00f4ad7..6caad63 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java @@ -65,7 +65,7 @@ public class EvalConsumersBuilder extends AbstractConsumersBuilder { throw new RuntimeException(e); } - Path commonTokens = getNonNullPath(localAttrs, "commonTokens"); + Path commonTokens = getPath(localAttrs, "commonTokens"); try { AbstractProfiler.loadCommonTokens(commonTokens); } catch (IOException e) { http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java index de8be64..ec45f32 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java @@ -38,13 +38,13 @@ public class SingleFileConsumerBuilder extends EvalConsumerBuilder { @Override public FileResourceConsumer build() throws IOException { - Path extractDir = PropsUtil.getPath(localAttrs.get("extractDir"), null); - if (extractDir == null) { - throw new RuntimeException("Must specify \"extractDir\" -- directory to crawl"); + Path extracts = PropsUtil.getPath(localAttrs.get("extracts"), null); + if (extracts == null) { + throw new RuntimeException("Must specify \"extracts\" -- directory to crawl"); } - if (!Files.isDirectory(extractDir)) { + if (!Files.isDirectory(extracts)) { throw new RuntimeException("ROOT DIRECTORY DOES NOT EXIST: " + - extractDir.toAbsolutePath()); + extracts.toAbsolutePath()); } Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null); @@ -64,16 +64,16 @@ public class SingleFileConsumerBuilder extends EvalConsumerBuilder { } catch (SQLException e) { throw new RuntimeException("Can't populate ref tables", e); } - //we _could_ set this to extractDir (if not null) + //we _could_ set this to extracts (if not null) //here, but the Crawler defaults to "input" if nothing is passed //so this won't work if (inputDir == null) { throw new RuntimeException("Must specify -inputDir"); } - if (extractDir == null && inputDir != null) { - extractDir = inputDir; + if (extracts == null && inputDir != null) { + extracts = inputDir; } - return new ExtractProfiler(queue, inputDir, extractDir, writer, alterExtractList); + return new ExtractProfiler(queue, inputDir, extracts, writer, alterExtractList); } @Override http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java index b74daa1..b072812 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java @@ -19,6 +19,8 @@ package org.apache.tika.eval.tokens; import java.io.BufferedReader; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.Files; @@ -29,6 +31,7 @@ import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import org.apache.commons.lang3.mutable.MutableInt; +import org.apache.tika.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -102,39 +105,57 @@ public class CommonTokenCountManager { if (commonTokenMap.get(langCode) != null) { return; } - Path p = commonTokensDir.resolve(langCode); - if (!Files.isRegularFile(p)) { - LOGGER.warn("Couldn't find common tokens file for: '"+langCode+"': "+ - p.toAbsolutePath()); - alreadyTriedToLoad.add(langCode); - return; + InputStream is = null; + Path p = null; + if (commonTokensDir != null) { + p = commonTokensDir.resolve(langCode); } - Set<String> set = commonTokenMap.get(langCode); - if (set == null) { - set = new HashSet<>(); - commonTokenMap.put(langCode, set); - } - try (BufferedReader reader = Files.newBufferedReader(p, COMMON_TOKENS_CHARSET)) { - alreadyTriedToLoad.add(langCode); - String line = reader.readLine(); - while (line != null) { - line = line.trim(); - if (line.startsWith("#")) { + try { + if (p == null || !Files.isRegularFile(p)) { + is = this.getClass().getResourceAsStream("/common_tokens/" + langCode); + } else { + is = Files.newInputStream(p); + } + + + if (is == null) { + LOGGER.warn("Couldn't find common tokens file for: '" + langCode + "': " + + p.toAbsolutePath()); + alreadyTriedToLoad.add(langCode); + return; + } + + + Set<String> set = commonTokenMap.get(langCode); + if (set == null) { + set = new HashSet<>(); + commonTokenMap.put(langCode, set); + } + try (BufferedReader reader = new BufferedReader( + new InputStreamReader(is, COMMON_TOKENS_CHARSET))) { + alreadyTriedToLoad.add(langCode); + String line = reader.readLine(); + while (line != null) { + line = line.trim(); + if (line.startsWith("#")) { + line = reader.readLine(); + continue; + } + //allow language models with, e.g. tab-delimited counts after the term + String[] cols = line.split("\t"); + String t = cols[0].trim(); + if (t.length() > 0) { + set.add(t); + } + line = reader.readLine(); - continue; - } - //allow language models with, e.g. tab-delimited counts after the term - String[] cols = line.split("\t"); - String t = cols[0].trim(); - if (t.length() > 0) { - set.add(t); } - - line = reader.readLine(); } } catch (IOException e) { - LOGGER.warn("IOException trying to read: '"+langCode+"'"); + LOGGER.warn("IOException trying to read: '" + langCode + "'"); + } finally { + IOUtils.closeQuietly(is); } }
