TIKA-1332 -- add English/Spanish common tokens, fix logging

Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/dc2dcd4c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/dc2dcd4c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/dc2dcd4c

Branch: refs/heads/master
Commit: dc2dcd4ccc7bca640bb362f72729d0b6ba22a890
Parents: a2d214c
Author: tballison <[email protected]>
Authored: Thu Feb 16 20:13:07 2017 -0500
Committer: tballison <[email protected]>
Committed: Thu Feb 16 20:13:07 2017 -0500

----------------------------------------------------------------------
 .../org/apache/tika/eval/AbstractProfiler.java  |    17 +-
 .../org/apache/tika/eval/ExtractProfiler.java   |    24 +-
 .../java/org/apache/tika/eval/TikaEvalCLI.java  |    22 +-
 .../tika/eval/batch/EvalConsumersBuilder.java   |     2 +-
 .../eval/batch/SingleFileConsumerBuilder.java   |    18 +-
 .../eval/tokens/CommonTokenCountManager.java    |    75 +-
 tika-eval/src/main/resources/common_tokens/en   | 20000 +++++++++++++++++
 tika-eval/src/main/resources/common_tokens/es   | 19997 ++++++++++++++++
 tika-eval/src/main/resources/log4j.properties   |    11 +
 .../resources/tika-eval-comparison-config.xml   |     6 +-
 .../resources/tika-eval-profiler-config.xml     |     7 +-
 .../apache/tika/eval/SimpleComparerTest.java    |     2 +-
 .../org/apache/tika/eval/TikaEvalCLITest.java   |     2 +-
 tika-eval/src/test/resources/common_tokens/en   |     8 +
 tika-eval/src/test/resources/common_tokens/es   |    10 +
 .../src/test/resources/common_tokens/zh-cn      |     8 +
 .../src/test/resources/common_tokens/zh-tw      |     8 +
 tika-eval/src/test/resources/commontokens/en    |     8 -
 tika-eval/src/test/resources/commontokens/es    |    10 -
 tika-eval/src/test/resources/commontokens/zh-cn |     8 -
 tika-eval/src/test/resources/commontokens/zh-tw |     8 -
 tika-eval/src/test/resources/log4j.properties   |    11 -
 .../src/test/resources/log4j_process.properties |    11 -
 ...ingle-file-profiler-crawl-extract-config.xml |     4 +-
 .../single-file-profiler-crawl-input-config.xml |     4 +-
 25 files changed, 40143 insertions(+), 138 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java 
b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index 24f7358..daa964a 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -158,6 +158,11 @@ public abstract class AbstractProfiler extends 
FileResourceConsumer {
     final LanguageIDWrapper langIder;
     protected IDBWriter writer;
 
+    /**
+     *
+     * @param p path to the common_tokens directory.  If this is null, try to 
load from classPath
+     * @throws IOException
+     */
     public static void loadCommonTokens(Path p) throws IOException {
         commonTokenCountManager = new CommonTokenCountManager(p);
     }
@@ -536,29 +541,29 @@ public abstract class AbstractProfiler extends 
FileResourceConsumer {
     /**
      *
      * @param metadata
-     * @param extractDir
+     * @param extracts
      * @return evalfilepaths for files if crawling an extract directory
      */
     protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata,
-                                                     Path extractDir) {
+                                                     Path extracts) {
         String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH);
         Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath);
         Path relativeSourceFilePath = Paths.get(m.replaceAll(""));
         //just try slapping the relextractfilepath on the extractdir
-        Path extractFile = extractDir.resolve(relExtractFilePath);
+        Path extractFile = extracts.resolve(relExtractFilePath);
         if (! Files.isRegularFile(extractFile)) {
             //if that doesn't work, try to find the right extract file.
             //This is necessary if crawling extractsA and trying to find a 
file in
             //extractsB that is not in the same format: json vs txt or 
compressed
-            extractFile = findFile(extractDir, relativeSourceFilePath);
+            extractFile = findFile(extracts, relativeSourceFilePath);
         }
         return new EvalFilePaths(relativeSourceFilePath, extractFile);
     }
     //call this if the crawler is crawling through the src directory
     protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path 
srcDir,
-                                                 Path extractDir) {
+                                                 Path extracts) {
         Path relativeSourceFilePath = 
Paths.get(metadata.get(FSProperties.FS_REL_PATH));
-        Path extractFile = findFile(extractDir, relativeSourceFilePath);
+        Path extractFile = findFile(extracts, relativeSourceFilePath);
         Path inputFile = srcDir.resolve(relativeSourceFilePath);
         long srcLen = -1l;
         //try to get the length of the source file in case there was an error

http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java 
b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
index 6840926..f3b10ca 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -44,18 +44,18 @@ public class ExtractProfiler extends AbstractProfiler {
         Option db = new Option("db", true, "db file to which to write 
results");
         db.setRequired(true);
 
-        //By the time this commandline is parsed, there should be both an 
extractDir and an inputDir
-        Option extractDir = new Option("extractDir", true, "directory for 
extract files");
-        extractDir.setRequired(true);
+        //By the time this commandline is parsed, there should be both an 
extracts and an inputDir
+        Option extracts = new Option("extracts", true, "directory for extract 
files");
+        extracts.setRequired(true);
 
         Option inputDir = new Option("inputDir", true,
                 "optional: directory for original binary input documents."+
-        " If not specified, -extractDir is crawled as is.");
+        " If not specified, -extracts is crawled as is.");
         inputDir.setRequired(true);
 
         OPTIONS = new Options()
                 .addOption(db)
-                .addOption(extractDir)
+                .addOption(extracts)
                 .addOption(inputDir)
                 .addOption("bc", "optional: tika-batch config file")
                 .addOption("numConsumers", true, "optional: number of consumer 
threads")
@@ -71,7 +71,7 @@ public class ExtractProfiler extends AbstractProfiler {
         HelpFormatter helpFormatter = new HelpFormatter();
         helpFormatter.printHelp(
                 80,
-                "java -jar tika-eval-x.y.jar Profile -extractDir extracts -db 
mydb [-inputDir input]",
+                "java -jar tika-eval-x.y.jar Profile -extracts extracts -db 
mydb [-inputDir input]",
                 "Tool: Profile",
                 ExtractProfiler.OPTIONS,
                 "Note: for h2 db, do not include the .mv.db at the end of the 
db name.");
@@ -143,16 +143,16 @@ public class ExtractProfiler extends AbstractProfiler {
     );
 
     private final Path inputDir;
-    private final Path extractDir;
+    private final Path extracts;
     private final ExtractReader.ALTER_METADATA_LIST alterExtractList;
     private final ExtractReader extractReader = new ExtractReader();
 
     public ExtractProfiler(ArrayBlockingQueue<FileResource> queue,
-                           Path inputDir, Path extractDir,
+                           Path inputDir, Path extracts,
                            IDBWriter dbWriter, 
ExtractReader.ALTER_METADATA_LIST alterExtractList) {
         super(queue, dbWriter);
         this.inputDir = inputDir;
-        this.extractDir = extractDir;
+        this.extracts = extracts;
         this.alterExtractList = alterExtractList;
     }
 
@@ -161,11 +161,11 @@ public class ExtractProfiler extends AbstractProfiler {
         Metadata metadata = fileResource.getMetadata();
         EvalFilePaths fps = null;
 
-        if (inputDir != null && inputDir.equals(extractDir)) {
+        if (inputDir != null && inputDir.equals(extracts)) {
             //crawling an extract dir
-            fps = getPathsFromExtractCrawl(metadata, extractDir);
+            fps = getPathsFromExtractCrawl(metadata, extracts);
         } else {
-            fps = getPathsFromSrcCrawl(metadata, inputDir, extractDir);
+            fps = getPathsFromSrcCrawl(metadata, inputDir, extracts);
         }
         List<Metadata> metadataList = 
extractReader.loadExtract(fps.getExtractFile(), alterExtractList);
 

http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java 
b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
index 5860327..2fd7f72 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
@@ -78,7 +78,7 @@ public class TikaEvalCLI {
 
         boolean containsBC = false;
         String inputDir = null;
-        String extractDir = null;
+        String extracts = null;
         String alterExtract = null;
         //confirm there's a batch-config file
         for (int i = 0; i < argList.size(); i++) {
@@ -93,13 +93,13 @@ public class TikaEvalCLI {
                 }
                 inputDir = argList.get(i+1);
                 i++;
-            } else if (arg.equals("-extractDir")) {
+            } else if (arg.equals("-extracts")) {
                 if (i+1 >= argList.size()) {
-                    System.err.println("Must specify directory after 
-extractDir");
+                    System.err.println("Must specify directory after 
-extracts");
                     ExtractProfiler.USAGE();
                     return;
                 }
-                extractDir = argList.get(i+1);
+                extracts = argList.get(i+1);
                 i++;
             } else if (arg.equals("-alterExtract")) {
                 if (i+1 >= argList.size()) {
@@ -122,15 +122,15 @@ public class TikaEvalCLI {
         }
 
         //need to specify each in this commandline
-        //if only extractDir is passed to tika-batch,
+        //if only extracts is passed to tika-batch,
         //the crawler will see no inputDir and start crawling "input".
-        //this allows the user to specify either extractDir or inputDir
-        if (extractDir == null && inputDir != null) {
-            argList.add("-extractDir");
+        //this allows the user to specify either extracts or inputDir
+        if (extracts == null && inputDir != null) {
+            argList.add("-extracts");
             argList.add(inputDir);
-        } else if (inputDir == null && extractDir != null) {
+        } else if (inputDir == null && extracts != null) {
             argList.add("-inputDir");
-            argList.add(extractDir);
+            argList.add(extracts);
         }
 
         Path tmpBCConfig = null;
@@ -210,7 +210,7 @@ public class TikaEvalCLI {
         }
 
         //need to specify each in the commandline that goes into tika-batch
-        //if only extractDir is passed to tika-batch,
+        //if only extracts is passed to tika-batch,
         //the crawler will see no inputDir and start crawling "input".
         //if the user doesn't specify inputDir, crawl extractsA
         if (inputDir == null && extractsA != null) {

http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
index 00f4ad7..6caad63 100644
--- 
a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
@@ -65,7 +65,7 @@ public class EvalConsumersBuilder extends 
AbstractConsumersBuilder {
             throw new RuntimeException(e);
         }
 
-        Path commonTokens = getNonNullPath(localAttrs, "commonTokens");
+        Path commonTokens = getPath(localAttrs, "commonTokens");
         try {
             AbstractProfiler.loadCommonTokens(commonTokens);
         } catch (IOException e) {

http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java
index de8be64..ec45f32 100644
--- 
a/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java
@@ -38,13 +38,13 @@ public class SingleFileConsumerBuilder extends 
EvalConsumerBuilder {
 
     @Override
     public FileResourceConsumer build() throws IOException {
-        Path extractDir = PropsUtil.getPath(localAttrs.get("extractDir"), 
null);
-        if (extractDir == null) {
-            throw new RuntimeException("Must specify \"extractDir\" -- 
directory to crawl");
+        Path extracts = PropsUtil.getPath(localAttrs.get("extracts"), null);
+        if (extracts == null) {
+            throw new RuntimeException("Must specify \"extracts\" -- directory 
to crawl");
         }
-        if (!Files.isDirectory(extractDir)) {
+        if (!Files.isDirectory(extracts)) {
             throw new RuntimeException("ROOT DIRECTORY DOES NOT EXIST: " +
-                    extractDir.toAbsolutePath());
+                    extracts.toAbsolutePath());
         }
 
         Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
@@ -64,16 +64,16 @@ public class SingleFileConsumerBuilder extends 
EvalConsumerBuilder {
         } catch (SQLException e) {
             throw new RuntimeException("Can't populate ref tables", e);
         }
-        //we _could_ set this to extractDir (if not null)
+        //we _could_ set this to extracts (if not null)
         //here, but the Crawler defaults to "input" if nothing is passed
         //so this won't work
         if (inputDir == null) {
             throw new RuntimeException("Must specify -inputDir");
         }
-        if (extractDir == null && inputDir != null) {
-            extractDir = inputDir;
+        if (extracts == null && inputDir != null) {
+            extracts = inputDir;
         }
-        return new ExtractProfiler(queue, inputDir, extractDir, writer, 
alterExtractList);
+        return new ExtractProfiler(queue, inputDir, extracts, writer, 
alterExtractList);
     }
 
     @Override

http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
index b74daa1..b072812 100644
--- 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
@@ -19,6 +19,8 @@ package org.apache.tika.eval.tokens;
 
 import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
@@ -29,6 +31,7 @@ import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.tika.io.IOUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -102,39 +105,57 @@ public class CommonTokenCountManager {
         if (commonTokenMap.get(langCode) != null) {
             return;
         }
-        Path p = commonTokensDir.resolve(langCode);
-        if (!Files.isRegularFile(p)) {
-            LOGGER.warn("Couldn't find common tokens file for: '"+langCode+"': 
"+
-            p.toAbsolutePath());
-            alreadyTriedToLoad.add(langCode);
-            return;
+        InputStream is = null;
+        Path p = null;
+        if (commonTokensDir != null) {
+            p = commonTokensDir.resolve(langCode);
         }
 
-        Set<String> set = commonTokenMap.get(langCode);
-        if (set == null) {
-            set = new HashSet<>();
-            commonTokenMap.put(langCode, set);
-        }
-        try (BufferedReader reader = Files.newBufferedReader(p, 
COMMON_TOKENS_CHARSET)) {
-            alreadyTriedToLoad.add(langCode);
-            String line = reader.readLine();
-            while (line != null) {
-                line = line.trim();
-                if (line.startsWith("#")) {
+        try {
+            if (p == null || !Files.isRegularFile(p)) {
+                is = this.getClass().getResourceAsStream("/common_tokens/" + 
langCode);
+            } else {
+                is = Files.newInputStream(p);
+            }
+
+
+            if (is == null) {
+                LOGGER.warn("Couldn't find common tokens file for: '" + 
langCode + "': " +
+                        p.toAbsolutePath());
+                alreadyTriedToLoad.add(langCode);
+                return;
+            }
+
+
+            Set<String> set = commonTokenMap.get(langCode);
+            if (set == null) {
+                set = new HashSet<>();
+                commonTokenMap.put(langCode, set);
+            }
+            try (BufferedReader reader = new BufferedReader(
+                    new InputStreamReader(is, COMMON_TOKENS_CHARSET))) {
+                alreadyTriedToLoad.add(langCode);
+                String line = reader.readLine();
+                while (line != null) {
+                    line = line.trim();
+                    if (line.startsWith("#")) {
+                        line = reader.readLine();
+                        continue;
+                    }
+                    //allow language models with, e.g. tab-delimited counts 
after the term
+                    String[] cols = line.split("\t");
+                    String t = cols[0].trim();
+                    if (t.length() > 0) {
+                        set.add(t);
+                    }
+
                     line = reader.readLine();
-                    continue;
-                }
-                //allow language models with, e.g. tab-delimited counts after 
the term
-                String[] cols = line.split("\t");
-                String t = cols[0].trim();
-                if (t.length() > 0) {
-                    set.add(t);
                 }
-
-                line = reader.readLine();
             }
         } catch (IOException e) {
-            LOGGER.warn("IOException trying to read: '"+langCode+"'");
+            LOGGER.warn("IOException trying to read: '" + langCode + "'");
+        } finally {
+            IOUtils.closeQuietly(is);
         }
     }
 

Reply via email to