(tika) 02/03: WIP -- don't merge

tallison Tue, 08 Jul 2025 12:28:02 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4342
in repository https://gitbox.apache.org/repos/asf/tika.git


commit a5972ca7e53b25b001417e9a13b635dcae7c452b
Author: tallison <[email protected]>
AuthorDate: Wed May 28 16:37:11 2025 -0400

    WIP -- don't merge
---
 .../tika/eval/app/ExctractProfileRunner.java       | 204 ++++++
 .../org/apache/tika/eval/app/ExtractProfiler.java  |  15 +-
 .../apache/tika/eval/app/FileProfileRunner.java    |  54 --
 .../org/apache/tika/eval/app/ProfilerBase.java     | 816 +++++++++++++++++++++
 .../java/org/apache/tika/eval/app/TikaEvalCLI.java |   2 +-
 .../apache/tika/eval/app/batch/FileResource.java   |   2 +-
 .../apache/tika/eval/app/batch/PathResource.java   |  33 +
 7 files changed, 1059 insertions(+), 67 deletions(-)

diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExctractProfileRunner.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExctractProfileRunner.java
new file mode 100644
index 000000000..20cb33c1c
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExctractProfileRunner.java
@@ -0,0 +1,204 @@
+package org.apache.tika.eval.app;
+
+import java.io.IOException;
+import java.nio.file.FileVisitResult;
+import java.nio.file.FileVisitor;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.attribute.BasicFileAttributes;
+import java.util.List;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.eval.app.batch.FileResource;
+import org.apache.tika.eval.app.batch.PathResource;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.eval.app.db.MimeBuffer;
+import org.apache.tika.eval.app.io.ExtractReader;
+import org.apache.tika.eval.app.io.IDBWriter;
+
+public class ExctractProfileRunner {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(ExctractProfileRunner.class);
+    private static final PathResource SEMAPHORE = new 
PathResource(Paths.get("/"), "STOP");
+
+    static Options OPTIONS;
+
+    static {
+
+        OPTIONS = new Options()
+                
.addOption(Option.builder("e").longOpt("extracts").hasArg().desc("required: 
directory of extracts").build())
+                
.addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional: 
directory for original binary input documents."
+                        + " If not specified, -extracts is crawled as 
is.").build())
+                
.addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db 
path").build())
+                
.addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json 
config file").build())
+                ;
+    }
+    public static void main(String[] args) throws Exception {
+        DefaultParser defaultCLIParser = new DefaultParser();
+        CommandLine commandLine = defaultCLIParser.parse(OPTIONS, args);
+        EvalConfig evalConfig = commandLine.hasOption('c') ? 
EvalConfig.load(Paths.get(commandLine.getOptionValue('c'))) : new EvalConfig();
+        Path extractsDir = commandLine.hasOption('e') ? 
Paths.get(commandLine.getOptionValue('e')) : Paths.get(USAGE_FAIL("Must specify 
extracts dir: -i"));
+        Path inputDir = commandLine.hasOption('i') ? 
Paths.get(commandLine.getOptionValue('i')) : extractsDir;
+        String dbPath = commandLine.hasOption('d') ? 
commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d");
+        execute(inputDir, extractsDir, dbPath, evalConfig);
+    }
+
+    private static void execute(Path inputDir, Path extractsDir, String 
dbPath, EvalConfig evalConfig) {
+
+        ArrayBlockingQueue<FileResource> queue = new 
ArrayBlockingQueue<>(1000);
+        DirectoryWalker fileWalker = new DirectoryWalker(inputDir, queue);
+        ExecutorService executorService = 
Executors.newFixedThreadPool(evalConfig.numThreads + 1);
+        ExecutorCompletionService<Integer> executorCompletionService = new 
ExecutorCompletionService<>(executorService);
+        executorCompletionService.submit(fileWalker);
+        IDBWriter dbWriter = buildDBWriter();
+        for (int i = 0; i < evalConfig.numThreads; i++) {
+            ExtractReader extractReader = new 
ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS, 
evalConfig.minExtractLength, evalConfig.maxExtractLength);
+
+            ExtractProfiler extractProfiler = new ExtractProfiler(inputDir, 
extractsDir, extractReader, dbWriter);
+            executorCompletionService.submit(new ProfileWorker(queue, 
extractProfiler));
+        }
+
+        int finished = 0;
+        try {
+            while (finished < evalConfig.numThreads + 1) {
+                //blocking
+                Future<Integer> future = executorCompletionService.take();
+                Integer result = future.get();
+                if (result != null) {
+                    finished++;
+                }
+
+            }
+        } catch (InterruptedException e) {
+            LOG.info("interrupted", e);
+        } catch (ExecutionException e) {
+            throw new RuntimeException(e);
+        } finally {
+            executorService.shutdownNow();
+        }
+
+    }
+
+    private static IDBWriter buildDBWriter(String connectionString, String 
driverClass) {
+        MimeBuffer mimeBuffer = null;
+        JDBCUtil dbUtil = new JDBCUtil(connectionString, driverClass);
+        //Step 1. Used to be update table infos with prefixes
+        updateTableInfosWithPrefixes(localAttrs);
+
+        JDBCUtil.CREATE_TABLE createRegularTable = (forceDrop) ? 
JDBCUtil.CREATE_TABLE.DROP_IF_EXISTS : JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS;
+
+        JDBCUtil.CREATE_TABLE createRefTable = (forceDrop) ? 
JDBCUtil.CREATE_TABLE.DROP_IF_EXISTS : JDBCUtil.CREATE_TABLE.SKIP_IF_EXISTS;
+
+        //step 2. create the tables
+        dbUtil.createTables(getNonRefTableInfos(), 
JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS);
+        dbUtil.createTables(getRefTableInfos(), 
JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS);
+
+        //step 3. create mime buffer
+        this.mimeBuffer = new MimeBuffer(dbUtil.getConnection(), 
getMimeTable(), TikaConfig.getDefaultConfig());
+
+        //step 4. populate the reference tables
+        populateRefTables();
+
+        return mimeBuffer;
+
+
+    }
+
+    private static void USAGE() {
+        HelpFormatter helpFormatter = new HelpFormatter();
+        helpFormatter.printHelp(80, "java -jar tika-eval-app-x.y.z.jar 
FileProfiler -e docs -d mydb [-i inputDir, -c config.json]",
+                "Tool: Profile", OPTIONS, "");
+    }
+
+    private static String USAGE_FAIL(String msg) {
+        USAGE();
+        throw new IllegalArgumentException(msg);
+    }
+
+    private static class ProfileWorker implements Callable<Integer> {
+
+        private final ArrayBlockingQueue<FileResource> queue;
+        private final ExtractProfiler extractProfiler;
+        ProfileWorker(ArrayBlockingQueue<FileResource> queue, ExtractProfiler 
extractProfiler) {
+            this.queue = queue;
+            this.extractProfiler = extractProfiler;
+        }
+
+        @Override
+        public Integer call() throws Exception {
+            while (true) {
+                FileResource resource = queue.poll(1, TimeUnit.SECONDS);
+                if (resource == null) {
+                    LOG.info("ExtractProfileWorker waiting on queue");
+                    continue;
+                }
+                if (resource == SEMAPHORE) {
+                    LOG.debug("worker hit semaphore and is stopping");
+                    //hangs
+                    queue.put(resource);
+                    return 1;
+                }
+                extractProfiler.processFileResource(resource);
+            }
+        }
+    }
+
+    private static class DirectoryWalker implements Callable<Integer> {
+        private final Path startDir;
+        private final ArrayBlockingQueue<FileResource> queue;
+
+        public DirectoryWalker(Path startDir, ArrayBlockingQueue<FileResource> 
queue) {
+            this.startDir = startDir;
+            this.queue = queue;
+        }
+
+        @Override
+        public Integer call() throws Exception {
+            Files.walkFileTree(startDir, new FileVisitor<Path>() {
+                @Override
+                public FileVisitResult preVisitDirectory(Path dir, 
BasicFileAttributes attrs) throws IOException {
+                    return FileVisitResult.CONTINUE;
+                }
+
+                @Override
+                public FileVisitResult visitFile(Path file, 
BasicFileAttributes attrs) throws IOException {
+                    //blocking
+                    try {
+                        queue.put(new PathResource(file, 
startDir.relativize(file).toString()));
+                    } catch (InterruptedException e) {
+                        return FileVisitResult.TERMINATE;
+                    }
+                    return FileVisitResult.CONTINUE;
+                }
+
+                @Override
+                public FileVisitResult visitFileFailed(Path file, IOException 
exc) throws IOException {
+                    return FileVisitResult.CONTINUE;
+                }
+
+                @Override
+                public FileVisitResult postVisitDirectory(Path dir, 
IOException exc) throws IOException {
+                    return FileVisitResult.CONTINUE;
+                }
+            });
+            return 0;
+        }
+    }
+}
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
index 844eac125..a2a4986f3 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
@@ -16,7 +16,6 @@
  */
 package org.apache.tika.eval.app;
 
-import java.io.File;
 import java.io.IOException;
 import java.nio.file.Path;
 import java.sql.Types;
@@ -25,11 +24,9 @@ import java.util.List;
 import java.util.Map;
 import java.util.concurrent.ArrayBlockingQueue;
 
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
 import org.apache.commons.cli.Options;
 
-import org.apache.tika.batch.FileResource;
+import org.apache.tika.eval.app.batch.FileResource;
 import org.apache.tika.eval.app.db.ColInfo;
 import org.apache.tika.eval.app.db.Cols;
 import org.apache.tika.eval.app.db.TableInfo;
@@ -40,7 +37,7 @@ import org.apache.tika.eval.core.util.ContentTags;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 
-public class ExtractProfiler extends AbstractProfiler {
+public class ExtractProfiler extends ProfilerBase {
 
     private final static String FIELD = "f";
     public static TableInfo EXTRACT_EXCEPTION_TABLE =
@@ -81,19 +78,15 @@ public class ExtractProfiler extends AbstractProfiler {
     private final Path extracts;
     private final ExtractReader extractReader;
 
-    public static ExtractProfiler build(ArrayBlockingQueue<FileResource> 
queue, Path inputDir, Path extracts, ExtractReader extractReader, IDBWriter 
dbWriter) {
-        return new ExtractProfiler(queue, inputDir, extracts, extractReader, 
dbWriter);
-    }
 
-    ExtractProfiler(ArrayBlockingQueue<FileResource> queue, Path inputDir, 
Path extracts, ExtractReader extractReader, IDBWriter dbWriter) {
-        super(queue, dbWriter);
+    ExtractProfiler(Path inputDir, Path extracts, ExtractReader extractReader, 
IDBWriter dbWriter) {
+        super(dbWriter);
         this.inputDir = inputDir;
         this.extracts = extracts;
         this.extractReader = extractReader;
     }
 
 
-    @Override
     public boolean processFileResource(FileResource fileResource) {
         Metadata metadata = fileResource.getMetadata();
         EvalFilePaths fps = null;
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfileRunner.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfileRunner.java
deleted file mode 100644
index 1e57b35ad..000000000
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfileRunner.java
+++ /dev/null
@@ -1,54 +0,0 @@
-package org.apache.tika.eval.app;
-
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.concurrent.ArrayBlockingQueue;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.DefaultParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
-
-import org.apache.tika.batch.FileResource;
-
-public class FileProfileRunner {
-    static Options OPTIONS;
-
-    static {
-
-        OPTIONS = new Options()
-                
.addOption(Option.builder("e").longOpt("extracts").hasArg().desc("required: 
directory of extracts").build())
-                
.addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional: 
directory for original binary input documents."
-                        + " If not specified, -extracts is crawled as 
is.").build())
-                
.addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db 
path").build())
-                
.addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json 
config file").build())
-                ;
-    }
-    public static void main(String[] args) throws Exception {
-        DefaultParser defaultCLIParser = new DefaultParser();
-        CommandLine commandLine = defaultCLIParser.parse(OPTIONS, args);
-        EvalConfig evalConfig = commandLine.hasOption('c') ? 
EvalConfig.load(Paths.get(commandLine.getOptionValue('c'))) : new EvalConfig();
-        Path extractsDir = commandLine.hasOption('e') ? 
Paths.get(commandLine.getOptionValue('e')) : Paths.get(USAGE_FAIL("Must specify 
extracts dir: -i"));
-        Path inputDir = commandLine.hasOption('i') ? 
Paths.get(commandLine.getOptionValue('i')) : extractsDir;
-        String dbPath = commandLine.hasOption('d') ? 
commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d");
-        execute(inputDir, extractsDir, dbPath, evalConfig);
-    }
-
-    private static void execute(Path inputDir, Path extractsDir, String 
dbPath, EvalConfig evalConfig) {
-
-        ArrayBlockingQueue<FileResource> queue = new 
ArrayBlockingQueue<>(1000);
-        FileWalker fileWalker = new FileWalker(queue)
-    }
-
-    private static void USAGE() {
-        HelpFormatter helpFormatter = new HelpFormatter();
-        helpFormatter.printHelp(80, "java -jar tika-eval-app-x.y.z.jar 
FileProfiler -e docs -d mydb [-i inputDir, -c config.json]",
-                "Tool: Profile", OPTIONS, "");
-    }
-
-    private static String USAGE_FAIL(String msg) {
-        USAGE();
-        throw new IllegalArgumentException(msg);
-    }
-}
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
new file mode 100644
index 000000000..f5d5f1a15
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
@@ -0,0 +1,816 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app;
+
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.sql.Types;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.eval.app.db.ColInfo;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.ExtractReaderException;
+import org.apache.tika.eval.app.io.IDBWriter;
+import org.apache.tika.eval.core.langid.LanguageIDWrapper;
+import org.apache.tika.eval.core.textstats.BasicTokenCountStatsCalculator;
+import org.apache.tika.eval.core.textstats.CommonTokens;
+import org.apache.tika.eval.core.textstats.CompositeTextStatsCalculator;
+import org.apache.tika.eval.core.textstats.ContentLengthCalculator;
+import org.apache.tika.eval.core.textstats.TextStatsCalculator;
+import org.apache.tika.eval.core.textstats.TokenEntropy;
+import org.apache.tika.eval.core.textstats.TokenLengths;
+import org.apache.tika.eval.core.textstats.TopNTokens;
+import org.apache.tika.eval.core.textstats.UnicodeBlockCounter;
+import org.apache.tika.eval.core.tokens.AnalyzerManager;
+import org.apache.tika.eval.core.tokens.CommonTokenCountManager;
+import org.apache.tika.eval.core.tokens.CommonTokenResult;
+import org.apache.tika.eval.core.tokens.TokenCounts;
+import org.apache.tika.eval.core.tokens.TokenIntPair;
+import org.apache.tika.eval.core.util.ContentTagParser;
+import org.apache.tika.eval.core.util.ContentTags;
+import org.apache.tika.eval.core.util.EvalExceptionUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.language.detect.LanguageResult;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PDF;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.apache.tika.utils.StringUtils;
+
+public abstract class ProfilerBase {
+
+    public static final String TRUE = Boolean.toString(true);
+    public static final String FALSE = Boolean.toString(false);
+    protected static final AtomicInteger ID = new AtomicInteger();
+    static final long NON_EXISTENT_FILE_LENGTH = -1l;
+    final static int FILE_PATH_MAX_LEN = 1024;//max len for varchar for 
file_path
+    //Container exception key from the 1.x branch
+    private static final Property CONTAINER_EXCEPTION_1X = 
Property.externalText("X-TIKA" + ":EXCEPTION:runtime");
+    private static final Logger LOG = 
LoggerFactory.getLogger(ProfilerBase.class);
+    private static final String[] EXTRACT_EXTENSIONS = {".json", ".txt", ""};
+    private static final String[] COMPRESSION_EXTENSIONS = {"", ".bz2", 
".gzip", ".zip",};
+    private static final String ZERO = "0";
+    private static final String UNKNOWN_EXTENSION = "unk";
+    //make this configurable
+    private static final String DIGEST_KEY = "X-TIKA:digest:MD5";
+    private static final Map<String, Cols> UC_TAGS_OF_INTEREST = initTags();
+    private final static Pattern ACCESS_PERMISSION_EXCEPTION = 
Pattern.compile("org\\.apache\\.tika\\.exception\\.AccessPermissionException");
+    private final static Pattern ENCRYPTION_EXCEPTION = 
Pattern.compile("org\\.apache\\.tika.exception\\.EncryptedDocumentException");
+    public static TableInfo REF_EXTRACT_EXCEPTION_TYPES = new 
TableInfo("ref_extract_exception_types", new ColInfo(Cols.EXTRACT_EXCEPTION_ID, 
Types.INTEGER),
+            new ColInfo(Cols.EXTRACT_EXCEPTION_DESCRIPTION, Types.VARCHAR, 
128));
+    public static TableInfo REF_PARSE_ERROR_TYPES =
+            new TableInfo("ref_parse_error_types", new 
ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER), new 
ColInfo(Cols.PARSE_ERROR_DESCRIPTION, Types.VARCHAR, 128));
+    public static TableInfo REF_PARSE_EXCEPTION_TYPES =
+            new TableInfo("ref_parse_exception_types", new 
ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER), new 
ColInfo(Cols.PARSE_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128));
+    public static TableInfo MIME_TABLE = new TableInfo("mimes", new 
ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"), new 
ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
+            new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12));
+    private static CommonTokenCountManager COMMON_TOKEN_COUNT_MANAGER;
+    private static Pattern FILE_NAME_CLEANER = 
Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$");
+    private static LanguageIDWrapper LANG_ID = new LanguageIDWrapper();
+    protected IDBWriter writer;
+    AnalyzerManager analyzerManager;
+    int maxContentLength = 10000000;
+    int maxContentLengthForLangId = 50000;
+    int maxTokens = 200000;
+
+    CompositeTextStatsCalculator compositeTextStatsCalculator;
+    private String lastExtractExtension = null;
+
+    public ProfilerBase(IDBWriter writer) {
+        this.writer = writer;
+        LanguageIDWrapper.setMaxTextLength(maxContentLengthForLangId);
+        this.compositeTextStatsCalculator = 
initAnalyzersAndTokenCounter(maxTokens, LANG_ID);
+    }
+
+    private static Map<String, Cols> initTags() {
+        //simplify this mess
+        Map<String, Cols> tmp = new HashMap<>();
+        tmp.put("A", Cols.TAGS_A);
+        tmp.put("B", Cols.TAGS_B);
+        tmp.put("DIV", Cols.TAGS_DIV);
+        tmp.put("I", Cols.TAGS_I);
+        tmp.put("IMG", Cols.TAGS_IMG);
+        tmp.put("LI", Cols.TAGS_LI);
+        tmp.put("OL", Cols.TAGS_OL);
+        tmp.put("P", Cols.TAGS_P);
+        tmp.put("TABLE", Cols.TAGS_TABLE);
+        tmp.put("TD", Cols.TAGS_TD);
+        tmp.put("TITLE", Cols.TAGS_TITLE);
+        tmp.put("TR", Cols.TAGS_TR);
+        tmp.put("U", Cols.TAGS_U);
+        tmp.put("UL", Cols.TAGS_UL);
+        return Collections.unmodifiableMap(tmp);
+    }
+
+    /**
+     * @param p               path to the common_tokens directory.  If this is 
null, try to load from classPath
+     * @param defaultLangCode this is the language code to use if a 
common_words list doesn't exist for the
+     *                        detected langauge; can be <code>null</code>
+     * @throws IOException
+     */
+    public static void loadCommonTokens(Path p, String defaultLangCode) throws 
IOException {
+        COMMON_TOKEN_COUNT_MANAGER = new CommonTokenCountManager(p, 
defaultLangCode);
+    }
+
+    private static String getFileName(String path) {
+        if (path == null) {
+            return "";
+        }
+        //filenameUtils checks for a null byte in the path.
+        //it will throw an IllegalArgumentException if there is a null byte.
+        //given that we're recording names and not using them on a file path
+        //we should ignore this.
+        try {
+            return FilenameUtils.getName(path);
+        } catch (IllegalArgumentException e) {
+            LOG.warn("{} in {}", e.getMessage(), path);
+        }
+        path = path.replaceAll("\u0000", " ");
+        try {
+            return FilenameUtils.getName(path);
+        } catch (IllegalArgumentException e) {
+            LOG.warn("Again: {} in {}", e.getMessage(), path);
+        }
+        //give up
+        return "";
+    }
+
+    /**
+     * Get the content and record in the data {@link 
Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the string was truncated
+     *
+     * @param contentTags
+     * @param maxLength
+     * @param data
+     * @return
+     */
+    protected static String truncateContent(ContentTags contentTags, int 
maxLength, Map<Cols, String> data) {
+        data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE");
+        if (contentTags == null) {
+            return "";
+        }
+        String c = contentTags.getContent();
+        if (maxLength > -1 && c.length() > maxLength) {
+            c = c.substring(0, maxLength);
+            data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "TRUE");
+        }
+        return c;
+
+    }
+
+    protected static ContentTags getContent(EvalFilePaths evalFilePaths, 
Metadata metadata) {
+        if (metadata == null) {
+            return ContentTags.EMPTY_CONTENT_TAGS;
+        }
+        return parseContentAndTags(evalFilePaths, metadata);
+    }
+
+    /**
+     * @param list
+     * @return empty list if input list is empty or null
+     */
+    static List<Integer> countAttachments(List<Metadata> list) {
+        List<Integer> ret = new ArrayList<>();
+        if (list == null || list.size() == 0) {
+            return ret;
+        }
+        //container document attachment count = list.size()-1
+        ret.add(list.size() - 1);
+
+        Map<String, Integer> counts = new HashMap<>();
+        for (int i = 1; i < list.size(); i++) {
+            String path = list
+                    .get(i)
+                    .get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
+            if (path == null) {
+                //shouldn't ever happen
+                continue;
+            }
+            String[] parts = path.split("/");
+            StringBuilder parent = new StringBuilder();
+            for (int end = 1; end < parts.length - 1; end++) {
+                parent.setLength(0);
+                join("/", parent, parts, 1, end);
+                String parentPath = parent.toString();
+                Integer count = counts.get(parentPath);
+                if (count == null) {
+                    count = 1;
+                } else {
+                    count++;
+                }
+                counts.put(parentPath, count);
+            }
+        }
+
+        for (int i = 1; i < list.size(); i++) {
+            Integer count = counts.get(list
+                    .get(i)
+                    .get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+            if (count == null) {
+                count = 0;
+            }
+            ret.add(i, count);
+        }
+        return ret;
+
+
+    }
+
+    private static void join(String delimiter, StringBuilder sb, String[] 
parts, int start, int end) {
+        for (int i = start; i <= end; i++) {
+            sb.append(delimiter);
+            sb.append(parts[i]);
+        }
+    }
+
+    private static ContentTags parseContentAndTags(EvalFilePaths 
evalFilePaths, Metadata metadata) {
+        String s = metadata.get(TikaCoreProperties.TIKA_CONTENT);
+        if (s == null || s.isEmpty()) {
+            return ContentTags.EMPTY_CONTENT_TAGS;
+        }
+
+        String handlerClass = 
metadata.get(TikaCoreProperties.TIKA_CONTENT_HANDLER);
+        if (evalFilePaths
+                .getExtractFile()
+                .getFileName()
+                .toString()
+                .toLowerCase(Locale.ENGLISH)
+                .endsWith(".html")) {
+            try {
+                return ContentTagParser.parseHTML(s, 
UC_TAGS_OF_INTEREST.keySet());
+            } catch (IOException | SAXException e) {
+                LOG.warn("Problem parsing html in {}; backing off to treat 
string as text", evalFilePaths
+                        .getExtractFile()
+                        .toAbsolutePath()
+                        .toString(), e);
+
+                return new ContentTags(s, true);
+            }
+        } else if (evalFilePaths
+                .getExtractFile()
+                .getFileName()
+                .toString()
+                .toLowerCase(Locale.ENGLISH)
+                .endsWith(".xhtml") || (handlerClass != null && 
handlerClass.equals(ToXMLContentHandler.class.getSimpleName()))) {
+            try {
+                return ContentTagParser.parseXML(s, 
UC_TAGS_OF_INTEREST.keySet());
+            } catch (TikaException | IOException | SAXException e) {
+                LOG.warn("Problem parsing xhtml in {}; backing off to html 
parser", evalFilePaths
+                        .getExtractFile()
+                        .toAbsolutePath()
+                        .toString(), e);
+                try {
+                    ContentTags contentTags = ContentTagParser.parseHTML(s, 
UC_TAGS_OF_INTEREST.keySet());
+                    contentTags.setParseException(true);
+                    return contentTags;
+                } catch (IOException | SAXException e2) {
+                    LOG.warn("Problem parsing html in {}; backing off to treat 
string as text", evalFilePaths
+                            .getExtractFile()
+                            .toAbsolutePath()
+                            .toString(), e2);
+                }
+                return new ContentTags(s, true);
+            }
+        }
+        return new ContentTags(s);
+    }
+
+    private CompositeTextStatsCalculator initAnalyzersAndTokenCounter(int 
maxTokens, LanguageIDWrapper langIder) {
+        analyzerManager = AnalyzerManager.newInstance(maxTokens);
+        List<TextStatsCalculator> calculators = new ArrayList<>();
+        calculators.add(new CommonTokens(COMMON_TOKEN_COUNT_MANAGER));
+        calculators.add(new TokenEntropy());
+        calculators.add(new TokenLengths());
+        calculators.add(new TopNTokens(10));
+        calculators.add(new BasicTokenCountStatsCalculator());
+        calculators.add(new ContentLengthCalculator());
+        calculators.add(new UnicodeBlockCounter(maxContentLengthForLangId));
+
+        return new CompositeTextStatsCalculator(calculators, 
analyzerManager.getGeneralAnalyzer(), langIder);
+    }
+
+    /**
+     * Truncate the content string if greater than this length to this length
+     *
+     * @param maxContentLength
+     */
+    public void setMaxContentLength(int maxContentLength) {
+        this.maxContentLength = maxContentLength;
+    }
+
+    /**
+     * Truncate content string if greater than this length to this length for 
lang id
+     *
+     * @param maxContentLengthForLangId
+     */
+    public void setMaxContentLengthForLangId(int maxContentLengthForLangId) {
+        this.maxContentLengthForLangId = maxContentLengthForLangId;
+        LanguageIDWrapper.setMaxTextLength(maxContentLengthForLangId);
+    }
+
+    /**
+     * Add a LimitTokenCountFilterFactory if &gt; -1
+     *
+     * @param maxTokens
+     */
+    public void setMaxTokens(int maxTokens) {
+        this.maxTokens = maxTokens;
+        initAnalyzersAndTokenCounter(maxTokens, new LanguageIDWrapper());
+    }
+
+    protected void writeExtractException(TableInfo extractExceptionTable, 
String containerId, String filePath, ExtractReaderException.TYPE type) throws 
IOException {
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.CONTAINER_ID, containerId);
+        data.put(Cols.FILE_PATH, filePath);
+        data.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(type.ordinal()));
+        writer.writeRow(extractExceptionTable, data);
+
+    }
+
+    protected void writeProfileData(EvalFilePaths fps, int i, ContentTags 
contentTags, Metadata m, String fileId, String containerId, List<Integer> 
numAttachments,
+                                    TableInfo profileTable) {
+
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.ID, fileId);
+        data.put(Cols.CONTAINER_ID, containerId);
+        data.put(Cols.MD5, m.get(DIGEST_KEY));
+
+        if (i < numAttachments.size()) {
+            data.put(Cols.NUM_ATTACHMENTS, 
Integer.toString(numAttachments.get(i)));
+        }
+        data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
+        data.put(Cols.NUM_METADATA_VALUES, 
Integer.toString(countMetadataValues(m)));
+
+        Integer nPages = m.getInt(PagedText.N_PAGES);
+        if (nPages != null) {
+            data.put(Cols.NUM_PAGES, Integer.toString(nPages));
+        }
+        Integer nOCRPages = m.getInt(PDF.OCR_PAGE_COUNT);
+        if (nOCRPages != null) {
+            data.put(Cols.NUM_OCR_PAGES, Integer.toString(nOCRPages));
+        }
+
+        //if the outer wrapper document
+        if (i == 0) {
+            data.put(Cols.IS_EMBEDDED, FALSE);
+            data.put(Cols.FILE_NAME, fps
+                    .getRelativeSourceFilePath()
+                    .getFileName()
+                    .toString());
+            data.put(Cols.EMBEDDED_DEPTH, "0");
+        } else {
+            data.put(Cols.IS_EMBEDDED, TRUE);
+            String embeddedFilePath = 
m.get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH);
+            if (!StringUtils.isBlank(embeddedFilePath)) {
+                data.put(Cols.FILE_NAME, getFileName(embeddedFilePath));
+                data.put(Cols.EMBEDDED_FILE_PATH, embeddedFilePath);
+            }
+            if 
(!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_DEPTH))) {
+                data.put(Cols.EMBEDDED_DEPTH, 
m.get(TikaCoreProperties.EMBEDDED_DEPTH));
+            }
+            if 
(!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) {
+                data.put(Cols.ATTACHMENT_TYPE, 
m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+            }
+        }
+        String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME));
+        ext = (ext == null) ? "" : ext.toLowerCase(Locale.US);
+        data.put(Cols.FILE_EXTENSION, ext);
+        long srcFileLen = getSourceFileLength(m);
+        if (srcFileLen > NON_EXISTENT_FILE_LENGTH) {
+            data.put(Cols.LENGTH, Long.toString(srcFileLen));
+        } else {
+            data.put(Cols.LENGTH, "");
+        }
+        int numMetadataValues = countMetadataValues(m);
+        data.put(Cols.NUM_METADATA_VALUES, 
Integer.toString(numMetadataValues));
+
+        data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
+
+        String content = contentTags.getContent();
+        if (content == null || content.isBlank()) {
+            data.put(Cols.HAS_CONTENT, FALSE);
+        } else {
+            data.put(Cols.HAS_CONTENT, TRUE);
+        }
+        getFileTypes(m, data);
+        try {
+            writer.writeRow(profileTable, data);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    protected void writeExceptionData(String fileId, Metadata m, TableInfo 
exceptionTable) {
+        Map<Cols, String> data = new HashMap<>();
+        getExceptionStrings(m, data);
+        if (data
+                .keySet()
+                .size() > 0) {
+            try {
+                data.put(Cols.ID, fileId);
+                writer.writeRow(exceptionTable, data);
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    protected Map<Class, Object> calcTextStats(ContentTags contentTags) {
+/*        if (contentTags == ContentTags.EMPTY_CONTENT_TAGS) {
+            return Collections.EMPTY_MAP;
+        }*/
+        Map<Cols, String> data = new HashMap<>();
+        String content = truncateContent(contentTags, maxContentLength, data);
+        if (content == null || content.isBlank()) {
+            content = "";
+        }
+        return compositeTextStatsCalculator.calculate(content);
+    }
+
+    /**
+     * Checks to see if metadata is null or content is empty (null or only 
whitespace).
+     * If any of these, then this does no processing, and the fileId is not
+     * entered into the content table.
+     *
+     * @param fileId
+     * @param textStats
+     * @param contentsTable
+     */
+    protected void writeContentData(String fileId, Map<Class, Object> 
textStats, TableInfo contentsTable) throws IOException {
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.ID, fileId);
+        if (textStats.containsKey(ContentLengthCalculator.class)) {
+            int length = (int) textStats.get(ContentLengthCalculator.class);
+            if (length == 0) {
+                return;
+            }
+            data.put(Cols.CONTENT_LENGTH, Integer.toString(length));
+        }
+        langid(textStats, data);
+
+        writeTokenCounts(textStats, data);
+        CommonTokenResult commonTokenResult = (CommonTokenResult) 
textStats.get(CommonTokens.class);
+        if (commonTokenResult != null) {
+            data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
+            data.put(Cols.NUM_UNIQUE_COMMON_TOKENS, 
Integer.toString(commonTokenResult.getUniqueCommonTokens()));
+            data.put(Cols.NUM_COMMON_TOKENS, 
Integer.toString(commonTokenResult.getCommonTokens()));
+            data.put(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, 
Integer.toString(commonTokenResult.getUniqueAlphabeticTokens()));
+            data.put(Cols.NUM_ALPHABETIC_TOKENS, 
Integer.toString(commonTokenResult.getAlphabeticTokens()));
+            double oov = commonTokenResult.getAlphabeticTokens() > 0 ? 
commonTokenResult.getOOV() : -1.0;
+            data.put(Cols.OOV, Double.toString(oov));
+        }
+        TokenCounts tokenCounts = (TokenCounts) 
textStats.get(BasicTokenCountStatsCalculator.class);
+        if (tokenCounts != null) {
+
+            data.put(Cols.NUM_UNIQUE_TOKENS, 
Integer.toString(tokenCounts.getTotalUniqueTokens()));
+            data.put(Cols.NUM_TOKENS, 
Integer.toString(tokenCounts.getTotalTokens()));
+        }
+        if (textStats.get(TokenEntropy.class) != null) {
+            data.put(Cols.TOKEN_ENTROPY_RATE, Double.toString((Double) 
textStats.get(TokenEntropy.class)));
+        }
+
+
+        SummaryStatistics summStats = (SummaryStatistics) 
textStats.get(TokenLengths.class);
+        if (summStats != null) {
+            data.put(Cols.TOKEN_LENGTH_SUM, Integer.toString((int) 
summStats.getSum()));
+
+            data.put(Cols.TOKEN_LENGTH_MEAN, 
Double.toString(summStats.getMean()));
+
+            data.put(Cols.TOKEN_LENGTH_STD_DEV, 
Double.toString(summStats.getStandardDeviation()));
+        }
+        unicodeBlocks(textStats, data);
+        try {
+            writer.writeRow(contentsTable, data);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    void writeTagData(String fileId, ContentTags contentTags, TableInfo 
tagsTable) {
+        Map<String, Integer> tags = contentTags.getTags();
+        if (tags.size() == 0 && contentTags.getParseException() == false) {
+            return;
+        }
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.ID, fileId);
+
+        for (Map.Entry<String, Cols> e : UC_TAGS_OF_INTEREST.entrySet()) {
+            Integer count = tags.get(e.getKey());
+            if (count == null) {
+                data.put(e.getValue(), ZERO);
+            } else {
+                data.put(e.getValue(), Integer.toString(count));
+            }
+        }
+
+        if (contentTags.getParseException()) {
+            data.put(Cols.TAGS_PARSE_EXCEPTION, TRUE);
+        } else {
+            data.put(Cols.TAGS_PARSE_EXCEPTION, FALSE);
+        }
+        try {
+            writer.writeRow(tagsTable, data);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    String getTime(Metadata m) {
+        String elapsed = "-1";
+
+        String v = m.get(TikaCoreProperties.PARSE_TIME_MILLIS);
+        if (v != null) {
+            return v;
+        }
+        return elapsed;
+    }
+
+    int countMetadataValues(Metadata m) {
+        if (m == null) {
+            return 0;
+        }
+        int i = 0;
+        for (String n : m.names()) {
+            i += m.getValues(n).length;
+        }
+        return i;
+    }
+
+    void getExceptionStrings(Metadata metadata, Map<Cols, String> data) {
+
+        String fullTrace = 
metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION);
+        if (fullTrace == null) {
+            fullTrace = metadata.get(CONTAINER_EXCEPTION_1X);
+        }
+
+        if (fullTrace == null) {
+            fullTrace = metadata.get(TikaCoreProperties.EMBEDDED_EXCEPTION);
+        }
+
+        if (fullTrace != null) {
+            //check for "expected" exceptions...exceptions
+            //that can't be fixed.
+            //Do not store trace for "expected" exceptions
+
+            Matcher matcher = ACCESS_PERMISSION_EXCEPTION.matcher(fullTrace);
+            if (matcher.find()) {
+                data.put(Cols.PARSE_EXCEPTION_ID, 
Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()));
+                return;
+            }
+            matcher = ENCRYPTION_EXCEPTION.matcher(fullTrace);
+            if (matcher.find()) {
+                data.put(Cols.PARSE_EXCEPTION_ID, 
Integer.toString(EXCEPTION_TYPE.ENCRYPTION.ordinal()));
+                return;
+            }
+
+            data.put(Cols.PARSE_EXCEPTION_ID, 
Integer.toString(EXCEPTION_TYPE.RUNTIME.ordinal()));
+
+            data.put(Cols.ORIG_STACK_TRACE, fullTrace);
+            //TikaExceptions can have object ids, as in the "@2b1ea6ee" in:
+            //org.apache.tika.exception.TikaException: TIKA-198: Illegal
+            //IOException from 
org.apache.tika.parser.microsoft.OfficeParser@2b1ea6ee
+            //For reporting purposes, let's snip off the object id so that we 
can more
+            //easily count exceptions.
+            String sortTrace = EvalExceptionUtils.normalize(fullTrace);
+            data.put(Cols.SORT_STACK_TRACE, sortTrace);
+        }
+    }
+
+    void unicodeBlocks(Map<Class, Object> tokenStats, Map<Cols, String> data) {
+
+        Map<String, MutableInt> blocks = (Map<String, MutableInt>) 
tokenStats.get(UnicodeBlockCounter.class);
+        List<Pair<String, Integer>> pairs = new ArrayList<>();
+        for (Map.Entry<String, MutableInt> e : blocks.entrySet()) {
+            pairs.add(Pair.of(e.getKey(), e
+                    .getValue()
+                    .intValue()));
+        }
+        pairs.sort((o1, o2) -> o2
+                .getValue()
+                .compareTo(o1.getValue()));
+        StringBuilder sb = new StringBuilder();
+
+        for (int i = 0; i < 20 && i < pairs.size(); i++) {
+            if (i > 0) {
+                sb.append(" | ");
+            }
+            sb
+                    .append(pairs
+                            .get(i)
+                            .getKey())
+                    .append(": ")
+                    .append(pairs
+                            .get(i)
+                            .getValue());
+        }
+        data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString());
+    }
+
+    void langid(Map<Class, Object> stats, Map<Cols, String> data) {
+        List<LanguageResult> probabilities = (List<LanguageResult>) 
stats.get(LanguageIDWrapper.class);
+
+        if (probabilities.size() > 0) {
+            data.put(Cols.LANG_ID_1, probabilities
+                    .get(0)
+                    .getLanguage());
+            data.put(Cols.LANG_ID_PROB_1, Double.toString(probabilities
+                    .get(0)
+                    .getRawScore()));
+        }
+        if (probabilities.size() > 1) {
+            data.put(Cols.LANG_ID_2, probabilities
+                    .get(1)
+                    .getLanguage());
+            data.put(Cols.LANG_ID_PROB_2, Double.toString(probabilities
+                    .get(1)
+                    .getRawScore()));
+        }
+    }
+
+    void getFileTypes(Metadata metadata, Map<Cols, String> output) {
+        if (metadata == null) {
+            return;
+        }
+        String type = metadata.get(Metadata.CONTENT_TYPE);
+        if (type == null) {
+            return;
+        }
+        int mimeId = writer.getMimeId(type);
+        output.put(Cols.MIME_ID, Integer.toString(mimeId));
+    }
+
+    void writeTokenCounts(Map<Class, Object> textStats, Map<Cols, String> 
data) {
+        TokenIntPair[] tokenIntPairs = (TokenIntPair[]) 
textStats.get(TopNTokens.class);
+        int i = 0;
+        StringBuilder sb = new StringBuilder();
+        for (TokenIntPair t : tokenIntPairs) {
+            if (i++ > 0) {
+                sb.append(" | ");
+            }
+            sb
+                    .append(t.getToken())
+                    .append(": ")
+                    .append(t.getValue());
+        }
+
+        data.put(Cols.TOP_N_TOKENS, sb.toString());
+    }
+
+    public void closeWriter() throws IOException {
+        writer.close();
+    }
+
+    /**
+     * @param metadata
+     * @param extracts
+     * @return evalfilepaths for files if crawling an extract directory
+     */
+    protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata, Path 
extracts) {
+        String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH);
+        Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath);
+        Path relativeSourceFilePath = Paths.get(m.replaceAll(""));
+        //just try slapping the relextractfilepath on the extractdir
+        Path extractFile = extracts.resolve(relExtractFilePath);
+        if (!Files.isRegularFile(extractFile)) {
+            //if that doesn't work, try to find the right extract file.
+            //This is necessary if crawling extractsA and trying to find a 
file in
+            //extractsB that is not in the same format: json vs txt or 
compressed
+            extractFile = findFile(extracts, relativeSourceFilePath);
+        }
+        return new EvalFilePaths(relativeSourceFilePath, extractFile);
+    }
+
+    //call this if the crawler is crawling through the src directory
+    protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path 
srcDir, Path extracts) {
+        Path relativeSourceFilePath = 
Paths.get(metadata.get(FSProperties.FS_REL_PATH));
+        Path extractFile = findFile(extracts, relativeSourceFilePath);
+        Path inputFile = srcDir.resolve(relativeSourceFilePath);
+        long srcLen = -1l;
+        //try to get the length of the source file in case there was an error
+        //in both extracts
+        try {
+            srcLen = Files.size(inputFile);
+        } catch (IOException e) {
+            LOG.warn("Couldn't get length for: {}", 
inputFile.toAbsolutePath());
+        }
+        return new EvalFilePaths(relativeSourceFilePath, extractFile, srcLen);
+    }
+
+    /**
+     * @param extractRootDir
+     * @param relativeSourceFilePath
+     * @return extractFile or null if couldn't find one.
+     */
+    private Path findFile(Path extractRootDir, Path relativeSourceFilePath) {
+        String relSrcFilePathString = relativeSourceFilePath.toString();
+        if (lastExtractExtension != null) {
+            Path candidate = extractRootDir.resolve(relSrcFilePathString + 
lastExtractExtension);
+            if (Files.isRegularFile(candidate)) {
+                return candidate;
+            }
+        }
+        for (String ext : EXTRACT_EXTENSIONS) {
+            for (String compress : COMPRESSION_EXTENSIONS) {
+                Path candidate = extractRootDir.resolve(relSrcFilePathString + 
ext + compress);
+                if (Files.isRegularFile(candidate)) {
+                    lastExtractExtension = ext + compress;
+                    return candidate;
+                }
+            }
+        }
+        return null;
+    }
+
+    protected long getSourceFileLength(EvalFilePaths fps, List<Metadata> 
metadataList) {
+        if (fps.getSourceFileLength() > NON_EXISTENT_FILE_LENGTH) {
+            return fps.getSourceFileLength();
+        }
+        return getSourceFileLength(metadataList);
+    }
+
+    long getSourceFileLength(List<Metadata> metadataList) {
+        if (metadataList == null || metadataList.size() < 1) {
+            return NON_EXISTENT_FILE_LENGTH;
+        }
+        return getSourceFileLength(metadataList.get(0));
+    }
+
+    long getSourceFileLength(Metadata m) {
+        String lenString = m.get(Metadata.CONTENT_LENGTH);
+        if (lenString == null) {
+            return NON_EXISTENT_FILE_LENGTH;
+        }
+        try {
+            return Long.parseLong(lenString);
+        } catch (NumberFormatException e) {
+            //swallow
+        }
+        return NON_EXISTENT_FILE_LENGTH;
+    }
+
+    protected long getFileLength(Path p) {
+        if (p != null && Files.isRegularFile(p)) {
+            try {
+                return Files.size(p);
+            } catch (IOException e) {
+                //swallow
+            }
+        }
+        return NON_EXISTENT_FILE_LENGTH;
+    }
+
+    public enum EXCEPTION_TYPE {
+        RUNTIME, ENCRYPTION, ACCESS_PERMISSION, UNSUPPORTED_VERSION,
+    }
+
+    /**
+     * If information was gathered from the log file about
+     * a parse error
+     */
+    public enum PARSE_ERROR_TYPE {
+        OOM, TIMEOUT
+    }
+
+
+}
+
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
index 770319708..bae1435e7 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
@@ -154,7 +154,7 @@ public class TikaEvalCLI {
     }
 
     private void handleProfile(String[] subsetArgs) throws Exception {
-        FileProfileRunner.main(subsetArgs);
+        ExctractProfileRunner.main(subsetArgs);
     }
 
     private void handleCompare(String[] subsetArgs) throws Exception {
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileResource.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileResource.java
index dc0a35d7e..e4702cb8e 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileResource.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileResource.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.batch;
+package org.apache.tika.eval.app.batch;
 
 
 import java.io.IOException;
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java
new file mode 100644
index 000000000..9d64317aa
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java
@@ -0,0 +1,33 @@
+package org.apache.tika.eval.app.batch;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.tika.metadata.Metadata;
+
+public class PathResource implements FileResource {
+
+    private final Path path;
+    private final String resourceId;
+    public PathResource(Path path, String resourceId) {
+        this.path = path;
+        this.resourceId = resourceId;
+    }
+    @Override
+    public String getResourceId() {
+        return resourceId;
+    }
+
+    @Override
+    public Metadata getMetadata() {
+        return new Metadata();
+    }
+
+    @Override
+    public InputStream openInputStream() throws IOException {
+        return Files.newInputStream(path);
+    }
+}

(tika) 02/03: WIP -- don't merge

Reply via email to