This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 4c5200a45 TIKA-4342 -- remove tika-batch from tika-eval's Profile 
command (#2272)
4c5200a45 is described below

commit 4c5200a453f35a01ba4d50335143204b483a397a
Author: Tim Allison <[email protected]>
AuthorDate: Tue Jul 8 15:55:39 2025 -0400

    TIKA-4342 -- remove tika-batch from tika-eval's Profile command (#2272)
    
    * TIKA-4342 -- remove tika-batch from ExtractProfiler
---
 .../java/org/apache/tika/eval/app/EvalConfig.java  |  88 +++
 .../apache/tika/eval/app/ExtractProfileRunner.java | 374 ++++++++++
 .../org/apache/tika/eval/app/ExtractProfiler.java  |  50 +-
 .../org/apache/tika/eval/app/ProfilerBase.java     | 813 +++++++++++++++++++++
 .../org/apache/tika/eval/app/StatusReporter.java   | 102 +++
 .../java/org/apache/tika/eval/app/TikaEvalCLI.java | 106 +--
 .../eval/app/batch/ExtractProfilerBuilder.java     | 120 ---
 .../apache/tika/eval/app/batch/FileResource.java   |  66 ++
 .../apache/tika/eval/app/batch/PathResource.java   |  52 ++
 .../org/apache/tika/eval/app/EvalConfigTest.java   |  42 ++
 .../apache/tika/eval/app/ProfilerBatchTest.java    | 117 ++-
 .../org/apache/tika/eval/app/TikaEvalCLITest.java  |  11 +-
 .../resources/eval-configs/eval-config-basic.json  |   3 +
 .../test-dirs/raw_input/file10_permahang.txt       |   0
 .../resources/test-dirs/raw_input/file12_es.txt    |   6 +
 .../test-dirs/raw_input/file13_attachANotB.doc     |  12 +
 .../test-dirs/raw_input/file14_diffAttachOrder     |  21 +
 .../test/resources/test-dirs/raw_input/file15_tags |  41 ++
 .../resources/test-dirs/raw_input/file16_badTags   |  41 ++
 .../test-dirs/raw_input/file17_tagsOutOfOrder      |  41 ++
 20 files changed, 1761 insertions(+), 345 deletions(-)

diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
new file mode 100644
index 000000000..5525180ed
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import java.nio.file.Path;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+public class EvalConfig {
+
+    private long minExtractLength = 0;
+    private long maxExtractLength = 2_000_000;
+    private String jdbcString = null;
+    private String jdbcDriverClass = null;
+    private boolean forceDrop = true;
+    private int maxFilesToAdd = -1;
+    private int maxTokens = 200000;
+
+    private int maxContentLength = 5_000_000;
+    private int numWorkers = 4;
+    private Path errorLogFile = null;
+
+
+    public static EvalConfig load(Path path) throws Exception {
+        return new ObjectMapper().readValue(path.toFile(), EvalConfig.class);
+    }
+
+    public long getMinExtractLength() {
+        return minExtractLength;
+    }
+
+    public long getMaxExtractLength() {
+        return maxExtractLength;
+    }
+
+    public String getJdbcString() {
+        return jdbcString;
+    }
+
+    public String getJdbcDriverClass() {
+        return jdbcDriverClass;
+    }
+
+    public boolean isForceDrop() {
+        return forceDrop;
+    }
+
+    public int getMaxFilesToAdd() {
+        return maxFilesToAdd;
+    }
+
+    public int getMaxTokens() {
+        return maxTokens;
+    }
+
+    public int getMaxContentLength() {
+        return maxContentLength;
+    }
+
+    public int getNumWorkers() {
+        return numWorkers;
+    }
+
+    public Path getErrorLogFile() {
+        return errorLogFile;
+    }
+
+    @Override
+    public String toString() {
+        return "EvalConfig{" + "minExtractLength=" + minExtractLength + ", 
maxExtractLength=" + maxExtractLength + ", jdbcString='" + jdbcString + '\'' + 
", jdbcDriverClass='" +
+                jdbcDriverClass + '\'' + ", forceDrop=" + forceDrop + ", 
maxFilesToAdd=" + maxFilesToAdd + ", maxTokens=" + maxTokens + ", 
maxContentLength=" + maxContentLength +
+                ", numThreads=" + numWorkers + ", errorLogFile=" + 
errorLogFile + '}';
+    }
+}
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java
new file mode 100644
index 000000000..cd80a3df3
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java
@@ -0,0 +1,374 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import java.io.IOException;
+import java.nio.file.FileVisitResult;
+import java.nio.file.FileVisitor;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.attribute.BasicFileAttributes;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.eval.app.batch.DBConsumersManager;
+import org.apache.tika.eval.app.batch.FileResource;
+import org.apache.tika.eval.app.batch.PathResource;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.eval.app.db.MimeBuffer;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.DBWriter;
+import org.apache.tika.eval.app.io.ExtractReader;
+import org.apache.tika.eval.app.io.ExtractReaderException;
+import org.apache.tika.eval.app.io.IDBWriter;
+
+public class ExtractProfileRunner {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(ExtractProfileRunner.class);
+    private static final PathResource SEMAPHORE = new 
PathResource(Paths.get("/"), "STOP");
+    private static final int DIR_WALKER_COMPLETED_VALUE = 2;
+    private static final int PROFILE_WORKER_COMPLETED_VALUE = 1;
+
+    static Options OPTIONS;
+
+    static {
+
+        OPTIONS = new Options()
+                
.addOption(Option.builder("e").longOpt("extracts").hasArg().desc("required: 
directory of extracts").build())
+                
.addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional: 
directory for original binary input documents."
+                        + " If not specified, -extracts is crawled as 
is.").build())
+                
.addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db 
path").build())
+                
.addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json 
config file").build())
+                ;
+    }
+    public static void main(String[] args) throws Exception {
+        DefaultParser defaultCLIParser = new DefaultParser();
+        CommandLine commandLine = defaultCLIParser.parse(OPTIONS, args);
+        EvalConfig evalConfig = commandLine.hasOption('c') ? 
EvalConfig.load(Paths.get(commandLine.getOptionValue('c'))) : new EvalConfig();
+        Path extractsDir = commandLine.hasOption('e') ? 
Paths.get(commandLine.getOptionValue('e')) : Paths.get(USAGE_FAIL("Must specify 
extracts dir: -i"));
+        Path inputDir = commandLine.hasOption('i') ? 
Paths.get(commandLine.getOptionValue('i')) : extractsDir;
+        String dbPath = commandLine.hasOption('d') ? 
commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d");
+        String jdbcString = getJdbcConnectionString(dbPath);
+        execute(inputDir, extractsDir, jdbcString, evalConfig);
+    }
+
+    private static String getJdbcConnectionString(String dbPath) {
+        if (dbPath.startsWith("jdbc:")) {
+            return dbPath;
+        }
+        //default to h2
+        Path p = Paths.get(dbPath);
+        return "jdbc:h2:file:" + p.toAbsolutePath();
+
+    }
+
+    private static void execute(Path inputDir, Path extractsDir, String 
dbPath, EvalConfig evalConfig) throws SQLException, IOException {
+
+        //parameterize this? if necesssary
+        try {
+            ProfilerBase.loadCommonTokens(null, null);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+
+        JDBCUtil jdbcUtil = new JDBCUtil(dbPath, 
evalConfig.getJdbcDriverClass());
+        ExtractProfilerBuilder builder = new ExtractProfilerBuilder();
+        MimeBuffer mimeBuffer = initTables(jdbcUtil, builder, dbPath, 
evalConfig);
+        builder.populateRefTables(jdbcUtil, mimeBuffer);
+
+        AtomicInteger enqueued = new AtomicInteger(0);
+        AtomicInteger processed = new AtomicInteger(0);
+        AtomicInteger activeWorkers = new 
AtomicInteger(evalConfig.getNumWorkers());
+        AtomicBoolean crawlerActive = new AtomicBoolean(true);
+
+
+
+        ArrayBlockingQueue<FileResource> queue = new 
ArrayBlockingQueue<>(1000);
+        ExecutorService executorService = 
Executors.newFixedThreadPool(evalConfig.getNumWorkers() + 2);
+        ExecutorCompletionService<Integer> executorCompletionService = new 
ExecutorCompletionService<>(executorService);
+
+        StatusReporter statusReporter = new StatusReporter(enqueued, 
processed, activeWorkers, crawlerActive);
+        executorCompletionService.submit(statusReporter);
+
+        DirectoryWalker directoryWalker = new DirectoryWalker(inputDir, queue, 
enqueued);
+        executorCompletionService.submit(directoryWalker);
+        for (int i = 0; i < evalConfig.getNumWorkers(); i++) {
+            ExtractReader extractReader = new 
ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS, 
evalConfig.getMinExtractLength(), evalConfig.getMaxExtractLength());
+            ExtractProfiler extractProfiler = new ExtractProfiler(inputDir, 
extractsDir, extractReader, builder.getDBWriter(builder.tableInfos, jdbcUtil, 
mimeBuffer));
+            executorCompletionService.submit(new ProfileWorker(queue, 
extractProfiler, processed));
+        }
+
+        int finished = 0;
+        try {
+            while (finished < evalConfig.getNumWorkers() + 2) {
+                //blocking
+                Future<Integer> future = executorCompletionService.take();
+                Integer result = future.get();
+                if (result != null) {
+                    //if the dir walker has finished
+                    if (result == DIR_WALKER_COMPLETED_VALUE) {
+                        queue.put(SEMAPHORE);
+                        crawlerActive.set(false);
+                    } else if (result == PROFILE_WORKER_COMPLETED_VALUE) {
+                        activeWorkers.decrementAndGet();
+                    }
+                    finished++;
+                }
+            }
+        } catch (InterruptedException e) {
+            LOG.info("interrupted", e);
+        } catch (ExecutionException e) {
+            throw new RuntimeException(e);
+        } finally {
+            mimeBuffer.close();
+            executorService.shutdownNow();
+        }
+
+    }
+
+    private static MimeBuffer initTables(JDBCUtil jdbcUtil, 
ExtractProfilerBuilder builder, String connectionString, EvalConfig evalConfig) 
throws SQLException, IOException {
+
+        //step 1. create the tables
+        jdbcUtil.createTables(builder.getNonRefTableInfos(), 
JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS);
+        jdbcUtil.createTables(builder.getRefTableInfos(), 
JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS);
+
+        //step 2. create mime buffer
+        return new MimeBuffer(jdbcUtil.getConnection(), 
builder.getMimeTable(), TikaConfig.getDefaultConfig());
+    }
+
+    private static void USAGE() {
+        HelpFormatter helpFormatter = new HelpFormatter();
+        helpFormatter.printHelp(80, "java -jar tika-eval-app-x.y.z.jar 
FileProfiler -e docs -d mydb [-i inputDir, -c config.json]",
+                "Tool: Profile", OPTIONS, "");
+    }
+
+    private static String USAGE_FAIL(String msg) {
+        USAGE();
+        throw new IllegalArgumentException(msg);
+    }
+
+    private static class ProfileWorker implements Callable<Integer> {
+
+        private final ArrayBlockingQueue<FileResource> queue;
+        private final ExtractProfiler extractProfiler;
+        private final AtomicInteger processed;
+
+        ProfileWorker(ArrayBlockingQueue<FileResource> queue, ExtractProfiler 
extractProfiler, AtomicInteger processed) {
+            this.queue = queue;
+            this.extractProfiler = extractProfiler;
+            this.processed = processed;
+        }
+
+        @Override
+        public Integer call() throws Exception {
+            while (true) {
+                FileResource resource = queue.poll(1, TimeUnit.SECONDS);
+                if (resource == null) {
+                    LOG.info("ExtractProfileWorker waiting on queue");
+                    continue;
+                }
+                if (resource == SEMAPHORE) {
+                    LOG.debug("worker hit semaphore and is stopping");
+                    extractProfiler.closeWriter();
+                    //hangs
+                    queue.put(resource);
+                    return PROFILE_WORKER_COMPLETED_VALUE;
+                }
+                extractProfiler.processFileResource(resource);
+                processed.incrementAndGet();
+            }
+        }
+    }
+
+    private static class DirectoryWalker implements Callable<Integer> {
+        private final Path startDir;
+        private final ArrayBlockingQueue<FileResource> queue;
+        private final AtomicInteger enqueued;
+
+        public DirectoryWalker(Path startDir, ArrayBlockingQueue<FileResource> 
queue, AtomicInteger enqueued) {
+            this.startDir = startDir;
+            this.queue = queue;
+            this.enqueued = enqueued;
+        }
+
+        @Override
+        public Integer call() throws Exception {
+            Files.walkFileTree(startDir, new FileVisitor<Path>() {
+                @Override
+                public FileVisitResult preVisitDirectory(Path dir, 
BasicFileAttributes attrs) throws IOException {
+                    return FileVisitResult.CONTINUE;
+                }
+
+                @Override
+                public FileVisitResult visitFile(Path file, 
BasicFileAttributes attrs) throws IOException {
+                    if (Files.isDirectory(file)) {
+                        return FileVisitResult.CONTINUE;
+                    }
+                    try {
+                        //blocking
+                        queue.put(new PathResource(file, 
startDir.relativize(file).toString()));
+                        enqueued.incrementAndGet();
+                    } catch (InterruptedException e) {
+                        return FileVisitResult.TERMINATE;
+                    }
+                    return FileVisitResult.CONTINUE;
+                }
+
+                @Override
+                public FileVisitResult visitFileFailed(Path file, IOException 
exc) throws IOException {
+                    return FileVisitResult.CONTINUE;
+                }
+
+                @Override
+                public FileVisitResult postVisitDirectory(Path dir, 
IOException exc) throws IOException {
+                    return FileVisitResult.CONTINUE;
+                }
+            });
+            return DIR_WALKER_COMPLETED_VALUE;
+        }
+    }
+
+    private static class ExtractProfilerBuilder {
+        private final List<TableInfo> tableInfos;
+        private final List<TableInfo> refTableInfos;
+
+        public ExtractProfilerBuilder() {
+            List<TableInfo> tableInfos = new ArrayList();
+            tableInfos.add(AbstractProfiler.MIME_TABLE);
+            tableInfos.add(ExtractProfiler.CONTAINER_TABLE);
+            tableInfos.add(ExtractProfiler.PROFILE_TABLE);
+            tableInfos.add(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
+            tableInfos.add(ExtractProfiler.EXCEPTION_TABLE);
+            tableInfos.add(ExtractProfiler.CONTENTS_TABLE);
+            tableInfos.add(ExtractProfiler.TAGS_TABLE);
+            tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE);
+            this.tableInfos = Collections.unmodifiableList(tableInfos);
+
+            List<TableInfo> refTableInfos = new ArrayList<>();
+            refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
+            refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
+            refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES);
+            this.refTableInfos = Collections.unmodifiableList(refTableInfos);
+        }
+
+
+        protected List<TableInfo> getRefTableInfos() {
+            return refTableInfos;
+        }
+
+        protected List<TableInfo> getNonRefTableInfos() {
+            return tableInfos;
+        }
+
+        protected TableInfo getMimeTable() {
+            return AbstractProfiler.MIME_TABLE;
+        }
+
+        public void populateRefTables(JDBCUtil dbUtil, MimeBuffer mimeBuffer) 
throws IOException, SQLException {
+            boolean refTablesPopulated = true;
+            try {
+                Connection connection = dbUtil.getConnection();
+                for (TableInfo tableInfo : getRefTableInfos()) {
+                    int rows = 0;
+                    try (ResultSet rs = connection
+                            .createStatement()
+                            .executeQuery("select * from " + 
tableInfo.getName())) {
+                        while (rs.next()) {
+                            rows++;
+                        }
+                    }
+                    if (rows == 0) {
+                        refTablesPopulated = false;
+                        break;
+                    }
+
+                }
+            } catch (SQLException e) {
+                //swallow
+            }
+            if (refTablesPopulated) {
+                LOG.info("ref tables are already populated");
+                return;
+            }
+
+            IDBWriter writer = getDBWriter(getRefTableInfos(), dbUtil, 
mimeBuffer);
+            Map<Cols, String> m = new HashMap<>();
+            for (AbstractProfiler.PARSE_ERROR_TYPE t : 
AbstractProfiler.PARSE_ERROR_TYPE.values()) {
+                m.clear();
+                m.put(Cols.PARSE_ERROR_ID, Integer.toString(t.ordinal()));
+                m.put(Cols.PARSE_ERROR_DESCRIPTION, t.name());
+                writer.writeRow(AbstractProfiler.REF_PARSE_ERROR_TYPES, m);
+            }
+
+            for (AbstractProfiler.EXCEPTION_TYPE t : 
AbstractProfiler.EXCEPTION_TYPE.values()) {
+                m.clear();
+                m.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(t.ordinal()));
+                m.put(Cols.PARSE_EXCEPTION_DESCRIPTION, t.name());
+                writer.writeRow(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES, m);
+            }
+
+            for (ExtractReaderException.TYPE t : 
ExtractReaderException.TYPE.values()) {
+                m.clear();
+                m.put(Cols.EXTRACT_EXCEPTION_ID, 
Integer.toString(t.ordinal()));
+                m.put(Cols.EXTRACT_EXCEPTION_DESCRIPTION, t.name());
+                writer.writeRow(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES, 
m);
+            }
+            writer.close();
+        }
+
+        protected IDBWriter getDBWriter(List<TableInfo> tableInfos, JDBCUtil 
dbUtil, MimeBuffer mimeBuffer) throws IOException, SQLException {
+            Connection conn = dbUtil.getConnection();
+            return new DBWriter(conn, tableInfos, dbUtil, mimeBuffer);
+        }
+
+
+        protected void addErrorLogTablePairs(DBConsumersManager manager, 
EvalConfig evalConfig) {
+            Path errorLog = evalConfig.getErrorLogFile();
+            if (errorLog == null) {
+                return;
+            }
+            manager.addErrorLogTablePair(errorLog, 
ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
+        }
+    }
+}
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
index 22889d73b..9b0f482f6 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
@@ -22,13 +22,10 @@ import java.sql.Types;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
 
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
 import org.apache.commons.cli.Options;
 
-import org.apache.tika.batch.FileResource;
+import org.apache.tika.eval.app.batch.FileResource;
 import org.apache.tika.eval.app.db.ColInfo;
 import org.apache.tika.eval.app.db.Cols;
 import org.apache.tika.eval.app.db.TableInfo;
@@ -39,7 +36,7 @@ import org.apache.tika.eval.core.util.ContentTags;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 
-public class ExtractProfiler extends AbstractProfiler {
+public class ExtractProfiler extends ProfilerBase {
 
     private final static String FIELD = "f";
     public static TableInfo EXTRACT_EXCEPTION_TABLE =
@@ -76,56 +73,19 @@ public class ExtractProfiler extends AbstractProfiler {
                     new ColInfo(Cols.TAGS_PARSE_EXCEPTION, Types.BOOLEAN));
     static Options OPTIONS;
 
-    static {
-        //By the time this commandline is parsed, there should be both an 
extracts and an inputDir
-        Option extracts = new Option("extracts", true, "directory for extract 
files");
-        extracts.setRequired(true);
-
-        Option inputDir = new Option("inputDir", true, "optional: directory 
for original binary input documents." + " If not specified, -extracts is 
crawled as is.");
-
-        OPTIONS = new Options()
-                .addOption(extracts)
-                .addOption(inputDir)
-                .addOption("bc", "optional: tika-batch config file")
-                .addOption("numConsumers", true, "optional: number of consumer 
threads")
-                .addOption(new Option("alterExtract", true,
-                        "for json-formatted extract files, " + "process full 
metadata list ('as_is'=default), " + "take just the first/container document 
('first_only'), " +
-                                "concatenate all content into the first 
metadata item ('concatenate_content')"))
-                .addOption("minExtractLength", true, "minimum extract length 
to process (in bytes)")
-                .addOption("maxExtractLength", true, "maximum extract length 
to process (in bytes)")
-                .addOption("db", true, "db file to which to write results")
-                .addOption("jdbc", true, "EXPERT: full jdbc connection string. 
Must specify this or -db <h2db>")
-                .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or 
specify via -Djdbc.driver")
-                .addOption("tablePrefix", true, "EXPERT: optional prefix for 
table names")
-                .addOption("drop", false, "drop tables if they exist")
-                .addOption("maxFilesToAdd", true, "maximum number of files to 
add to the crawler")
-                .addOption("maxTokens", true, "maximum tokens to process, 
default=200000")
-                .addOption("maxContentLength", true, "truncate content beyond 
this length for calculating 'contents' stats, default=1000000")
-                .addOption("maxContentLengthForLangId", true, "truncate 
content beyond this length for language id, default=50000")
-                .addOption("defaultLangCode", true, "which language to use for 
common words if no 'common words' file exists for the langid result")
-
-        ;
-
-    }
-
     private final Path inputDir;
     private final Path extracts;
     private final ExtractReader extractReader;
 
-    public ExtractProfiler(ArrayBlockingQueue<FileResource> queue, Path 
inputDir, Path extracts, ExtractReader extractReader, IDBWriter dbWriter) {
-        super(queue, dbWriter);
+
+    ExtractProfiler(Path inputDir, Path extracts, ExtractReader extractReader, 
IDBWriter dbWriter) {
+        super(dbWriter);
         this.inputDir = inputDir;
         this.extracts = extracts;
         this.extractReader = extractReader;
     }
 
-    public static void USAGE() {
-        HelpFormatter helpFormatter = new HelpFormatter();
-        helpFormatter.printHelp(80, "java -jar tika-eval-x.y.jar Profile 
-extracts extracts -db mydb [-inputDir input]", "Tool: Profile", 
ExtractProfiler.OPTIONS,
-                "Note: for the default h2 db, do not include the .mv.db at the 
end of the db name.");
-    }
 
-    @Override
     public boolean processFileResource(FileResource fileResource) {
         Metadata metadata = fileResource.getMetadata();
         EvalFilePaths fps = null;
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
new file mode 100644
index 000000000..19a7d680f
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
@@ -0,0 +1,813 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app;
+
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.sql.Types;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.eval.app.db.ColInfo;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.ExtractReaderException;
+import org.apache.tika.eval.app.io.IDBWriter;
+import org.apache.tika.eval.core.langid.LanguageIDWrapper;
+import org.apache.tika.eval.core.textstats.BasicTokenCountStatsCalculator;
+import org.apache.tika.eval.core.textstats.CommonTokens;
+import org.apache.tika.eval.core.textstats.CompositeTextStatsCalculator;
+import org.apache.tika.eval.core.textstats.ContentLengthCalculator;
+import org.apache.tika.eval.core.textstats.TextStatsCalculator;
+import org.apache.tika.eval.core.textstats.TokenEntropy;
+import org.apache.tika.eval.core.textstats.TokenLengths;
+import org.apache.tika.eval.core.textstats.TopNTokens;
+import org.apache.tika.eval.core.textstats.UnicodeBlockCounter;
+import org.apache.tika.eval.core.tokens.AnalyzerManager;
+import org.apache.tika.eval.core.tokens.CommonTokenCountManager;
+import org.apache.tika.eval.core.tokens.CommonTokenResult;
+import org.apache.tika.eval.core.tokens.TokenCounts;
+import org.apache.tika.eval.core.tokens.TokenIntPair;
+import org.apache.tika.eval.core.util.ContentTagParser;
+import org.apache.tika.eval.core.util.ContentTags;
+import org.apache.tika.eval.core.util.EvalExceptionUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.language.detect.LanguageResult;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PDF;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.apache.tika.utils.StringUtils;
+
+public abstract class ProfilerBase {
+
+    public static final String TRUE = Boolean.toString(true);
+    public static final String FALSE = Boolean.toString(false);
+    protected static final AtomicInteger ID = new AtomicInteger();
+    static final long NON_EXISTENT_FILE_LENGTH = -1l;
+    final static int FILE_PATH_MAX_LEN = 1024;//max len for varchar for 
file_path
+    //Container exception key from the 1.x branch
+    private static final Property CONTAINER_EXCEPTION_1X = 
Property.externalText("X-TIKA" + ":EXCEPTION:runtime");
+    private static final Logger LOG = 
LoggerFactory.getLogger(ProfilerBase.class);
+    private static final String[] EXTRACT_EXTENSIONS = {".json", ".txt", ""};
+    private static final String[] COMPRESSION_EXTENSIONS = {"", ".bz2", 
".gzip", ".zip",};
+    private static final String ZERO = "0";
+    private static final String UNKNOWN_EXTENSION = "unk";
+    //make this configurable
+    private static final String DIGEST_KEY = "X-TIKA:digest:MD5";
+    private static final Map<String, Cols> UC_TAGS_OF_INTEREST = initTags();
+    private final static Pattern ACCESS_PERMISSION_EXCEPTION = 
Pattern.compile("org\\.apache\\.tika\\.exception\\.AccessPermissionException");
+    private final static Pattern ENCRYPTION_EXCEPTION = 
Pattern.compile("org\\.apache\\.tika.exception\\.EncryptedDocumentException");
+    public static TableInfo REF_EXTRACT_EXCEPTION_TYPES = new 
TableInfo("ref_extract_exception_types", new ColInfo(Cols.EXTRACT_EXCEPTION_ID, 
Types.INTEGER),
+            new ColInfo(Cols.EXTRACT_EXCEPTION_DESCRIPTION, Types.VARCHAR, 
128));
+    public static TableInfo REF_PARSE_ERROR_TYPES =
+            new TableInfo("ref_parse_error_types", new 
ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER), new 
ColInfo(Cols.PARSE_ERROR_DESCRIPTION, Types.VARCHAR, 128));
+    public static TableInfo REF_PARSE_EXCEPTION_TYPES =
+            new TableInfo("ref_parse_exception_types", new 
ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER), new 
ColInfo(Cols.PARSE_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128));
+    public static TableInfo MIME_TABLE = new TableInfo("mimes", new 
ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"), new 
ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
+            new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12));
+    private static CommonTokenCountManager COMMON_TOKEN_COUNT_MANAGER;
+    private static Pattern FILE_NAME_CLEANER = 
Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$");
+    private static LanguageIDWrapper LANG_ID = new LanguageIDWrapper();
+    protected IDBWriter writer;
+    AnalyzerManager analyzerManager;
+    int maxContentLength = 10000000;
+    int maxContentLengthForLangId = 50000;
+    int maxTokens = 200000;
+
+    CompositeTextStatsCalculator compositeTextStatsCalculator;
+    private String lastExtractExtension = null;
+
+    public ProfilerBase(IDBWriter writer) {
+        this.writer = writer;
+        LanguageIDWrapper.setMaxTextLength(maxContentLengthForLangId);
+        this.compositeTextStatsCalculator = 
initAnalyzersAndTokenCounter(maxTokens, LANG_ID);
+    }
+
+    private static Map<String, Cols> initTags() {
+        //simplify this mess
+        Map<String, Cols> tmp = new HashMap<>();
+        tmp.put("A", Cols.TAGS_A);
+        tmp.put("B", Cols.TAGS_B);
+        tmp.put("DIV", Cols.TAGS_DIV);
+        tmp.put("I", Cols.TAGS_I);
+        tmp.put("IMG", Cols.TAGS_IMG);
+        tmp.put("LI", Cols.TAGS_LI);
+        tmp.put("OL", Cols.TAGS_OL);
+        tmp.put("P", Cols.TAGS_P);
+        tmp.put("TABLE", Cols.TAGS_TABLE);
+        tmp.put("TD", Cols.TAGS_TD);
+        tmp.put("TITLE", Cols.TAGS_TITLE);
+        tmp.put("TR", Cols.TAGS_TR);
+        tmp.put("U", Cols.TAGS_U);
+        tmp.put("UL", Cols.TAGS_UL);
+        return Collections.unmodifiableMap(tmp);
+    }
+
+    /**
+     * @param p               path to the common_tokens directory.  If this is 
null, try to load from classPath
+     * @param defaultLangCode this is the language code to use if a 
common_words list doesn't exist for the
+     *                        detected langauge; can be <code>null</code>
+     * @throws IOException
+     */
+    public static void loadCommonTokens(Path p, String defaultLangCode) throws 
IOException {
+        COMMON_TOKEN_COUNT_MANAGER = new CommonTokenCountManager(p, 
defaultLangCode);
+    }
+
+    private static String getFileName(String path) {
+        if (path == null) {
+            return "";
+        }
+        //filenameUtils checks for a null byte in the path.
+        //it will throw an IllegalArgumentException if there is a null byte.
+        //given that we're recording names and not using them on a file path
+        //we should ignore this.
+        try {
+            return FilenameUtils.getName(path);
+        } catch (IllegalArgumentException e) {
+            LOG.warn("{} in {}", e.getMessage(), path);
+        }
+        path = path.replaceAll("\u0000", " ");
+        try {
+            return FilenameUtils.getName(path);
+        } catch (IllegalArgumentException e) {
+            LOG.warn("Again: {} in {}", e.getMessage(), path);
+        }
+        //give up
+        return "";
+    }
+
+    /**
+     * Get the content and record in the data {@link 
Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the string was truncated
+     *
+     * @param contentTags
+     * @param maxLength
+     * @param data
+     * @return
+     */
+    protected static String truncateContent(ContentTags contentTags, int 
maxLength, Map<Cols, String> data) {
+        data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE");
+        if (contentTags == null) {
+            return "";
+        }
+        String c = contentTags.getContent();
+        if (maxLength > -1 && c.length() > maxLength) {
+            c = c.substring(0, maxLength);
+            data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "TRUE");
+        }
+        return c;
+
+    }
+
+    protected static ContentTags getContent(EvalFilePaths evalFilePaths, 
Metadata metadata) {
+        if (metadata == null) {
+            return ContentTags.EMPTY_CONTENT_TAGS;
+        }
+        return parseContentAndTags(evalFilePaths, metadata);
+    }
+
+    /**
+     * @param list
+     * @return empty list if input list is empty or null
+     */
+    static List<Integer> countAttachments(List<Metadata> list) {
+        List<Integer> ret = new ArrayList<>();
+        if (list == null || list.size() == 0) {
+            return ret;
+        }
+        //container document attachment count = list.size()-1
+        ret.add(list.size() - 1);
+
+        Map<String, Integer> counts = new HashMap<>();
+        for (int i = 1; i < list.size(); i++) {
+            String path = list
+                    .get(i)
+                    .get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
+            if (path == null) {
+                //shouldn't ever happen
+                continue;
+            }
+            String[] parts = path.split("/");
+            StringBuilder parent = new StringBuilder();
+            for (int end = 1; end < parts.length - 1; end++) {
+                parent.setLength(0);
+                join("/", parent, parts, 1, end);
+                String parentPath = parent.toString();
+                Integer count = counts.get(parentPath);
+                if (count == null) {
+                    count = 1;
+                } else {
+                    count++;
+                }
+                counts.put(parentPath, count);
+            }
+        }
+
+        for (int i = 1; i < list.size(); i++) {
+            Integer count = counts.get(list
+                    .get(i)
+                    .get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+            if (count == null) {
+                count = 0;
+            }
+            ret.add(i, count);
+        }
+        return ret;
+
+
+    }
+
+    private static void join(String delimiter, StringBuilder sb, String[] 
parts, int start, int end) {
+        for (int i = start; i <= end; i++) {
+            sb.append(delimiter);
+            sb.append(parts[i]);
+        }
+    }
+
+    private static ContentTags parseContentAndTags(EvalFilePaths 
evalFilePaths, Metadata metadata) {
+        String s = metadata.get(TikaCoreProperties.TIKA_CONTENT);
+        if (s == null || s.isEmpty()) {
+            return ContentTags.EMPTY_CONTENT_TAGS;
+        }
+
+        String handlerClass = 
metadata.get(TikaCoreProperties.TIKA_CONTENT_HANDLER);
+        if (evalFilePaths
+                .getExtractFile()
+                .getFileName()
+                .toString()
+                .toLowerCase(Locale.ENGLISH)
+                .endsWith(".html")) {
+            try {
+                return ContentTagParser.parseHTML(s, 
UC_TAGS_OF_INTEREST.keySet());
+            } catch (IOException | SAXException e) {
+                LOG.warn("Problem parsing html in {}; backing off to treat 
string as text", evalFilePaths
+                        .getExtractFile()
+                        .toAbsolutePath()
+                        .toString(), e);
+
+                return new ContentTags(s, true);
+            }
+        } else if (evalFilePaths
+                .getExtractFile()
+                .getFileName()
+                .toString()
+                .toLowerCase(Locale.ENGLISH)
+                .endsWith(".xhtml") || (handlerClass != null && 
handlerClass.equals(ToXMLContentHandler.class.getSimpleName()))) {
+            try {
+                return ContentTagParser.parseXML(s, 
UC_TAGS_OF_INTEREST.keySet());
+            } catch (TikaException | IOException | SAXException e) {
+                LOG.warn("Problem parsing xhtml in {}; backing off to html 
parser", evalFilePaths
+                        .getExtractFile()
+                        .toAbsolutePath()
+                        .toString(), e);
+                try {
+                    ContentTags contentTags = ContentTagParser.parseHTML(s, 
UC_TAGS_OF_INTEREST.keySet());
+                    contentTags.setParseException(true);
+                    return contentTags;
+                } catch (IOException | SAXException e2) {
+                    LOG.warn("Problem parsing html in {}; backing off to treat 
string as text", evalFilePaths
+                            .getExtractFile()
+                            .toAbsolutePath()
+                            .toString(), e2);
+                }
+                return new ContentTags(s, true);
+            }
+        }
+        return new ContentTags(s);
+    }
+
+    private CompositeTextStatsCalculator initAnalyzersAndTokenCounter(int 
maxTokens, LanguageIDWrapper langIder) {
+        analyzerManager = AnalyzerManager.newInstance(maxTokens);
+        List<TextStatsCalculator> calculators = new ArrayList<>();
+        calculators.add(new CommonTokens(COMMON_TOKEN_COUNT_MANAGER));
+        calculators.add(new TokenEntropy());
+        calculators.add(new TokenLengths());
+        calculators.add(new TopNTokens(10));
+        calculators.add(new BasicTokenCountStatsCalculator());
+        calculators.add(new ContentLengthCalculator());
+        calculators.add(new UnicodeBlockCounter(maxContentLengthForLangId));
+
+        return new CompositeTextStatsCalculator(calculators, 
analyzerManager.getGeneralAnalyzer(), langIder);
+    }
+
+    /**
+     * Truncate the content string if greater than this length to this length
+     *
+     * @param maxContentLength
+     */
+    public void setMaxContentLength(int maxContentLength) {
+        this.maxContentLength = maxContentLength;
+    }
+
+    /**
+     * Truncate content string if greater than this length to this length for 
lang id
+     *
+     * @param maxContentLengthForLangId
+     */
+    public void setMaxContentLengthForLangId(int maxContentLengthForLangId) {
+        this.maxContentLengthForLangId = maxContentLengthForLangId;
+        LanguageIDWrapper.setMaxTextLength(maxContentLengthForLangId);
+    }
+
+    /**
+     * Add a LimitTokenCountFilterFactory if &gt; -1
+     *
+     * @param maxTokens
+     */
+    public void setMaxTokens(int maxTokens) {
+        this.maxTokens = maxTokens;
+        initAnalyzersAndTokenCounter(maxTokens, new LanguageIDWrapper());
+    }
+
+    protected void writeExtractException(TableInfo extractExceptionTable, 
String containerId, String filePath, ExtractReaderException.TYPE type) throws 
IOException {
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.CONTAINER_ID, containerId);
+        data.put(Cols.FILE_PATH, filePath);
+        data.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(type.ordinal()));
+        writer.writeRow(extractExceptionTable, data);
+
+    }
+
+    protected void writeProfileData(EvalFilePaths fps, int i, ContentTags 
contentTags, Metadata m, String fileId, String containerId, List<Integer> 
numAttachments,
+                                    TableInfo profileTable) {
+
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.ID, fileId);
+        data.put(Cols.CONTAINER_ID, containerId);
+        data.put(Cols.MD5, m.get(DIGEST_KEY));
+
+        if (i < numAttachments.size()) {
+            data.put(Cols.NUM_ATTACHMENTS, 
Integer.toString(numAttachments.get(i)));
+        }
+        data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
+        data.put(Cols.NUM_METADATA_VALUES, 
Integer.toString(countMetadataValues(m)));
+
+        Integer nPages = m.getInt(PagedText.N_PAGES);
+        if (nPages != null) {
+            data.put(Cols.NUM_PAGES, Integer.toString(nPages));
+        }
+        Integer nOCRPages = m.getInt(PDF.OCR_PAGE_COUNT);
+        if (nOCRPages != null) {
+            data.put(Cols.NUM_OCR_PAGES, Integer.toString(nOCRPages));
+        }
+
+        //if the outer wrapper document
+        if (i == 0) {
+            data.put(Cols.IS_EMBEDDED, FALSE);
+            data.put(Cols.FILE_NAME, fps
+                    .getRelativeSourceFilePath()
+                    .getFileName()
+                    .toString());
+            data.put(Cols.EMBEDDED_DEPTH, "0");
+        } else {
+            data.put(Cols.IS_EMBEDDED, TRUE);
+            String embeddedFilePath = 
m.get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH);
+            if (!StringUtils.isBlank(embeddedFilePath)) {
+                data.put(Cols.FILE_NAME, getFileName(embeddedFilePath));
+                data.put(Cols.EMBEDDED_FILE_PATH, embeddedFilePath);
+            }
+            if 
(!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_DEPTH))) {
+                data.put(Cols.EMBEDDED_DEPTH, 
m.get(TikaCoreProperties.EMBEDDED_DEPTH));
+            }
+            if 
(!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) {
+                data.put(Cols.ATTACHMENT_TYPE, 
m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+            }
+        }
+        String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME));
+        ext = (ext == null) ? "" : ext.toLowerCase(Locale.US);
+        data.put(Cols.FILE_EXTENSION, ext);
+        long srcFileLen = getSourceFileLength(m);
+        if (srcFileLen > NON_EXISTENT_FILE_LENGTH) {
+            data.put(Cols.LENGTH, Long.toString(srcFileLen));
+        } else {
+            data.put(Cols.LENGTH, "");
+        }
+        int numMetadataValues = countMetadataValues(m);
+        data.put(Cols.NUM_METADATA_VALUES, 
Integer.toString(numMetadataValues));
+
+        data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
+
+        String content = contentTags.getContent();
+        if (content == null || content.isBlank()) {
+            data.put(Cols.HAS_CONTENT, FALSE);
+        } else {
+            data.put(Cols.HAS_CONTENT, TRUE);
+        }
+        getFileTypes(m, data);
+        try {
+            writer.writeRow(profileTable, data);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    protected void writeExceptionData(String fileId, Metadata m, TableInfo 
exceptionTable) {
+        Map<Cols, String> data = new HashMap<>();
+        getExceptionStrings(m, data);
+        if (data
+                .keySet()
+                .size() > 0) {
+            try {
+                data.put(Cols.ID, fileId);
+                writer.writeRow(exceptionTable, data);
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    protected Map<Class, Object> calcTextStats(ContentTags contentTags) {
+/*        if (contentTags == ContentTags.EMPTY_CONTENT_TAGS) {
+            return Collections.EMPTY_MAP;
+        }*/
+        Map<Cols, String> data = new HashMap<>();
+        String content = truncateContent(contentTags, maxContentLength, data);
+        if (content == null || content.isBlank()) {
+            content = "";
+        }
+        return compositeTextStatsCalculator.calculate(content);
+    }
+
+    /**
+     * Checks to see if metadata is null or content is empty (null or only 
whitespace).
+     * If any of these, then this does no processing, and the fileId is not
+     * entered into the content table.
+     *
+     * @param fileId
+     * @param textStats
+     * @param contentsTable
+     */
+    protected void writeContentData(String fileId, Map<Class, Object> 
textStats, TableInfo contentsTable) throws IOException {
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.ID, fileId);
+        if (textStats.containsKey(ContentLengthCalculator.class)) {
+            int length = (int) textStats.get(ContentLengthCalculator.class);
+            if (length == 0) {
+                return;
+            }
+            data.put(Cols.CONTENT_LENGTH, Integer.toString(length));
+        }
+        langid(textStats, data);
+
+        writeTokenCounts(textStats, data);
+        CommonTokenResult commonTokenResult = (CommonTokenResult) 
textStats.get(CommonTokens.class);
+        if (commonTokenResult != null) {
+            data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
+            data.put(Cols.NUM_UNIQUE_COMMON_TOKENS, 
Integer.toString(commonTokenResult.getUniqueCommonTokens()));
+            data.put(Cols.NUM_COMMON_TOKENS, 
Integer.toString(commonTokenResult.getCommonTokens()));
+            data.put(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, 
Integer.toString(commonTokenResult.getUniqueAlphabeticTokens()));
+            data.put(Cols.NUM_ALPHABETIC_TOKENS, 
Integer.toString(commonTokenResult.getAlphabeticTokens()));
+            double oov = commonTokenResult.getAlphabeticTokens() > 0 ? 
commonTokenResult.getOOV() : -1.0;
+            data.put(Cols.OOV, Double.toString(oov));
+        }
+        TokenCounts tokenCounts = (TokenCounts) 
textStats.get(BasicTokenCountStatsCalculator.class);
+        if (tokenCounts != null) {
+
+            data.put(Cols.NUM_UNIQUE_TOKENS, 
Integer.toString(tokenCounts.getTotalUniqueTokens()));
+            data.put(Cols.NUM_TOKENS, 
Integer.toString(tokenCounts.getTotalTokens()));
+        }
+        if (textStats.get(TokenEntropy.class) != null) {
+            data.put(Cols.TOKEN_ENTROPY_RATE, Double.toString((Double) 
textStats.get(TokenEntropy.class)));
+        }
+
+
+        SummaryStatistics summStats = (SummaryStatistics) 
textStats.get(TokenLengths.class);
+        if (summStats != null) {
+            data.put(Cols.TOKEN_LENGTH_SUM, Integer.toString((int) 
summStats.getSum()));
+
+            data.put(Cols.TOKEN_LENGTH_MEAN, 
Double.toString(summStats.getMean()));
+
+            data.put(Cols.TOKEN_LENGTH_STD_DEV, 
Double.toString(summStats.getStandardDeviation()));
+        }
+        unicodeBlocks(textStats, data);
+        try {
+            writer.writeRow(contentsTable, data);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    void writeTagData(String fileId, ContentTags contentTags, TableInfo 
tagsTable) {
+        Map<String, Integer> tags = contentTags.getTags();
+        if (tags.size() == 0 && contentTags.getParseException() == false) {
+            return;
+        }
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.ID, fileId);
+
+        for (Map.Entry<String, Cols> e : UC_TAGS_OF_INTEREST.entrySet()) {
+            Integer count = tags.get(e.getKey());
+            if (count == null) {
+                data.put(e.getValue(), ZERO);
+            } else {
+                data.put(e.getValue(), Integer.toString(count));
+            }
+        }
+
+        if (contentTags.getParseException()) {
+            data.put(Cols.TAGS_PARSE_EXCEPTION, TRUE);
+        } else {
+            data.put(Cols.TAGS_PARSE_EXCEPTION, FALSE);
+        }
+        try {
+            writer.writeRow(tagsTable, data);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    String getTime(Metadata m) {
+        String elapsed = "-1";
+
+        String v = m.get(TikaCoreProperties.PARSE_TIME_MILLIS);
+        if (v != null) {
+            return v;
+        }
+        return elapsed;
+    }
+
+    int countMetadataValues(Metadata m) {
+        if (m == null) {
+            return 0;
+        }
+        int i = 0;
+        for (String n : m.names()) {
+            i += m.getValues(n).length;
+        }
+        return i;
+    }
+
+    void getExceptionStrings(Metadata metadata, Map<Cols, String> data) {
+
+        String fullTrace = 
metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION);
+        if (fullTrace == null) {
+            fullTrace = metadata.get(CONTAINER_EXCEPTION_1X);
+        }
+
+        if (fullTrace == null) {
+            fullTrace = metadata.get(TikaCoreProperties.EMBEDDED_EXCEPTION);
+        }
+
+        if (fullTrace != null) {
+            //check for "expected" exceptions...exceptions
+            //that can't be fixed.
+            //Do not store trace for "expected" exceptions
+
+            Matcher matcher = ACCESS_PERMISSION_EXCEPTION.matcher(fullTrace);
+            if (matcher.find()) {
+                data.put(Cols.PARSE_EXCEPTION_ID, 
Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()));
+                return;
+            }
+            matcher = ENCRYPTION_EXCEPTION.matcher(fullTrace);
+            if (matcher.find()) {
+                data.put(Cols.PARSE_EXCEPTION_ID, 
Integer.toString(EXCEPTION_TYPE.ENCRYPTION.ordinal()));
+                return;
+            }
+
+            data.put(Cols.PARSE_EXCEPTION_ID, 
Integer.toString(EXCEPTION_TYPE.RUNTIME.ordinal()));
+
+            data.put(Cols.ORIG_STACK_TRACE, fullTrace);
+            //TikaExceptions can have object ids, as in the "@2b1ea6ee" in:
+            //org.apache.tika.exception.TikaException: TIKA-198: Illegal
+            //IOException from 
org.apache.tika.parser.microsoft.OfficeParser@2b1ea6ee
+            //For reporting purposes, let's snip off the object id so that we 
can more
+            //easily count exceptions.
+            String sortTrace = EvalExceptionUtils.normalize(fullTrace);
+            data.put(Cols.SORT_STACK_TRACE, sortTrace);
+        }
+    }
+
+    void unicodeBlocks(Map<Class, Object> tokenStats, Map<Cols, String> data) {
+
+        Map<String, MutableInt> blocks = (Map<String, MutableInt>) 
tokenStats.get(UnicodeBlockCounter.class);
+        List<Pair<String, Integer>> pairs = new ArrayList<>();
+        for (Map.Entry<String, MutableInt> e : blocks.entrySet()) {
+            pairs.add(Pair.of(e.getKey(), e
+                    .getValue()
+                    .intValue()));
+        }
+        pairs.sort((o1, o2) -> o2
+                .getValue()
+                .compareTo(o1.getValue()));
+        StringBuilder sb = new StringBuilder();
+
+        for (int i = 0; i < 20 && i < pairs.size(); i++) {
+            if (i > 0) {
+                sb.append(" | ");
+            }
+            sb
+                    .append(pairs
+                            .get(i)
+                            .getKey())
+                    .append(": ")
+                    .append(pairs
+                            .get(i)
+                            .getValue());
+        }
+        data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString());
+    }
+
+    void langid(Map<Class, Object> stats, Map<Cols, String> data) {
+        List<LanguageResult> probabilities = (List<LanguageResult>) 
stats.get(LanguageIDWrapper.class);
+
+        if (probabilities.size() > 0) {
+            data.put(Cols.LANG_ID_1, probabilities
+                    .get(0)
+                    .getLanguage());
+            data.put(Cols.LANG_ID_PROB_1, Double.toString(probabilities
+                    .get(0)
+                    .getRawScore()));
+        }
+        if (probabilities.size() > 1) {
+            data.put(Cols.LANG_ID_2, probabilities
+                    .get(1)
+                    .getLanguage());
+            data.put(Cols.LANG_ID_PROB_2, Double.toString(probabilities
+                    .get(1)
+                    .getRawScore()));
+        }
+    }
+
+    void getFileTypes(Metadata metadata, Map<Cols, String> output) {
+        if (metadata == null) {
+            return;
+        }
+        String type = metadata.get(Metadata.CONTENT_TYPE);
+        if (type == null) {
+            return;
+        }
+        int mimeId = writer.getMimeId(type);
+        output.put(Cols.MIME_ID, Integer.toString(mimeId));
+    }
+
+    void writeTokenCounts(Map<Class, Object> textStats, Map<Cols, String> 
data) {
+        TokenIntPair[] tokenIntPairs = (TokenIntPair[]) 
textStats.get(TopNTokens.class);
+        int i = 0;
+        StringBuilder sb = new StringBuilder();
+        for (TokenIntPair t : tokenIntPairs) {
+            if (i++ > 0) {
+                sb.append(" | ");
+            }
+            sb
+                    .append(t.getToken())
+                    .append(": ")
+                    .append(t.getValue());
+        }
+
+        data.put(Cols.TOP_N_TOKENS, sb.toString());
+    }
+
+    public void closeWriter() throws IOException {
+        writer.close();
+    }
+
+    /**
+     * @param metadata
+     * @param extracts
+     * @return evalfilepaths for files if crawling an extract directory
+     */
+    protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata, Path 
extracts) {
+        String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH);
+        Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath);
+        Path relativeSourceFilePath = Paths.get(m.replaceAll(""));
+        //just try slapping the relextractfilepath on the extractdir
+        Path extractFile = extracts.resolve(relExtractFilePath);
+        if (!Files.isRegularFile(extractFile)) {
+            //if that doesn't work, try to find the right extract file.
+            //This is necessary if crawling extractsA and trying to find a 
file in
+            //extractsB that is not in the same format: json vs txt or 
compressed
+            extractFile = findFile(extracts, relativeSourceFilePath);
+        }
+        return new EvalFilePaths(relativeSourceFilePath, extractFile);
+    }
+
+    //call this if the crawler is crawling through the src directory
+    protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path 
srcDir, Path extracts) {
+        Path relativeSourceFilePath = 
Paths.get(metadata.get(FSProperties.FS_REL_PATH));
+        Path extractFile = findFile(extracts, relativeSourceFilePath);
+        Path inputFile = srcDir.resolve(relativeSourceFilePath);
+        long srcLen = -1l;
+        //try to get the length of the source file in case there was an error
+        //in both extracts
+        try {
+            srcLen = Files.size(inputFile);
+        } catch (IOException e) {
+            LOG.warn("Couldn't get length for: {}", 
inputFile.toAbsolutePath());
+        }
+        return new EvalFilePaths(relativeSourceFilePath, extractFile, srcLen);
+    }
+
+    /**
+     * @param extractRootDir
+     * @param relativeSourceFilePath
+     * @return extractFile or null if couldn't find one.
+     */
+    private Path findFile(Path extractRootDir, Path relativeSourceFilePath) {
+        String relSrcFilePathString = relativeSourceFilePath.toString();
+        if (lastExtractExtension != null) {
+            Path candidate = extractRootDir.resolve(relSrcFilePathString + 
lastExtractExtension);
+            if (Files.isRegularFile(candidate)) {
+                return candidate;
+            }
+        }
+        for (String ext : EXTRACT_EXTENSIONS) {
+            for (String compress : COMPRESSION_EXTENSIONS) {
+                Path candidate = extractRootDir.resolve(relSrcFilePathString + 
ext + compress);
+                if (Files.isRegularFile(candidate)) {
+                    lastExtractExtension = ext + compress;
+                    return candidate;
+                }
+            }
+        }
+        return null;
+    }
+
+    protected long getSourceFileLength(EvalFilePaths fps, List<Metadata> 
metadataList) {
+        if (fps.getSourceFileLength() > NON_EXISTENT_FILE_LENGTH) {
+            return fps.getSourceFileLength();
+        }
+        return getSourceFileLength(metadataList);
+    }
+
+    long getSourceFileLength(List<Metadata> metadataList) {
+        if (metadataList == null || metadataList.size() < 1) {
+            return NON_EXISTENT_FILE_LENGTH;
+        }
+        return getSourceFileLength(metadataList.get(0));
+    }
+
+    long getSourceFileLength(Metadata m) {
+        String lenString = m.get(Metadata.CONTENT_LENGTH);
+        if (lenString == null) {
+            return NON_EXISTENT_FILE_LENGTH;
+        }
+        try {
+            return Long.parseLong(lenString);
+        } catch (NumberFormatException e) {
+            //swallow
+        }
+        return NON_EXISTENT_FILE_LENGTH;
+    }
+
+    protected long getFileLength(Path p) {
+        if (p != null && Files.isRegularFile(p)) {
+            try {
+                return Files.size(p);
+            } catch (IOException e) {
+                //swallow
+            }
+        }
+        return NON_EXISTENT_FILE_LENGTH;
+    }
+
+    public enum EXCEPTION_TYPE {
+        RUNTIME, ENCRYPTION, ACCESS_PERMISSION, UNSUPPORTED_VERSION,
+    }
+
+    /**
+     * If information was gathered from the log file about
+     * a parse error
+     */
+    public enum PARSE_ERROR_TYPE {
+        OOM, TIMEOUT
+    }
+
+
+}
+
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/StatusReporter.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/StatusReporter.java
new file mode 100644
index 000000000..8df9824ed
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/StatusReporter.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import java.text.NumberFormat;
+import java.util.Locale;
+import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.util.DurationFormatUtils;
+
+public class StatusReporter implements Callable<Integer> {
+
+    public static final int COMPLETED_VAL = 3;
+    private static final Logger LOGGER = 
LoggerFactory.getLogger(StatusReporter.class);
+    private final AtomicInteger filesQueued;
+    private final AtomicInteger filesProcessed;
+    private final AtomicInteger activeWorkers;
+    private final AtomicBoolean crawlerIsActive;
+    private final long start;
+    private final NumberFormat numberFormat = 
NumberFormat.getNumberInstance(Locale.ROOT);
+
+
+    public StatusReporter(AtomicInteger filesQueued, AtomicInteger 
filesProcessed, AtomicInteger activeWorkers, AtomicBoolean crawlerIsActive) {
+        this.filesQueued = filesQueued;
+        this.filesProcessed = filesProcessed;
+        this.activeWorkers = activeWorkers;
+        this.crawlerIsActive = crawlerIsActive;
+        this.start = System.currentTimeMillis();
+    }
+
+    @Override
+    public Integer call() throws Exception {
+        while (true) {
+
+            try {
+                Thread.sleep(1000);
+            } catch (InterruptedException e) {
+                LOGGER.info("Interrupted?");
+                //expected
+                return COMPLETED_VAL;
+            }
+            report();
+            if (activeWorkers.get() == 0) {
+                LOGGER.info("Completed successfully.");
+                return COMPLETED_VAL;
+            }
+        }
+    }
+
+    private void report() {
+        int cnt = filesProcessed.get();
+        long elapsed = System.currentTimeMillis() - start;
+        double elapsedSecs = (double) elapsed / (double) 1000;
+        int avg = (elapsedSecs > 5 || cnt > 100) ? (int) ((double) cnt / 
elapsedSecs) : -1;
+
+        String elapsedString = 
DurationFormatUtils.formatMillis(System.currentTimeMillis() - start);
+        String docsPerSec = avg > -1 ? String.format(Locale.ROOT, " (%s docs 
per sec)", numberFormat.format(avg)) : "";
+        String msg = String.format(Locale.ROOT, "Processed %s documents in 
%s%s.", numberFormat.format(cnt), elapsedString, docsPerSec);
+        LOGGER.info(msg);
+
+        int stillAlive = activeWorkers.get();
+        if (stillAlive == 1) {
+            msg = "There is one file processor still active.";
+        } else {
+            msg = "There are " + numberFormat.format(stillAlive) + " file 
processors still active.";
+        }
+        LOGGER.info(msg);
+
+        int queued = filesQueued.get();
+
+        if (queued == 1) {
+            msg = "The crawler has enqueued 1 file.";
+        } else {
+            msg = "The crawler has enqueued " + numberFormat.format(queued) + 
" files.";
+        }
+        LOGGER.info(msg);
+
+        if (! crawlerIsActive.get()) {
+            msg = "The directory crawler has completed its crawl.\n";
+            LOGGER.info(msg);
+        }
+    }
+}
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
index a897461ee..91aecd832 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
@@ -117,7 +117,7 @@ public class TikaEvalCLI {
                 CommandLine commandLine = 
defaultCLIParser.parse(FileProfiler.OPTIONS, updatedArgs);
                 if (commandLine.hasOption("db") && 
commandLine.hasOption("jdbc")) {
                     System.out.println("Please specify either the default -db 
or the full -jdbc, not both");
-                    ExtractProfiler.USAGE();
+                    FileProfiler.USAGE();
                     return;
                 }
             } catch (ParseException e) {
@@ -154,109 +154,7 @@ public class TikaEvalCLI {
     }
 
     private void handleProfile(String[] subsetArgs) throws Exception {
-        List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
-
-        boolean containsBC = false;
-        String inputDir = null;
-        String extracts = null;
-        String alterExtract = null;
-        //confirm there's a batch-config file
-        for (int i = 0; i < argList.size(); i++) {
-            String arg = argList.get(i);
-            switch (arg) {
-                case "-bc":
-                    containsBC = true;
-                    break;
-                case "-inputDir":
-                    if (i + 1 >= argList.size()) {
-                        System.err.println("Must specify directory after 
-inputDir");
-                        ExtractProfiler.USAGE();
-                        return;
-                    }
-                    inputDir = argList.get(i + 1);
-                    i++;
-                    break;
-                case "-extracts":
-                    if (i + 1 >= argList.size()) {
-                        System.err.println("Must specify directory after 
-extracts");
-                        ExtractProfiler.USAGE();
-                        return;
-                    }
-                    extracts = argList.get(i + 1);
-                    i++;
-                    break;
-                case "-alterExtract":
-                    if (i + 1 >= argList.size()) {
-                        System.err.println("Must specify type 'as_is', 
'first_only' or " + "'concatenate_content' after -alterExtract");
-                        ExtractComparer.USAGE();
-                        return;
-                    }
-                    alterExtract = argList.get(i + 1);
-                    i++;
-                    break;
-            }
-        }
-
-        if (alterExtract != null && !alterExtract.equals("as_is") && 
!alterExtract.equals("concatenate_content") && 
!alterExtract.equals("first_only")) {
-            System.out.println("Sorry, I don't understand:" + alterExtract + 
". The values must be one of: as_is, first_only, concatenate_content");
-            ExtractProfiler.USAGE();
-            return;
-        }
-
-        //need to specify each in this commandline
-        //if only extracts is passed to tika-batch,
-        //the crawler will see no inputDir and start crawling "input".
-        //this allows the user to specify either extracts or inputDir
-        if (extracts == null && inputDir != null) {
-            argList.add("-extracts");
-            argList.add(inputDir);
-        } else if (inputDir == null && extracts != null) {
-            argList.add("-inputDir");
-            argList.add(extracts);
-        }
-
-        Path tmpBCConfig = null;
-        try {
-            tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
-            if (!containsBC) {
-                try (InputStream is = this
-                        .getClass()
-                        
.getResourceAsStream("/tika-eval-profiler-config.xml")) {
-                    Files.copy(is, tmpBCConfig, 
StandardCopyOption.REPLACE_EXISTING);
-                }
-                argList.add("-bc");
-                argList.add(tmpBCConfig
-                        .toAbsolutePath()
-                        .toString());
-            }
-
-            String[] updatedArgs = argList.toArray(new String[0]);
-            DefaultParser defaultCLIParser = new DefaultParser();
-            try {
-                CommandLine commandLine = 
defaultCLIParser.parse(ExtractProfiler.OPTIONS, updatedArgs);
-                if (commandLine.hasOption("db") && 
commandLine.hasOption("jdbc")) {
-                    System.out.println("Please specify either the default -db 
or the full -jdbc, not both");
-                    ExtractProfiler.USAGE();
-                    return;
-                }
-            } catch (ParseException e) {
-                System.out.println(e.getMessage() + "\n");
-                ExtractProfiler.USAGE();
-                return;
-            }
-
-            // lazy delete because main() calls System.exit()
-            if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
-                tmpBCConfig
-                        .toFile()
-                        .deleteOnExit();
-            }
-            FSBatchProcessCLI.main(updatedArgs);
-        } finally {
-            if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
-                Files.delete(tmpBCConfig);
-            }
-        }
+        ExtractProfileRunner.main(subsetArgs);
     }
 
     private void handleCompare(String[] subsetArgs) throws Exception {
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java
deleted file mode 100644
index c59d53016..000000000
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.app.batch;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.eval.app.AbstractProfiler;
-import org.apache.tika.eval.app.ExtractProfiler;
-import org.apache.tika.eval.app.db.TableInfo;
-import org.apache.tika.util.PropsUtil;
-
-
-public class ExtractProfilerBuilder extends EvalConsumerBuilder {
-
-    public final static String TABLE_PREFIX_KEY = "tablePrefix";
-
-    private final List<TableInfo> tableInfos;
-    private final List<TableInfo> refTableInfos;
-
-    public ExtractProfilerBuilder() {
-        List<TableInfo> tableInfos = new ArrayList();
-        tableInfos.add(AbstractProfiler.MIME_TABLE);
-        tableInfos.add(ExtractProfiler.CONTAINER_TABLE);
-        tableInfos.add(ExtractProfiler.PROFILE_TABLE);
-        tableInfos.add(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
-        tableInfos.add(ExtractProfiler.EXCEPTION_TABLE);
-        tableInfos.add(ExtractProfiler.CONTENTS_TABLE);
-        tableInfos.add(ExtractProfiler.TAGS_TABLE);
-        tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE);
-        this.tableInfos = Collections.unmodifiableList(tableInfos);
-
-        List<TableInfo> refTableInfos = new ArrayList<>();
-        refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
-        refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
-        refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES);
-        this.refTableInfos = Collections.unmodifiableList(refTableInfos);
-    }
-
-    @Override
-    public FileResourceConsumer build() throws IOException, SQLException {
-        Path extracts = PropsUtil.getPath(localAttrs.get("extracts"), null);
-        if (extracts == null) {
-            throw new RuntimeException("Must specify \"extracts\" -- directory 
to crawl");
-        }
-        if (!Files.isDirectory(extracts)) {
-            throw new RuntimeException("ROOT DIRECTORY DOES NOT EXIST: " + 
extracts.toAbsolutePath());
-        }
-
-        Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
-
-        //we _could_ set this to extracts (if not null)
-        //here, but the Crawler defaults to "input" if nothing is passed
-        //so this won't work
-        if (inputDir == null) {
-            throw new RuntimeException("Must specify -inputDir");
-        }
-        if (extracts == null && inputDir != null) {
-            extracts = inputDir;
-        }
-        return parameterizeProfiler(new ExtractProfiler(queue, inputDir, 
extracts, buildExtractReader(localAttrs), getDBWriter(tableInfos)));
-    }
-
-
-    @Override
-    protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
-        String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
-        if (tableNamePrefix != null && !tableNamePrefix.equals("null")) {
-            for (TableInfo tableInfo : tableInfos) {
-                tableInfo.setNamePrefix(tableNamePrefix);
-            }
-        }
-    }
-
-
-    @Override
-    protected List<TableInfo> getRefTableInfos() {
-        return refTableInfos;
-    }
-
-    @Override
-    protected List<TableInfo> getNonRefTableInfos() {
-        return tableInfos;
-    }
-
-    @Override
-    protected TableInfo getMimeTable() {
-        return AbstractProfiler.MIME_TABLE;
-    }
-
-    @Override
-    protected void addErrorLogTablePairs(DBConsumersManager manager) {
-        Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), 
null);
-        if (errorLog == null) {
-            return;
-        }
-        manager.addErrorLogTablePair(errorLog, 
ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
-    }
-}
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileResource.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileResource.java
new file mode 100644
index 000000000..e4702cb8e
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileResource.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.batch;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+
+
+/**
+ * This is a basic interface to handle a logical "file".
+ * This should enable code-agnostic handling of files from different
+ * sources: file system, database, etc.
+ */
+public interface FileResource {
+
+    //The literal lowercased extension of a file.  This may or may not
+    //have any relationship to the actual type of the file.
+    public static final Property FILE_EXTENSION = 
Property.internalText("tika:file_ext");
+
+    /**
+     * This is only used in logging to identify which file
+     * may have caused problems.  While it is probably best
+     * to use unique ids for the sake of debugging, it is not
+     * necessary that the ids be unique.  This id
+     * is never used as a hashkey by the batch processors, for example.
+     *
+     * @return an id for a FileResource
+     */
+    public String getResourceId();
+
+    /**
+     * This gets the metadata available before the parsing of the file.
+     * This will typically be "external" metadata: file name,
+     * file size, file location, data stream, etc.  That is, things
+     * that are known about the file from outside information, not
+     * file-internal metadata.
+     *
+     * @return Metadata
+     */
+    public Metadata getMetadata();
+
+    /**
+     * @return an InputStream for the FileResource
+     * @throws java.io.IOException
+     */
+    public InputStream openInputStream() throws IOException;
+
+}
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java
new file mode 100644
index 000000000..20f67798a
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.batch;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.metadata.Metadata;
+
+public class PathResource implements FileResource {
+
+    private final Path path;
+    private final String resourceId;
+    private final Metadata metadata = new Metadata();
+    public PathResource(Path path, String resourceId) {
+        this.path = path;
+        this.resourceId = resourceId;
+        metadata.set(FSProperties.FS_REL_PATH, resourceId);
+    }
+    @Override
+    public String getResourceId() {
+        return resourceId;
+    }
+
+    @Override
+    public Metadata getMetadata() {
+        return metadata;
+    }
+
+    @Override
+    public InputStream openInputStream() throws IOException {
+        return Files.newInputStream(path);
+    }
+}
diff --git 
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/EvalConfigTest.java
 
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/EvalConfigTest.java
new file mode 100644
index 000000000..395c90fe6
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/EvalConfigTest.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.net.URISyntaxException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import org.junit.jupiter.api.Test;
+
+public class EvalConfigTest {
+
+    @Test
+    public void testBasic() throws Exception {
+        EvalConfig evalConfig = 
EvalConfig.load(getConfig("eval-config-basic.json"));
+        assertEquals(20000, evalConfig.getMaxExtractLength());
+        assertNull(evalConfig.getErrorLogFile());
+        assertNull(evalConfig.getJdbcString());
+    }
+
+    private Path getConfig(String fileName) throws URISyntaxException {
+        return Paths.get(EvalConfigTest.class.getResource("/eval-configs/" + 
fileName).toURI());
+    }
+}
diff --git 
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java
 
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java
index be58e0ed2..3d6e93ad3 100644
--- 
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java
+++ 
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java
@@ -21,7 +21,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.IOException;
-import java.nio.file.DirectoryStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
@@ -30,79 +29,75 @@ import java.sql.ResultSet;
 import java.sql.SQLException;
 import java.sql.Statement;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 
-import org.junit.jupiter.api.AfterAll;
+import org.apache.commons.io.FileUtils;
+import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.eval.app.db.Cols;
 import org.apache.tika.eval.app.db.H2Util;
 import org.apache.tika.eval.app.db.TableInfo;
-import org.apache.tika.eval.app.io.ExtractReaderException;
 
-@Disabled
 public class ProfilerBatchTest {
 
-    public final static String COMPARER_PROCESS_CLASS = 
"org.apache.tika.batch.fs.FSBatchProcessCLI";
-    private final static String profileTable = 
ExtractProfiler.PROFILE_TABLE.getName();
-    private final static String exTable = 
ExtractProfiler.EXCEPTION_TABLE.getName();
-    private final static String fpCol = Cols.FILE_PATH.name();
-    private static Path dbDir;
-    private static Connection conn;
+    private static Connection CONN;
+    private static Path DB_DIR;
+    private static Path DB;
 
     @BeforeAll
     public static void setUp() throws Exception {
+        DB_DIR = Files.createTempDirectory("profiler-test");
+        Path extractsRoot = Paths.get(ComparerBatchTest.class
+                .getResource("/test-dirs/extractsA")
+                .toURI());
 
         Path inputRoot = Paths.get(ComparerBatchTest.class
-                .getResource("/test-dirs/extractsA")
+                .getResource("/test-dirs/raw_input")
                 .toURI());
-        dbDir = Files.createTempDirectory(inputRoot, "tika-test-db-dir-");
-        Map<String, String> args = new HashMap<>();
-        Path db = dbDir.resolve("profiler_test");
-        args.put("-db", db.toString());
-
-        //for debugging, you can use this to select only one file pair to load
-        //args.put("-includeFilePat", "file8.*");
-
-       /* BatchProcessTestExecutor ex = new 
BatchProcessTestExecutor(COMPARER_PROCESS_CLASS, args,
-                "/single-file-profiler-crawl-input-config.xml");
-        StreamStrings streamStrings = ex.execute();
-        System.out.println(streamStrings.getErrString());
-        System.out.println(streamStrings.getOutString());*/
-        H2Util dbUtil = new H2Util(db);
-        conn = dbUtil.getConnection();
-    }
 
-    @AfterAll
-    public static void tearDown() throws IOException {
+        DB = DB_DIR.resolve("mydb");
+        String[] args = new String[]{
+            "-i", inputRoot.toAbsolutePath().toString(),
+            "-e", extractsRoot.toAbsolutePath().toString(),
+                "-d", "jdbc:h2:file:" + DB.toAbsolutePath().toString()
+        };
+
+        ExtractProfileRunner.main(args);
+    }
 
+    @AfterEach
+    public void tearDown() throws IOException {
         try {
-            conn.close();
+            CONN.close();
         } catch (SQLException e) {
             throw new RuntimeException(e);
         }
-        //TODO: if/when we turn this back on, use @TempDir instead of this
+        FileUtils.deleteDirectory(DB_DIR.toFile());
 
-        DirectoryStream<Path> dStream = Files.newDirectoryStream(dbDir);
-        for (Path p : dStream) {
-            Files.delete(p);
-        }
-        dStream.close();
-        Files.delete(dbDir);
+    }
+
+    @BeforeEach
+    public void setUpEach() throws SQLException {
+        H2Util dbUtil = new H2Util(DB);
+        CONN = dbUtil.getConnection();
+    }
+
+    @AfterEach
+    public void tearDownEach() throws SQLException {
+        CONN.close();
     }
 
     @Test
     public void testSimpleDBWriteAndRead() throws Exception {
-
         Statement st = null;
         List<String> fNameList = new ArrayList<>();
         try {
             String sql = "select * from " + 
ExtractProfiler.CONTAINER_TABLE.getName();
-            st = conn.createStatement();
+            st = CONN.createStatement();
             ResultSet rs = st.executeQuery(sql);
             while (rs.next()) {
                 String fileName = rs.getString(Cols.FILE_PATH.name());
@@ -113,17 +108,19 @@ public class ProfilerBatchTest {
                 st.close();
             }
         }
+        /*
         debugTable(ExtractProfiler.CONTAINER_TABLE);
         debugTable(ExtractProfiler.PROFILE_TABLE);
         debugTable(ExtractProfiler.CONTENTS_TABLE);
         debugTable(ExtractProfiler.EXCEPTION_TABLE);
-        debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
-        assertEquals(10, fNameList.size());
+        debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);*/
+        assertEquals(17, fNameList.size());
         assertTrue(fNameList.contains("file1.pdf"), "file1.pdf");
         assertTrue(fNameList.contains("file2_attachANotB.doc"), 
"file2_attachANotB.doc");
         assertTrue(fNameList.contains("file3_attachBNotA.doc"), 
"file3_attachBNotA.doc");
         assertTrue(fNameList.contains("file4_emptyB.pdf"), "file4_emptyB.pdf");
         assertTrue(fNameList.contains("file7_badJson.pdf"), 
"file4_emptyB.pdf");
+        assertTrue(fNameList.contains("file9_noextract.txt"), 
"file9_noextract.txt");
     }
 
     @Test
@@ -131,43 +128,29 @@ public class ProfilerBatchTest {
         String sql =
                 "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + " 
join containers c on c.container_id = e.container_id " + " where 
c.file_path='file9_noextract.txt'";
 
-        assertEquals("missing extract: file9_noextract.txt", "0", 
getSingleResult(sql));
-        debugTable(ExtractProfiler.CONTAINER_TABLE);
+        /*debugTable(ExtractProfiler.CONTAINER_TABLE);
         debugTable(ExtractProfiler.PROFILE_TABLE);
         debugTable(ExtractProfiler.CONTENTS_TABLE);
         debugTable(ExtractProfiler.EXCEPTION_TABLE);
-        debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
-
-        sql = "select EXTRACT_EXCEPTION_ID from errors e" + " join containers 
c on c.container_id = e.container_id " + " where 
c.file_path='file5_emptyA.pdf'";
-        assertEquals("empty extract: file5_emptyA.pdf", "1", 
getSingleResult(sql));
-
-        sql = "select EXTRACT_EXCEPTION_ID from errors e" + " join containers 
c on c.container_id = e.container_id " + " where 
c.file_path='file7_badJson.pdf'";
-        assertEquals("extract error:file7_badJson.pdf", "2", 
getSingleResult(sql));
-
-    }
-
-    @Test
-    public void testParseErrors() throws Exception {
-        debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
-        String sql = "select file_path from errors where container_id is null";
-        assertEquals("file10_permahang.txt", getSingleResult(sql));
-
-        sql = "select extract_error_id from extract_exceptions " + "where 
file_path='file11_oom.txt'";
-        
assertEquals(Integer.toString(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE.ordinal()),
 getSingleResult(sql));
+        debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);*/
+        assertEquals("0", getSingleResult(sql), "missing extract: 
file9_noextract.txt");
 
-        sql = "select parse_error_id from extract_exceptions where 
file_path='file11_oom.txt'";
-        
assertEquals(Integer.toString(AbstractProfiler.PARSE_ERROR_TYPE.OOM.ordinal()), 
getSingleResult(sql));
+        sql = "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + " join 
containers c on c.container_id = e.container_id " + " where 
c.file_path='file5_emptyA.pdf'";
+        assertEquals("1", getSingleResult(sql), "empty extract: 
file5_emptyA.pdf");
 
+        sql = "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + " join 
containers c on c.container_id = e.container_id " + " where 
c.file_path='file7_badJson.pdf'";
+        assertEquals("2", getSingleResult(sql), "extract 
error:file7_badJson.pdf");
     }
 
     @Test
+    @Disabled("create actual unit test")
     public void testParseExceptions() throws Exception {
         debugTable(ExtractProfiler.EXCEPTION_TABLE);
     }
 
     private String getSingleResult(String sql) throws Exception {
         Statement st = null;
-        st = conn.createStatement();
+        st = CONN.createStatement();
         ResultSet rs = st.executeQuery(sql);
         int hits = 0;
         String val = "";
@@ -188,7 +171,7 @@ public class ProfilerBatchTest {
         Statement st = null;
         try {
             String sql = "select * from " + table.getName();
-            st = conn.createStatement();
+            st = CONN.createStatement();
             ResultSet rs = st.executeQuery(sql);
             int colCount = rs
                     .getMetaData()
diff --git 
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
 
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
index fff15e3dc..4d7d4bb2b 100644
--- 
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
+++ 
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
@@ -98,20 +98,13 @@ public class TikaEvalCLITest extends TikaTest {
     private static void profile() throws IOException {
         List<String> args = new ArrayList<>();
         args.add("Profile");
-        args.add("-extracts");
+        args.add("-e");
         args.add(ProcessUtils.escapeCommandLine(extractsDir
                 .resolve("extractsA")
                 .toAbsolutePath()
                 .toString()));
-        //add these just to confirm this info doesn't cause problems w cli
-        args.add("-maxTokens");
-        args.add("10000000");
-        args.add("-maxContentLength");
-        args.add("100000000");
-        args.add("-maxContentLengthForLangId");
-        args.add("100000");
 
-        args.add("-db");
+        args.add("-d");
         args.add(ProcessUtils.escapeCommandLine(profileDBDir
                 .toAbsolutePath()
                 .toString() + "/" + dbName));
diff --git 
a/tika-eval/tika-eval-app/src/test/resources/eval-configs/eval-config-basic.json
 
b/tika-eval/tika-eval-app/src/test/resources/eval-configs/eval-config-basic.json
new file mode 100644
index 000000000..b4af28df3
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/test/resources/eval-configs/eval-config-basic.json
@@ -0,0 +1,3 @@
+{
+  "maxExtractLength" : 20000
+}
\ No newline at end of file
diff --git 
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file10_permahang.txt
 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file10_permahang.txt
new file mode 100644
index 000000000..e69de29bb
diff --git 
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file12_es.txt 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file12_es.txt
new file mode 100644
index 000000000..5ffd824a9
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file12_es.txt
@@ -0,0 +1,6 @@
+[
+  {
+    "Content-Type": "text/plain",
+    "X-TIKA:content": "El zorro marrón rápido saltó sobre el perro. El zorro 
marrón rápido saltó sobre el perro. El zorro marrón rápido saltó sobre el perro"
+  }
+]
\ No newline at end of file
diff --git 
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file13_attachANotB.doc
 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file13_attachANotB.doc
new file mode 100644
index 000000000..15bc592a5
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file13_attachANotB.doc
@@ -0,0 +1,12 @@
+[
+  {
+    "Content-Type": "text/plain",
+    "_comment": "simplified",
+    "X-TIKA:content": 
"调整每一个心脏和每个声音,投标每个护理提取;让大家一起欢乐,赞美老拿骚.调整每一个心脏和每个声音,投标每个护理提取;让大家一起欢乐,赞美老拿骚 狐狸狐狸狐狸 
"
+  },
+  {
+    "Content-Type": "text/plain",
+    "X-TIKA:embedded_resource_path": "inner.txt",
+    "X-TIKA:content": "attachment contents"
+  }
+]
\ No newline at end of file
diff --git 
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file14_diffAttachOrder
 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file14_diffAttachOrder
new file mode 100644
index 000000000..25f0db9a1
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file14_diffAttachOrder
@@ -0,0 +1,21 @@
+[
+  {
+    "Content-Type": "text/plain",
+    "X-TIKA:content": "the quick brown fox fox fox jumped over the lazy lazy 
dog",
+    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8351"
+  },
+  {
+    "Content-Type": "text/plain",
+    "X-TIKA:embedded_resource_path": "/0",
+    "X-TIKA:content": "a b c d e f g h i j k l m n",
+    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8354",
+    "X-TIKA:embedded_depth": "1"
+  },
+  {
+    "Content-Type": "text/plain",
+    "X-TIKA:embedded_resource_path": "/1",
+    "X-TIKA:content": "o p q r s t u v w x y z",
+    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8353",
+    "X-TIKA:embedded_depth": "1"
+  }
+]
\ No newline at end of file
diff --git 
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file15_tags 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file15_tags
new file mode 100644
index 000000000..5af73db80
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file15_tags
@@ -0,0 +1,41 @@
+[
+  {
+    "Content-Length": "34824",
+    "Content-Type": "application/pdf",
+    "Last-Modified": "2007-09-15T09:02:31Z",
+    "X-Parsed-By": [
+      "org.apache.tika.parser.DefaultParser",
+      "org.apache.tika.parser.pdf.PDFParser"
+    ],
+    "X-TIKA:content_handler": "ToXMLContentHandler",
+    "X-TIKA:content": "\u003chtml 
xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta 
name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta 
name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" 
/\u003e\n\u003cmeta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" 
/\u003e\n\u003cmeta name\u003d\"access_permission:modify_annotations\" 
content\u003d\"true\" /\u003e\n\u003cmeta 
name\u003d\"access_permission:can_print_degra [...]
+    "X-TIKA:parse_time_millis": "500",
+    "access_permission:assemble_document": "true",
+    "access_permission:can_modify": "true",
+    "access_permission:can_print": "true",
+    "access_permission:can_print_degraded": "true",
+    "access_permission:extract_content": "true",
+    "access_permission:extract_for_accessibility": "true",
+    "access_permission:fill_in_form": "true",
+    "access_permission:modify_annotations": "true",
+    "dc:creator": "Bertrand DelacrΘtaz",
+    "dc:format": "application/pdf; version\u003d1.3",
+    "dc:title": "Apache Tika - Apache Tika",
+    "dcterms:created": "2007-09-15T09:02:31Z",
+    "dcterms:modified": "2007-09-15T09:02:31Z",
+    "meta:author": "Bertrand DelacrΘtaz",
+    "meta:creation-date": "2007-09-15T09:02:31Z",
+    "meta:save-date": "2007-09-15T09:02:31Z",
+    "pdf:PDFVersion": "1.3",
+    "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+    "pdf:docinfo:creator_tool": "Firefox",
+    "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+    "pdf:docinfo:title": "Apache Tika - Apache Tika",
+    "pdf:encrypted": "false",
+    "resourceName": "testPDF.pdf",
+    "xmp:CreatorTool": "Firefox",
+    "xmpTPg:NPages": "1"
+  }
+]
\ No newline at end of file
diff --git 
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file16_badTags 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file16_badTags
new file mode 100644
index 000000000..5c6272e43
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file16_badTags
@@ -0,0 +1,41 @@
+[
+  {
+    "Content-Length": "34824",
+    "Content-Type": "application/pdf",
+    "Last-Modified": "2007-09-15T09:02:31Z",
+    "X-Parsed-By": [
+      "org.apache.tika.parser.DefaultParser",
+      "org.apache.tika.parser.pdf.PDFParser"
+    ],
+    "X-TIKA:content_handler": "ToXMLContentHandler",
+    "X-TIKA:content": "\u003chtml 
xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta 
name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta 
name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" meta 
name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" 
/\u003e\n\u003c\u003c\u003c\u003c\u003cmeta 
name\u003d\"access_permission:modify_annotations\" content\u003d\"true\" 
/\u003e\n\u003cmeta name\u003d\"access_permission:can_pr [...]
+    "X-TIKA:parse_time_millis": "500",
+    "access_permission:assemble_document": "true",
+    "access_permission:can_modify": "true",
+    "access_permission:can_print": "true",
+    "access_permission:can_print_degraded": "true",
+    "access_permission:extract_content": "true",
+    "access_permission:extract_for_accessibility": "true",
+    "access_permission:fill_in_form": "true",
+    "access_permission:modify_annotations": "true",
+    "dc:creator": "Bertrand DelacrΘtaz",
+    "dc:format": "application/pdf; version\u003d1.3",
+    "dc:title": "Apache Tika - Apache Tika",
+    "dcterms:created": "2007-09-15T09:02:31Z",
+    "dcterms:modified": "2007-09-15T09:02:31Z",
+    "meta:author": "Bertrand DelacrΘtaz",
+    "meta:creation-date": "2007-09-15T09:02:31Z",
+    "meta:save-date": "2007-09-15T09:02:31Z",
+    "pdf:PDFVersion": "1.3",
+    "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+    "pdf:docinfo:creator_tool": "Firefox",
+    "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+    "pdf:docinfo:title": "Apache Tika - Apache Tika",
+    "pdf:encrypted": "false",
+    "resourceName": "testPDF.pdf",
+    "xmp:CreatorTool": "Firefox",
+    "xmpTPg:NPages": "1"
+  }
+]
\ No newline at end of file
diff --git 
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file17_tagsOutOfOrder
 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file17_tagsOutOfOrder
new file mode 100644
index 000000000..97afec8ad
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file17_tagsOutOfOrder
@@ -0,0 +1,41 @@
+[
+  {
+    "Content-Length": "34824",
+    "Content-Type": "application/pdf",
+    "Last-Modified": "2007-09-15T09:02:31Z",
+    "X-Parsed-By": [
+      "org.apache.tika.parser.DefaultParser",
+      "org.apache.tika.parser.pdf.PDFParser"
+    ],
+    "X-TIKA:content_handler": "ToXMLContentHandler",
+    "X-TIKA:content": "\u003chtml 
xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta 
name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta 
name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" 
/\u003e\n\u003cmeta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" 
/\u003e\n\u003cmeta name\u003d\"access_permission:modify_annotations\" 
content\u003d\"true\" /\u003e\n\u003cmeta 
name\u003d\"access_permission:can_print_degra [...]
+    "X-TIKA:parse_time_millis": "500",
+    "access_permission:assemble_document": "true",
+    "access_permission:can_modify": "true",
+    "access_permission:can_print": "true",
+    "access_permission:can_print_degraded": "true",
+    "access_permission:extract_content": "true",
+    "access_permission:extract_for_accessibility": "true",
+    "access_permission:fill_in_form": "true",
+    "access_permission:modify_annotations": "true",
+    "dc:creator": "Bertrand DelacrΘtaz",
+    "dc:format": "application/pdf; version\u003d1.3",
+    "dc:title": "Apache Tika - Apache Tika",
+    "dcterms:created": "2007-09-15T09:02:31Z",
+    "dcterms:modified": "2007-09-15T09:02:31Z",
+    "meta:author": "Bertrand DelacrΘtaz",
+    "meta:creation-date": "2007-09-15T09:02:31Z",
+    "meta:save-date": "2007-09-15T09:02:31Z",
+    "pdf:PDFVersion": "1.3",
+    "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+    "pdf:docinfo:creator_tool": "Firefox",
+    "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+    "pdf:docinfo:title": "Apache Tika - Apache Tika",
+    "pdf:encrypted": "false",
+    "resourceName": "testPDF.pdf",
+    "xmp:CreatorTool": "Firefox",
+    "xmpTPg:NPages": "1"
+  }
+]
\ No newline at end of file

Reply via email to