This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4342
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 303623e67e451392eab1249d47463f2861507127
Author: tallison <talli...@apache.org>
AuthorDate: Tue Jul 8 15:27:27 2025 -0400

    TIKA-4342 -- remove tika-batch from ExtractProfiler
---
 .../java/org/apache/tika/eval/app/EvalConfig.java  |  89 ++++-
 .../tika/eval/app/ExctractProfileRunner.java       | 204 -----------
 .../apache/tika/eval/app/ExtractProfileRunner.java | 374 +++++++++++++++++++++
 .../org/apache/tika/eval/app/ExtractProfiler.java  |   1 -
 .../org/apache/tika/eval/app/FileProfiler.java     |  19 ++
 .../org/apache/tika/eval/app/ProfilerBase.java     |   3 -
 .../org/apache/tika/eval/app/StatusReporter.java   | 102 ++++++
 .../java/org/apache/tika/eval/app/TikaEvalCLI.java |   4 +-
 .../eval/app/batch/ExtractProfilerBuilder.java     | 120 -------
 .../apache/tika/eval/app/batch/PathResource.java   |  21 +-
 .../org/apache/tika/eval/app/EvalConfigTest.java   |  42 +++
 .../apache/tika/eval/app/ProfilerBatchTest.java    | 117 +++----
 .../org/apache/tika/eval/app/TikaEvalCLITest.java  |  11 +-
 .../resources/eval-configs/eval-config-basic.json  |   3 +
 .../test-dirs/raw_input/file10_permahang.txt       |   0
 .../resources/test-dirs/raw_input/file12_es.txt    |   6 +
 .../test-dirs/raw_input/file13_attachANotB.doc     |  12 +
 .../test-dirs/raw_input/file14_diffAttachOrder     |  21 ++
 .../test/resources/test-dirs/raw_input/file15_tags |  41 +++
 .../resources/test-dirs/raw_input/file16_badTags   |  41 +++
 .../test-dirs/raw_input/file17_tagsOutOfOrder      |  41 +++
 tika-parent/pom.xml                                |   2 +-
 22 files changed, 859 insertions(+), 415 deletions(-)

diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
index 416d7bb6e..5525180ed 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
@@ -1,13 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.tika.eval.app;
 
+import java.nio.file.Path;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
 public class EvalConfig {
 
-    long minExtractLength = 0;
-    long maxExtractLength = 10_000_000;
-    String jdbcString = null;
-    int maxFilesToAdd = -1;
-    int maxTokens = 200000;
-    int maxContentLength = 5_000_000;
-    int numThreads = 4;
+    private long minExtractLength = 0;
+    private long maxExtractLength = 2_000_000;
+    private String jdbcString = null;
+    private String jdbcDriverClass = null;
+    private boolean forceDrop = true;
+    private int maxFilesToAdd = -1;
+    private int maxTokens = 200000;
+
+    private int maxContentLength = 5_000_000;
+    private int numWorkers = 4;
+    private Path errorLogFile = null;
+
+
+    public static EvalConfig load(Path path) throws Exception {
+        return new ObjectMapper().readValue(path.toFile(), EvalConfig.class);
+    }
+
+    public long getMinExtractLength() {
+        return minExtractLength;
+    }
+
+    public long getMaxExtractLength() {
+        return maxExtractLength;
+    }
+
+    public String getJdbcString() {
+        return jdbcString;
+    }
+
+    public String getJdbcDriverClass() {
+        return jdbcDriverClass;
+    }
+
+    public boolean isForceDrop() {
+        return forceDrop;
+    }
+
+    public int getMaxFilesToAdd() {
+        return maxFilesToAdd;
+    }
+
+    public int getMaxTokens() {
+        return maxTokens;
+    }
+
+    public int getMaxContentLength() {
+        return maxContentLength;
+    }
+
+    public int getNumWorkers() {
+        return numWorkers;
+    }
+
+    public Path getErrorLogFile() {
+        return errorLogFile;
+    }
 
+    @Override
+    public String toString() {
+        return "EvalConfig{" + "minExtractLength=" + minExtractLength + ", 
maxExtractLength=" + maxExtractLength + ", jdbcString='" + jdbcString + '\'' + 
", jdbcDriverClass='" +
+                jdbcDriverClass + '\'' + ", forceDrop=" + forceDrop + ", 
maxFilesToAdd=" + maxFilesToAdd + ", maxTokens=" + maxTokens + ", 
maxContentLength=" + maxContentLength +
+                ", numThreads=" + numWorkers + ", errorLogFile=" + 
errorLogFile + '}';
+    }
 }
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExctractProfileRunner.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExctractProfileRunner.java
deleted file mode 100644
index 20cb33c1c..000000000
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExctractProfileRunner.java
+++ /dev/null
@@ -1,204 +0,0 @@
-package org.apache.tika.eval.app;
-
-import java.io.IOException;
-import java.nio.file.FileVisitResult;
-import java.nio.file.FileVisitor;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.nio.file.attribute.BasicFileAttributes;
-import java.util.List;
-import java.util.concurrent.ArrayBlockingQueue;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.ExecutorCompletionService;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.DefaultParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.eval.app.batch.FileResource;
-import org.apache.tika.eval.app.batch.PathResource;
-import org.apache.tika.eval.app.db.JDBCUtil;
-import org.apache.tika.eval.app.db.MimeBuffer;
-import org.apache.tika.eval.app.io.ExtractReader;
-import org.apache.tika.eval.app.io.IDBWriter;
-
-public class ExctractProfileRunner {
-
-    private static final Logger LOG = 
LoggerFactory.getLogger(ExctractProfileRunner.class);
-    private static final PathResource SEMAPHORE = new 
PathResource(Paths.get("/"), "STOP");
-
-    static Options OPTIONS;
-
-    static {
-
-        OPTIONS = new Options()
-                
.addOption(Option.builder("e").longOpt("extracts").hasArg().desc("required: 
directory of extracts").build())
-                
.addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional: 
directory for original binary input documents."
-                        + " If not specified, -extracts is crawled as 
is.").build())
-                
.addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db 
path").build())
-                
.addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json 
config file").build())
-                ;
-    }
-    public static void main(String[] args) throws Exception {
-        DefaultParser defaultCLIParser = new DefaultParser();
-        CommandLine commandLine = defaultCLIParser.parse(OPTIONS, args);
-        EvalConfig evalConfig = commandLine.hasOption('c') ? 
EvalConfig.load(Paths.get(commandLine.getOptionValue('c'))) : new EvalConfig();
-        Path extractsDir = commandLine.hasOption('e') ? 
Paths.get(commandLine.getOptionValue('e')) : Paths.get(USAGE_FAIL("Must specify 
extracts dir: -i"));
-        Path inputDir = commandLine.hasOption('i') ? 
Paths.get(commandLine.getOptionValue('i')) : extractsDir;
-        String dbPath = commandLine.hasOption('d') ? 
commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d");
-        execute(inputDir, extractsDir, dbPath, evalConfig);
-    }
-
-    private static void execute(Path inputDir, Path extractsDir, String 
dbPath, EvalConfig evalConfig) {
-
-        ArrayBlockingQueue<FileResource> queue = new 
ArrayBlockingQueue<>(1000);
-        DirectoryWalker fileWalker = new DirectoryWalker(inputDir, queue);
-        ExecutorService executorService = 
Executors.newFixedThreadPool(evalConfig.numThreads + 1);
-        ExecutorCompletionService<Integer> executorCompletionService = new 
ExecutorCompletionService<>(executorService);
-        executorCompletionService.submit(fileWalker);
-        IDBWriter dbWriter = buildDBWriter();
-        for (int i = 0; i < evalConfig.numThreads; i++) {
-            ExtractReader extractReader = new 
ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS, 
evalConfig.minExtractLength, evalConfig.maxExtractLength);
-
-            ExtractProfiler extractProfiler = new ExtractProfiler(inputDir, 
extractsDir, extractReader, dbWriter);
-            executorCompletionService.submit(new ProfileWorker(queue, 
extractProfiler));
-        }
-
-        int finished = 0;
-        try {
-            while (finished < evalConfig.numThreads + 1) {
-                //blocking
-                Future<Integer> future = executorCompletionService.take();
-                Integer result = future.get();
-                if (result != null) {
-                    finished++;
-                }
-
-            }
-        } catch (InterruptedException e) {
-            LOG.info("interrupted", e);
-        } catch (ExecutionException e) {
-            throw new RuntimeException(e);
-        } finally {
-            executorService.shutdownNow();
-        }
-
-    }
-
-    private static IDBWriter buildDBWriter(String connectionString, String 
driverClass) {
-        MimeBuffer mimeBuffer = null;
-        JDBCUtil dbUtil = new JDBCUtil(connectionString, driverClass);
-        //Step 1. Used to be update table infos with prefixes
-        updateTableInfosWithPrefixes(localAttrs);
-
-        JDBCUtil.CREATE_TABLE createRegularTable = (forceDrop) ? 
JDBCUtil.CREATE_TABLE.DROP_IF_EXISTS : JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS;
-
-        JDBCUtil.CREATE_TABLE createRefTable = (forceDrop) ? 
JDBCUtil.CREATE_TABLE.DROP_IF_EXISTS : JDBCUtil.CREATE_TABLE.SKIP_IF_EXISTS;
-
-        //step 2. create the tables
-        dbUtil.createTables(getNonRefTableInfos(), 
JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS);
-        dbUtil.createTables(getRefTableInfos(), 
JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS);
-
-        //step 3. create mime buffer
-        this.mimeBuffer = new MimeBuffer(dbUtil.getConnection(), 
getMimeTable(), TikaConfig.getDefaultConfig());
-
-        //step 4. populate the reference tables
-        populateRefTables();
-
-        return mimeBuffer;
-
-
-    }
-
-    private static void USAGE() {
-        HelpFormatter helpFormatter = new HelpFormatter();
-        helpFormatter.printHelp(80, "java -jar tika-eval-app-x.y.z.jar 
FileProfiler -e docs -d mydb [-i inputDir, -c config.json]",
-                "Tool: Profile", OPTIONS, "");
-    }
-
-    private static String USAGE_FAIL(String msg) {
-        USAGE();
-        throw new IllegalArgumentException(msg);
-    }
-
-    private static class ProfileWorker implements Callable<Integer> {
-
-        private final ArrayBlockingQueue<FileResource> queue;
-        private final ExtractProfiler extractProfiler;
-        ProfileWorker(ArrayBlockingQueue<FileResource> queue, ExtractProfiler 
extractProfiler) {
-            this.queue = queue;
-            this.extractProfiler = extractProfiler;
-        }
-
-        @Override
-        public Integer call() throws Exception {
-            while (true) {
-                FileResource resource = queue.poll(1, TimeUnit.SECONDS);
-                if (resource == null) {
-                    LOG.info("ExtractProfileWorker waiting on queue");
-                    continue;
-                }
-                if (resource == SEMAPHORE) {
-                    LOG.debug("worker hit semaphore and is stopping");
-                    //hangs
-                    queue.put(resource);
-                    return 1;
-                }
-                extractProfiler.processFileResource(resource);
-            }
-        }
-    }
-
-    private static class DirectoryWalker implements Callable<Integer> {
-        private final Path startDir;
-        private final ArrayBlockingQueue<FileResource> queue;
-
-        public DirectoryWalker(Path startDir, ArrayBlockingQueue<FileResource> 
queue) {
-            this.startDir = startDir;
-            this.queue = queue;
-        }
-
-        @Override
-        public Integer call() throws Exception {
-            Files.walkFileTree(startDir, new FileVisitor<Path>() {
-                @Override
-                public FileVisitResult preVisitDirectory(Path dir, 
BasicFileAttributes attrs) throws IOException {
-                    return FileVisitResult.CONTINUE;
-                }
-
-                @Override
-                public FileVisitResult visitFile(Path file, 
BasicFileAttributes attrs) throws IOException {
-                    //blocking
-                    try {
-                        queue.put(new PathResource(file, 
startDir.relativize(file).toString()));
-                    } catch (InterruptedException e) {
-                        return FileVisitResult.TERMINATE;
-                    }
-                    return FileVisitResult.CONTINUE;
-                }
-
-                @Override
-                public FileVisitResult visitFileFailed(Path file, IOException 
exc) throws IOException {
-                    return FileVisitResult.CONTINUE;
-                }
-
-                @Override
-                public FileVisitResult postVisitDirectory(Path dir, 
IOException exc) throws IOException {
-                    return FileVisitResult.CONTINUE;
-                }
-            });
-            return 0;
-        }
-    }
-}
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java
new file mode 100644
index 000000000..cd80a3df3
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java
@@ -0,0 +1,374 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import java.io.IOException;
+import java.nio.file.FileVisitResult;
+import java.nio.file.FileVisitor;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.attribute.BasicFileAttributes;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.eval.app.batch.DBConsumersManager;
+import org.apache.tika.eval.app.batch.FileResource;
+import org.apache.tika.eval.app.batch.PathResource;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.eval.app.db.MimeBuffer;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.DBWriter;
+import org.apache.tika.eval.app.io.ExtractReader;
+import org.apache.tika.eval.app.io.ExtractReaderException;
+import org.apache.tika.eval.app.io.IDBWriter;
+
+public class ExtractProfileRunner {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(ExtractProfileRunner.class);
+    private static final PathResource SEMAPHORE = new 
PathResource(Paths.get("/"), "STOP");
+    private static final int DIR_WALKER_COMPLETED_VALUE = 2;
+    private static final int PROFILE_WORKER_COMPLETED_VALUE = 1;
+
+    static Options OPTIONS;
+
+    static {
+
+        OPTIONS = new Options()
+                
.addOption(Option.builder("e").longOpt("extracts").hasArg().desc("required: 
directory of extracts").build())
+                
.addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional: 
directory for original binary input documents."
+                        + " If not specified, -extracts is crawled as 
is.").build())
+                
.addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db 
path").build())
+                
.addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json 
config file").build())
+                ;
+    }
+    public static void main(String[] args) throws Exception {
+        DefaultParser defaultCLIParser = new DefaultParser();
+        CommandLine commandLine = defaultCLIParser.parse(OPTIONS, args);
+        EvalConfig evalConfig = commandLine.hasOption('c') ? 
EvalConfig.load(Paths.get(commandLine.getOptionValue('c'))) : new EvalConfig();
+        Path extractsDir = commandLine.hasOption('e') ? 
Paths.get(commandLine.getOptionValue('e')) : Paths.get(USAGE_FAIL("Must specify 
extracts dir: -i"));
+        Path inputDir = commandLine.hasOption('i') ? 
Paths.get(commandLine.getOptionValue('i')) : extractsDir;
+        String dbPath = commandLine.hasOption('d') ? 
commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d");
+        String jdbcString = getJdbcConnectionString(dbPath);
+        execute(inputDir, extractsDir, jdbcString, evalConfig);
+    }
+
+    private static String getJdbcConnectionString(String dbPath) {
+        if (dbPath.startsWith("jdbc:")) {
+            return dbPath;
+        }
+        //default to h2
+        Path p = Paths.get(dbPath);
+        return "jdbc:h2:file:" + p.toAbsolutePath();
+
+    }
+
+    private static void execute(Path inputDir, Path extractsDir, String 
dbPath, EvalConfig evalConfig) throws SQLException, IOException {
+
+        //parameterize this? if necesssary
+        try {
+            ProfilerBase.loadCommonTokens(null, null);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+
+        JDBCUtil jdbcUtil = new JDBCUtil(dbPath, 
evalConfig.getJdbcDriverClass());
+        ExtractProfilerBuilder builder = new ExtractProfilerBuilder();
+        MimeBuffer mimeBuffer = initTables(jdbcUtil, builder, dbPath, 
evalConfig);
+        builder.populateRefTables(jdbcUtil, mimeBuffer);
+
+        AtomicInteger enqueued = new AtomicInteger(0);
+        AtomicInteger processed = new AtomicInteger(0);
+        AtomicInteger activeWorkers = new 
AtomicInteger(evalConfig.getNumWorkers());
+        AtomicBoolean crawlerActive = new AtomicBoolean(true);
+
+
+
+        ArrayBlockingQueue<FileResource> queue = new 
ArrayBlockingQueue<>(1000);
+        ExecutorService executorService = 
Executors.newFixedThreadPool(evalConfig.getNumWorkers() + 2);
+        ExecutorCompletionService<Integer> executorCompletionService = new 
ExecutorCompletionService<>(executorService);
+
+        StatusReporter statusReporter = new StatusReporter(enqueued, 
processed, activeWorkers, crawlerActive);
+        executorCompletionService.submit(statusReporter);
+
+        DirectoryWalker directoryWalker = new DirectoryWalker(inputDir, queue, 
enqueued);
+        executorCompletionService.submit(directoryWalker);
+        for (int i = 0; i < evalConfig.getNumWorkers(); i++) {
+            ExtractReader extractReader = new 
ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS, 
evalConfig.getMinExtractLength(), evalConfig.getMaxExtractLength());
+            ExtractProfiler extractProfiler = new ExtractProfiler(inputDir, 
extractsDir, extractReader, builder.getDBWriter(builder.tableInfos, jdbcUtil, 
mimeBuffer));
+            executorCompletionService.submit(new ProfileWorker(queue, 
extractProfiler, processed));
+        }
+
+        int finished = 0;
+        try {
+            while (finished < evalConfig.getNumWorkers() + 2) {
+                //blocking
+                Future<Integer> future = executorCompletionService.take();
+                Integer result = future.get();
+                if (result != null) {
+                    //if the dir walker has finished
+                    if (result == DIR_WALKER_COMPLETED_VALUE) {
+                        queue.put(SEMAPHORE);
+                        crawlerActive.set(false);
+                    } else if (result == PROFILE_WORKER_COMPLETED_VALUE) {
+                        activeWorkers.decrementAndGet();
+                    }
+                    finished++;
+                }
+            }
+        } catch (InterruptedException e) {
+            LOG.info("interrupted", e);
+        } catch (ExecutionException e) {
+            throw new RuntimeException(e);
+        } finally {
+            mimeBuffer.close();
+            executorService.shutdownNow();
+        }
+
+    }
+
+    private static MimeBuffer initTables(JDBCUtil jdbcUtil, 
ExtractProfilerBuilder builder, String connectionString, EvalConfig evalConfig) 
throws SQLException, IOException {
+
+        //step 1. create the tables
+        jdbcUtil.createTables(builder.getNonRefTableInfos(), 
JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS);
+        jdbcUtil.createTables(builder.getRefTableInfos(), 
JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS);
+
+        //step 2. create mime buffer
+        return new MimeBuffer(jdbcUtil.getConnection(), 
builder.getMimeTable(), TikaConfig.getDefaultConfig());
+    }
+
+    private static void USAGE() {
+        HelpFormatter helpFormatter = new HelpFormatter();
+        helpFormatter.printHelp(80, "java -jar tika-eval-app-x.y.z.jar 
FileProfiler -e docs -d mydb [-i inputDir, -c config.json]",
+                "Tool: Profile", OPTIONS, "");
+    }
+
+    private static String USAGE_FAIL(String msg) {
+        USAGE();
+        throw new IllegalArgumentException(msg);
+    }
+
+    private static class ProfileWorker implements Callable<Integer> {
+
+        private final ArrayBlockingQueue<FileResource> queue;
+        private final ExtractProfiler extractProfiler;
+        private final AtomicInteger processed;
+
+        ProfileWorker(ArrayBlockingQueue<FileResource> queue, ExtractProfiler 
extractProfiler, AtomicInteger processed) {
+            this.queue = queue;
+            this.extractProfiler = extractProfiler;
+            this.processed = processed;
+        }
+
+        @Override
+        public Integer call() throws Exception {
+            while (true) {
+                FileResource resource = queue.poll(1, TimeUnit.SECONDS);
+                if (resource == null) {
+                    LOG.info("ExtractProfileWorker waiting on queue");
+                    continue;
+                }
+                if (resource == SEMAPHORE) {
+                    LOG.debug("worker hit semaphore and is stopping");
+                    extractProfiler.closeWriter();
+                    //hangs
+                    queue.put(resource);
+                    return PROFILE_WORKER_COMPLETED_VALUE;
+                }
+                extractProfiler.processFileResource(resource);
+                processed.incrementAndGet();
+            }
+        }
+    }
+
+    private static class DirectoryWalker implements Callable<Integer> {
+        private final Path startDir;
+        private final ArrayBlockingQueue<FileResource> queue;
+        private final AtomicInteger enqueued;
+
+        public DirectoryWalker(Path startDir, ArrayBlockingQueue<FileResource> 
queue, AtomicInteger enqueued) {
+            this.startDir = startDir;
+            this.queue = queue;
+            this.enqueued = enqueued;
+        }
+
+        @Override
+        public Integer call() throws Exception {
+            Files.walkFileTree(startDir, new FileVisitor<Path>() {
+                @Override
+                public FileVisitResult preVisitDirectory(Path dir, 
BasicFileAttributes attrs) throws IOException {
+                    return FileVisitResult.CONTINUE;
+                }
+
+                @Override
+                public FileVisitResult visitFile(Path file, 
BasicFileAttributes attrs) throws IOException {
+                    if (Files.isDirectory(file)) {
+                        return FileVisitResult.CONTINUE;
+                    }
+                    try {
+                        //blocking
+                        queue.put(new PathResource(file, 
startDir.relativize(file).toString()));
+                        enqueued.incrementAndGet();
+                    } catch (InterruptedException e) {
+                        return FileVisitResult.TERMINATE;
+                    }
+                    return FileVisitResult.CONTINUE;
+                }
+
+                @Override
+                public FileVisitResult visitFileFailed(Path file, IOException 
exc) throws IOException {
+                    return FileVisitResult.CONTINUE;
+                }
+
+                @Override
+                public FileVisitResult postVisitDirectory(Path dir, 
IOException exc) throws IOException {
+                    return FileVisitResult.CONTINUE;
+                }
+            });
+            return DIR_WALKER_COMPLETED_VALUE;
+        }
+    }
+
+    private static class ExtractProfilerBuilder {
+        private final List<TableInfo> tableInfos;
+        private final List<TableInfo> refTableInfos;
+
+        public ExtractProfilerBuilder() {
+            List<TableInfo> tableInfos = new ArrayList();
+            tableInfos.add(AbstractProfiler.MIME_TABLE);
+            tableInfos.add(ExtractProfiler.CONTAINER_TABLE);
+            tableInfos.add(ExtractProfiler.PROFILE_TABLE);
+            tableInfos.add(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
+            tableInfos.add(ExtractProfiler.EXCEPTION_TABLE);
+            tableInfos.add(ExtractProfiler.CONTENTS_TABLE);
+            tableInfos.add(ExtractProfiler.TAGS_TABLE);
+            tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE);
+            this.tableInfos = Collections.unmodifiableList(tableInfos);
+
+            List<TableInfo> refTableInfos = new ArrayList<>();
+            refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
+            refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
+            refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES);
+            this.refTableInfos = Collections.unmodifiableList(refTableInfos);
+        }
+
+
+        protected List<TableInfo> getRefTableInfos() {
+            return refTableInfos;
+        }
+
+        protected List<TableInfo> getNonRefTableInfos() {
+            return tableInfos;
+        }
+
+        protected TableInfo getMimeTable() {
+            return AbstractProfiler.MIME_TABLE;
+        }
+
+        public void populateRefTables(JDBCUtil dbUtil, MimeBuffer mimeBuffer) 
throws IOException, SQLException {
+            boolean refTablesPopulated = true;
+            try {
+                Connection connection = dbUtil.getConnection();
+                for (TableInfo tableInfo : getRefTableInfos()) {
+                    int rows = 0;
+                    try (ResultSet rs = connection
+                            .createStatement()
+                            .executeQuery("select * from " + 
tableInfo.getName())) {
+                        while (rs.next()) {
+                            rows++;
+                        }
+                    }
+                    if (rows == 0) {
+                        refTablesPopulated = false;
+                        break;
+                    }
+
+                }
+            } catch (SQLException e) {
+                //swallow
+            }
+            if (refTablesPopulated) {
+                LOG.info("ref tables are already populated");
+                return;
+            }
+
+            IDBWriter writer = getDBWriter(getRefTableInfos(), dbUtil, 
mimeBuffer);
+            Map<Cols, String> m = new HashMap<>();
+            for (AbstractProfiler.PARSE_ERROR_TYPE t : 
AbstractProfiler.PARSE_ERROR_TYPE.values()) {
+                m.clear();
+                m.put(Cols.PARSE_ERROR_ID, Integer.toString(t.ordinal()));
+                m.put(Cols.PARSE_ERROR_DESCRIPTION, t.name());
+                writer.writeRow(AbstractProfiler.REF_PARSE_ERROR_TYPES, m);
+            }
+
+            for (AbstractProfiler.EXCEPTION_TYPE t : 
AbstractProfiler.EXCEPTION_TYPE.values()) {
+                m.clear();
+                m.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(t.ordinal()));
+                m.put(Cols.PARSE_EXCEPTION_DESCRIPTION, t.name());
+                writer.writeRow(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES, m);
+            }
+
+            for (ExtractReaderException.TYPE t : 
ExtractReaderException.TYPE.values()) {
+                m.clear();
+                m.put(Cols.EXTRACT_EXCEPTION_ID, 
Integer.toString(t.ordinal()));
+                m.put(Cols.EXTRACT_EXCEPTION_DESCRIPTION, t.name());
+                writer.writeRow(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES, 
m);
+            }
+            writer.close();
+        }
+
+        protected IDBWriter getDBWriter(List<TableInfo> tableInfos, JDBCUtil 
dbUtil, MimeBuffer mimeBuffer) throws IOException, SQLException {
+            Connection conn = dbUtil.getConnection();
+            return new DBWriter(conn, tableInfos, dbUtil, mimeBuffer);
+        }
+
+
+        protected void addErrorLogTablePairs(DBConsumersManager manager, 
EvalConfig evalConfig) {
+            Path errorLog = evalConfig.getErrorLogFile();
+            if (errorLog == null) {
+                return;
+            }
+            manager.addErrorLogTablePair(errorLog, 
ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
+        }
+    }
+}
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
index a2a4986f3..9b0f482f6 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
@@ -22,7 +22,6 @@ import java.sql.Types;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
 
 import org.apache.commons.cli.Options;
 
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
index edc431e4a..925452094 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
@@ -73,7 +73,26 @@ public class FileProfiler extends AbstractProfiler {
     public static TableInfo FILE_MIME_TABLE =
             new TableInfo("file_mimes", new ColInfo(Cols.MIME_ID, 
Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 
256),
                     new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12));
+    static Options OPTIONS;
 
+    static {
+
+        Option inputDir = new Option("inputDir", true, "optional: directory 
for original binary input documents." + " If not specified, -extracts is 
crawled as is.");
+
+        OPTIONS = new Options()
+                .addOption(inputDir)
+                .addOption("bc", "optional: tika-batch config file")
+                .addOption("numConsumers", true, "optional: number of consumer 
threads")
+                .addOption("db", true, "db file to which to write results")
+                .addOption("jdbc", true, "EXPERT: full jdbc connection string. 
Must specify this or -db <h2db>")
+                .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or 
specify via -Djdbc.driver")
+                .addOption("tablePrefix", true, "EXPERT: optional prefix for 
table names")
+                .addOption("drop", false, "drop tables if they exist")
+                .addOption("maxFilesToAdd", true, "maximum number of files to 
add to the crawler")
+
+        ;
+
+    }
 
     private final Path inputDir;
 
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
index f5d5f1a15..19a7d680f 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
@@ -29,7 +29,6 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -42,8 +41,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.SAXException;
 
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.batch.FileResourceConsumer;
 import org.apache.tika.batch.fs.FSProperties;
 import org.apache.tika.eval.app.db.ColInfo;
 import org.apache.tika.eval.app.db.Cols;
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/StatusReporter.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/StatusReporter.java
new file mode 100644
index 000000000..8df9824ed
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/StatusReporter.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import java.text.NumberFormat;
+import java.util.Locale;
+import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.util.DurationFormatUtils;
+
+public class StatusReporter implements Callable<Integer> {
+
+    public static final int COMPLETED_VAL = 3;
+    private static final Logger LOGGER = 
LoggerFactory.getLogger(StatusReporter.class);
+    private final AtomicInteger filesQueued;
+    private final AtomicInteger filesProcessed;
+    private final AtomicInteger activeWorkers;
+    private final AtomicBoolean crawlerIsActive;
+    private final long start;
+    private final NumberFormat numberFormat = 
NumberFormat.getNumberInstance(Locale.ROOT);
+
+
+    public StatusReporter(AtomicInteger filesQueued, AtomicInteger 
filesProcessed, AtomicInteger activeWorkers, AtomicBoolean crawlerIsActive) {
+        this.filesQueued = filesQueued;
+        this.filesProcessed = filesProcessed;
+        this.activeWorkers = activeWorkers;
+        this.crawlerIsActive = crawlerIsActive;
+        this.start = System.currentTimeMillis();
+    }
+
+    @Override
+    public Integer call() throws Exception {
+        while (true) {
+
+            try {
+                Thread.sleep(1000);
+            } catch (InterruptedException e) {
+                LOGGER.info("Interrupted?");
+                //expected
+                return COMPLETED_VAL;
+            }
+            report();
+            if (activeWorkers.get() == 0) {
+                LOGGER.info("Completed successfully.");
+                return COMPLETED_VAL;
+            }
+        }
+    }
+
+    private void report() {
+        int cnt = filesProcessed.get();
+        long elapsed = System.currentTimeMillis() - start;
+        double elapsedSecs = (double) elapsed / (double) 1000;
+        int avg = (elapsedSecs > 5 || cnt > 100) ? (int) ((double) cnt / 
elapsedSecs) : -1;
+
+        String elapsedString = 
DurationFormatUtils.formatMillis(System.currentTimeMillis() - start);
+        String docsPerSec = avg > -1 ? String.format(Locale.ROOT, " (%s docs 
per sec)", numberFormat.format(avg)) : "";
+        String msg = String.format(Locale.ROOT, "Processed %s documents in 
%s%s.", numberFormat.format(cnt), elapsedString, docsPerSec);
+        LOGGER.info(msg);
+
+        int stillAlive = activeWorkers.get();
+        if (stillAlive == 1) {
+            msg = "There is one file processor still active.";
+        } else {
+            msg = "There are " + numberFormat.format(stillAlive) + " file 
processors still active.";
+        }
+        LOGGER.info(msg);
+
+        int queued = filesQueued.get();
+
+        if (queued == 1) {
+            msg = "The crawler has enqueued 1 file.";
+        } else {
+            msg = "The crawler has enqueued " + numberFormat.format(queued) + 
" files.";
+        }
+        LOGGER.info(msg);
+
+        if (! crawlerIsActive.get()) {
+            msg = "The directory crawler has completed its crawl.\n";
+            LOGGER.info(msg);
+        }
+    }
+}
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
index bae1435e7..91aecd832 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
@@ -117,7 +117,7 @@ public class TikaEvalCLI {
                 CommandLine commandLine = 
defaultCLIParser.parse(FileProfiler.OPTIONS, updatedArgs);
                 if (commandLine.hasOption("db") && 
commandLine.hasOption("jdbc")) {
                     System.out.println("Please specify either the default -db 
or the full -jdbc, not both");
-                    ExtractProfiler.USAGE();
+                    FileProfiler.USAGE();
                     return;
                 }
             } catch (ParseException e) {
@@ -154,7 +154,7 @@ public class TikaEvalCLI {
     }
 
     private void handleProfile(String[] subsetArgs) throws Exception {
-        ExctractProfileRunner.main(subsetArgs);
+        ExtractProfileRunner.main(subsetArgs);
     }
 
     private void handleCompare(String[] subsetArgs) throws Exception {
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java
deleted file mode 100644
index c59d53016..000000000
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.app.batch;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.eval.app.AbstractProfiler;
-import org.apache.tika.eval.app.ExtractProfiler;
-import org.apache.tika.eval.app.db.TableInfo;
-import org.apache.tika.util.PropsUtil;
-
-
-public class ExtractProfilerBuilder extends EvalConsumerBuilder {
-
-    public final static String TABLE_PREFIX_KEY = "tablePrefix";
-
-    private final List<TableInfo> tableInfos;
-    private final List<TableInfo> refTableInfos;
-
-    public ExtractProfilerBuilder() {
-        List<TableInfo> tableInfos = new ArrayList();
-        tableInfos.add(AbstractProfiler.MIME_TABLE);
-        tableInfos.add(ExtractProfiler.CONTAINER_TABLE);
-        tableInfos.add(ExtractProfiler.PROFILE_TABLE);
-        tableInfos.add(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
-        tableInfos.add(ExtractProfiler.EXCEPTION_TABLE);
-        tableInfos.add(ExtractProfiler.CONTENTS_TABLE);
-        tableInfos.add(ExtractProfiler.TAGS_TABLE);
-        tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE);
-        this.tableInfos = Collections.unmodifiableList(tableInfos);
-
-        List<TableInfo> refTableInfos = new ArrayList<>();
-        refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
-        refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
-        refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES);
-        this.refTableInfos = Collections.unmodifiableList(refTableInfos);
-    }
-
-    @Override
-    public FileResourceConsumer build() throws IOException, SQLException {
-        Path extracts = PropsUtil.getPath(localAttrs.get("extracts"), null);
-        if (extracts == null) {
-            throw new RuntimeException("Must specify \"extracts\" -- directory 
to crawl");
-        }
-        if (!Files.isDirectory(extracts)) {
-            throw new RuntimeException("ROOT DIRECTORY DOES NOT EXIST: " + 
extracts.toAbsolutePath());
-        }
-
-        Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
-
-        //we _could_ set this to extracts (if not null)
-        //here, but the Crawler defaults to "input" if nothing is passed
-        //so this won't work
-        if (inputDir == null) {
-            throw new RuntimeException("Must specify -inputDir");
-        }
-        if (extracts == null && inputDir != null) {
-            extracts = inputDir;
-        }
-        return parameterizeProfiler(new ExtractProfiler(queue, inputDir, 
extracts, buildExtractReader(localAttrs), getDBWriter(tableInfos)));
-    }
-
-
-    @Override
-    protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
-        String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
-        if (tableNamePrefix != null && !tableNamePrefix.equals("null")) {
-            for (TableInfo tableInfo : tableInfos) {
-                tableInfo.setNamePrefix(tableNamePrefix);
-            }
-        }
-    }
-
-
-    @Override
-    protected List<TableInfo> getRefTableInfos() {
-        return refTableInfos;
-    }
-
-    @Override
-    protected List<TableInfo> getNonRefTableInfos() {
-        return tableInfos;
-    }
-
-    @Override
-    protected TableInfo getMimeTable() {
-        return AbstractProfiler.MIME_TABLE;
-    }
-
-    @Override
-    protected void addErrorLogTablePairs(DBConsumersManager manager) {
-        Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), 
null);
-        if (errorLog == null) {
-            return;
-        }
-        manager.addErrorLogTablePair(errorLog, 
ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
-    }
-}
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java
index 9d64317aa..20f67798a 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.tika.eval.app.batch;
 
 
@@ -6,15 +22,18 @@ import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 
+import org.apache.tika.batch.fs.FSProperties;
 import org.apache.tika.metadata.Metadata;
 
 public class PathResource implements FileResource {
 
     private final Path path;
     private final String resourceId;
+    private final Metadata metadata = new Metadata();
     public PathResource(Path path, String resourceId) {
         this.path = path;
         this.resourceId = resourceId;
+        metadata.set(FSProperties.FS_REL_PATH, resourceId);
     }
     @Override
     public String getResourceId() {
@@ -23,7 +42,7 @@ public class PathResource implements FileResource {
 
     @Override
     public Metadata getMetadata() {
-        return new Metadata();
+        return metadata;
     }
 
     @Override
diff --git 
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/EvalConfigTest.java
 
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/EvalConfigTest.java
new file mode 100644
index 000000000..395c90fe6
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/EvalConfigTest.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.net.URISyntaxException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import org.junit.jupiter.api.Test;
+
+public class EvalConfigTest {
+
+    @Test
+    public void testBasic() throws Exception {
+        EvalConfig evalConfig = 
EvalConfig.load(getConfig("eval-config-basic.json"));
+        assertEquals(20000, evalConfig.getMaxExtractLength());
+        assertNull(evalConfig.getErrorLogFile());
+        assertNull(evalConfig.getJdbcString());
+    }
+
+    private Path getConfig(String fileName) throws URISyntaxException {
+        return Paths.get(EvalConfigTest.class.getResource("/eval-configs/" + 
fileName).toURI());
+    }
+}
diff --git 
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java
 
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java
index be58e0ed2..3d6e93ad3 100644
--- 
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java
+++ 
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java
@@ -21,7 +21,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.IOException;
-import java.nio.file.DirectoryStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
@@ -30,79 +29,75 @@ import java.sql.ResultSet;
 import java.sql.SQLException;
 import java.sql.Statement;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 
-import org.junit.jupiter.api.AfterAll;
+import org.apache.commons.io.FileUtils;
+import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.eval.app.db.Cols;
 import org.apache.tika.eval.app.db.H2Util;
 import org.apache.tika.eval.app.db.TableInfo;
-import org.apache.tika.eval.app.io.ExtractReaderException;
 
-@Disabled
 public class ProfilerBatchTest {
 
-    public final static String COMPARER_PROCESS_CLASS = 
"org.apache.tika.batch.fs.FSBatchProcessCLI";
-    private final static String profileTable = 
ExtractProfiler.PROFILE_TABLE.getName();
-    private final static String exTable = 
ExtractProfiler.EXCEPTION_TABLE.getName();
-    private final static String fpCol = Cols.FILE_PATH.name();
-    private static Path dbDir;
-    private static Connection conn;
+    private static Connection CONN;
+    private static Path DB_DIR;
+    private static Path DB;
 
     @BeforeAll
     public static void setUp() throws Exception {
+        DB_DIR = Files.createTempDirectory("profiler-test");
+        Path extractsRoot = Paths.get(ComparerBatchTest.class
+                .getResource("/test-dirs/extractsA")
+                .toURI());
 
         Path inputRoot = Paths.get(ComparerBatchTest.class
-                .getResource("/test-dirs/extractsA")
+                .getResource("/test-dirs/raw_input")
                 .toURI());
-        dbDir = Files.createTempDirectory(inputRoot, "tika-test-db-dir-");
-        Map<String, String> args = new HashMap<>();
-        Path db = dbDir.resolve("profiler_test");
-        args.put("-db", db.toString());
-
-        //for debugging, you can use this to select only one file pair to load
-        //args.put("-includeFilePat", "file8.*");
-
-       /* BatchProcessTestExecutor ex = new 
BatchProcessTestExecutor(COMPARER_PROCESS_CLASS, args,
-                "/single-file-profiler-crawl-input-config.xml");
-        StreamStrings streamStrings = ex.execute();
-        System.out.println(streamStrings.getErrString());
-        System.out.println(streamStrings.getOutString());*/
-        H2Util dbUtil = new H2Util(db);
-        conn = dbUtil.getConnection();
-    }
 
-    @AfterAll
-    public static void tearDown() throws IOException {
+        DB = DB_DIR.resolve("mydb");
+        String[] args = new String[]{
+            "-i", inputRoot.toAbsolutePath().toString(),
+            "-e", extractsRoot.toAbsolutePath().toString(),
+                "-d", "jdbc:h2:file:" + DB.toAbsolutePath().toString()
+        };
+
+        ExtractProfileRunner.main(args);
+    }
 
+    @AfterEach
+    public void tearDown() throws IOException {
         try {
-            conn.close();
+            CONN.close();
         } catch (SQLException e) {
             throw new RuntimeException(e);
         }
-        //TODO: if/when we turn this back on, use @TempDir instead of this
+        FileUtils.deleteDirectory(DB_DIR.toFile());
 
-        DirectoryStream<Path> dStream = Files.newDirectoryStream(dbDir);
-        for (Path p : dStream) {
-            Files.delete(p);
-        }
-        dStream.close();
-        Files.delete(dbDir);
+    }
+
+    @BeforeEach
+    public void setUpEach() throws SQLException {
+        H2Util dbUtil = new H2Util(DB);
+        CONN = dbUtil.getConnection();
+    }
+
+    @AfterEach
+    public void tearDownEach() throws SQLException {
+        CONN.close();
     }
 
     @Test
     public void testSimpleDBWriteAndRead() throws Exception {
-
         Statement st = null;
         List<String> fNameList = new ArrayList<>();
         try {
             String sql = "select * from " + 
ExtractProfiler.CONTAINER_TABLE.getName();
-            st = conn.createStatement();
+            st = CONN.createStatement();
             ResultSet rs = st.executeQuery(sql);
             while (rs.next()) {
                 String fileName = rs.getString(Cols.FILE_PATH.name());
@@ -113,17 +108,19 @@ public class ProfilerBatchTest {
                 st.close();
             }
         }
+        /*
         debugTable(ExtractProfiler.CONTAINER_TABLE);
         debugTable(ExtractProfiler.PROFILE_TABLE);
         debugTable(ExtractProfiler.CONTENTS_TABLE);
         debugTable(ExtractProfiler.EXCEPTION_TABLE);
-        debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
-        assertEquals(10, fNameList.size());
+        debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);*/
+        assertEquals(17, fNameList.size());
         assertTrue(fNameList.contains("file1.pdf"), "file1.pdf");
         assertTrue(fNameList.contains("file2_attachANotB.doc"), 
"file2_attachANotB.doc");
         assertTrue(fNameList.contains("file3_attachBNotA.doc"), 
"file3_attachBNotA.doc");
         assertTrue(fNameList.contains("file4_emptyB.pdf"), "file4_emptyB.pdf");
         assertTrue(fNameList.contains("file7_badJson.pdf"), 
"file4_emptyB.pdf");
+        assertTrue(fNameList.contains("file9_noextract.txt"), 
"file9_noextract.txt");
     }
 
     @Test
@@ -131,43 +128,29 @@ public class ProfilerBatchTest {
         String sql =
                 "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + " 
join containers c on c.container_id = e.container_id " + " where 
c.file_path='file9_noextract.txt'";
 
-        assertEquals("missing extract: file9_noextract.txt", "0", 
getSingleResult(sql));
-        debugTable(ExtractProfiler.CONTAINER_TABLE);
+        /*debugTable(ExtractProfiler.CONTAINER_TABLE);
         debugTable(ExtractProfiler.PROFILE_TABLE);
         debugTable(ExtractProfiler.CONTENTS_TABLE);
         debugTable(ExtractProfiler.EXCEPTION_TABLE);
-        debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
-
-        sql = "select EXTRACT_EXCEPTION_ID from errors e" + " join containers 
c on c.container_id = e.container_id " + " where 
c.file_path='file5_emptyA.pdf'";
-        assertEquals("empty extract: file5_emptyA.pdf", "1", 
getSingleResult(sql));
-
-        sql = "select EXTRACT_EXCEPTION_ID from errors e" + " join containers 
c on c.container_id = e.container_id " + " where 
c.file_path='file7_badJson.pdf'";
-        assertEquals("extract error:file7_badJson.pdf", "2", 
getSingleResult(sql));
-
-    }
-
-    @Test
-    public void testParseErrors() throws Exception {
-        debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
-        String sql = "select file_path from errors where container_id is null";
-        assertEquals("file10_permahang.txt", getSingleResult(sql));
-
-        sql = "select extract_error_id from extract_exceptions " + "where 
file_path='file11_oom.txt'";
-        
assertEquals(Integer.toString(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE.ordinal()),
 getSingleResult(sql));
+        debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);*/
+        assertEquals("0", getSingleResult(sql), "missing extract: 
file9_noextract.txt");
 
-        sql = "select parse_error_id from extract_exceptions where 
file_path='file11_oom.txt'";
-        
assertEquals(Integer.toString(AbstractProfiler.PARSE_ERROR_TYPE.OOM.ordinal()), 
getSingleResult(sql));
+        sql = "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + " join 
containers c on c.container_id = e.container_id " + " where 
c.file_path='file5_emptyA.pdf'";
+        assertEquals("1", getSingleResult(sql), "empty extract: 
file5_emptyA.pdf");
 
+        sql = "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + " join 
containers c on c.container_id = e.container_id " + " where 
c.file_path='file7_badJson.pdf'";
+        assertEquals("2", getSingleResult(sql), "extract 
error:file7_badJson.pdf");
     }
 
     @Test
+    @Disabled("create actual unit test")
     public void testParseExceptions() throws Exception {
         debugTable(ExtractProfiler.EXCEPTION_TABLE);
     }
 
     private String getSingleResult(String sql) throws Exception {
         Statement st = null;
-        st = conn.createStatement();
+        st = CONN.createStatement();
         ResultSet rs = st.executeQuery(sql);
         int hits = 0;
         String val = "";
@@ -188,7 +171,7 @@ public class ProfilerBatchTest {
         Statement st = null;
         try {
             String sql = "select * from " + table.getName();
-            st = conn.createStatement();
+            st = CONN.createStatement();
             ResultSet rs = st.executeQuery(sql);
             int colCount = rs
                     .getMetaData()
diff --git 
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
 
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
index fff15e3dc..4d7d4bb2b 100644
--- 
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
+++ 
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
@@ -98,20 +98,13 @@ public class TikaEvalCLITest extends TikaTest {
     private static void profile() throws IOException {
         List<String> args = new ArrayList<>();
         args.add("Profile");
-        args.add("-extracts");
+        args.add("-e");
         args.add(ProcessUtils.escapeCommandLine(extractsDir
                 .resolve("extractsA")
                 .toAbsolutePath()
                 .toString()));
-        //add these just to confirm this info doesn't cause problems w cli
-        args.add("-maxTokens");
-        args.add("10000000");
-        args.add("-maxContentLength");
-        args.add("100000000");
-        args.add("-maxContentLengthForLangId");
-        args.add("100000");
 
-        args.add("-db");
+        args.add("-d");
         args.add(ProcessUtils.escapeCommandLine(profileDBDir
                 .toAbsolutePath()
                 .toString() + "/" + dbName));
diff --git 
a/tika-eval/tika-eval-app/src/test/resources/eval-configs/eval-config-basic.json
 
b/tika-eval/tika-eval-app/src/test/resources/eval-configs/eval-config-basic.json
new file mode 100644
index 000000000..b4af28df3
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/test/resources/eval-configs/eval-config-basic.json
@@ -0,0 +1,3 @@
+{
+  "maxExtractLength" : 20000
+}
\ No newline at end of file
diff --git 
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file10_permahang.txt
 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file10_permahang.txt
new file mode 100644
index 000000000..e69de29bb
diff --git 
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file12_es.txt 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file12_es.txt
new file mode 100644
index 000000000..5ffd824a9
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file12_es.txt
@@ -0,0 +1,6 @@
+[
+  {
+    "Content-Type": "text/plain",
+    "X-TIKA:content": "El zorro marrón rápido saltó sobre el perro. El zorro 
marrón rápido saltó sobre el perro. El zorro marrón rápido saltó sobre el perro"
+  }
+]
\ No newline at end of file
diff --git 
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file13_attachANotB.doc
 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file13_attachANotB.doc
new file mode 100644
index 000000000..15bc592a5
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file13_attachANotB.doc
@@ -0,0 +1,12 @@
+[
+  {
+    "Content-Type": "text/plain",
+    "_comment": "simplified",
+    "X-TIKA:content": 
"调整每一个心脏和每个声音,投标每个护理提取;让大家一起欢乐,赞美老拿骚.调整每一个心脏和每个声音,投标每个护理提取;让大家一起欢乐,赞美老拿骚 狐狸狐狸狐狸 
"
+  },
+  {
+    "Content-Type": "text/plain",
+    "X-TIKA:embedded_resource_path": "inner.txt",
+    "X-TIKA:content": "attachment contents"
+  }
+]
\ No newline at end of file
diff --git 
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file14_diffAttachOrder
 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file14_diffAttachOrder
new file mode 100644
index 000000000..25f0db9a1
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file14_diffAttachOrder
@@ -0,0 +1,21 @@
+[
+  {
+    "Content-Type": "text/plain",
+    "X-TIKA:content": "the quick brown fox fox fox jumped over the lazy lazy 
dog",
+    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8351"
+  },
+  {
+    "Content-Type": "text/plain",
+    "X-TIKA:embedded_resource_path": "/0",
+    "X-TIKA:content": "a b c d e f g h i j k l m n",
+    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8354",
+    "X-TIKA:embedded_depth": "1"
+  },
+  {
+    "Content-Type": "text/plain",
+    "X-TIKA:embedded_resource_path": "/1",
+    "X-TIKA:content": "o p q r s t u v w x y z",
+    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8353",
+    "X-TIKA:embedded_depth": "1"
+  }
+]
\ No newline at end of file
diff --git 
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file15_tags 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file15_tags
new file mode 100644
index 000000000..5af73db80
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file15_tags
@@ -0,0 +1,41 @@
+[
+  {
+    "Content-Length": "34824",
+    "Content-Type": "application/pdf",
+    "Last-Modified": "2007-09-15T09:02:31Z",
+    "X-Parsed-By": [
+      "org.apache.tika.parser.DefaultParser",
+      "org.apache.tika.parser.pdf.PDFParser"
+    ],
+    "X-TIKA:content_handler": "ToXMLContentHandler",
+    "X-TIKA:content": "\u003chtml 
xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta 
name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta 
name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" 
/\u003e\n\u003cmeta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" 
/\u003e\n\u003cmeta name\u003d\"access_permission:modify_annotations\" 
content\u003d\"true\" /\u003e\n\u003cmeta 
name\u003d\"access_permission:can_print_degra [...]
+    "X-TIKA:parse_time_millis": "500",
+    "access_permission:assemble_document": "true",
+    "access_permission:can_modify": "true",
+    "access_permission:can_print": "true",
+    "access_permission:can_print_degraded": "true",
+    "access_permission:extract_content": "true",
+    "access_permission:extract_for_accessibility": "true",
+    "access_permission:fill_in_form": "true",
+    "access_permission:modify_annotations": "true",
+    "dc:creator": "Bertrand DelacrΘtaz",
+    "dc:format": "application/pdf; version\u003d1.3",
+    "dc:title": "Apache Tika - Apache Tika",
+    "dcterms:created": "2007-09-15T09:02:31Z",
+    "dcterms:modified": "2007-09-15T09:02:31Z",
+    "meta:author": "Bertrand DelacrΘtaz",
+    "meta:creation-date": "2007-09-15T09:02:31Z",
+    "meta:save-date": "2007-09-15T09:02:31Z",
+    "pdf:PDFVersion": "1.3",
+    "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+    "pdf:docinfo:creator_tool": "Firefox",
+    "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+    "pdf:docinfo:title": "Apache Tika - Apache Tika",
+    "pdf:encrypted": "false",
+    "resourceName": "testPDF.pdf",
+    "xmp:CreatorTool": "Firefox",
+    "xmpTPg:NPages": "1"
+  }
+]
\ No newline at end of file
diff --git 
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file16_badTags 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file16_badTags
new file mode 100644
index 000000000..5c6272e43
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file16_badTags
@@ -0,0 +1,41 @@
+[
+  {
+    "Content-Length": "34824",
+    "Content-Type": "application/pdf",
+    "Last-Modified": "2007-09-15T09:02:31Z",
+    "X-Parsed-By": [
+      "org.apache.tika.parser.DefaultParser",
+      "org.apache.tika.parser.pdf.PDFParser"
+    ],
+    "X-TIKA:content_handler": "ToXMLContentHandler",
+    "X-TIKA:content": "\u003chtml 
xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta 
name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta 
name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" meta 
name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" 
/\u003e\n\u003c\u003c\u003c\u003c\u003cmeta 
name\u003d\"access_permission:modify_annotations\" content\u003d\"true\" 
/\u003e\n\u003cmeta name\u003d\"access_permission:can_pr [...]
+    "X-TIKA:parse_time_millis": "500",
+    "access_permission:assemble_document": "true",
+    "access_permission:can_modify": "true",
+    "access_permission:can_print": "true",
+    "access_permission:can_print_degraded": "true",
+    "access_permission:extract_content": "true",
+    "access_permission:extract_for_accessibility": "true",
+    "access_permission:fill_in_form": "true",
+    "access_permission:modify_annotations": "true",
+    "dc:creator": "Bertrand DelacrΘtaz",
+    "dc:format": "application/pdf; version\u003d1.3",
+    "dc:title": "Apache Tika - Apache Tika",
+    "dcterms:created": "2007-09-15T09:02:31Z",
+    "dcterms:modified": "2007-09-15T09:02:31Z",
+    "meta:author": "Bertrand DelacrΘtaz",
+    "meta:creation-date": "2007-09-15T09:02:31Z",
+    "meta:save-date": "2007-09-15T09:02:31Z",
+    "pdf:PDFVersion": "1.3",
+    "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+    "pdf:docinfo:creator_tool": "Firefox",
+    "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+    "pdf:docinfo:title": "Apache Tika - Apache Tika",
+    "pdf:encrypted": "false",
+    "resourceName": "testPDF.pdf",
+    "xmp:CreatorTool": "Firefox",
+    "xmpTPg:NPages": "1"
+  }
+]
\ No newline at end of file
diff --git 
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file17_tagsOutOfOrder
 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file17_tagsOutOfOrder
new file mode 100644
index 000000000..97afec8ad
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file17_tagsOutOfOrder
@@ -0,0 +1,41 @@
+[
+  {
+    "Content-Length": "34824",
+    "Content-Type": "application/pdf",
+    "Last-Modified": "2007-09-15T09:02:31Z",
+    "X-Parsed-By": [
+      "org.apache.tika.parser.DefaultParser",
+      "org.apache.tika.parser.pdf.PDFParser"
+    ],
+    "X-TIKA:content_handler": "ToXMLContentHandler",
+    "X-TIKA:content": "\u003chtml 
xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta 
name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta 
name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" 
/\u003e\n\u003cmeta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" 
/\u003e\n\u003cmeta name\u003d\"access_permission:modify_annotations\" 
content\u003d\"true\" /\u003e\n\u003cmeta 
name\u003d\"access_permission:can_print_degra [...]
+    "X-TIKA:parse_time_millis": "500",
+    "access_permission:assemble_document": "true",
+    "access_permission:can_modify": "true",
+    "access_permission:can_print": "true",
+    "access_permission:can_print_degraded": "true",
+    "access_permission:extract_content": "true",
+    "access_permission:extract_for_accessibility": "true",
+    "access_permission:fill_in_form": "true",
+    "access_permission:modify_annotations": "true",
+    "dc:creator": "Bertrand DelacrΘtaz",
+    "dc:format": "application/pdf; version\u003d1.3",
+    "dc:title": "Apache Tika - Apache Tika",
+    "dcterms:created": "2007-09-15T09:02:31Z",
+    "dcterms:modified": "2007-09-15T09:02:31Z",
+    "meta:author": "Bertrand DelacrΘtaz",
+    "meta:creation-date": "2007-09-15T09:02:31Z",
+    "meta:save-date": "2007-09-15T09:02:31Z",
+    "pdf:PDFVersion": "1.3",
+    "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+    "pdf:docinfo:creator_tool": "Firefox",
+    "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+    "pdf:docinfo:title": "Apache Tika - Apache Tika",
+    "pdf:encrypted": "false",
+    "resourceName": "testPDF.pdf",
+    "xmp:CreatorTool": "Firefox",
+    "xmpTPg:NPages": "1"
+  }
+]
\ No newline at end of file
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index e0ba1333d..2037f1aba 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -399,7 +399,7 @@
     <jsoup.version>1.21.1</jsoup.version>
     <jsr305.version>3.0.2</jsr305.version>
     <junit4.version>4.13.2</junit4.version>
-    <junit5.version>6.0.0-M1</junit5.version>
+    <junit5.version>5.13.3</junit5.version>
     <juniversalchardet.version>2.5.0</juniversalchardet.version>
     <junrar.version>7.5.5</junrar.version>
     <jwarc.version>0.31.1</jwarc.version>

Reply via email to