This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 4c5200a45 TIKA-4342 -- remove tika-batch from tika-eval's Profile
command (#2272)
4c5200a45 is described below
commit 4c5200a453f35a01ba4d50335143204b483a397a
Author: Tim Allison <[email protected]>
AuthorDate: Tue Jul 8 15:55:39 2025 -0400
TIKA-4342 -- remove tika-batch from tika-eval's Profile command (#2272)
* TIKA-4342 -- remove tika-batch from ExtractProfiler
---
.../java/org/apache/tika/eval/app/EvalConfig.java | 88 +++
.../apache/tika/eval/app/ExtractProfileRunner.java | 374 ++++++++++
.../org/apache/tika/eval/app/ExtractProfiler.java | 50 +-
.../org/apache/tika/eval/app/ProfilerBase.java | 813 +++++++++++++++++++++
.../org/apache/tika/eval/app/StatusReporter.java | 102 +++
.../java/org/apache/tika/eval/app/TikaEvalCLI.java | 106 +--
.../eval/app/batch/ExtractProfilerBuilder.java | 120 ---
.../apache/tika/eval/app/batch/FileResource.java | 66 ++
.../apache/tika/eval/app/batch/PathResource.java | 52 ++
.../org/apache/tika/eval/app/EvalConfigTest.java | 42 ++
.../apache/tika/eval/app/ProfilerBatchTest.java | 117 ++-
.../org/apache/tika/eval/app/TikaEvalCLITest.java | 11 +-
.../resources/eval-configs/eval-config-basic.json | 3 +
.../test-dirs/raw_input/file10_permahang.txt | 0
.../resources/test-dirs/raw_input/file12_es.txt | 6 +
.../test-dirs/raw_input/file13_attachANotB.doc | 12 +
.../test-dirs/raw_input/file14_diffAttachOrder | 21 +
.../test/resources/test-dirs/raw_input/file15_tags | 41 ++
.../resources/test-dirs/raw_input/file16_badTags | 41 ++
.../test-dirs/raw_input/file17_tagsOutOfOrder | 41 ++
20 files changed, 1761 insertions(+), 345 deletions(-)
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
new file mode 100644
index 000000000..5525180ed
--- /dev/null
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import java.nio.file.Path;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+public class EvalConfig {
+
+ private long minExtractLength = 0;
+ private long maxExtractLength = 2_000_000;
+ private String jdbcString = null;
+ private String jdbcDriverClass = null;
+ private boolean forceDrop = true;
+ private int maxFilesToAdd = -1;
+ private int maxTokens = 200000;
+
+ private int maxContentLength = 5_000_000;
+ private int numWorkers = 4;
+ private Path errorLogFile = null;
+
+
+ public static EvalConfig load(Path path) throws Exception {
+ return new ObjectMapper().readValue(path.toFile(), EvalConfig.class);
+ }
+
+ public long getMinExtractLength() {
+ return minExtractLength;
+ }
+
+ public long getMaxExtractLength() {
+ return maxExtractLength;
+ }
+
+ public String getJdbcString() {
+ return jdbcString;
+ }
+
+ public String getJdbcDriverClass() {
+ return jdbcDriverClass;
+ }
+
+ public boolean isForceDrop() {
+ return forceDrop;
+ }
+
+ public int getMaxFilesToAdd() {
+ return maxFilesToAdd;
+ }
+
+ public int getMaxTokens() {
+ return maxTokens;
+ }
+
+ public int getMaxContentLength() {
+ return maxContentLength;
+ }
+
+ public int getNumWorkers() {
+ return numWorkers;
+ }
+
+ public Path getErrorLogFile() {
+ return errorLogFile;
+ }
+
+ @Override
+ public String toString() {
+ return "EvalConfig{" + "minExtractLength=" + minExtractLength + ",
maxExtractLength=" + maxExtractLength + ", jdbcString='" + jdbcString + '\'' +
", jdbcDriverClass='" +
+ jdbcDriverClass + '\'' + ", forceDrop=" + forceDrop + ",
maxFilesToAdd=" + maxFilesToAdd + ", maxTokens=" + maxTokens + ",
maxContentLength=" + maxContentLength +
+ ", numThreads=" + numWorkers + ", errorLogFile=" +
errorLogFile + '}';
+ }
+}
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java
new file mode 100644
index 000000000..cd80a3df3
--- /dev/null
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java
@@ -0,0 +1,374 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import java.io.IOException;
+import java.nio.file.FileVisitResult;
+import java.nio.file.FileVisitor;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.attribute.BasicFileAttributes;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.eval.app.batch.DBConsumersManager;
+import org.apache.tika.eval.app.batch.FileResource;
+import org.apache.tika.eval.app.batch.PathResource;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.eval.app.db.MimeBuffer;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.DBWriter;
+import org.apache.tika.eval.app.io.ExtractReader;
+import org.apache.tika.eval.app.io.ExtractReaderException;
+import org.apache.tika.eval.app.io.IDBWriter;
+
+public class ExtractProfileRunner {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(ExtractProfileRunner.class);
+ private static final PathResource SEMAPHORE = new
PathResource(Paths.get("/"), "STOP");
+ private static final int DIR_WALKER_COMPLETED_VALUE = 2;
+ private static final int PROFILE_WORKER_COMPLETED_VALUE = 1;
+
+ static Options OPTIONS;
+
+ static {
+
+ OPTIONS = new Options()
+
.addOption(Option.builder("e").longOpt("extracts").hasArg().desc("required:
directory of extracts").build())
+
.addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional:
directory for original binary input documents."
+ + " If not specified, -extracts is crawled as
is.").build())
+
.addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db
path").build())
+
.addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json
config file").build())
+ ;
+ }
+ public static void main(String[] args) throws Exception {
+ DefaultParser defaultCLIParser = new DefaultParser();
+ CommandLine commandLine = defaultCLIParser.parse(OPTIONS, args);
+ EvalConfig evalConfig = commandLine.hasOption('c') ?
EvalConfig.load(Paths.get(commandLine.getOptionValue('c'))) : new EvalConfig();
+ Path extractsDir = commandLine.hasOption('e') ?
Paths.get(commandLine.getOptionValue('e')) : Paths.get(USAGE_FAIL("Must specify
extracts dir: -i"));
+ Path inputDir = commandLine.hasOption('i') ?
Paths.get(commandLine.getOptionValue('i')) : extractsDir;
+ String dbPath = commandLine.hasOption('d') ?
commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d");
+ String jdbcString = getJdbcConnectionString(dbPath);
+ execute(inputDir, extractsDir, jdbcString, evalConfig);
+ }
+
+ private static String getJdbcConnectionString(String dbPath) {
+ if (dbPath.startsWith("jdbc:")) {
+ return dbPath;
+ }
+ //default to h2
+ Path p = Paths.get(dbPath);
+ return "jdbc:h2:file:" + p.toAbsolutePath();
+
+ }
+
+ private static void execute(Path inputDir, Path extractsDir, String
dbPath, EvalConfig evalConfig) throws SQLException, IOException {
+
+ //parameterize this? if necesssary
+ try {
+ ProfilerBase.loadCommonTokens(null, null);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ JDBCUtil jdbcUtil = new JDBCUtil(dbPath,
evalConfig.getJdbcDriverClass());
+ ExtractProfilerBuilder builder = new ExtractProfilerBuilder();
+ MimeBuffer mimeBuffer = initTables(jdbcUtil, builder, dbPath,
evalConfig);
+ builder.populateRefTables(jdbcUtil, mimeBuffer);
+
+ AtomicInteger enqueued = new AtomicInteger(0);
+ AtomicInteger processed = new AtomicInteger(0);
+ AtomicInteger activeWorkers = new
AtomicInteger(evalConfig.getNumWorkers());
+ AtomicBoolean crawlerActive = new AtomicBoolean(true);
+
+
+
+ ArrayBlockingQueue<FileResource> queue = new
ArrayBlockingQueue<>(1000);
+ ExecutorService executorService =
Executors.newFixedThreadPool(evalConfig.getNumWorkers() + 2);
+ ExecutorCompletionService<Integer> executorCompletionService = new
ExecutorCompletionService<>(executorService);
+
+ StatusReporter statusReporter = new StatusReporter(enqueued,
processed, activeWorkers, crawlerActive);
+ executorCompletionService.submit(statusReporter);
+
+ DirectoryWalker directoryWalker = new DirectoryWalker(inputDir, queue,
enqueued);
+ executorCompletionService.submit(directoryWalker);
+ for (int i = 0; i < evalConfig.getNumWorkers(); i++) {
+ ExtractReader extractReader = new
ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS,
evalConfig.getMinExtractLength(), evalConfig.getMaxExtractLength());
+ ExtractProfiler extractProfiler = new ExtractProfiler(inputDir,
extractsDir, extractReader, builder.getDBWriter(builder.tableInfos, jdbcUtil,
mimeBuffer));
+ executorCompletionService.submit(new ProfileWorker(queue,
extractProfiler, processed));
+ }
+
+ int finished = 0;
+ try {
+ while (finished < evalConfig.getNumWorkers() + 2) {
+ //blocking
+ Future<Integer> future = executorCompletionService.take();
+ Integer result = future.get();
+ if (result != null) {
+ //if the dir walker has finished
+ if (result == DIR_WALKER_COMPLETED_VALUE) {
+ queue.put(SEMAPHORE);
+ crawlerActive.set(false);
+ } else if (result == PROFILE_WORKER_COMPLETED_VALUE) {
+ activeWorkers.decrementAndGet();
+ }
+ finished++;
+ }
+ }
+ } catch (InterruptedException e) {
+ LOG.info("interrupted", e);
+ } catch (ExecutionException e) {
+ throw new RuntimeException(e);
+ } finally {
+ mimeBuffer.close();
+ executorService.shutdownNow();
+ }
+
+ }
+
+ private static MimeBuffer initTables(JDBCUtil jdbcUtil,
ExtractProfilerBuilder builder, String connectionString, EvalConfig evalConfig)
throws SQLException, IOException {
+
+ //step 1. create the tables
+ jdbcUtil.createTables(builder.getNonRefTableInfos(),
JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS);
+ jdbcUtil.createTables(builder.getRefTableInfos(),
JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS);
+
+ //step 2. create mime buffer
+ return new MimeBuffer(jdbcUtil.getConnection(),
builder.getMimeTable(), TikaConfig.getDefaultConfig());
+ }
+
+ private static void USAGE() {
+ HelpFormatter helpFormatter = new HelpFormatter();
+ helpFormatter.printHelp(80, "java -jar tika-eval-app-x.y.z.jar
FileProfiler -e docs -d mydb [-i inputDir, -c config.json]",
+ "Tool: Profile", OPTIONS, "");
+ }
+
+ private static String USAGE_FAIL(String msg) {
+ USAGE();
+ throw new IllegalArgumentException(msg);
+ }
+
+ private static class ProfileWorker implements Callable<Integer> {
+
+ private final ArrayBlockingQueue<FileResource> queue;
+ private final ExtractProfiler extractProfiler;
+ private final AtomicInteger processed;
+
+ ProfileWorker(ArrayBlockingQueue<FileResource> queue, ExtractProfiler
extractProfiler, AtomicInteger processed) {
+ this.queue = queue;
+ this.extractProfiler = extractProfiler;
+ this.processed = processed;
+ }
+
+ @Override
+ public Integer call() throws Exception {
+ while (true) {
+ FileResource resource = queue.poll(1, TimeUnit.SECONDS);
+ if (resource == null) {
+ LOG.info("ExtractProfileWorker waiting on queue");
+ continue;
+ }
+ if (resource == SEMAPHORE) {
+ LOG.debug("worker hit semaphore and is stopping");
+ extractProfiler.closeWriter();
+ //hangs
+ queue.put(resource);
+ return PROFILE_WORKER_COMPLETED_VALUE;
+ }
+ extractProfiler.processFileResource(resource);
+ processed.incrementAndGet();
+ }
+ }
+ }
+
+ private static class DirectoryWalker implements Callable<Integer> {
+ private final Path startDir;
+ private final ArrayBlockingQueue<FileResource> queue;
+ private final AtomicInteger enqueued;
+
+ public DirectoryWalker(Path startDir, ArrayBlockingQueue<FileResource>
queue, AtomicInteger enqueued) {
+ this.startDir = startDir;
+ this.queue = queue;
+ this.enqueued = enqueued;
+ }
+
+ @Override
+ public Integer call() throws Exception {
+ Files.walkFileTree(startDir, new FileVisitor<Path>() {
+ @Override
+ public FileVisitResult preVisitDirectory(Path dir,
BasicFileAttributes attrs) throws IOException {
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult visitFile(Path file,
BasicFileAttributes attrs) throws IOException {
+ if (Files.isDirectory(file)) {
+ return FileVisitResult.CONTINUE;
+ }
+ try {
+ //blocking
+ queue.put(new PathResource(file,
startDir.relativize(file).toString()));
+ enqueued.incrementAndGet();
+ } catch (InterruptedException e) {
+ return FileVisitResult.TERMINATE;
+ }
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult visitFileFailed(Path file, IOException
exc) throws IOException {
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult postVisitDirectory(Path dir,
IOException exc) throws IOException {
+ return FileVisitResult.CONTINUE;
+ }
+ });
+ return DIR_WALKER_COMPLETED_VALUE;
+ }
+ }
+
+ private static class ExtractProfilerBuilder {
+ private final List<TableInfo> tableInfos;
+ private final List<TableInfo> refTableInfos;
+
+ public ExtractProfilerBuilder() {
+ List<TableInfo> tableInfos = new ArrayList();
+ tableInfos.add(AbstractProfiler.MIME_TABLE);
+ tableInfos.add(ExtractProfiler.CONTAINER_TABLE);
+ tableInfos.add(ExtractProfiler.PROFILE_TABLE);
+ tableInfos.add(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
+ tableInfos.add(ExtractProfiler.EXCEPTION_TABLE);
+ tableInfos.add(ExtractProfiler.CONTENTS_TABLE);
+ tableInfos.add(ExtractProfiler.TAGS_TABLE);
+ tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE);
+ this.tableInfos = Collections.unmodifiableList(tableInfos);
+
+ List<TableInfo> refTableInfos = new ArrayList<>();
+ refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
+ refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
+ refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES);
+ this.refTableInfos = Collections.unmodifiableList(refTableInfos);
+ }
+
+
+ protected List<TableInfo> getRefTableInfos() {
+ return refTableInfos;
+ }
+
+ protected List<TableInfo> getNonRefTableInfos() {
+ return tableInfos;
+ }
+
+ protected TableInfo getMimeTable() {
+ return AbstractProfiler.MIME_TABLE;
+ }
+
+ public void populateRefTables(JDBCUtil dbUtil, MimeBuffer mimeBuffer)
throws IOException, SQLException {
+ boolean refTablesPopulated = true;
+ try {
+ Connection connection = dbUtil.getConnection();
+ for (TableInfo tableInfo : getRefTableInfos()) {
+ int rows = 0;
+ try (ResultSet rs = connection
+ .createStatement()
+ .executeQuery("select * from " +
tableInfo.getName())) {
+ while (rs.next()) {
+ rows++;
+ }
+ }
+ if (rows == 0) {
+ refTablesPopulated = false;
+ break;
+ }
+
+ }
+ } catch (SQLException e) {
+ //swallow
+ }
+ if (refTablesPopulated) {
+ LOG.info("ref tables are already populated");
+ return;
+ }
+
+ IDBWriter writer = getDBWriter(getRefTableInfos(), dbUtil,
mimeBuffer);
+ Map<Cols, String> m = new HashMap<>();
+ for (AbstractProfiler.PARSE_ERROR_TYPE t :
AbstractProfiler.PARSE_ERROR_TYPE.values()) {
+ m.clear();
+ m.put(Cols.PARSE_ERROR_ID, Integer.toString(t.ordinal()));
+ m.put(Cols.PARSE_ERROR_DESCRIPTION, t.name());
+ writer.writeRow(AbstractProfiler.REF_PARSE_ERROR_TYPES, m);
+ }
+
+ for (AbstractProfiler.EXCEPTION_TYPE t :
AbstractProfiler.EXCEPTION_TYPE.values()) {
+ m.clear();
+ m.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(t.ordinal()));
+ m.put(Cols.PARSE_EXCEPTION_DESCRIPTION, t.name());
+ writer.writeRow(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES, m);
+ }
+
+ for (ExtractReaderException.TYPE t :
ExtractReaderException.TYPE.values()) {
+ m.clear();
+ m.put(Cols.EXTRACT_EXCEPTION_ID,
Integer.toString(t.ordinal()));
+ m.put(Cols.EXTRACT_EXCEPTION_DESCRIPTION, t.name());
+ writer.writeRow(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES,
m);
+ }
+ writer.close();
+ }
+
+ protected IDBWriter getDBWriter(List<TableInfo> tableInfos, JDBCUtil
dbUtil, MimeBuffer mimeBuffer) throws IOException, SQLException {
+ Connection conn = dbUtil.getConnection();
+ return new DBWriter(conn, tableInfos, dbUtil, mimeBuffer);
+ }
+
+
+ protected void addErrorLogTablePairs(DBConsumersManager manager,
EvalConfig evalConfig) {
+ Path errorLog = evalConfig.getErrorLogFile();
+ if (errorLog == null) {
+ return;
+ }
+ manager.addErrorLogTablePair(errorLog,
ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
+ }
+ }
+}
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
index 22889d73b..9b0f482f6 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
@@ -22,13 +22,10 @@ import java.sql.Types;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
-import org.apache.tika.batch.FileResource;
+import org.apache.tika.eval.app.batch.FileResource;
import org.apache.tika.eval.app.db.ColInfo;
import org.apache.tika.eval.app.db.Cols;
import org.apache.tika.eval.app.db.TableInfo;
@@ -39,7 +36,7 @@ import org.apache.tika.eval.core.util.ContentTags;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-public class ExtractProfiler extends AbstractProfiler {
+public class ExtractProfiler extends ProfilerBase {
private final static String FIELD = "f";
public static TableInfo EXTRACT_EXCEPTION_TABLE =
@@ -76,56 +73,19 @@ public class ExtractProfiler extends AbstractProfiler {
new ColInfo(Cols.TAGS_PARSE_EXCEPTION, Types.BOOLEAN));
static Options OPTIONS;
- static {
- //By the time this commandline is parsed, there should be both an
extracts and an inputDir
- Option extracts = new Option("extracts", true, "directory for extract
files");
- extracts.setRequired(true);
-
- Option inputDir = new Option("inputDir", true, "optional: directory
for original binary input documents." + " If not specified, -extracts is
crawled as is.");
-
- OPTIONS = new Options()
- .addOption(extracts)
- .addOption(inputDir)
- .addOption("bc", "optional: tika-batch config file")
- .addOption("numConsumers", true, "optional: number of consumer
threads")
- .addOption(new Option("alterExtract", true,
- "for json-formatted extract files, " + "process full
metadata list ('as_is'=default), " + "take just the first/container document
('first_only'), " +
- "concatenate all content into the first
metadata item ('concatenate_content')"))
- .addOption("minExtractLength", true, "minimum extract length
to process (in bytes)")
- .addOption("maxExtractLength", true, "maximum extract length
to process (in bytes)")
- .addOption("db", true, "db file to which to write results")
- .addOption("jdbc", true, "EXPERT: full jdbc connection string.
Must specify this or -db <h2db>")
- .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or
specify via -Djdbc.driver")
- .addOption("tablePrefix", true, "EXPERT: optional prefix for
table names")
- .addOption("drop", false, "drop tables if they exist")
- .addOption("maxFilesToAdd", true, "maximum number of files to
add to the crawler")
- .addOption("maxTokens", true, "maximum tokens to process,
default=200000")
- .addOption("maxContentLength", true, "truncate content beyond
this length for calculating 'contents' stats, default=1000000")
- .addOption("maxContentLengthForLangId", true, "truncate
content beyond this length for language id, default=50000")
- .addOption("defaultLangCode", true, "which language to use for
common words if no 'common words' file exists for the langid result")
-
- ;
-
- }
-
private final Path inputDir;
private final Path extracts;
private final ExtractReader extractReader;
- public ExtractProfiler(ArrayBlockingQueue<FileResource> queue, Path
inputDir, Path extracts, ExtractReader extractReader, IDBWriter dbWriter) {
- super(queue, dbWriter);
+
+ ExtractProfiler(Path inputDir, Path extracts, ExtractReader extractReader,
IDBWriter dbWriter) {
+ super(dbWriter);
this.inputDir = inputDir;
this.extracts = extracts;
this.extractReader = extractReader;
}
- public static void USAGE() {
- HelpFormatter helpFormatter = new HelpFormatter();
- helpFormatter.printHelp(80, "java -jar tika-eval-x.y.jar Profile
-extracts extracts -db mydb [-inputDir input]", "Tool: Profile",
ExtractProfiler.OPTIONS,
- "Note: for the default h2 db, do not include the .mv.db at the
end of the db name.");
- }
- @Override
public boolean processFileResource(FileResource fileResource) {
Metadata metadata = fileResource.getMetadata();
EvalFilePaths fps = null;
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
new file mode 100644
index 000000000..19a7d680f
--- /dev/null
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
@@ -0,0 +1,813 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app;
+
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.sql.Types;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.eval.app.db.ColInfo;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.ExtractReaderException;
+import org.apache.tika.eval.app.io.IDBWriter;
+import org.apache.tika.eval.core.langid.LanguageIDWrapper;
+import org.apache.tika.eval.core.textstats.BasicTokenCountStatsCalculator;
+import org.apache.tika.eval.core.textstats.CommonTokens;
+import org.apache.tika.eval.core.textstats.CompositeTextStatsCalculator;
+import org.apache.tika.eval.core.textstats.ContentLengthCalculator;
+import org.apache.tika.eval.core.textstats.TextStatsCalculator;
+import org.apache.tika.eval.core.textstats.TokenEntropy;
+import org.apache.tika.eval.core.textstats.TokenLengths;
+import org.apache.tika.eval.core.textstats.TopNTokens;
+import org.apache.tika.eval.core.textstats.UnicodeBlockCounter;
+import org.apache.tika.eval.core.tokens.AnalyzerManager;
+import org.apache.tika.eval.core.tokens.CommonTokenCountManager;
+import org.apache.tika.eval.core.tokens.CommonTokenResult;
+import org.apache.tika.eval.core.tokens.TokenCounts;
+import org.apache.tika.eval.core.tokens.TokenIntPair;
+import org.apache.tika.eval.core.util.ContentTagParser;
+import org.apache.tika.eval.core.util.ContentTags;
+import org.apache.tika.eval.core.util.EvalExceptionUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.language.detect.LanguageResult;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PDF;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.apache.tika.utils.StringUtils;
+
+public abstract class ProfilerBase {
+
+ public static final String TRUE = Boolean.toString(true);
+ public static final String FALSE = Boolean.toString(false);
+ protected static final AtomicInteger ID = new AtomicInteger();
+ static final long NON_EXISTENT_FILE_LENGTH = -1l;
+ final static int FILE_PATH_MAX_LEN = 1024;//max len for varchar for
file_path
+ //Container exception key from the 1.x branch
+ private static final Property CONTAINER_EXCEPTION_1X =
Property.externalText("X-TIKA" + ":EXCEPTION:runtime");
+ private static final Logger LOG =
LoggerFactory.getLogger(ProfilerBase.class);
+ private static final String[] EXTRACT_EXTENSIONS = {".json", ".txt", ""};
+ private static final String[] COMPRESSION_EXTENSIONS = {"", ".bz2",
".gzip", ".zip",};
+ private static final String ZERO = "0";
+ private static final String UNKNOWN_EXTENSION = "unk";
+ //make this configurable
+ private static final String DIGEST_KEY = "X-TIKA:digest:MD5";
+ private static final Map<String, Cols> UC_TAGS_OF_INTEREST = initTags();
+ private final static Pattern ACCESS_PERMISSION_EXCEPTION =
Pattern.compile("org\\.apache\\.tika\\.exception\\.AccessPermissionException");
+ private final static Pattern ENCRYPTION_EXCEPTION =
Pattern.compile("org\\.apache\\.tika.exception\\.EncryptedDocumentException");
+ public static TableInfo REF_EXTRACT_EXCEPTION_TYPES = new
TableInfo("ref_extract_exception_types", new ColInfo(Cols.EXTRACT_EXCEPTION_ID,
Types.INTEGER),
+ new ColInfo(Cols.EXTRACT_EXCEPTION_DESCRIPTION, Types.VARCHAR,
128));
+ public static TableInfo REF_PARSE_ERROR_TYPES =
+ new TableInfo("ref_parse_error_types", new
ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER), new
ColInfo(Cols.PARSE_ERROR_DESCRIPTION, Types.VARCHAR, 128));
+ public static TableInfo REF_PARSE_EXCEPTION_TYPES =
+ new TableInfo("ref_parse_exception_types", new
ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER), new
ColInfo(Cols.PARSE_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128));
+ public static TableInfo MIME_TABLE = new TableInfo("mimes", new
ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"), new
ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
+ new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12));
+ private static CommonTokenCountManager COMMON_TOKEN_COUNT_MANAGER;
+ private static Pattern FILE_NAME_CLEANER =
Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$");
+ private static LanguageIDWrapper LANG_ID = new LanguageIDWrapper();
+ protected IDBWriter writer;
+ AnalyzerManager analyzerManager;
+ int maxContentLength = 10000000;
+ int maxContentLengthForLangId = 50000;
+ int maxTokens = 200000;
+
+ CompositeTextStatsCalculator compositeTextStatsCalculator;
+ private String lastExtractExtension = null;
+
+ public ProfilerBase(IDBWriter writer) {
+ this.writer = writer;
+ LanguageIDWrapper.setMaxTextLength(maxContentLengthForLangId);
+ this.compositeTextStatsCalculator =
initAnalyzersAndTokenCounter(maxTokens, LANG_ID);
+ }
+
+ private static Map<String, Cols> initTags() {
+ //simplify this mess
+ Map<String, Cols> tmp = new HashMap<>();
+ tmp.put("A", Cols.TAGS_A);
+ tmp.put("B", Cols.TAGS_B);
+ tmp.put("DIV", Cols.TAGS_DIV);
+ tmp.put("I", Cols.TAGS_I);
+ tmp.put("IMG", Cols.TAGS_IMG);
+ tmp.put("LI", Cols.TAGS_LI);
+ tmp.put("OL", Cols.TAGS_OL);
+ tmp.put("P", Cols.TAGS_P);
+ tmp.put("TABLE", Cols.TAGS_TABLE);
+ tmp.put("TD", Cols.TAGS_TD);
+ tmp.put("TITLE", Cols.TAGS_TITLE);
+ tmp.put("TR", Cols.TAGS_TR);
+ tmp.put("U", Cols.TAGS_U);
+ tmp.put("UL", Cols.TAGS_UL);
+ return Collections.unmodifiableMap(tmp);
+ }
+
+ /**
+ * @param p path to the common_tokens directory. If this is
null, try to load from classPath
+ * @param defaultLangCode this is the language code to use if a
common_words list doesn't exist for the
+ * detected langauge; can be <code>null</code>
+ * @throws IOException
+ */
+ public static void loadCommonTokens(Path p, String defaultLangCode) throws
IOException {
+ COMMON_TOKEN_COUNT_MANAGER = new CommonTokenCountManager(p,
defaultLangCode);
+ }
+
+ private static String getFileName(String path) {
+ if (path == null) {
+ return "";
+ }
+ //filenameUtils checks for a null byte in the path.
+ //it will throw an IllegalArgumentException if there is a null byte.
+ //given that we're recording names and not using them on a file path
+ //we should ignore this.
+ try {
+ return FilenameUtils.getName(path);
+ } catch (IllegalArgumentException e) {
+ LOG.warn("{} in {}", e.getMessage(), path);
+ }
+ path = path.replaceAll("\u0000", " ");
+ try {
+ return FilenameUtils.getName(path);
+ } catch (IllegalArgumentException e) {
+ LOG.warn("Again: {} in {}", e.getMessage(), path);
+ }
+ //give up
+ return "";
+ }
+
+ /**
+ * Get the content and record in the data {@link
Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the string was truncated
+ *
+ * @param contentTags
+ * @param maxLength
+ * @param data
+ * @return
+ */
+ protected static String truncateContent(ContentTags contentTags, int
maxLength, Map<Cols, String> data) {
+ data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE");
+ if (contentTags == null) {
+ return "";
+ }
+ String c = contentTags.getContent();
+ if (maxLength > -1 && c.length() > maxLength) {
+ c = c.substring(0, maxLength);
+ data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "TRUE");
+ }
+ return c;
+
+ }
+
+ protected static ContentTags getContent(EvalFilePaths evalFilePaths,
Metadata metadata) {
+ if (metadata == null) {
+ return ContentTags.EMPTY_CONTENT_TAGS;
+ }
+ return parseContentAndTags(evalFilePaths, metadata);
+ }
+
+ /**
+ * @param list
+ * @return empty list if input list is empty or null
+ */
+ static List<Integer> countAttachments(List<Metadata> list) {
+ List<Integer> ret = new ArrayList<>();
+ if (list == null || list.size() == 0) {
+ return ret;
+ }
+ //container document attachment count = list.size()-1
+ ret.add(list.size() - 1);
+
+ Map<String, Integer> counts = new HashMap<>();
+ for (int i = 1; i < list.size(); i++) {
+ String path = list
+ .get(i)
+ .get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
+ if (path == null) {
+ //shouldn't ever happen
+ continue;
+ }
+ String[] parts = path.split("/");
+ StringBuilder parent = new StringBuilder();
+ for (int end = 1; end < parts.length - 1; end++) {
+ parent.setLength(0);
+ join("/", parent, parts, 1, end);
+ String parentPath = parent.toString();
+ Integer count = counts.get(parentPath);
+ if (count == null) {
+ count = 1;
+ } else {
+ count++;
+ }
+ counts.put(parentPath, count);
+ }
+ }
+
+ for (int i = 1; i < list.size(); i++) {
+ Integer count = counts.get(list
+ .get(i)
+ .get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+ if (count == null) {
+ count = 0;
+ }
+ ret.add(i, count);
+ }
+ return ret;
+
+
+ }
+
+ private static void join(String delimiter, StringBuilder sb, String[]
parts, int start, int end) {
+ for (int i = start; i <= end; i++) {
+ sb.append(delimiter);
+ sb.append(parts[i]);
+ }
+ }
+
+ private static ContentTags parseContentAndTags(EvalFilePaths
evalFilePaths, Metadata metadata) {
+ String s = metadata.get(TikaCoreProperties.TIKA_CONTENT);
+ if (s == null || s.isEmpty()) {
+ return ContentTags.EMPTY_CONTENT_TAGS;
+ }
+
+ String handlerClass =
metadata.get(TikaCoreProperties.TIKA_CONTENT_HANDLER);
+ if (evalFilePaths
+ .getExtractFile()
+ .getFileName()
+ .toString()
+ .toLowerCase(Locale.ENGLISH)
+ .endsWith(".html")) {
+ try {
+ return ContentTagParser.parseHTML(s,
UC_TAGS_OF_INTEREST.keySet());
+ } catch (IOException | SAXException e) {
+ LOG.warn("Problem parsing html in {}; backing off to treat
string as text", evalFilePaths
+ .getExtractFile()
+ .toAbsolutePath()
+ .toString(), e);
+
+ return new ContentTags(s, true);
+ }
+ } else if (evalFilePaths
+ .getExtractFile()
+ .getFileName()
+ .toString()
+ .toLowerCase(Locale.ENGLISH)
+ .endsWith(".xhtml") || (handlerClass != null &&
handlerClass.equals(ToXMLContentHandler.class.getSimpleName()))) {
+ try {
+ return ContentTagParser.parseXML(s,
UC_TAGS_OF_INTEREST.keySet());
+ } catch (TikaException | IOException | SAXException e) {
+ LOG.warn("Problem parsing xhtml in {}; backing off to html
parser", evalFilePaths
+ .getExtractFile()
+ .toAbsolutePath()
+ .toString(), e);
+ try {
+ ContentTags contentTags = ContentTagParser.parseHTML(s,
UC_TAGS_OF_INTEREST.keySet());
+ contentTags.setParseException(true);
+ return contentTags;
+ } catch (IOException | SAXException e2) {
+ LOG.warn("Problem parsing html in {}; backing off to treat
string as text", evalFilePaths
+ .getExtractFile()
+ .toAbsolutePath()
+ .toString(), e2);
+ }
+ return new ContentTags(s, true);
+ }
+ }
+ return new ContentTags(s);
+ }
+
+ private CompositeTextStatsCalculator initAnalyzersAndTokenCounter(int
maxTokens, LanguageIDWrapper langIder) {
+ analyzerManager = AnalyzerManager.newInstance(maxTokens);
+ List<TextStatsCalculator> calculators = new ArrayList<>();
+ calculators.add(new CommonTokens(COMMON_TOKEN_COUNT_MANAGER));
+ calculators.add(new TokenEntropy());
+ calculators.add(new TokenLengths());
+ calculators.add(new TopNTokens(10));
+ calculators.add(new BasicTokenCountStatsCalculator());
+ calculators.add(new ContentLengthCalculator());
+ calculators.add(new UnicodeBlockCounter(maxContentLengthForLangId));
+
+ return new CompositeTextStatsCalculator(calculators,
analyzerManager.getGeneralAnalyzer(), langIder);
+ }
+
+ /**
+ * Truncate the content string if greater than this length to this length
+ *
+ * @param maxContentLength
+ */
+ public void setMaxContentLength(int maxContentLength) {
+ this.maxContentLength = maxContentLength;
+ }
+
+ /**
+ * Truncate content string if greater than this length to this length for
lang id
+ *
+ * @param maxContentLengthForLangId
+ */
+ public void setMaxContentLengthForLangId(int maxContentLengthForLangId) {
+ this.maxContentLengthForLangId = maxContentLengthForLangId;
+ LanguageIDWrapper.setMaxTextLength(maxContentLengthForLangId);
+ }
+
+ /**
+ * Add a LimitTokenCountFilterFactory if > -1
+ *
+ * @param maxTokens
+ */
+ public void setMaxTokens(int maxTokens) {
+ this.maxTokens = maxTokens;
+ initAnalyzersAndTokenCounter(maxTokens, new LanguageIDWrapper());
+ }
+
+ protected void writeExtractException(TableInfo extractExceptionTable,
String containerId, String filePath, ExtractReaderException.TYPE type) throws
IOException {
+ Map<Cols, String> data = new HashMap<>();
+ data.put(Cols.CONTAINER_ID, containerId);
+ data.put(Cols.FILE_PATH, filePath);
+ data.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(type.ordinal()));
+ writer.writeRow(extractExceptionTable, data);
+
+ }
+
+ protected void writeProfileData(EvalFilePaths fps, int i, ContentTags
contentTags, Metadata m, String fileId, String containerId, List<Integer>
numAttachments,
+ TableInfo profileTable) {
+
+ Map<Cols, String> data = new HashMap<>();
+ data.put(Cols.ID, fileId);
+ data.put(Cols.CONTAINER_ID, containerId);
+ data.put(Cols.MD5, m.get(DIGEST_KEY));
+
+ if (i < numAttachments.size()) {
+ data.put(Cols.NUM_ATTACHMENTS,
Integer.toString(numAttachments.get(i)));
+ }
+ data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
+ data.put(Cols.NUM_METADATA_VALUES,
Integer.toString(countMetadataValues(m)));
+
+ Integer nPages = m.getInt(PagedText.N_PAGES);
+ if (nPages != null) {
+ data.put(Cols.NUM_PAGES, Integer.toString(nPages));
+ }
+ Integer nOCRPages = m.getInt(PDF.OCR_PAGE_COUNT);
+ if (nOCRPages != null) {
+ data.put(Cols.NUM_OCR_PAGES, Integer.toString(nOCRPages));
+ }
+
+ //if the outer wrapper document
+ if (i == 0) {
+ data.put(Cols.IS_EMBEDDED, FALSE);
+ data.put(Cols.FILE_NAME, fps
+ .getRelativeSourceFilePath()
+ .getFileName()
+ .toString());
+ data.put(Cols.EMBEDDED_DEPTH, "0");
+ } else {
+ data.put(Cols.IS_EMBEDDED, TRUE);
+ String embeddedFilePath =
m.get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH);
+ if (!StringUtils.isBlank(embeddedFilePath)) {
+ data.put(Cols.FILE_NAME, getFileName(embeddedFilePath));
+ data.put(Cols.EMBEDDED_FILE_PATH, embeddedFilePath);
+ }
+ if
(!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_DEPTH))) {
+ data.put(Cols.EMBEDDED_DEPTH,
m.get(TikaCoreProperties.EMBEDDED_DEPTH));
+ }
+ if
(!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) {
+ data.put(Cols.ATTACHMENT_TYPE,
m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ }
+ }
+ String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME));
+ ext = (ext == null) ? "" : ext.toLowerCase(Locale.US);
+ data.put(Cols.FILE_EXTENSION, ext);
+ long srcFileLen = getSourceFileLength(m);
+ if (srcFileLen > NON_EXISTENT_FILE_LENGTH) {
+ data.put(Cols.LENGTH, Long.toString(srcFileLen));
+ } else {
+ data.put(Cols.LENGTH, "");
+ }
+ int numMetadataValues = countMetadataValues(m);
+ data.put(Cols.NUM_METADATA_VALUES,
Integer.toString(numMetadataValues));
+
+ data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
+
+ String content = contentTags.getContent();
+ if (content == null || content.isBlank()) {
+ data.put(Cols.HAS_CONTENT, FALSE);
+ } else {
+ data.put(Cols.HAS_CONTENT, TRUE);
+ }
+ getFileTypes(m, data);
+ try {
+ writer.writeRow(profileTable, data);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ protected void writeExceptionData(String fileId, Metadata m, TableInfo
exceptionTable) {
+ Map<Cols, String> data = new HashMap<>();
+ getExceptionStrings(m, data);
+ if (data
+ .keySet()
+ .size() > 0) {
+ try {
+ data.put(Cols.ID, fileId);
+ writer.writeRow(exceptionTable, data);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ protected Map<Class, Object> calcTextStats(ContentTags contentTags) {
+/* if (contentTags == ContentTags.EMPTY_CONTENT_TAGS) {
+ return Collections.EMPTY_MAP;
+ }*/
+ Map<Cols, String> data = new HashMap<>();
+ String content = truncateContent(contentTags, maxContentLength, data);
+ if (content == null || content.isBlank()) {
+ content = "";
+ }
+ return compositeTextStatsCalculator.calculate(content);
+ }
+
+ /**
+ * Checks to see if metadata is null or content is empty (null or only
whitespace).
+ * If any of these, then this does no processing, and the fileId is not
+ * entered into the content table.
+ *
+ * @param fileId
+ * @param textStats
+ * @param contentsTable
+ */
+ protected void writeContentData(String fileId, Map<Class, Object>
textStats, TableInfo contentsTable) throws IOException {
+ Map<Cols, String> data = new HashMap<>();
+ data.put(Cols.ID, fileId);
+ if (textStats.containsKey(ContentLengthCalculator.class)) {
+ int length = (int) textStats.get(ContentLengthCalculator.class);
+ if (length == 0) {
+ return;
+ }
+ data.put(Cols.CONTENT_LENGTH, Integer.toString(length));
+ }
+ langid(textStats, data);
+
+ writeTokenCounts(textStats, data);
+ CommonTokenResult commonTokenResult = (CommonTokenResult)
textStats.get(CommonTokens.class);
+ if (commonTokenResult != null) {
+ data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
+ data.put(Cols.NUM_UNIQUE_COMMON_TOKENS,
Integer.toString(commonTokenResult.getUniqueCommonTokens()));
+ data.put(Cols.NUM_COMMON_TOKENS,
Integer.toString(commonTokenResult.getCommonTokens()));
+ data.put(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS,
Integer.toString(commonTokenResult.getUniqueAlphabeticTokens()));
+ data.put(Cols.NUM_ALPHABETIC_TOKENS,
Integer.toString(commonTokenResult.getAlphabeticTokens()));
+ double oov = commonTokenResult.getAlphabeticTokens() > 0 ?
commonTokenResult.getOOV() : -1.0;
+ data.put(Cols.OOV, Double.toString(oov));
+ }
+ TokenCounts tokenCounts = (TokenCounts)
textStats.get(BasicTokenCountStatsCalculator.class);
+ if (tokenCounts != null) {
+
+ data.put(Cols.NUM_UNIQUE_TOKENS,
Integer.toString(tokenCounts.getTotalUniqueTokens()));
+ data.put(Cols.NUM_TOKENS,
Integer.toString(tokenCounts.getTotalTokens()));
+ }
+ if (textStats.get(TokenEntropy.class) != null) {
+ data.put(Cols.TOKEN_ENTROPY_RATE, Double.toString((Double)
textStats.get(TokenEntropy.class)));
+ }
+
+
+ SummaryStatistics summStats = (SummaryStatistics)
textStats.get(TokenLengths.class);
+ if (summStats != null) {
+ data.put(Cols.TOKEN_LENGTH_SUM, Integer.toString((int)
summStats.getSum()));
+
+ data.put(Cols.TOKEN_LENGTH_MEAN,
Double.toString(summStats.getMean()));
+
+ data.put(Cols.TOKEN_LENGTH_STD_DEV,
Double.toString(summStats.getStandardDeviation()));
+ }
+ unicodeBlocks(textStats, data);
+ try {
+ writer.writeRow(contentsTable, data);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ void writeTagData(String fileId, ContentTags contentTags, TableInfo
tagsTable) {
+ Map<String, Integer> tags = contentTags.getTags();
+ if (tags.size() == 0 && contentTags.getParseException() == false) {
+ return;
+ }
+ Map<Cols, String> data = new HashMap<>();
+ data.put(Cols.ID, fileId);
+
+ for (Map.Entry<String, Cols> e : UC_TAGS_OF_INTEREST.entrySet()) {
+ Integer count = tags.get(e.getKey());
+ if (count == null) {
+ data.put(e.getValue(), ZERO);
+ } else {
+ data.put(e.getValue(), Integer.toString(count));
+ }
+ }
+
+ if (contentTags.getParseException()) {
+ data.put(Cols.TAGS_PARSE_EXCEPTION, TRUE);
+ } else {
+ data.put(Cols.TAGS_PARSE_EXCEPTION, FALSE);
+ }
+ try {
+ writer.writeRow(tagsTable, data);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ String getTime(Metadata m) {
+ String elapsed = "-1";
+
+ String v = m.get(TikaCoreProperties.PARSE_TIME_MILLIS);
+ if (v != null) {
+ return v;
+ }
+ return elapsed;
+ }
+
+ int countMetadataValues(Metadata m) {
+ if (m == null) {
+ return 0;
+ }
+ int i = 0;
+ for (String n : m.names()) {
+ i += m.getValues(n).length;
+ }
+ return i;
+ }
+
+ void getExceptionStrings(Metadata metadata, Map<Cols, String> data) {
+
+ String fullTrace =
metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION);
+ if (fullTrace == null) {
+ fullTrace = metadata.get(CONTAINER_EXCEPTION_1X);
+ }
+
+ if (fullTrace == null) {
+ fullTrace = metadata.get(TikaCoreProperties.EMBEDDED_EXCEPTION);
+ }
+
+ if (fullTrace != null) {
+ //check for "expected" exceptions...exceptions
+ //that can't be fixed.
+ //Do not store trace for "expected" exceptions
+
+ Matcher matcher = ACCESS_PERMISSION_EXCEPTION.matcher(fullTrace);
+ if (matcher.find()) {
+ data.put(Cols.PARSE_EXCEPTION_ID,
Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()));
+ return;
+ }
+ matcher = ENCRYPTION_EXCEPTION.matcher(fullTrace);
+ if (matcher.find()) {
+ data.put(Cols.PARSE_EXCEPTION_ID,
Integer.toString(EXCEPTION_TYPE.ENCRYPTION.ordinal()));
+ return;
+ }
+
+ data.put(Cols.PARSE_EXCEPTION_ID,
Integer.toString(EXCEPTION_TYPE.RUNTIME.ordinal()));
+
+ data.put(Cols.ORIG_STACK_TRACE, fullTrace);
+ //TikaExceptions can have object ids, as in the "@2b1ea6ee" in:
+ //org.apache.tika.exception.TikaException: TIKA-198: Illegal
+ //IOException from
org.apache.tika.parser.microsoft.OfficeParser@2b1ea6ee
+ //For reporting purposes, let's snip off the object id so that we
can more
+ //easily count exceptions.
+ String sortTrace = EvalExceptionUtils.normalize(fullTrace);
+ data.put(Cols.SORT_STACK_TRACE, sortTrace);
+ }
+ }
+
+ void unicodeBlocks(Map<Class, Object> tokenStats, Map<Cols, String> data) {
+
+ Map<String, MutableInt> blocks = (Map<String, MutableInt>)
tokenStats.get(UnicodeBlockCounter.class);
+ List<Pair<String, Integer>> pairs = new ArrayList<>();
+ for (Map.Entry<String, MutableInt> e : blocks.entrySet()) {
+ pairs.add(Pair.of(e.getKey(), e
+ .getValue()
+ .intValue()));
+ }
+ pairs.sort((o1, o2) -> o2
+ .getValue()
+ .compareTo(o1.getValue()));
+ StringBuilder sb = new StringBuilder();
+
+ for (int i = 0; i < 20 && i < pairs.size(); i++) {
+ if (i > 0) {
+ sb.append(" | ");
+ }
+ sb
+ .append(pairs
+ .get(i)
+ .getKey())
+ .append(": ")
+ .append(pairs
+ .get(i)
+ .getValue());
+ }
+ data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString());
+ }
+
+ void langid(Map<Class, Object> stats, Map<Cols, String> data) {
+ List<LanguageResult> probabilities = (List<LanguageResult>)
stats.get(LanguageIDWrapper.class);
+
+ if (probabilities.size() > 0) {
+ data.put(Cols.LANG_ID_1, probabilities
+ .get(0)
+ .getLanguage());
+ data.put(Cols.LANG_ID_PROB_1, Double.toString(probabilities
+ .get(0)
+ .getRawScore()));
+ }
+ if (probabilities.size() > 1) {
+ data.put(Cols.LANG_ID_2, probabilities
+ .get(1)
+ .getLanguage());
+ data.put(Cols.LANG_ID_PROB_2, Double.toString(probabilities
+ .get(1)
+ .getRawScore()));
+ }
+ }
+
+ void getFileTypes(Metadata metadata, Map<Cols, String> output) {
+ if (metadata == null) {
+ return;
+ }
+ String type = metadata.get(Metadata.CONTENT_TYPE);
+ if (type == null) {
+ return;
+ }
+ int mimeId = writer.getMimeId(type);
+ output.put(Cols.MIME_ID, Integer.toString(mimeId));
+ }
+
+ void writeTokenCounts(Map<Class, Object> textStats, Map<Cols, String>
data) {
+ TokenIntPair[] tokenIntPairs = (TokenIntPair[])
textStats.get(TopNTokens.class);
+ int i = 0;
+ StringBuilder sb = new StringBuilder();
+ for (TokenIntPair t : tokenIntPairs) {
+ if (i++ > 0) {
+ sb.append(" | ");
+ }
+ sb
+ .append(t.getToken())
+ .append(": ")
+ .append(t.getValue());
+ }
+
+ data.put(Cols.TOP_N_TOKENS, sb.toString());
+ }
+
+ public void closeWriter() throws IOException {
+ writer.close();
+ }
+
+ /**
+ * @param metadata
+ * @param extracts
+ * @return evalfilepaths for files if crawling an extract directory
+ */
+ protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata, Path
extracts) {
+ String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH);
+ Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath);
+ Path relativeSourceFilePath = Paths.get(m.replaceAll(""));
+ //just try slapping the relextractfilepath on the extractdir
+ Path extractFile = extracts.resolve(relExtractFilePath);
+ if (!Files.isRegularFile(extractFile)) {
+ //if that doesn't work, try to find the right extract file.
+ //This is necessary if crawling extractsA and trying to find a
file in
+ //extractsB that is not in the same format: json vs txt or
compressed
+ extractFile = findFile(extracts, relativeSourceFilePath);
+ }
+ return new EvalFilePaths(relativeSourceFilePath, extractFile);
+ }
+
+ //call this if the crawler is crawling through the src directory
+ protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path
srcDir, Path extracts) {
+ Path relativeSourceFilePath =
Paths.get(metadata.get(FSProperties.FS_REL_PATH));
+ Path extractFile = findFile(extracts, relativeSourceFilePath);
+ Path inputFile = srcDir.resolve(relativeSourceFilePath);
+ long srcLen = -1l;
+ //try to get the length of the source file in case there was an error
+ //in both extracts
+ try {
+ srcLen = Files.size(inputFile);
+ } catch (IOException e) {
+ LOG.warn("Couldn't get length for: {}",
inputFile.toAbsolutePath());
+ }
+ return new EvalFilePaths(relativeSourceFilePath, extractFile, srcLen);
+ }
+
+ /**
+ * @param extractRootDir
+ * @param relativeSourceFilePath
+ * @return extractFile or null if couldn't find one.
+ */
+ private Path findFile(Path extractRootDir, Path relativeSourceFilePath) {
+ String relSrcFilePathString = relativeSourceFilePath.toString();
+ if (lastExtractExtension != null) {
+ Path candidate = extractRootDir.resolve(relSrcFilePathString +
lastExtractExtension);
+ if (Files.isRegularFile(candidate)) {
+ return candidate;
+ }
+ }
+ for (String ext : EXTRACT_EXTENSIONS) {
+ for (String compress : COMPRESSION_EXTENSIONS) {
+ Path candidate = extractRootDir.resolve(relSrcFilePathString +
ext + compress);
+ if (Files.isRegularFile(candidate)) {
+ lastExtractExtension = ext + compress;
+ return candidate;
+ }
+ }
+ }
+ return null;
+ }
+
+ protected long getSourceFileLength(EvalFilePaths fps, List<Metadata>
metadataList) {
+ if (fps.getSourceFileLength() > NON_EXISTENT_FILE_LENGTH) {
+ return fps.getSourceFileLength();
+ }
+ return getSourceFileLength(metadataList);
+ }
+
+ long getSourceFileLength(List<Metadata> metadataList) {
+ if (metadataList == null || metadataList.size() < 1) {
+ return NON_EXISTENT_FILE_LENGTH;
+ }
+ return getSourceFileLength(metadataList.get(0));
+ }
+
+ long getSourceFileLength(Metadata m) {
+ String lenString = m.get(Metadata.CONTENT_LENGTH);
+ if (lenString == null) {
+ return NON_EXISTENT_FILE_LENGTH;
+ }
+ try {
+ return Long.parseLong(lenString);
+ } catch (NumberFormatException e) {
+ //swallow
+ }
+ return NON_EXISTENT_FILE_LENGTH;
+ }
+
+ protected long getFileLength(Path p) {
+ if (p != null && Files.isRegularFile(p)) {
+ try {
+ return Files.size(p);
+ } catch (IOException e) {
+ //swallow
+ }
+ }
+ return NON_EXISTENT_FILE_LENGTH;
+ }
+
+ public enum EXCEPTION_TYPE {
+ RUNTIME, ENCRYPTION, ACCESS_PERMISSION, UNSUPPORTED_VERSION,
+ }
+
+ /**
+ * If information was gathered from the log file about
+ * a parse error
+ */
+ public enum PARSE_ERROR_TYPE {
+ OOM, TIMEOUT
+ }
+
+
+}
+
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/StatusReporter.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/StatusReporter.java
new file mode 100644
index 000000000..8df9824ed
--- /dev/null
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/StatusReporter.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import java.text.NumberFormat;
+import java.util.Locale;
+import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.util.DurationFormatUtils;
+
+public class StatusReporter implements Callable<Integer> {
+
+ public static final int COMPLETED_VAL = 3;
+ private static final Logger LOGGER =
LoggerFactory.getLogger(StatusReporter.class);
+ private final AtomicInteger filesQueued;
+ private final AtomicInteger filesProcessed;
+ private final AtomicInteger activeWorkers;
+ private final AtomicBoolean crawlerIsActive;
+ private final long start;
+ private final NumberFormat numberFormat =
NumberFormat.getNumberInstance(Locale.ROOT);
+
+
+ public StatusReporter(AtomicInteger filesQueued, AtomicInteger
filesProcessed, AtomicInteger activeWorkers, AtomicBoolean crawlerIsActive) {
+ this.filesQueued = filesQueued;
+ this.filesProcessed = filesProcessed;
+ this.activeWorkers = activeWorkers;
+ this.crawlerIsActive = crawlerIsActive;
+ this.start = System.currentTimeMillis();
+ }
+
+ @Override
+ public Integer call() throws Exception {
+ while (true) {
+
+ try {
+ Thread.sleep(1000);
+ } catch (InterruptedException e) {
+ LOGGER.info("Interrupted?");
+ //expected
+ return COMPLETED_VAL;
+ }
+ report();
+ if (activeWorkers.get() == 0) {
+ LOGGER.info("Completed successfully.");
+ return COMPLETED_VAL;
+ }
+ }
+ }
+
+ private void report() {
+ int cnt = filesProcessed.get();
+ long elapsed = System.currentTimeMillis() - start;
+ double elapsedSecs = (double) elapsed / (double) 1000;
+ int avg = (elapsedSecs > 5 || cnt > 100) ? (int) ((double) cnt /
elapsedSecs) : -1;
+
+ String elapsedString =
DurationFormatUtils.formatMillis(System.currentTimeMillis() - start);
+ String docsPerSec = avg > -1 ? String.format(Locale.ROOT, " (%s docs
per sec)", numberFormat.format(avg)) : "";
+ String msg = String.format(Locale.ROOT, "Processed %s documents in
%s%s.", numberFormat.format(cnt), elapsedString, docsPerSec);
+ LOGGER.info(msg);
+
+ int stillAlive = activeWorkers.get();
+ if (stillAlive == 1) {
+ msg = "There is one file processor still active.";
+ } else {
+ msg = "There are " + numberFormat.format(stillAlive) + " file
processors still active.";
+ }
+ LOGGER.info(msg);
+
+ int queued = filesQueued.get();
+
+ if (queued == 1) {
+ msg = "The crawler has enqueued 1 file.";
+ } else {
+ msg = "The crawler has enqueued " + numberFormat.format(queued) +
" files.";
+ }
+ LOGGER.info(msg);
+
+ if (! crawlerIsActive.get()) {
+ msg = "The directory crawler has completed its crawl.\n";
+ LOGGER.info(msg);
+ }
+ }
+}
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
index a897461ee..91aecd832 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
@@ -117,7 +117,7 @@ public class TikaEvalCLI {
CommandLine commandLine =
defaultCLIParser.parse(FileProfiler.OPTIONS, updatedArgs);
if (commandLine.hasOption("db") &&
commandLine.hasOption("jdbc")) {
System.out.println("Please specify either the default -db
or the full -jdbc, not both");
- ExtractProfiler.USAGE();
+ FileProfiler.USAGE();
return;
}
} catch (ParseException e) {
@@ -154,109 +154,7 @@ public class TikaEvalCLI {
}
private void handleProfile(String[] subsetArgs) throws Exception {
- List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
-
- boolean containsBC = false;
- String inputDir = null;
- String extracts = null;
- String alterExtract = null;
- //confirm there's a batch-config file
- for (int i = 0; i < argList.size(); i++) {
- String arg = argList.get(i);
- switch (arg) {
- case "-bc":
- containsBC = true;
- break;
- case "-inputDir":
- if (i + 1 >= argList.size()) {
- System.err.println("Must specify directory after
-inputDir");
- ExtractProfiler.USAGE();
- return;
- }
- inputDir = argList.get(i + 1);
- i++;
- break;
- case "-extracts":
- if (i + 1 >= argList.size()) {
- System.err.println("Must specify directory after
-extracts");
- ExtractProfiler.USAGE();
- return;
- }
- extracts = argList.get(i + 1);
- i++;
- break;
- case "-alterExtract":
- if (i + 1 >= argList.size()) {
- System.err.println("Must specify type 'as_is',
'first_only' or " + "'concatenate_content' after -alterExtract");
- ExtractComparer.USAGE();
- return;
- }
- alterExtract = argList.get(i + 1);
- i++;
- break;
- }
- }
-
- if (alterExtract != null && !alterExtract.equals("as_is") &&
!alterExtract.equals("concatenate_content") &&
!alterExtract.equals("first_only")) {
- System.out.println("Sorry, I don't understand:" + alterExtract +
". The values must be one of: as_is, first_only, concatenate_content");
- ExtractProfiler.USAGE();
- return;
- }
-
- //need to specify each in this commandline
- //if only extracts is passed to tika-batch,
- //the crawler will see no inputDir and start crawling "input".
- //this allows the user to specify either extracts or inputDir
- if (extracts == null && inputDir != null) {
- argList.add("-extracts");
- argList.add(inputDir);
- } else if (inputDir == null && extracts != null) {
- argList.add("-inputDir");
- argList.add(extracts);
- }
-
- Path tmpBCConfig = null;
- try {
- tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
- if (!containsBC) {
- try (InputStream is = this
- .getClass()
-
.getResourceAsStream("/tika-eval-profiler-config.xml")) {
- Files.copy(is, tmpBCConfig,
StandardCopyOption.REPLACE_EXISTING);
- }
- argList.add("-bc");
- argList.add(tmpBCConfig
- .toAbsolutePath()
- .toString());
- }
-
- String[] updatedArgs = argList.toArray(new String[0]);
- DefaultParser defaultCLIParser = new DefaultParser();
- try {
- CommandLine commandLine =
defaultCLIParser.parse(ExtractProfiler.OPTIONS, updatedArgs);
- if (commandLine.hasOption("db") &&
commandLine.hasOption("jdbc")) {
- System.out.println("Please specify either the default -db
or the full -jdbc, not both");
- ExtractProfiler.USAGE();
- return;
- }
- } catch (ParseException e) {
- System.out.println(e.getMessage() + "\n");
- ExtractProfiler.USAGE();
- return;
- }
-
- // lazy delete because main() calls System.exit()
- if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
- tmpBCConfig
- .toFile()
- .deleteOnExit();
- }
- FSBatchProcessCLI.main(updatedArgs);
- } finally {
- if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
- Files.delete(tmpBCConfig);
- }
- }
+ ExtractProfileRunner.main(subsetArgs);
}
private void handleCompare(String[] subsetArgs) throws Exception {
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java
deleted file mode 100644
index c59d53016..000000000
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.app.batch;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.eval.app.AbstractProfiler;
-import org.apache.tika.eval.app.ExtractProfiler;
-import org.apache.tika.eval.app.db.TableInfo;
-import org.apache.tika.util.PropsUtil;
-
-
-public class ExtractProfilerBuilder extends EvalConsumerBuilder {
-
- public final static String TABLE_PREFIX_KEY = "tablePrefix";
-
- private final List<TableInfo> tableInfos;
- private final List<TableInfo> refTableInfos;
-
- public ExtractProfilerBuilder() {
- List<TableInfo> tableInfos = new ArrayList();
- tableInfos.add(AbstractProfiler.MIME_TABLE);
- tableInfos.add(ExtractProfiler.CONTAINER_TABLE);
- tableInfos.add(ExtractProfiler.PROFILE_TABLE);
- tableInfos.add(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
- tableInfos.add(ExtractProfiler.EXCEPTION_TABLE);
- tableInfos.add(ExtractProfiler.CONTENTS_TABLE);
- tableInfos.add(ExtractProfiler.TAGS_TABLE);
- tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE);
- this.tableInfos = Collections.unmodifiableList(tableInfos);
-
- List<TableInfo> refTableInfos = new ArrayList<>();
- refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
- refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
- refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES);
- this.refTableInfos = Collections.unmodifiableList(refTableInfos);
- }
-
- @Override
- public FileResourceConsumer build() throws IOException, SQLException {
- Path extracts = PropsUtil.getPath(localAttrs.get("extracts"), null);
- if (extracts == null) {
- throw new RuntimeException("Must specify \"extracts\" -- directory
to crawl");
- }
- if (!Files.isDirectory(extracts)) {
- throw new RuntimeException("ROOT DIRECTORY DOES NOT EXIST: " +
extracts.toAbsolutePath());
- }
-
- Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
-
- //we _could_ set this to extracts (if not null)
- //here, but the Crawler defaults to "input" if nothing is passed
- //so this won't work
- if (inputDir == null) {
- throw new RuntimeException("Must specify -inputDir");
- }
- if (extracts == null && inputDir != null) {
- extracts = inputDir;
- }
- return parameterizeProfiler(new ExtractProfiler(queue, inputDir,
extracts, buildExtractReader(localAttrs), getDBWriter(tableInfos)));
- }
-
-
- @Override
- protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
- String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
- if (tableNamePrefix != null && !tableNamePrefix.equals("null")) {
- for (TableInfo tableInfo : tableInfos) {
- tableInfo.setNamePrefix(tableNamePrefix);
- }
- }
- }
-
-
- @Override
- protected List<TableInfo> getRefTableInfos() {
- return refTableInfos;
- }
-
- @Override
- protected List<TableInfo> getNonRefTableInfos() {
- return tableInfos;
- }
-
- @Override
- protected TableInfo getMimeTable() {
- return AbstractProfiler.MIME_TABLE;
- }
-
- @Override
- protected void addErrorLogTablePairs(DBConsumersManager manager) {
- Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"),
null);
- if (errorLog == null) {
- return;
- }
- manager.addErrorLogTablePair(errorLog,
ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
- }
-}
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileResource.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileResource.java
new file mode 100644
index 000000000..e4702cb8e
--- /dev/null
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileResource.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.batch;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+
+
+/**
+ * This is a basic interface to handle a logical "file".
+ * This should enable code-agnostic handling of files from different
+ * sources: file system, database, etc.
+ */
+public interface FileResource {
+
+ //The literal lowercased extension of a file. This may or may not
+ //have any relationship to the actual type of the file.
+ public static final Property FILE_EXTENSION =
Property.internalText("tika:file_ext");
+
+ /**
+ * This is only used in logging to identify which file
+ * may have caused problems. While it is probably best
+ * to use unique ids for the sake of debugging, it is not
+ * necessary that the ids be unique. This id
+ * is never used as a hashkey by the batch processors, for example.
+ *
+ * @return an id for a FileResource
+ */
+ public String getResourceId();
+
+ /**
+ * This gets the metadata available before the parsing of the file.
+ * This will typically be "external" metadata: file name,
+ * file size, file location, data stream, etc. That is, things
+ * that are known about the file from outside information, not
+ * file-internal metadata.
+ *
+ * @return Metadata
+ */
+ public Metadata getMetadata();
+
+ /**
+ * @return an InputStream for the FileResource
+ * @throws java.io.IOException
+ */
+ public InputStream openInputStream() throws IOException;
+
+}
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java
new file mode 100644
index 000000000..20f67798a
--- /dev/null
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.batch;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.metadata.Metadata;
+
+public class PathResource implements FileResource {
+
+ private final Path path;
+ private final String resourceId;
+ private final Metadata metadata = new Metadata();
+ public PathResource(Path path, String resourceId) {
+ this.path = path;
+ this.resourceId = resourceId;
+ metadata.set(FSProperties.FS_REL_PATH, resourceId);
+ }
+ @Override
+ public String getResourceId() {
+ return resourceId;
+ }
+
+ @Override
+ public Metadata getMetadata() {
+ return metadata;
+ }
+
+ @Override
+ public InputStream openInputStream() throws IOException {
+ return Files.newInputStream(path);
+ }
+}
diff --git
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/EvalConfigTest.java
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/EvalConfigTest.java
new file mode 100644
index 000000000..395c90fe6
--- /dev/null
+++
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/EvalConfigTest.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.net.URISyntaxException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import org.junit.jupiter.api.Test;
+
+public class EvalConfigTest {
+
+ @Test
+ public void testBasic() throws Exception {
+ EvalConfig evalConfig =
EvalConfig.load(getConfig("eval-config-basic.json"));
+ assertEquals(20000, evalConfig.getMaxExtractLength());
+ assertNull(evalConfig.getErrorLogFile());
+ assertNull(evalConfig.getJdbcString());
+ }
+
+ private Path getConfig(String fileName) throws URISyntaxException {
+ return Paths.get(EvalConfigTest.class.getResource("/eval-configs/" +
fileName).toURI());
+ }
+}
diff --git
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java
index be58e0ed2..3d6e93ad3 100644
---
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java
+++
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java
@@ -21,7 +21,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
-import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
@@ -30,79 +29,75 @@ import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
-import org.junit.jupiter.api.AfterAll;
+import org.apache.commons.io.FileUtils;
+import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.apache.tika.eval.app.db.Cols;
import org.apache.tika.eval.app.db.H2Util;
import org.apache.tika.eval.app.db.TableInfo;
-import org.apache.tika.eval.app.io.ExtractReaderException;
-@Disabled
public class ProfilerBatchTest {
- public final static String COMPARER_PROCESS_CLASS =
"org.apache.tika.batch.fs.FSBatchProcessCLI";
- private final static String profileTable =
ExtractProfiler.PROFILE_TABLE.getName();
- private final static String exTable =
ExtractProfiler.EXCEPTION_TABLE.getName();
- private final static String fpCol = Cols.FILE_PATH.name();
- private static Path dbDir;
- private static Connection conn;
+ private static Connection CONN;
+ private static Path DB_DIR;
+ private static Path DB;
@BeforeAll
public static void setUp() throws Exception {
+ DB_DIR = Files.createTempDirectory("profiler-test");
+ Path extractsRoot = Paths.get(ComparerBatchTest.class
+ .getResource("/test-dirs/extractsA")
+ .toURI());
Path inputRoot = Paths.get(ComparerBatchTest.class
- .getResource("/test-dirs/extractsA")
+ .getResource("/test-dirs/raw_input")
.toURI());
- dbDir = Files.createTempDirectory(inputRoot, "tika-test-db-dir-");
- Map<String, String> args = new HashMap<>();
- Path db = dbDir.resolve("profiler_test");
- args.put("-db", db.toString());
-
- //for debugging, you can use this to select only one file pair to load
- //args.put("-includeFilePat", "file8.*");
-
- /* BatchProcessTestExecutor ex = new
BatchProcessTestExecutor(COMPARER_PROCESS_CLASS, args,
- "/single-file-profiler-crawl-input-config.xml");
- StreamStrings streamStrings = ex.execute();
- System.out.println(streamStrings.getErrString());
- System.out.println(streamStrings.getOutString());*/
- H2Util dbUtil = new H2Util(db);
- conn = dbUtil.getConnection();
- }
- @AfterAll
- public static void tearDown() throws IOException {
+ DB = DB_DIR.resolve("mydb");
+ String[] args = new String[]{
+ "-i", inputRoot.toAbsolutePath().toString(),
+ "-e", extractsRoot.toAbsolutePath().toString(),
+ "-d", "jdbc:h2:file:" + DB.toAbsolutePath().toString()
+ };
+
+ ExtractProfileRunner.main(args);
+ }
+ @AfterEach
+ public void tearDown() throws IOException {
try {
- conn.close();
+ CONN.close();
} catch (SQLException e) {
throw new RuntimeException(e);
}
- //TODO: if/when we turn this back on, use @TempDir instead of this
+ FileUtils.deleteDirectory(DB_DIR.toFile());
- DirectoryStream<Path> dStream = Files.newDirectoryStream(dbDir);
- for (Path p : dStream) {
- Files.delete(p);
- }
- dStream.close();
- Files.delete(dbDir);
+ }
+
+ @BeforeEach
+ public void setUpEach() throws SQLException {
+ H2Util dbUtil = new H2Util(DB);
+ CONN = dbUtil.getConnection();
+ }
+
+ @AfterEach
+ public void tearDownEach() throws SQLException {
+ CONN.close();
}
@Test
public void testSimpleDBWriteAndRead() throws Exception {
-
Statement st = null;
List<String> fNameList = new ArrayList<>();
try {
String sql = "select * from " +
ExtractProfiler.CONTAINER_TABLE.getName();
- st = conn.createStatement();
+ st = CONN.createStatement();
ResultSet rs = st.executeQuery(sql);
while (rs.next()) {
String fileName = rs.getString(Cols.FILE_PATH.name());
@@ -113,17 +108,19 @@ public class ProfilerBatchTest {
st.close();
}
}
+ /*
debugTable(ExtractProfiler.CONTAINER_TABLE);
debugTable(ExtractProfiler.PROFILE_TABLE);
debugTable(ExtractProfiler.CONTENTS_TABLE);
debugTable(ExtractProfiler.EXCEPTION_TABLE);
- debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
- assertEquals(10, fNameList.size());
+ debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);*/
+ assertEquals(17, fNameList.size());
assertTrue(fNameList.contains("file1.pdf"), "file1.pdf");
assertTrue(fNameList.contains("file2_attachANotB.doc"),
"file2_attachANotB.doc");
assertTrue(fNameList.contains("file3_attachBNotA.doc"),
"file3_attachBNotA.doc");
assertTrue(fNameList.contains("file4_emptyB.pdf"), "file4_emptyB.pdf");
assertTrue(fNameList.contains("file7_badJson.pdf"),
"file4_emptyB.pdf");
+ assertTrue(fNameList.contains("file9_noextract.txt"),
"file9_noextract.txt");
}
@Test
@@ -131,43 +128,29 @@ public class ProfilerBatchTest {
String sql =
"select EXTRACT_EXCEPTION_ID from extract_exceptions e" + "
join containers c on c.container_id = e.container_id " + " where
c.file_path='file9_noextract.txt'";
- assertEquals("missing extract: file9_noextract.txt", "0",
getSingleResult(sql));
- debugTable(ExtractProfiler.CONTAINER_TABLE);
+ /*debugTable(ExtractProfiler.CONTAINER_TABLE);
debugTable(ExtractProfiler.PROFILE_TABLE);
debugTable(ExtractProfiler.CONTENTS_TABLE);
debugTable(ExtractProfiler.EXCEPTION_TABLE);
- debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
-
- sql = "select EXTRACT_EXCEPTION_ID from errors e" + " join containers
c on c.container_id = e.container_id " + " where
c.file_path='file5_emptyA.pdf'";
- assertEquals("empty extract: file5_emptyA.pdf", "1",
getSingleResult(sql));
-
- sql = "select EXTRACT_EXCEPTION_ID from errors e" + " join containers
c on c.container_id = e.container_id " + " where
c.file_path='file7_badJson.pdf'";
- assertEquals("extract error:file7_badJson.pdf", "2",
getSingleResult(sql));
-
- }
-
- @Test
- public void testParseErrors() throws Exception {
- debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
- String sql = "select file_path from errors where container_id is null";
- assertEquals("file10_permahang.txt", getSingleResult(sql));
-
- sql = "select extract_error_id from extract_exceptions " + "where
file_path='file11_oom.txt'";
-
assertEquals(Integer.toString(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE.ordinal()),
getSingleResult(sql));
+ debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);*/
+ assertEquals("0", getSingleResult(sql), "missing extract:
file9_noextract.txt");
- sql = "select parse_error_id from extract_exceptions where
file_path='file11_oom.txt'";
-
assertEquals(Integer.toString(AbstractProfiler.PARSE_ERROR_TYPE.OOM.ordinal()),
getSingleResult(sql));
+ sql = "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + " join
containers c on c.container_id = e.container_id " + " where
c.file_path='file5_emptyA.pdf'";
+ assertEquals("1", getSingleResult(sql), "empty extract:
file5_emptyA.pdf");
+ sql = "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + " join
containers c on c.container_id = e.container_id " + " where
c.file_path='file7_badJson.pdf'";
+ assertEquals("2", getSingleResult(sql), "extract
error:file7_badJson.pdf");
}
@Test
+ @Disabled("create actual unit test")
public void testParseExceptions() throws Exception {
debugTable(ExtractProfiler.EXCEPTION_TABLE);
}
private String getSingleResult(String sql) throws Exception {
Statement st = null;
- st = conn.createStatement();
+ st = CONN.createStatement();
ResultSet rs = st.executeQuery(sql);
int hits = 0;
String val = "";
@@ -188,7 +171,7 @@ public class ProfilerBatchTest {
Statement st = null;
try {
String sql = "select * from " + table.getName();
- st = conn.createStatement();
+ st = CONN.createStatement();
ResultSet rs = st.executeQuery(sql);
int colCount = rs
.getMetaData()
diff --git
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
index fff15e3dc..4d7d4bb2b 100644
---
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
+++
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
@@ -98,20 +98,13 @@ public class TikaEvalCLITest extends TikaTest {
private static void profile() throws IOException {
List<String> args = new ArrayList<>();
args.add("Profile");
- args.add("-extracts");
+ args.add("-e");
args.add(ProcessUtils.escapeCommandLine(extractsDir
.resolve("extractsA")
.toAbsolutePath()
.toString()));
- //add these just to confirm this info doesn't cause problems w cli
- args.add("-maxTokens");
- args.add("10000000");
- args.add("-maxContentLength");
- args.add("100000000");
- args.add("-maxContentLengthForLangId");
- args.add("100000");
- args.add("-db");
+ args.add("-d");
args.add(ProcessUtils.escapeCommandLine(profileDBDir
.toAbsolutePath()
.toString() + "/" + dbName));
diff --git
a/tika-eval/tika-eval-app/src/test/resources/eval-configs/eval-config-basic.json
b/tika-eval/tika-eval-app/src/test/resources/eval-configs/eval-config-basic.json
new file mode 100644
index 000000000..b4af28df3
--- /dev/null
+++
b/tika-eval/tika-eval-app/src/test/resources/eval-configs/eval-config-basic.json
@@ -0,0 +1,3 @@
+{
+ "maxExtractLength" : 20000
+}
\ No newline at end of file
diff --git
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file10_permahang.txt
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file10_permahang.txt
new file mode 100644
index 000000000..e69de29bb
diff --git
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file12_es.txt
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file12_es.txt
new file mode 100644
index 000000000..5ffd824a9
--- /dev/null
+++
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file12_es.txt
@@ -0,0 +1,6 @@
+[
+ {
+ "Content-Type": "text/plain",
+ "X-TIKA:content": "El zorro marrón rápido saltó sobre el perro. El zorro
marrón rápido saltó sobre el perro. El zorro marrón rápido saltó sobre el perro"
+ }
+]
\ No newline at end of file
diff --git
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file13_attachANotB.doc
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file13_attachANotB.doc
new file mode 100644
index 000000000..15bc592a5
--- /dev/null
+++
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file13_attachANotB.doc
@@ -0,0 +1,12 @@
+[
+ {
+ "Content-Type": "text/plain",
+ "_comment": "simplified",
+ "X-TIKA:content":
"调整每一个心脏和每个声音,投标每个护理提取;让大家一起欢乐,赞美老拿骚.调整每一个心脏和每个声音,投标每个护理提取;让大家一起欢乐,赞美老拿骚 狐狸狐狸狐狸
"
+ },
+ {
+ "Content-Type": "text/plain",
+ "X-TIKA:embedded_resource_path": "inner.txt",
+ "X-TIKA:content": "attachment contents"
+ }
+]
\ No newline at end of file
diff --git
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file14_diffAttachOrder
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file14_diffAttachOrder
new file mode 100644
index 000000000..25f0db9a1
--- /dev/null
+++
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file14_diffAttachOrder
@@ -0,0 +1,21 @@
+[
+ {
+ "Content-Type": "text/plain",
+ "X-TIKA:content": "the quick brown fox fox fox jumped over the lazy lazy
dog",
+ "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8351"
+ },
+ {
+ "Content-Type": "text/plain",
+ "X-TIKA:embedded_resource_path": "/0",
+ "X-TIKA:content": "a b c d e f g h i j k l m n",
+ "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8354",
+ "X-TIKA:embedded_depth": "1"
+ },
+ {
+ "Content-Type": "text/plain",
+ "X-TIKA:embedded_resource_path": "/1",
+ "X-TIKA:content": "o p q r s t u v w x y z",
+ "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8353",
+ "X-TIKA:embedded_depth": "1"
+ }
+]
\ No newline at end of file
diff --git
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file15_tags
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file15_tags
new file mode 100644
index 000000000..5af73db80
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file15_tags
@@ -0,0 +1,41 @@
+[
+ {
+ "Content-Length": "34824",
+ "Content-Type": "application/pdf",
+ "Last-Modified": "2007-09-15T09:02:31Z",
+ "X-Parsed-By": [
+ "org.apache.tika.parser.DefaultParser",
+ "org.apache.tika.parser.pdf.PDFParser"
+ ],
+ "X-TIKA:content_handler": "ToXMLContentHandler",
+ "X-TIKA:content": "\u003chtml
xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta
name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta
name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\"
/\u003e\n\u003cmeta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\"
/\u003e\n\u003cmeta name\u003d\"access_permission:modify_annotations\"
content\u003d\"true\" /\u003e\n\u003cmeta
name\u003d\"access_permission:can_print_degra [...]
+ "X-TIKA:parse_time_millis": "500",
+ "access_permission:assemble_document": "true",
+ "access_permission:can_modify": "true",
+ "access_permission:can_print": "true",
+ "access_permission:can_print_degraded": "true",
+ "access_permission:extract_content": "true",
+ "access_permission:extract_for_accessibility": "true",
+ "access_permission:fill_in_form": "true",
+ "access_permission:modify_annotations": "true",
+ "dc:creator": "Bertrand DelacrΘtaz",
+ "dc:format": "application/pdf; version\u003d1.3",
+ "dc:title": "Apache Tika - Apache Tika",
+ "dcterms:created": "2007-09-15T09:02:31Z",
+ "dcterms:modified": "2007-09-15T09:02:31Z",
+ "meta:author": "Bertrand DelacrΘtaz",
+ "meta:creation-date": "2007-09-15T09:02:31Z",
+ "meta:save-date": "2007-09-15T09:02:31Z",
+ "pdf:PDFVersion": "1.3",
+ "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+ "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+ "pdf:docinfo:creator_tool": "Firefox",
+ "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+ "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+ "pdf:docinfo:title": "Apache Tika - Apache Tika",
+ "pdf:encrypted": "false",
+ "resourceName": "testPDF.pdf",
+ "xmp:CreatorTool": "Firefox",
+ "xmpTPg:NPages": "1"
+ }
+]
\ No newline at end of file
diff --git
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file16_badTags
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file16_badTags
new file mode 100644
index 000000000..5c6272e43
--- /dev/null
+++
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file16_badTags
@@ -0,0 +1,41 @@
+[
+ {
+ "Content-Length": "34824",
+ "Content-Type": "application/pdf",
+ "Last-Modified": "2007-09-15T09:02:31Z",
+ "X-Parsed-By": [
+ "org.apache.tika.parser.DefaultParser",
+ "org.apache.tika.parser.pdf.PDFParser"
+ ],
+ "X-TIKA:content_handler": "ToXMLContentHandler",
+ "X-TIKA:content": "\u003chtml
xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta
name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta
name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" meta
name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\"
/\u003e\n\u003c\u003c\u003c\u003c\u003cmeta
name\u003d\"access_permission:modify_annotations\" content\u003d\"true\"
/\u003e\n\u003cmeta name\u003d\"access_permission:can_pr [...]
+ "X-TIKA:parse_time_millis": "500",
+ "access_permission:assemble_document": "true",
+ "access_permission:can_modify": "true",
+ "access_permission:can_print": "true",
+ "access_permission:can_print_degraded": "true",
+ "access_permission:extract_content": "true",
+ "access_permission:extract_for_accessibility": "true",
+ "access_permission:fill_in_form": "true",
+ "access_permission:modify_annotations": "true",
+ "dc:creator": "Bertrand DelacrΘtaz",
+ "dc:format": "application/pdf; version\u003d1.3",
+ "dc:title": "Apache Tika - Apache Tika",
+ "dcterms:created": "2007-09-15T09:02:31Z",
+ "dcterms:modified": "2007-09-15T09:02:31Z",
+ "meta:author": "Bertrand DelacrΘtaz",
+ "meta:creation-date": "2007-09-15T09:02:31Z",
+ "meta:save-date": "2007-09-15T09:02:31Z",
+ "pdf:PDFVersion": "1.3",
+ "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+ "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+ "pdf:docinfo:creator_tool": "Firefox",
+ "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+ "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+ "pdf:docinfo:title": "Apache Tika - Apache Tika",
+ "pdf:encrypted": "false",
+ "resourceName": "testPDF.pdf",
+ "xmp:CreatorTool": "Firefox",
+ "xmpTPg:NPages": "1"
+ }
+]
\ No newline at end of file
diff --git
a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file17_tagsOutOfOrder
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file17_tagsOutOfOrder
new file mode 100644
index 000000000..97afec8ad
--- /dev/null
+++
b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file17_tagsOutOfOrder
@@ -0,0 +1,41 @@
+[
+ {
+ "Content-Length": "34824",
+ "Content-Type": "application/pdf",
+ "Last-Modified": "2007-09-15T09:02:31Z",
+ "X-Parsed-By": [
+ "org.apache.tika.parser.DefaultParser",
+ "org.apache.tika.parser.pdf.PDFParser"
+ ],
+ "X-TIKA:content_handler": "ToXMLContentHandler",
+ "X-TIKA:content": "\u003chtml
xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta
name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta
name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\"
/\u003e\n\u003cmeta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\"
/\u003e\n\u003cmeta name\u003d\"access_permission:modify_annotations\"
content\u003d\"true\" /\u003e\n\u003cmeta
name\u003d\"access_permission:can_print_degra [...]
+ "X-TIKA:parse_time_millis": "500",
+ "access_permission:assemble_document": "true",
+ "access_permission:can_modify": "true",
+ "access_permission:can_print": "true",
+ "access_permission:can_print_degraded": "true",
+ "access_permission:extract_content": "true",
+ "access_permission:extract_for_accessibility": "true",
+ "access_permission:fill_in_form": "true",
+ "access_permission:modify_annotations": "true",
+ "dc:creator": "Bertrand DelacrΘtaz",
+ "dc:format": "application/pdf; version\u003d1.3",
+ "dc:title": "Apache Tika - Apache Tika",
+ "dcterms:created": "2007-09-15T09:02:31Z",
+ "dcterms:modified": "2007-09-15T09:02:31Z",
+ "meta:author": "Bertrand DelacrΘtaz",
+ "meta:creation-date": "2007-09-15T09:02:31Z",
+ "meta:save-date": "2007-09-15T09:02:31Z",
+ "pdf:PDFVersion": "1.3",
+ "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+ "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+ "pdf:docinfo:creator_tool": "Firefox",
+ "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+ "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+ "pdf:docinfo:title": "Apache Tika - Apache Tika",
+ "pdf:encrypted": "false",
+ "resourceName": "testPDF.pdf",
+ "xmp:CreatorTool": "Firefox",
+ "xmpTPg:NPages": "1"
+ }
+]
\ No newline at end of file