This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new a7e3fd7cc TIKA-4450 -- remove tika-batch from ExtractComparer (#2273)
a7e3fd7cc is described below
commit a7e3fd7cc93a24de9b53e97524bb532b5a4e92f7
Author: Tim Allison <[email protected]>
AuthorDate: Tue Jul 8 17:49:27 2025 -0400
TIKA-4450 -- remove tika-batch from ExtractComparer (#2273)
---
.../org/apache/tika/eval/app/ExtractComparer.java | 46 +--
.../tika/eval/app/ExtractComparerRunner.java | 386 +++++++++++++++++++++
.../org/apache/tika/eval/app/ExtractProfiler.java | 1 +
.../org/apache/tika/eval/app/ProfilerBase.java | 3 +
.../java/org/apache/tika/eval/app/TikaEvalCLI.java | 100 +-----
.../eval/app/batch/ExtractComparerBuilder.java | 163 ---------
.../apache/tika/eval/app/SimpleComparerTest.java | 4 +-
.../org/apache/tika/eval/app/TikaEvalCLITest.java | 13 +-
8 files changed, 402 insertions(+), 314 deletions(-)
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
index 6f2865bf0..538231d6c 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
@@ -24,15 +24,13 @@ import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
-import java.util.concurrent.ArrayBlockingQueue;
import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.io.FilenameUtils;
-import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.eval.app.batch.FileResource;
import org.apache.tika.eval.app.db.ColInfo;
import org.apache.tika.eval.app.db.Cols;
import org.apache.tika.eval.app.db.TableInfo;
@@ -48,7 +46,7 @@ import org.apache.tika.eval.core.util.ContentTags;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-public class ExtractComparer extends AbstractProfiler {
+public class ExtractComparer extends ProfilerBase {
private static final String DIGEST_KEY_PREFIX =
TikaCoreProperties.TIKA_META_PREFIX + "digest" +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
private final static String FIELD_A = "fa";
@@ -76,40 +74,8 @@ public class ExtractComparer extends AbstractProfiler {
public static TableInfo EXTRACT_EXCEPTION_TABLE_B = new
TableInfo("extract_exceptions_b",
ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos());
static Options OPTIONS;
- static {
- Option extractsA = new Option("extractsA", true, "directory for
extractsA files");
- extractsA.setRequired(true);
-
- Option extractsB = new Option("extractsB", true, "directory for
extractsB files");
- extractsB.setRequired(true);
-
- Option inputDir = new Option("inputDir", true,
- "optional: directory of original binary input files if it
exists " + "or can be the same as -extractsA or -extractsB. If not specified,
-inputDir=-extractsA");
-
-
- OPTIONS = new Options()
- .addOption(extractsA)
- .addOption(extractsB)
- .addOption(inputDir)
- .addOption("bc", "optional: tika-batch config file")
- .addOption("numConsumers", true, "optional: number of consumer
threads")
- .addOption(new Option("alterExtract", true,
- "for json-formatted extract files, " + "process full
metadata list ('as_is'=default), " + "take just the first/container document
('first_only'), " +
- "concatenate all content into the first
metadata item ('concatenate_content')"))
- .addOption("minExtractLength", true, "minimum extract length
to process (in bytes)")
- .addOption("maxExtractLength", true, "maximum extract length
to process (in bytes)")
- .addOption("db", true, "db file to which to write results")
- .addOption("jdbc", true, "EXPERT: full jdbc connection string.
Must specify this or -db <h2db>")
- .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or
specify via -Djdbc.driver")
- .addOption("tablePrefixA", true, "EXPERT: optional prefix for
table names for A")
- .addOption("tablePrefixB", true, "EXPERT: optional prefix for
table names for B")
- .addOption("drop", false, "drop tables if they exist")
- .addOption("maxFilesToAdd", true, "maximum number of files to
add to the crawler")
- .addOption("maxTokens", true, "maximum tokens to process,
default=200000")
- .addOption("maxContentLength", true, "truncate content beyond
this length for calculating 'contents' stats, default=1000000")
- .addOption("maxContentLengthForLangId", true, "truncate
content beyond this length for language id, default=50000")
- .addOption("defaultLangCode", true, "which language to use for
common words if no 'common words' " + "file exists for the langid result");
- }
+
+
//need to parameterize?
private final Path inputDir;
@@ -118,8 +84,8 @@ public class ExtractComparer extends AbstractProfiler {
private final TokenContraster tokenContraster = new TokenContraster();
private final ExtractReader extractReader;
- public ExtractComparer(ArrayBlockingQueue<FileResource> queue, Path
inputDir, Path extractsA, Path extractsB, ExtractReader extractReader,
IDBWriter writer) {
- super(queue, writer);
+ public ExtractComparer(Path inputDir, Path extractsA, Path extractsB,
ExtractReader extractReader, IDBWriter writer) {
+ super(writer);
this.inputDir = inputDir;
this.extractsA = extractsA;
this.extractsB = extractsB;
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
new file mode 100644
index 000000000..25e55ee61
--- /dev/null
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
@@ -0,0 +1,386 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import java.io.IOException;
+import java.nio.file.FileVisitResult;
+import java.nio.file.FileVisitor;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.attribute.BasicFileAttributes;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.eval.app.batch.FileResource;
+import org.apache.tika.eval.app.batch.PathResource;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.eval.app.db.MimeBuffer;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.DBWriter;
+import org.apache.tika.eval.app.io.ExtractReader;
+import org.apache.tika.eval.app.io.ExtractReaderException;
+import org.apache.tika.eval.app.io.IDBWriter;
+
+public class ExtractComparerRunner {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(ExtractComparerRunner.class);
+ private static final PathResource SEMAPHORE = new
PathResource(Paths.get("/"), "STOP");
+ private static final int DIR_WALKER_COMPLETED_VALUE = 2;
+ private static final int COMPARER_WORKER_COMPLETED_VALUE = 1;
+
+ static Options OPTIONS;
+
+ static {
+
+ OPTIONS = new Options()
+
.addOption(Option.builder("a").longOpt("extractsA").hasArg().desc("required:
directory of 'A' extracts").build())
+
.addOption(Option.builder("b").longOpt("extractsB").hasArg().desc("required:
directory of 'B' extracts").build())
+
.addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional:
directory for original binary input documents."
+ + " If not specified, -extracts is crawled as
is.").build())
+
.addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db
path").build())
+
.addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json
config file").build())
+ ;
+ }
+ public static void main(String[] args) throws Exception {
+ DefaultParser defaultCLIParser = new DefaultParser();
+ CommandLine commandLine = defaultCLIParser.parse(OPTIONS, args);
+ EvalConfig evalConfig = commandLine.hasOption('c') ?
EvalConfig.load(Paths.get(commandLine.getOptionValue('c'))) : new EvalConfig();
+ Path extractsADir = commandLine.hasOption('a') ?
Paths.get(commandLine.getOptionValue('a')) : Paths.get(USAGE_FAIL("Must specify
extractsA dir: -a"));
+ Path extractsBDir = commandLine.hasOption('b') ?
Paths.get(commandLine.getOptionValue('b')) : Paths.get(USAGE_FAIL("Must specify
extractsB dir: -b"));
+ Path inputDir = commandLine.hasOption('i') ?
Paths.get(commandLine.getOptionValue('i')) : extractsADir;
+ String dbPath = commandLine.hasOption('d') ?
commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d");
+ String jdbcString = getJdbcConnectionString(dbPath);
+ execute(inputDir, extractsADir, extractsBDir, jdbcString, evalConfig);
+ }
+
+ private static String getJdbcConnectionString(String dbPath) {
+ if (dbPath.startsWith("jdbc:")) {
+ return dbPath;
+ }
+ //default to h2
+ Path p = Paths.get(dbPath);
+ return "jdbc:h2:file:" + p.toAbsolutePath();
+
+ }
+
+ private static void execute(Path inputDir, Path extractsA, Path extractsB,
String dbPath, EvalConfig evalConfig) throws SQLException, IOException {
+
+ //parameterize this? if necesssary
+ try {
+ ProfilerBase.loadCommonTokens(null, null);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ JDBCUtil jdbcUtil = new JDBCUtil(dbPath,
evalConfig.getJdbcDriverClass());
+ ExtractComparerBuilder builder = new ExtractComparerBuilder();
+ MimeBuffer mimeBuffer = initTables(jdbcUtil, builder, dbPath,
evalConfig);
+ builder.populateRefTables(jdbcUtil, mimeBuffer);
+
+ AtomicInteger enqueued = new AtomicInteger(0);
+ AtomicInteger processed = new AtomicInteger(0);
+ AtomicInteger activeWorkers = new
AtomicInteger(evalConfig.getNumWorkers());
+ AtomicBoolean crawlerActive = new AtomicBoolean(true);
+
+ ArrayBlockingQueue<FileResource> queue = new
ArrayBlockingQueue<>(1000);
+ ExecutorService executorService =
Executors.newFixedThreadPool(evalConfig.getNumWorkers() + 2);
+ ExecutorCompletionService<Integer> executorCompletionService = new
ExecutorCompletionService<>(executorService);
+
+ StatusReporter statusReporter = new StatusReporter(enqueued,
processed, activeWorkers, crawlerActive);
+ executorCompletionService.submit(statusReporter);
+
+ DirectoryWalker directoryWalker = new DirectoryWalker(inputDir, queue,
enqueued);
+ executorCompletionService.submit(directoryWalker);
+ for (int i = 0; i < evalConfig.getNumWorkers(); i++) {
+ ExtractReader extractReader = new
ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS,
evalConfig.getMinExtractLength(), evalConfig.getMaxExtractLength());
+ ExtractComparer extractComparer = new ExtractComparer(inputDir,
extractsA, extractsB, extractReader,
+ builder.getDBWriter(builder.getNonRefTableInfos(),
jdbcUtil, mimeBuffer));
+ executorCompletionService.submit(new ComparerWorker(queue,
extractComparer, processed));
+ }
+
+ int finished = 0;
+ try {
+ while (finished < evalConfig.getNumWorkers() + 2) {
+ //blocking
+ Future<Integer> future = executorCompletionService.take();
+ Integer result = future.get();
+ if (result != null) {
+ //if the dir walker has finished
+ if (result == DIR_WALKER_COMPLETED_VALUE) {
+ queue.put(SEMAPHORE);
+ crawlerActive.set(false);
+ } else if (result == COMPARER_WORKER_COMPLETED_VALUE) {
+ activeWorkers.decrementAndGet();
+ }
+ finished++;
+ }
+ }
+ } catch (InterruptedException e) {
+ LOG.info("interrupted", e);
+ } catch (ExecutionException e) {
+ throw new RuntimeException(e);
+ } finally {
+ mimeBuffer.close();
+ executorService.shutdownNow();
+ }
+
+ }
+
+ private static MimeBuffer initTables(JDBCUtil jdbcUtil,
ExtractComparerBuilder builder, String connectionString, EvalConfig evalConfig)
throws SQLException, IOException {
+
+ //step 1. create the tables
+ jdbcUtil.createTables(builder.getNonRefTableInfos(),
JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS);
+ jdbcUtil.createTables(builder.getRefTableInfos(),
JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS);
+
+ //step 2. create mime buffer
+ return new MimeBuffer(jdbcUtil.getConnection(),
builder.getMimeTable(), TikaConfig.getDefaultConfig());
+ }
+
+ private static void USAGE() {
+ HelpFormatter helpFormatter = new HelpFormatter();
+ helpFormatter.printHelp(80, "java -jar tika-eval-app-x.y.z.jar
FileProfiler -e docs -d mydb [-i inputDir, -c config.json]",
+ "Tool: Profile", OPTIONS, "");
+ }
+
+ private static String USAGE_FAIL(String msg) {
+ USAGE();
+ throw new IllegalArgumentException(msg);
+ }
+
+ private static class ComparerWorker implements Callable<Integer> {
+
+ private final ArrayBlockingQueue<FileResource> queue;
+ private final ExtractComparer extractComparer;
+ private final AtomicInteger processed;
+
+ ComparerWorker(ArrayBlockingQueue<FileResource> queue, ExtractComparer
extractComparer, AtomicInteger processed) {
+ this.queue = queue;
+ this.extractComparer = extractComparer;
+ this.processed = processed;
+ }
+
+ @Override
+ public Integer call() throws Exception {
+ while (true) {
+ FileResource resource = queue.poll(1, TimeUnit.SECONDS);
+ if (resource == null) {
+ LOG.info("ExtractProfileWorker waiting on queue");
+ continue;
+ }
+ if (resource == SEMAPHORE) {
+ LOG.debug("worker hit semaphore and is stopping");
+ extractComparer.closeWriter();
+ //hangs
+ queue.put(resource);
+ return COMPARER_WORKER_COMPLETED_VALUE;
+ }
+ extractComparer.processFileResource(resource);
+ processed.incrementAndGet();
+ }
+ }
+ }
+
+ private static class DirectoryWalker implements Callable<Integer> {
+ private final Path startDir;
+ private final ArrayBlockingQueue<FileResource> queue;
+ private final AtomicInteger enqueued;
+
+ public DirectoryWalker(Path startDir, ArrayBlockingQueue<FileResource>
queue, AtomicInteger enqueued) {
+ this.startDir = startDir;
+ this.queue = queue;
+ this.enqueued = enqueued;
+ }
+
+ @Override
+ public Integer call() throws Exception {
+ Files.walkFileTree(startDir, new FileVisitor<Path>() {
+ @Override
+ public FileVisitResult preVisitDirectory(Path dir,
BasicFileAttributes attrs) throws IOException {
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult visitFile(Path file,
BasicFileAttributes attrs) throws IOException {
+ if (Files.isDirectory(file)) {
+ return FileVisitResult.CONTINUE;
+ }
+ try {
+ //blocking
+ queue.put(new PathResource(file,
startDir.relativize(file).toString()));
+ enqueued.incrementAndGet();
+ } catch (InterruptedException e) {
+ return FileVisitResult.TERMINATE;
+ }
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult visitFileFailed(Path file, IOException
exc) throws IOException {
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult postVisitDirectory(Path dir,
IOException exc) throws IOException {
+ return FileVisitResult.CONTINUE;
+ }
+ });
+ return DIR_WALKER_COMPLETED_VALUE;
+ }
+ }
+
+ private static class ExtractComparerBuilder {
+ private final List<TableInfo> tableInfosA;
+ private final List<TableInfo> tableInfosB;
+ private final List<TableInfo> tableInfosAandB;
+ private final List<TableInfo> refTableInfos;
+
+ public ExtractComparerBuilder() {
+ List<TableInfo> tableInfosA = new ArrayList<>();
+ List<TableInfo> tableInfosB = new ArrayList<>();
+ List<TableInfo> tableInfosAandB = new ArrayList<>();
+ tableInfosA.add(ExtractComparer.PROFILES_A);
+ tableInfosA.add(ExtractComparer.EXCEPTION_TABLE_A);
+ tableInfosA.add(ExtractComparer.TAGS_TABLE_A);
+ tableInfosA.add(ExtractComparer.CONTENTS_TABLE_A);
+ tableInfosA.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_A);
+ tableInfosA.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_A);
+
+ tableInfosB.add(ExtractComparer.PROFILES_B);
+ tableInfosB.add(ExtractComparer.EXCEPTION_TABLE_B);
+ tableInfosB.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
+ tableInfosB.add(ExtractComparer.TAGS_TABLE_B);
+ tableInfosB.add(ExtractComparer.CONTENTS_TABLE_B);
+ tableInfosB.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_B);
+
+ tableInfosAandB.add(ExtractComparer.COMPARISON_CONTAINERS);
+ tableInfosAandB.add(ExtractComparer.CONTENT_COMPARISONS);
+ tableInfosAandB.add(AbstractProfiler.MIME_TABLE);
+
+ List<TableInfo> refTableInfos = new ArrayList<>();
+ refTableInfos.add(ExtractComparer.REF_PAIR_NAMES);
+ refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
+ refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
+ refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES);
+
+ this.tableInfosA = Collections.unmodifiableList(tableInfosA);
+ this.tableInfosB = Collections.unmodifiableList(tableInfosB);
+ this.tableInfosAandB =
Collections.unmodifiableList(tableInfosAandB);
+ this.refTableInfos = Collections.unmodifiableList(refTableInfos);
+ }
+
+
+ protected List<TableInfo> getRefTableInfos() {
+ return refTableInfos;
+ }
+
+ protected List<TableInfo> getNonRefTableInfos() {
+ List<TableInfo> allNonRefTables = new ArrayList<>();
+ allNonRefTables.addAll(tableInfosA);
+ allNonRefTables.addAll(tableInfosB);
+ allNonRefTables.addAll(tableInfosAandB);
+ return Collections.unmodifiableList(allNonRefTables);
+ }
+
+ protected TableInfo getMimeTable() {
+ return AbstractProfiler.MIME_TABLE;
+ }
+
+ public void populateRefTables(JDBCUtil dbUtil, MimeBuffer mimeBuffer)
throws IOException, SQLException {
+ boolean refTablesPopulated = true;
+ try {
+ Connection connection = dbUtil.getConnection();
+ for (TableInfo tableInfo : getRefTableInfos()) {
+ int rows = 0;
+ try (ResultSet rs = connection
+ .createStatement()
+ .executeQuery("select * from " +
tableInfo.getName())) {
+ while (rs.next()) {
+ rows++;
+ }
+ }
+ if (rows == 0) {
+ refTablesPopulated = false;
+ break;
+ }
+
+ }
+ } catch (SQLException e) {
+ //swallow
+ }
+ if (refTablesPopulated) {
+ LOG.info("ref tables are already populated");
+ return;
+ }
+
+ IDBWriter writer = getDBWriter(getRefTableInfos(), dbUtil,
mimeBuffer);
+ Map<Cols, String> m = new HashMap<>();
+ for (AbstractProfiler.PARSE_ERROR_TYPE t :
AbstractProfiler.PARSE_ERROR_TYPE.values()) {
+ m.clear();
+ m.put(Cols.PARSE_ERROR_ID, Integer.toString(t.ordinal()));
+ m.put(Cols.PARSE_ERROR_DESCRIPTION, t.name());
+ writer.writeRow(AbstractProfiler.REF_PARSE_ERROR_TYPES, m);
+ }
+
+ for (AbstractProfiler.EXCEPTION_TYPE t :
AbstractProfiler.EXCEPTION_TYPE.values()) {
+ m.clear();
+ m.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(t.ordinal()));
+ m.put(Cols.PARSE_EXCEPTION_DESCRIPTION, t.name());
+ writer.writeRow(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES, m);
+ }
+
+ for (ExtractReaderException.TYPE t :
ExtractReaderException.TYPE.values()) {
+ m.clear();
+ m.put(Cols.EXTRACT_EXCEPTION_ID,
Integer.toString(t.ordinal()));
+ m.put(Cols.EXTRACT_EXCEPTION_DESCRIPTION, t.name());
+ writer.writeRow(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES,
m);
+ }
+ writer.close();
+ }
+
+ protected IDBWriter getDBWriter(List<TableInfo> tableInfos, JDBCUtil
dbUtil, MimeBuffer mimeBuffer) throws IOException, SQLException {
+ Connection conn = dbUtil.getConnection();
+ return new DBWriter(conn, tableInfos, dbUtil, mimeBuffer);
+ }
+ }
+}
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
index 9b0f482f6..680e50535 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
@@ -86,6 +86,7 @@ public class ExtractProfiler extends ProfilerBase {
}
+ @Override
public boolean processFileResource(FileResource fileResource) {
Metadata metadata = fileResource.getMetadata();
EvalFilePaths fps = null;
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
index 19a7d680f..7325dc535 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
@@ -42,6 +42,7 @@ import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.eval.app.batch.FileResource;
import org.apache.tika.eval.app.db.ColInfo;
import org.apache.tika.eval.app.db.Cols;
import org.apache.tika.eval.app.db.TableInfo;
@@ -796,6 +797,8 @@ public abstract class ProfilerBase {
return NON_EXISTENT_FILE_LENGTH;
}
+ public abstract boolean processFileResource(FileResource fileResource);
+
public enum EXCEPTION_TYPE {
RUNTIME, ENCRYPTION, ACCESS_PERMISSION, UNSUPPORTED_VERSION,
}
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
index 91aecd832..b44b0cf4a 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
@@ -158,105 +158,7 @@ public class TikaEvalCLI {
}
private void handleCompare(String[] subsetArgs) throws Exception {
- List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
-
- boolean containsBC = false;
- String inputDir = null;
- String extractsA = null;
- String alterExtract = null;
- //confirm there's a batch-config file
- for (int i = 0; i < argList.size(); i++) {
- String arg = argList.get(i);
- switch (arg) {
- case "-bc":
- containsBC = true;
- break;
- case "-inputDir":
- if (i + 1 >= argList.size()) {
- System.err.println("Must specify directory after
-inputDir");
- ExtractComparer.USAGE();
- return;
- }
- inputDir = argList.get(i + 1);
- i++;
- break;
- case "-extractsA":
- if (i + 1 >= argList.size()) {
- System.err.println("Must specify directory after
-extractsA");
- ExtractComparer.USAGE();
- return;
- }
- extractsA = argList.get(i + 1);
- i++;
- break;
- case "-alterExtract":
- if (i + 1 >= argList.size()) {
- System.err.println("Must specify type 'as_is',
'first_only' or " + "'concatenate_content' after -alterExtract");
- ExtractComparer.USAGE();
- return;
- }
- alterExtract = argList.get(i + 1);
- i++;
- break;
- }
- }
- if (alterExtract != null && !alterExtract.equals("as_is") &&
!alterExtract.equals("concatenate_content") &&
!alterExtract.equals("first_only")) {
- System.out.println("Sorry, I don't understand:" + alterExtract +
". The values must be one of: as_is, first_only, concatenate_content");
- ExtractComparer.USAGE();
- return;
- }
-
- //need to specify each in the commandline that goes into tika-batch
- //if only extracts is passed to tika-batch,
- //the crawler will see no inputDir and start crawling "input".
- //if the user doesn't specify inputDir, crawl extractsA
- if (inputDir == null && extractsA != null) {
- argList.add("-inputDir");
- argList.add(extractsA);
- }
-
- Path tmpBCConfig = null;
- try {
- tmpBCConfig = Files.createTempFile("tika-eval", ".xml");
- if (!containsBC) {
- try (InputStream is = this
- .getClass()
-
.getResourceAsStream("/tika-eval-comparison-config.xml")) {
- Files.copy(is, tmpBCConfig,
StandardCopyOption.REPLACE_EXISTING);
- }
- argList.add("-bc");
- argList.add(tmpBCConfig
- .toAbsolutePath()
- .toString());
-
- }
- String[] updatedArgs = argList.toArray(new String[0]);
- DefaultParser defaultCLIParser = new DefaultParser();
- try {
- CommandLine commandLine =
defaultCLIParser.parse(ExtractComparer.OPTIONS, updatedArgs);
- if (commandLine.hasOption("db") &&
commandLine.hasOption("jdbc")) {
- System.out.println("Please specify either the default -db
or the full -jdbc, not both");
- ExtractComparer.USAGE();
- return;
- }
- } catch (ParseException e) {
- System.out.println(e.getMessage() + "\n");
- ExtractComparer.USAGE();
- return;
- }
-
- // lazy delete because main() calls System.exit()
- if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
- tmpBCConfig
- .toFile()
- .deleteOnExit();
- }
- FSBatchProcessCLI.main(updatedArgs);
- } finally {
- if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
- Files.delete(tmpBCConfig);
- }
- }
+ ExtractComparerRunner.main(subsetArgs);
}
private void handleReport(String[] subsetArgs) throws Exception {
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractComparerBuilder.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractComparerBuilder.java
deleted file mode 100644
index 6788de49e..000000000
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractComparerBuilder.java
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.app.batch;
-
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.eval.app.AbstractProfiler;
-import org.apache.tika.eval.app.ExtractComparer;
-import org.apache.tika.eval.app.db.TableInfo;
-import org.apache.tika.util.PropsUtil;
-
-public class ExtractComparerBuilder extends EvalConsumerBuilder {
- public final static String TABLE_PREFIX_A_KEY = "tablePrefixA";
- public final static String TABLE_PREFIX_B_KEY = "tablePrefixB";
-
- private final List<TableInfo> tableInfosA;
- private final List<TableInfo> tableInfosB;
- private final List<TableInfo> tableInfosAandB;
- private final List<TableInfo> refTableInfos;
-
- public ExtractComparerBuilder() {
- List<TableInfo> tableInfosA = new ArrayList<>();
- List<TableInfo> tableInfosB = new ArrayList<>();
- List<TableInfo> tableInfosAandB = new ArrayList<>();
- tableInfosA.add(ExtractComparer.PROFILES_A);
- tableInfosA.add(ExtractComparer.EXCEPTION_TABLE_A);
- tableInfosA.add(ExtractComparer.TAGS_TABLE_A);
- tableInfosA.add(ExtractComparer.CONTENTS_TABLE_A);
- tableInfosA.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_A);
- tableInfosA.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_A);
-
- tableInfosB.add(ExtractComparer.PROFILES_B);
- tableInfosB.add(ExtractComparer.EXCEPTION_TABLE_B);
- tableInfosB.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
- tableInfosB.add(ExtractComparer.TAGS_TABLE_B);
- tableInfosB.add(ExtractComparer.CONTENTS_TABLE_B);
- tableInfosB.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_B);
-
- tableInfosAandB.add(ExtractComparer.COMPARISON_CONTAINERS);
- tableInfosAandB.add(ExtractComparer.CONTENT_COMPARISONS);
- tableInfosAandB.add(AbstractProfiler.MIME_TABLE);
-
- List<TableInfo> refTableInfos = new ArrayList<>();
- refTableInfos.add(ExtractComparer.REF_PAIR_NAMES);
- refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
- refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
- refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES);
-
- this.tableInfosA = Collections.unmodifiableList(tableInfosA);
- this.tableInfosB = Collections.unmodifiableList(tableInfosB);
- this.tableInfosAandB = Collections.unmodifiableList(tableInfosAandB);
- this.refTableInfos = Collections.unmodifiableList(refTableInfos);
- }
-
- @Override
- public FileResourceConsumer build() throws IOException, SQLException {
- Path extractsA = PropsUtil.getPath(localAttrs.get("extractsA"), null);
- if (extractsA == null) {
- throw new RuntimeException("Must specify \"extractsA\" --
directory for 'A' extracts");
- }
- Path extractsB = PropsUtil.getPath(localAttrs.get("extractsB"), null);
- if (extractsB == null) {
- throw new RuntimeException("Must specify \"extractsB\" --
directory for 'B' extracts");
- }
-
- Path inputRootDir = PropsUtil.getPath(localAttrs.get("inputDir"),
null);
-
- if (inputRootDir == null) {
- //this is for the sake of the crawler
- throw new RuntimeException("Must specify an -inputDir");
- }
-
- return parameterizeProfiler(new ExtractComparer(queue, inputRootDir,
extractsA, extractsB, buildExtractReader(localAttrs),
getDBWriter(getNonRefTableInfos())));
- }
-
-
- @Override
- protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
- String tablePrefixA = localAttrs.get(TABLE_PREFIX_A_KEY);
-
- String tablePrefixB = localAttrs.get(TABLE_PREFIX_B_KEY);
-
- tablePrefixA = (tablePrefixA == null || tablePrefixA.endsWith("_")) ?
tablePrefixA : tablePrefixA + "_";
- tablePrefixB = (tablePrefixB == null || tablePrefixB.endsWith("_")) ?
tablePrefixB : tablePrefixB + "_";
-
- if (tablePrefixA != null) {
- for (TableInfo tableInfo : tableInfosA) {
- tableInfo.setNamePrefix(tablePrefixA);
- }
- }
-
- if (tablePrefixB != null) {
- for (TableInfo tableInfo : tableInfosB) {
- tableInfo.setNamePrefix(tablePrefixB);
- }
- }
-
- if (tablePrefixA != null || tablePrefixB != null) {
- String aAndB = (tablePrefixA == null) ? "" : tablePrefixA;
- aAndB = (tablePrefixB == null) ? aAndB : aAndB + tablePrefixB;
- for (TableInfo tableInfo : tableInfosAandB) {
- tableInfo.setNamePrefix(aAndB);
- }
- }
- }
-
- @Override
- protected List<TableInfo> getRefTableInfos() {
- return refTableInfos;
- }
-
- @Override
- protected List<TableInfo> getNonRefTableInfos() {
- List<TableInfo> allNonRefTables = new ArrayList<>();
- allNonRefTables.addAll(tableInfosA);
- allNonRefTables.addAll(tableInfosB);
- allNonRefTables.addAll(tableInfosAandB);
- return Collections.unmodifiableList(allNonRefTables);
- }
-
- @Override
- protected TableInfo getMimeTable() {
- return AbstractProfiler.MIME_TABLE;
- }
-
- @Override
- protected void addErrorLogTablePairs(DBConsumersManager manager) {
- Path errorLogA = PropsUtil.getPath(localAttrs.get("errorLogFileA"),
null);
- if (errorLogA == null) {
- return;
- }
- manager.addErrorLogTablePair(errorLogA,
ExtractComparer.EXTRACT_EXCEPTION_TABLE_A);
- Path errorLogB = PropsUtil.getPath(localAttrs.get("errorLogFileB"),
null);
- if (errorLogB == null) {
- return;
- }
- manager.addErrorLogTablePair(errorLogB,
ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
-
- }
-
-}
diff --git
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
index 721d106a2..ae9363f52 100644
---
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
+++
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
@@ -56,7 +56,7 @@ public class SimpleComparerTest extends TikaTest {
@BeforeAll
public static void staticSetUp() throws Exception {
WRITER = new MockDBWriter();
- AbstractProfiler.loadCommonTokens(Paths.get(SimpleComparerTest.class
+ ProfilerBase.loadCommonTokens(Paths.get(SimpleComparerTest.class
.getResource("/common_tokens")
.toURI()), "en");
}
@@ -64,7 +64,7 @@ public class SimpleComparerTest extends TikaTest {
@BeforeEach
public void setUp() throws Exception {
WRITER.clear();
- comparer = new ExtractComparer(null, null, Paths.get("extractsA"),
Paths.get("extractsB"),
+ comparer = new ExtractComparer(null, Paths.get("extractsA"),
Paths.get("extractsB"),
new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS,
IGNORE_LENGTH, IGNORE_LENGTH), WRITER);
}
diff --git
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
index 4d7d4bb2b..6bb22f9a6 100644
---
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
+++
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
@@ -68,25 +68,18 @@ public class TikaEvalCLITest extends TikaTest {
private static void compare() throws IOException {
List<String> args = new ArrayList<>();
args.add("Compare");
- args.add("-extractsA");
+ args.add("-a");
args.add(ProcessUtils.escapeCommandLine(extractsDir
.resolve("extractsA")
.toAbsolutePath()
.toString()));
- args.add("-extractsB");
+ args.add("-b");
args.add(ProcessUtils.escapeCommandLine(extractsDir
.resolve("extractsB")
.toAbsolutePath()
.toString()));
- //add these just to confirm this info doesn't cause problems w cli
- args.add("-maxTokens");
- args.add("10000000");
- args.add("-maxContentLength");
- args.add("100000000");
- args.add("-maxContentLengthForLangId");
- args.add("100000");
- args.add("-db");
+ args.add("-d");
args.add(ProcessUtils.escapeCommandLine(compareDBDir
.toAbsolutePath()
.toString() + "/" + dbName));