This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4450 in repository https://gitbox.apache.org/repos/asf/tika.git
commit e4b7b14405ac567caa83cbd2b589fd875cdd03b0 Author: tallison <talli...@apache.org> AuthorDate: Tue Jul 8 17:29:55 2025 -0400 TIKA-4450 -- remove tika-batch from ExtractComparer --- .../org/apache/tika/eval/app/ExtractComparer.java | 46 +-- .../tika/eval/app/ExtractComparerRunner.java | 386 +++++++++++++++++++++ .../org/apache/tika/eval/app/ExtractProfiler.java | 1 + .../org/apache/tika/eval/app/ProfilerBase.java | 3 + .../java/org/apache/tika/eval/app/TikaEvalCLI.java | 100 +----- .../eval/app/batch/ExtractComparerBuilder.java | 163 --------- .../apache/tika/eval/app/SimpleComparerTest.java | 4 +- .../org/apache/tika/eval/app/TikaEvalCLITest.java | 13 +- 8 files changed, 402 insertions(+), 314 deletions(-) diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java index 6f2865bf0..538231d6c 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java @@ -24,15 +24,13 @@ import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.concurrent.ArrayBlockingQueue; import org.apache.commons.cli.HelpFormatter; -import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.io.FilenameUtils; -import org.apache.tika.batch.FileResource; import org.apache.tika.batch.fs.FSProperties; +import org.apache.tika.eval.app.batch.FileResource; import org.apache.tika.eval.app.db.ColInfo; import org.apache.tika.eval.app.db.Cols; import org.apache.tika.eval.app.db.TableInfo; @@ -48,7 +46,7 @@ import org.apache.tika.eval.core.util.ContentTags; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; -public class ExtractComparer extends AbstractProfiler { +public class ExtractComparer extends ProfilerBase { private static final String DIGEST_KEY_PREFIX = TikaCoreProperties.TIKA_META_PREFIX + "digest" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; private final static String FIELD_A = "fa"; @@ -76,40 +74,8 @@ public class ExtractComparer extends AbstractProfiler { public static TableInfo EXTRACT_EXCEPTION_TABLE_B = new TableInfo("extract_exceptions_b", ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos()); static Options OPTIONS; - static { - Option extractsA = new Option("extractsA", true, "directory for extractsA files"); - extractsA.setRequired(true); - - Option extractsB = new Option("extractsB", true, "directory for extractsB files"); - extractsB.setRequired(true); - - Option inputDir = new Option("inputDir", true, - "optional: directory of original binary input files if it exists " + "or can be the same as -extractsA or -extractsB. If not specified, -inputDir=-extractsA"); - - - OPTIONS = new Options() - .addOption(extractsA) - .addOption(extractsB) - .addOption(inputDir) - .addOption("bc", "optional: tika-batch config file") - .addOption("numConsumers", true, "optional: number of consumer threads") - .addOption(new Option("alterExtract", true, - "for json-formatted extract files, " + "process full metadata list ('as_is'=default), " + "take just the first/container document ('first_only'), " + - "concatenate all content into the first metadata item ('concatenate_content')")) - .addOption("minExtractLength", true, "minimum extract length to process (in bytes)") - .addOption("maxExtractLength", true, "maximum extract length to process (in bytes)") - .addOption("db", true, "db file to which to write results") - .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>") - .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver") - .addOption("tablePrefixA", true, "EXPERT: optional prefix for table names for A") - .addOption("tablePrefixB", true, "EXPERT: optional prefix for table names for B") - .addOption("drop", false, "drop tables if they exist") - .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler") - .addOption("maxTokens", true, "maximum tokens to process, default=200000") - .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000") - .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000") - .addOption("defaultLangCode", true, "which language to use for common words if no 'common words' " + "file exists for the langid result"); - } + + //need to parameterize? private final Path inputDir; @@ -118,8 +84,8 @@ public class ExtractComparer extends AbstractProfiler { private final TokenContraster tokenContraster = new TokenContraster(); private final ExtractReader extractReader; - public ExtractComparer(ArrayBlockingQueue<FileResource> queue, Path inputDir, Path extractsA, Path extractsB, ExtractReader extractReader, IDBWriter writer) { - super(queue, writer); + public ExtractComparer(Path inputDir, Path extractsA, Path extractsB, ExtractReader extractReader, IDBWriter writer) { + super(writer); this.inputDir = inputDir; this.extractsA = extractsA; this.extractsB = extractsB; diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java new file mode 100644 index 000000000..25e55ee61 --- /dev/null +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java @@ -0,0 +1,386 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.app; + +import java.io.IOException; +import java.nio.file.FileVisitResult; +import java.nio.file.FileVisitor; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.attribute.BasicFileAttributes; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.eval.app.batch.FileResource; +import org.apache.tika.eval.app.batch.PathResource; +import org.apache.tika.eval.app.db.Cols; +import org.apache.tika.eval.app.db.JDBCUtil; +import org.apache.tika.eval.app.db.MimeBuffer; +import org.apache.tika.eval.app.db.TableInfo; +import org.apache.tika.eval.app.io.DBWriter; +import org.apache.tika.eval.app.io.ExtractReader; +import org.apache.tika.eval.app.io.ExtractReaderException; +import org.apache.tika.eval.app.io.IDBWriter; + +public class ExtractComparerRunner { + + private static final Logger LOG = LoggerFactory.getLogger(ExtractComparerRunner.class); + private static final PathResource SEMAPHORE = new PathResource(Paths.get("/"), "STOP"); + private static final int DIR_WALKER_COMPLETED_VALUE = 2; + private static final int COMPARER_WORKER_COMPLETED_VALUE = 1; + + static Options OPTIONS; + + static { + + OPTIONS = new Options() + .addOption(Option.builder("a").longOpt("extractsA").hasArg().desc("required: directory of 'A' extracts").build()) + .addOption(Option.builder("b").longOpt("extractsB").hasArg().desc("required: directory of 'B' extracts").build()) + .addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional: directory for original binary input documents." + + " If not specified, -extracts is crawled as is.").build()) + .addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db path").build()) + .addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json config file").build()) + ; + } + public static void main(String[] args) throws Exception { + DefaultParser defaultCLIParser = new DefaultParser(); + CommandLine commandLine = defaultCLIParser.parse(OPTIONS, args); + EvalConfig evalConfig = commandLine.hasOption('c') ? EvalConfig.load(Paths.get(commandLine.getOptionValue('c'))) : new EvalConfig(); + Path extractsADir = commandLine.hasOption('a') ? Paths.get(commandLine.getOptionValue('a')) : Paths.get(USAGE_FAIL("Must specify extractsA dir: -a")); + Path extractsBDir = commandLine.hasOption('b') ? Paths.get(commandLine.getOptionValue('b')) : Paths.get(USAGE_FAIL("Must specify extractsB dir: -b")); + Path inputDir = commandLine.hasOption('i') ? Paths.get(commandLine.getOptionValue('i')) : extractsADir; + String dbPath = commandLine.hasOption('d') ? commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d"); + String jdbcString = getJdbcConnectionString(dbPath); + execute(inputDir, extractsADir, extractsBDir, jdbcString, evalConfig); + } + + private static String getJdbcConnectionString(String dbPath) { + if (dbPath.startsWith("jdbc:")) { + return dbPath; + } + //default to h2 + Path p = Paths.get(dbPath); + return "jdbc:h2:file:" + p.toAbsolutePath(); + + } + + private static void execute(Path inputDir, Path extractsA, Path extractsB, String dbPath, EvalConfig evalConfig) throws SQLException, IOException { + + //parameterize this? if necesssary + try { + ProfilerBase.loadCommonTokens(null, null); + } catch (IOException e) { + throw new RuntimeException(e); + } + + JDBCUtil jdbcUtil = new JDBCUtil(dbPath, evalConfig.getJdbcDriverClass()); + ExtractComparerBuilder builder = new ExtractComparerBuilder(); + MimeBuffer mimeBuffer = initTables(jdbcUtil, builder, dbPath, evalConfig); + builder.populateRefTables(jdbcUtil, mimeBuffer); + + AtomicInteger enqueued = new AtomicInteger(0); + AtomicInteger processed = new AtomicInteger(0); + AtomicInteger activeWorkers = new AtomicInteger(evalConfig.getNumWorkers()); + AtomicBoolean crawlerActive = new AtomicBoolean(true); + + ArrayBlockingQueue<FileResource> queue = new ArrayBlockingQueue<>(1000); + ExecutorService executorService = Executors.newFixedThreadPool(evalConfig.getNumWorkers() + 2); + ExecutorCompletionService<Integer> executorCompletionService = new ExecutorCompletionService<>(executorService); + + StatusReporter statusReporter = new StatusReporter(enqueued, processed, activeWorkers, crawlerActive); + executorCompletionService.submit(statusReporter); + + DirectoryWalker directoryWalker = new DirectoryWalker(inputDir, queue, enqueued); + executorCompletionService.submit(directoryWalker); + for (int i = 0; i < evalConfig.getNumWorkers(); i++) { + ExtractReader extractReader = new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS, evalConfig.getMinExtractLength(), evalConfig.getMaxExtractLength()); + ExtractComparer extractComparer = new ExtractComparer(inputDir, extractsA, extractsB, extractReader, + builder.getDBWriter(builder.getNonRefTableInfos(), jdbcUtil, mimeBuffer)); + executorCompletionService.submit(new ComparerWorker(queue, extractComparer, processed)); + } + + int finished = 0; + try { + while (finished < evalConfig.getNumWorkers() + 2) { + //blocking + Future<Integer> future = executorCompletionService.take(); + Integer result = future.get(); + if (result != null) { + //if the dir walker has finished + if (result == DIR_WALKER_COMPLETED_VALUE) { + queue.put(SEMAPHORE); + crawlerActive.set(false); + } else if (result == COMPARER_WORKER_COMPLETED_VALUE) { + activeWorkers.decrementAndGet(); + } + finished++; + } + } + } catch (InterruptedException e) { + LOG.info("interrupted", e); + } catch (ExecutionException e) { + throw new RuntimeException(e); + } finally { + mimeBuffer.close(); + executorService.shutdownNow(); + } + + } + + private static MimeBuffer initTables(JDBCUtil jdbcUtil, ExtractComparerBuilder builder, String connectionString, EvalConfig evalConfig) throws SQLException, IOException { + + //step 1. create the tables + jdbcUtil.createTables(builder.getNonRefTableInfos(), JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS); + jdbcUtil.createTables(builder.getRefTableInfos(), JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS); + + //step 2. create mime buffer + return new MimeBuffer(jdbcUtil.getConnection(), builder.getMimeTable(), TikaConfig.getDefaultConfig()); + } + + private static void USAGE() { + HelpFormatter helpFormatter = new HelpFormatter(); + helpFormatter.printHelp(80, "java -jar tika-eval-app-x.y.z.jar FileProfiler -e docs -d mydb [-i inputDir, -c config.json]", + "Tool: Profile", OPTIONS, ""); + } + + private static String USAGE_FAIL(String msg) { + USAGE(); + throw new IllegalArgumentException(msg); + } + + private static class ComparerWorker implements Callable<Integer> { + + private final ArrayBlockingQueue<FileResource> queue; + private final ExtractComparer extractComparer; + private final AtomicInteger processed; + + ComparerWorker(ArrayBlockingQueue<FileResource> queue, ExtractComparer extractComparer, AtomicInteger processed) { + this.queue = queue; + this.extractComparer = extractComparer; + this.processed = processed; + } + + @Override + public Integer call() throws Exception { + while (true) { + FileResource resource = queue.poll(1, TimeUnit.SECONDS); + if (resource == null) { + LOG.info("ExtractProfileWorker waiting on queue"); + continue; + } + if (resource == SEMAPHORE) { + LOG.debug("worker hit semaphore and is stopping"); + extractComparer.closeWriter(); + //hangs + queue.put(resource); + return COMPARER_WORKER_COMPLETED_VALUE; + } + extractComparer.processFileResource(resource); + processed.incrementAndGet(); + } + } + } + + private static class DirectoryWalker implements Callable<Integer> { + private final Path startDir; + private final ArrayBlockingQueue<FileResource> queue; + private final AtomicInteger enqueued; + + public DirectoryWalker(Path startDir, ArrayBlockingQueue<FileResource> queue, AtomicInteger enqueued) { + this.startDir = startDir; + this.queue = queue; + this.enqueued = enqueued; + } + + @Override + public Integer call() throws Exception { + Files.walkFileTree(startDir, new FileVisitor<Path>() { + @Override + public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + if (Files.isDirectory(file)) { + return FileVisitResult.CONTINUE; + } + try { + //blocking + queue.put(new PathResource(file, startDir.relativize(file).toString())); + enqueued.incrementAndGet(); + } catch (InterruptedException e) { + return FileVisitResult.TERMINATE; + } + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException { + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { + return FileVisitResult.CONTINUE; + } + }); + return DIR_WALKER_COMPLETED_VALUE; + } + } + + private static class ExtractComparerBuilder { + private final List<TableInfo> tableInfosA; + private final List<TableInfo> tableInfosB; + private final List<TableInfo> tableInfosAandB; + private final List<TableInfo> refTableInfos; + + public ExtractComparerBuilder() { + List<TableInfo> tableInfosA = new ArrayList<>(); + List<TableInfo> tableInfosB = new ArrayList<>(); + List<TableInfo> tableInfosAandB = new ArrayList<>(); + tableInfosA.add(ExtractComparer.PROFILES_A); + tableInfosA.add(ExtractComparer.EXCEPTION_TABLE_A); + tableInfosA.add(ExtractComparer.TAGS_TABLE_A); + tableInfosA.add(ExtractComparer.CONTENTS_TABLE_A); + tableInfosA.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_A); + tableInfosA.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_A); + + tableInfosB.add(ExtractComparer.PROFILES_B); + tableInfosB.add(ExtractComparer.EXCEPTION_TABLE_B); + tableInfosB.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B); + tableInfosB.add(ExtractComparer.TAGS_TABLE_B); + tableInfosB.add(ExtractComparer.CONTENTS_TABLE_B); + tableInfosB.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_B); + + tableInfosAandB.add(ExtractComparer.COMPARISON_CONTAINERS); + tableInfosAandB.add(ExtractComparer.CONTENT_COMPARISONS); + tableInfosAandB.add(AbstractProfiler.MIME_TABLE); + + List<TableInfo> refTableInfos = new ArrayList<>(); + refTableInfos.add(ExtractComparer.REF_PAIR_NAMES); + refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES); + refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES); + refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES); + + this.tableInfosA = Collections.unmodifiableList(tableInfosA); + this.tableInfosB = Collections.unmodifiableList(tableInfosB); + this.tableInfosAandB = Collections.unmodifiableList(tableInfosAandB); + this.refTableInfos = Collections.unmodifiableList(refTableInfos); + } + + + protected List<TableInfo> getRefTableInfos() { + return refTableInfos; + } + + protected List<TableInfo> getNonRefTableInfos() { + List<TableInfo> allNonRefTables = new ArrayList<>(); + allNonRefTables.addAll(tableInfosA); + allNonRefTables.addAll(tableInfosB); + allNonRefTables.addAll(tableInfosAandB); + return Collections.unmodifiableList(allNonRefTables); + } + + protected TableInfo getMimeTable() { + return AbstractProfiler.MIME_TABLE; + } + + public void populateRefTables(JDBCUtil dbUtil, MimeBuffer mimeBuffer) throws IOException, SQLException { + boolean refTablesPopulated = true; + try { + Connection connection = dbUtil.getConnection(); + for (TableInfo tableInfo : getRefTableInfos()) { + int rows = 0; + try (ResultSet rs = connection + .createStatement() + .executeQuery("select * from " + tableInfo.getName())) { + while (rs.next()) { + rows++; + } + } + if (rows == 0) { + refTablesPopulated = false; + break; + } + + } + } catch (SQLException e) { + //swallow + } + if (refTablesPopulated) { + LOG.info("ref tables are already populated"); + return; + } + + IDBWriter writer = getDBWriter(getRefTableInfos(), dbUtil, mimeBuffer); + Map<Cols, String> m = new HashMap<>(); + for (AbstractProfiler.PARSE_ERROR_TYPE t : AbstractProfiler.PARSE_ERROR_TYPE.values()) { + m.clear(); + m.put(Cols.PARSE_ERROR_ID, Integer.toString(t.ordinal())); + m.put(Cols.PARSE_ERROR_DESCRIPTION, t.name()); + writer.writeRow(AbstractProfiler.REF_PARSE_ERROR_TYPES, m); + } + + for (AbstractProfiler.EXCEPTION_TYPE t : AbstractProfiler.EXCEPTION_TYPE.values()) { + m.clear(); + m.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(t.ordinal())); + m.put(Cols.PARSE_EXCEPTION_DESCRIPTION, t.name()); + writer.writeRow(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES, m); + } + + for (ExtractReaderException.TYPE t : ExtractReaderException.TYPE.values()) { + m.clear(); + m.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(t.ordinal())); + m.put(Cols.EXTRACT_EXCEPTION_DESCRIPTION, t.name()); + writer.writeRow(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES, m); + } + writer.close(); + } + + protected IDBWriter getDBWriter(List<TableInfo> tableInfos, JDBCUtil dbUtil, MimeBuffer mimeBuffer) throws IOException, SQLException { + Connection conn = dbUtil.getConnection(); + return new DBWriter(conn, tableInfos, dbUtil, mimeBuffer); + } + } +} diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java index 9b0f482f6..680e50535 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java @@ -86,6 +86,7 @@ public class ExtractProfiler extends ProfilerBase { } + @Override public boolean processFileResource(FileResource fileResource) { Metadata metadata = fileResource.getMetadata(); EvalFilePaths fps = null; diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java index 19a7d680f..7325dc535 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java @@ -42,6 +42,7 @@ import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; import org.apache.tika.batch.fs.FSProperties; +import org.apache.tika.eval.app.batch.FileResource; import org.apache.tika.eval.app.db.ColInfo; import org.apache.tika.eval.app.db.Cols; import org.apache.tika.eval.app.db.TableInfo; @@ -796,6 +797,8 @@ public abstract class ProfilerBase { return NON_EXISTENT_FILE_LENGTH; } + public abstract boolean processFileResource(FileResource fileResource); + public enum EXCEPTION_TYPE { RUNTIME, ENCRYPTION, ACCESS_PERMISSION, UNSUPPORTED_VERSION, } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java index 91aecd832..b44b0cf4a 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java @@ -158,105 +158,7 @@ public class TikaEvalCLI { } private void handleCompare(String[] subsetArgs) throws Exception { - List<String> argList = new ArrayList(Arrays.asList(subsetArgs)); - - boolean containsBC = false; - String inputDir = null; - String extractsA = null; - String alterExtract = null; - //confirm there's a batch-config file - for (int i = 0; i < argList.size(); i++) { - String arg = argList.get(i); - switch (arg) { - case "-bc": - containsBC = true; - break; - case "-inputDir": - if (i + 1 >= argList.size()) { - System.err.println("Must specify directory after -inputDir"); - ExtractComparer.USAGE(); - return; - } - inputDir = argList.get(i + 1); - i++; - break; - case "-extractsA": - if (i + 1 >= argList.size()) { - System.err.println("Must specify directory after -extractsA"); - ExtractComparer.USAGE(); - return; - } - extractsA = argList.get(i + 1); - i++; - break; - case "-alterExtract": - if (i + 1 >= argList.size()) { - System.err.println("Must specify type 'as_is', 'first_only' or " + "'concatenate_content' after -alterExtract"); - ExtractComparer.USAGE(); - return; - } - alterExtract = argList.get(i + 1); - i++; - break; - } - } - if (alterExtract != null && !alterExtract.equals("as_is") && !alterExtract.equals("concatenate_content") && !alterExtract.equals("first_only")) { - System.out.println("Sorry, I don't understand:" + alterExtract + ". The values must be one of: as_is, first_only, concatenate_content"); - ExtractComparer.USAGE(); - return; - } - - //need to specify each in the commandline that goes into tika-batch - //if only extracts is passed to tika-batch, - //the crawler will see no inputDir and start crawling "input". - //if the user doesn't specify inputDir, crawl extractsA - if (inputDir == null && extractsA != null) { - argList.add("-inputDir"); - argList.add(extractsA); - } - - Path tmpBCConfig = null; - try { - tmpBCConfig = Files.createTempFile("tika-eval", ".xml"); - if (!containsBC) { - try (InputStream is = this - .getClass() - .getResourceAsStream("/tika-eval-comparison-config.xml")) { - Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING); - } - argList.add("-bc"); - argList.add(tmpBCConfig - .toAbsolutePath() - .toString()); - - } - String[] updatedArgs = argList.toArray(new String[0]); - DefaultParser defaultCLIParser = new DefaultParser(); - try { - CommandLine commandLine = defaultCLIParser.parse(ExtractComparer.OPTIONS, updatedArgs); - if (commandLine.hasOption("db") && commandLine.hasOption("jdbc")) { - System.out.println("Please specify either the default -db or the full -jdbc, not both"); - ExtractComparer.USAGE(); - return; - } - } catch (ParseException e) { - System.out.println(e.getMessage() + "\n"); - ExtractComparer.USAGE(); - return; - } - - // lazy delete because main() calls System.exit() - if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) { - tmpBCConfig - .toFile() - .deleteOnExit(); - } - FSBatchProcessCLI.main(updatedArgs); - } finally { - if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) { - Files.delete(tmpBCConfig); - } - } + ExtractComparerRunner.main(subsetArgs); } private void handleReport(String[] subsetArgs) throws Exception { diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractComparerBuilder.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractComparerBuilder.java deleted file mode 100644 index 6788de49e..000000000 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractComparerBuilder.java +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.eval.app.batch; - - -import java.io.IOException; -import java.nio.file.Path; -import java.sql.SQLException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; - -import org.apache.tika.batch.FileResourceConsumer; -import org.apache.tika.eval.app.AbstractProfiler; -import org.apache.tika.eval.app.ExtractComparer; -import org.apache.tika.eval.app.db.TableInfo; -import org.apache.tika.util.PropsUtil; - -public class ExtractComparerBuilder extends EvalConsumerBuilder { - public final static String TABLE_PREFIX_A_KEY = "tablePrefixA"; - public final static String TABLE_PREFIX_B_KEY = "tablePrefixB"; - - private final List<TableInfo> tableInfosA; - private final List<TableInfo> tableInfosB; - private final List<TableInfo> tableInfosAandB; - private final List<TableInfo> refTableInfos; - - public ExtractComparerBuilder() { - List<TableInfo> tableInfosA = new ArrayList<>(); - List<TableInfo> tableInfosB = new ArrayList<>(); - List<TableInfo> tableInfosAandB = new ArrayList<>(); - tableInfosA.add(ExtractComparer.PROFILES_A); - tableInfosA.add(ExtractComparer.EXCEPTION_TABLE_A); - tableInfosA.add(ExtractComparer.TAGS_TABLE_A); - tableInfosA.add(ExtractComparer.CONTENTS_TABLE_A); - tableInfosA.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_A); - tableInfosA.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_A); - - tableInfosB.add(ExtractComparer.PROFILES_B); - tableInfosB.add(ExtractComparer.EXCEPTION_TABLE_B); - tableInfosB.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B); - tableInfosB.add(ExtractComparer.TAGS_TABLE_B); - tableInfosB.add(ExtractComparer.CONTENTS_TABLE_B); - tableInfosB.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_B); - - tableInfosAandB.add(ExtractComparer.COMPARISON_CONTAINERS); - tableInfosAandB.add(ExtractComparer.CONTENT_COMPARISONS); - tableInfosAandB.add(AbstractProfiler.MIME_TABLE); - - List<TableInfo> refTableInfos = new ArrayList<>(); - refTableInfos.add(ExtractComparer.REF_PAIR_NAMES); - refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES); - refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES); - refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES); - - this.tableInfosA = Collections.unmodifiableList(tableInfosA); - this.tableInfosB = Collections.unmodifiableList(tableInfosB); - this.tableInfosAandB = Collections.unmodifiableList(tableInfosAandB); - this.refTableInfos = Collections.unmodifiableList(refTableInfos); - } - - @Override - public FileResourceConsumer build() throws IOException, SQLException { - Path extractsA = PropsUtil.getPath(localAttrs.get("extractsA"), null); - if (extractsA == null) { - throw new RuntimeException("Must specify \"extractsA\" -- directory for 'A' extracts"); - } - Path extractsB = PropsUtil.getPath(localAttrs.get("extractsB"), null); - if (extractsB == null) { - throw new RuntimeException("Must specify \"extractsB\" -- directory for 'B' extracts"); - } - - Path inputRootDir = PropsUtil.getPath(localAttrs.get("inputDir"), null); - - if (inputRootDir == null) { - //this is for the sake of the crawler - throw new RuntimeException("Must specify an -inputDir"); - } - - return parameterizeProfiler(new ExtractComparer(queue, inputRootDir, extractsA, extractsB, buildExtractReader(localAttrs), getDBWriter(getNonRefTableInfos()))); - } - - - @Override - protected void updateTableInfosWithPrefixes(Map<String, String> attrs) { - String tablePrefixA = localAttrs.get(TABLE_PREFIX_A_KEY); - - String tablePrefixB = localAttrs.get(TABLE_PREFIX_B_KEY); - - tablePrefixA = (tablePrefixA == null || tablePrefixA.endsWith("_")) ? tablePrefixA : tablePrefixA + "_"; - tablePrefixB = (tablePrefixB == null || tablePrefixB.endsWith("_")) ? tablePrefixB : tablePrefixB + "_"; - - if (tablePrefixA != null) { - for (TableInfo tableInfo : tableInfosA) { - tableInfo.setNamePrefix(tablePrefixA); - } - } - - if (tablePrefixB != null) { - for (TableInfo tableInfo : tableInfosB) { - tableInfo.setNamePrefix(tablePrefixB); - } - } - - if (tablePrefixA != null || tablePrefixB != null) { - String aAndB = (tablePrefixA == null) ? "" : tablePrefixA; - aAndB = (tablePrefixB == null) ? aAndB : aAndB + tablePrefixB; - for (TableInfo tableInfo : tableInfosAandB) { - tableInfo.setNamePrefix(aAndB); - } - } - } - - @Override - protected List<TableInfo> getRefTableInfos() { - return refTableInfos; - } - - @Override - protected List<TableInfo> getNonRefTableInfos() { - List<TableInfo> allNonRefTables = new ArrayList<>(); - allNonRefTables.addAll(tableInfosA); - allNonRefTables.addAll(tableInfosB); - allNonRefTables.addAll(tableInfosAandB); - return Collections.unmodifiableList(allNonRefTables); - } - - @Override - protected TableInfo getMimeTable() { - return AbstractProfiler.MIME_TABLE; - } - - @Override - protected void addErrorLogTablePairs(DBConsumersManager manager) { - Path errorLogA = PropsUtil.getPath(localAttrs.get("errorLogFileA"), null); - if (errorLogA == null) { - return; - } - manager.addErrorLogTablePair(errorLogA, ExtractComparer.EXTRACT_EXCEPTION_TABLE_A); - Path errorLogB = PropsUtil.getPath(localAttrs.get("errorLogFileB"), null); - if (errorLogB == null) { - return; - } - manager.addErrorLogTablePair(errorLogB, ExtractComparer.EXTRACT_EXCEPTION_TABLE_B); - - } - -} diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java index 721d106a2..ae9363f52 100644 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java @@ -56,7 +56,7 @@ public class SimpleComparerTest extends TikaTest { @BeforeAll public static void staticSetUp() throws Exception { WRITER = new MockDBWriter(); - AbstractProfiler.loadCommonTokens(Paths.get(SimpleComparerTest.class + ProfilerBase.loadCommonTokens(Paths.get(SimpleComparerTest.class .getResource("/common_tokens") .toURI()), "en"); } @@ -64,7 +64,7 @@ public class SimpleComparerTest extends TikaTest { @BeforeEach public void setUp() throws Exception { WRITER.clear(); - comparer = new ExtractComparer(null, null, Paths.get("extractsA"), Paths.get("extractsB"), + comparer = new ExtractComparer(null, Paths.get("extractsA"), Paths.get("extractsB"), new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS, IGNORE_LENGTH, IGNORE_LENGTH), WRITER); } diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java index 4d7d4bb2b..6bb22f9a6 100644 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java @@ -68,25 +68,18 @@ public class TikaEvalCLITest extends TikaTest { private static void compare() throws IOException { List<String> args = new ArrayList<>(); args.add("Compare"); - args.add("-extractsA"); + args.add("-a"); args.add(ProcessUtils.escapeCommandLine(extractsDir .resolve("extractsA") .toAbsolutePath() .toString())); - args.add("-extractsB"); + args.add("-b"); args.add(ProcessUtils.escapeCommandLine(extractsDir .resolve("extractsB") .toAbsolutePath() .toString())); - //add these just to confirm this info doesn't cause problems w cli - args.add("-maxTokens"); - args.add("10000000"); - args.add("-maxContentLength"); - args.add("100000000"); - args.add("-maxContentLengthForLangId"); - args.add("100000"); - args.add("-db"); + args.add("-d"); args.add(ProcessUtils.escapeCommandLine(compareDBDir .toAbsolutePath() .toString() + "/" + dbName));