This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4342 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 303623e67e451392eab1249d47463f2861507127 Author: tallison <talli...@apache.org> AuthorDate: Tue Jul 8 15:27:27 2025 -0400 TIKA-4342 -- remove tika-batch from ExtractProfiler --- .../java/org/apache/tika/eval/app/EvalConfig.java | 89 ++++- .../tika/eval/app/ExctractProfileRunner.java | 204 ----------- .../apache/tika/eval/app/ExtractProfileRunner.java | 374 +++++++++++++++++++++ .../org/apache/tika/eval/app/ExtractProfiler.java | 1 - .../org/apache/tika/eval/app/FileProfiler.java | 19 ++ .../org/apache/tika/eval/app/ProfilerBase.java | 3 - .../org/apache/tika/eval/app/StatusReporter.java | 102 ++++++ .../java/org/apache/tika/eval/app/TikaEvalCLI.java | 4 +- .../eval/app/batch/ExtractProfilerBuilder.java | 120 ------- .../apache/tika/eval/app/batch/PathResource.java | 21 +- .../org/apache/tika/eval/app/EvalConfigTest.java | 42 +++ .../apache/tika/eval/app/ProfilerBatchTest.java | 117 +++---- .../org/apache/tika/eval/app/TikaEvalCLITest.java | 11 +- .../resources/eval-configs/eval-config-basic.json | 3 + .../test-dirs/raw_input/file10_permahang.txt | 0 .../resources/test-dirs/raw_input/file12_es.txt | 6 + .../test-dirs/raw_input/file13_attachANotB.doc | 12 + .../test-dirs/raw_input/file14_diffAttachOrder | 21 ++ .../test/resources/test-dirs/raw_input/file15_tags | 41 +++ .../resources/test-dirs/raw_input/file16_badTags | 41 +++ .../test-dirs/raw_input/file17_tagsOutOfOrder | 41 +++ tika-parent/pom.xml | 2 +- 22 files changed, 859 insertions(+), 415 deletions(-) diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java index 416d7bb6e..5525180ed 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java @@ -1,13 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.tika.eval.app; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.ObjectMapper; + public class EvalConfig { - long minExtractLength = 0; - long maxExtractLength = 10_000_000; - String jdbcString = null; - int maxFilesToAdd = -1; - int maxTokens = 200000; - int maxContentLength = 5_000_000; - int numThreads = 4; + private long minExtractLength = 0; + private long maxExtractLength = 2_000_000; + private String jdbcString = null; + private String jdbcDriverClass = null; + private boolean forceDrop = true; + private int maxFilesToAdd = -1; + private int maxTokens = 200000; + + private int maxContentLength = 5_000_000; + private int numWorkers = 4; + private Path errorLogFile = null; + + + public static EvalConfig load(Path path) throws Exception { + return new ObjectMapper().readValue(path.toFile(), EvalConfig.class); + } + + public long getMinExtractLength() { + return minExtractLength; + } + + public long getMaxExtractLength() { + return maxExtractLength; + } + + public String getJdbcString() { + return jdbcString; + } + + public String getJdbcDriverClass() { + return jdbcDriverClass; + } + + public boolean isForceDrop() { + return forceDrop; + } + + public int getMaxFilesToAdd() { + return maxFilesToAdd; + } + + public int getMaxTokens() { + return maxTokens; + } + + public int getMaxContentLength() { + return maxContentLength; + } + + public int getNumWorkers() { + return numWorkers; + } + + public Path getErrorLogFile() { + return errorLogFile; + } + @Override + public String toString() { + return "EvalConfig{" + "minExtractLength=" + minExtractLength + ", maxExtractLength=" + maxExtractLength + ", jdbcString='" + jdbcString + '\'' + ", jdbcDriverClass='" + + jdbcDriverClass + '\'' + ", forceDrop=" + forceDrop + ", maxFilesToAdd=" + maxFilesToAdd + ", maxTokens=" + maxTokens + ", maxContentLength=" + maxContentLength + + ", numThreads=" + numWorkers + ", errorLogFile=" + errorLogFile + '}'; + } } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExctractProfileRunner.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExctractProfileRunner.java deleted file mode 100644 index 20cb33c1c..000000000 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExctractProfileRunner.java +++ /dev/null @@ -1,204 +0,0 @@ -package org.apache.tika.eval.app; - -import java.io.IOException; -import java.nio.file.FileVisitResult; -import java.nio.file.FileVisitor; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.nio.file.attribute.BasicFileAttributes; -import java.util.List; -import java.util.concurrent.ArrayBlockingQueue; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorCompletionService; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; - -import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.DefaultParser; -import org.apache.commons.cli.HelpFormatter; -import org.apache.commons.cli.Option; -import org.apache.commons.cli.Options; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.tika.config.TikaConfig; -import org.apache.tika.eval.app.batch.FileResource; -import org.apache.tika.eval.app.batch.PathResource; -import org.apache.tika.eval.app.db.JDBCUtil; -import org.apache.tika.eval.app.db.MimeBuffer; -import org.apache.tika.eval.app.io.ExtractReader; -import org.apache.tika.eval.app.io.IDBWriter; - -public class ExctractProfileRunner { - - private static final Logger LOG = LoggerFactory.getLogger(ExctractProfileRunner.class); - private static final PathResource SEMAPHORE = new PathResource(Paths.get("/"), "STOP"); - - static Options OPTIONS; - - static { - - OPTIONS = new Options() - .addOption(Option.builder("e").longOpt("extracts").hasArg().desc("required: directory of extracts").build()) - .addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional: directory for original binary input documents." - + " If not specified, -extracts is crawled as is.").build()) - .addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db path").build()) - .addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json config file").build()) - ; - } - public static void main(String[] args) throws Exception { - DefaultParser defaultCLIParser = new DefaultParser(); - CommandLine commandLine = defaultCLIParser.parse(OPTIONS, args); - EvalConfig evalConfig = commandLine.hasOption('c') ? EvalConfig.load(Paths.get(commandLine.getOptionValue('c'))) : new EvalConfig(); - Path extractsDir = commandLine.hasOption('e') ? Paths.get(commandLine.getOptionValue('e')) : Paths.get(USAGE_FAIL("Must specify extracts dir: -i")); - Path inputDir = commandLine.hasOption('i') ? Paths.get(commandLine.getOptionValue('i')) : extractsDir; - String dbPath = commandLine.hasOption('d') ? commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d"); - execute(inputDir, extractsDir, dbPath, evalConfig); - } - - private static void execute(Path inputDir, Path extractsDir, String dbPath, EvalConfig evalConfig) { - - ArrayBlockingQueue<FileResource> queue = new ArrayBlockingQueue<>(1000); - DirectoryWalker fileWalker = new DirectoryWalker(inputDir, queue); - ExecutorService executorService = Executors.newFixedThreadPool(evalConfig.numThreads + 1); - ExecutorCompletionService<Integer> executorCompletionService = new ExecutorCompletionService<>(executorService); - executorCompletionService.submit(fileWalker); - IDBWriter dbWriter = buildDBWriter(); - for (int i = 0; i < evalConfig.numThreads; i++) { - ExtractReader extractReader = new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS, evalConfig.minExtractLength, evalConfig.maxExtractLength); - - ExtractProfiler extractProfiler = new ExtractProfiler(inputDir, extractsDir, extractReader, dbWriter); - executorCompletionService.submit(new ProfileWorker(queue, extractProfiler)); - } - - int finished = 0; - try { - while (finished < evalConfig.numThreads + 1) { - //blocking - Future<Integer> future = executorCompletionService.take(); - Integer result = future.get(); - if (result != null) { - finished++; - } - - } - } catch (InterruptedException e) { - LOG.info("interrupted", e); - } catch (ExecutionException e) { - throw new RuntimeException(e); - } finally { - executorService.shutdownNow(); - } - - } - - private static IDBWriter buildDBWriter(String connectionString, String driverClass) { - MimeBuffer mimeBuffer = null; - JDBCUtil dbUtil = new JDBCUtil(connectionString, driverClass); - //Step 1. Used to be update table infos with prefixes - updateTableInfosWithPrefixes(localAttrs); - - JDBCUtil.CREATE_TABLE createRegularTable = (forceDrop) ? JDBCUtil.CREATE_TABLE.DROP_IF_EXISTS : JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS; - - JDBCUtil.CREATE_TABLE createRefTable = (forceDrop) ? JDBCUtil.CREATE_TABLE.DROP_IF_EXISTS : JDBCUtil.CREATE_TABLE.SKIP_IF_EXISTS; - - //step 2. create the tables - dbUtil.createTables(getNonRefTableInfos(), JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS); - dbUtil.createTables(getRefTableInfos(), JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS); - - //step 3. create mime buffer - this.mimeBuffer = new MimeBuffer(dbUtil.getConnection(), getMimeTable(), TikaConfig.getDefaultConfig()); - - //step 4. populate the reference tables - populateRefTables(); - - return mimeBuffer; - - - } - - private static void USAGE() { - HelpFormatter helpFormatter = new HelpFormatter(); - helpFormatter.printHelp(80, "java -jar tika-eval-app-x.y.z.jar FileProfiler -e docs -d mydb [-i inputDir, -c config.json]", - "Tool: Profile", OPTIONS, ""); - } - - private static String USAGE_FAIL(String msg) { - USAGE(); - throw new IllegalArgumentException(msg); - } - - private static class ProfileWorker implements Callable<Integer> { - - private final ArrayBlockingQueue<FileResource> queue; - private final ExtractProfiler extractProfiler; - ProfileWorker(ArrayBlockingQueue<FileResource> queue, ExtractProfiler extractProfiler) { - this.queue = queue; - this.extractProfiler = extractProfiler; - } - - @Override - public Integer call() throws Exception { - while (true) { - FileResource resource = queue.poll(1, TimeUnit.SECONDS); - if (resource == null) { - LOG.info("ExtractProfileWorker waiting on queue"); - continue; - } - if (resource == SEMAPHORE) { - LOG.debug("worker hit semaphore and is stopping"); - //hangs - queue.put(resource); - return 1; - } - extractProfiler.processFileResource(resource); - } - } - } - - private static class DirectoryWalker implements Callable<Integer> { - private final Path startDir; - private final ArrayBlockingQueue<FileResource> queue; - - public DirectoryWalker(Path startDir, ArrayBlockingQueue<FileResource> queue) { - this.startDir = startDir; - this.queue = queue; - } - - @Override - public Integer call() throws Exception { - Files.walkFileTree(startDir, new FileVisitor<Path>() { - @Override - public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { - return FileVisitResult.CONTINUE; - } - - @Override - public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { - //blocking - try { - queue.put(new PathResource(file, startDir.relativize(file).toString())); - } catch (InterruptedException e) { - return FileVisitResult.TERMINATE; - } - return FileVisitResult.CONTINUE; - } - - @Override - public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException { - return FileVisitResult.CONTINUE; - } - - @Override - public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { - return FileVisitResult.CONTINUE; - } - }); - return 0; - } - } -} diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java new file mode 100644 index 000000000..cd80a3df3 --- /dev/null +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java @@ -0,0 +1,374 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.app; + +import java.io.IOException; +import java.nio.file.FileVisitResult; +import java.nio.file.FileVisitor; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.attribute.BasicFileAttributes; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.eval.app.batch.DBConsumersManager; +import org.apache.tika.eval.app.batch.FileResource; +import org.apache.tika.eval.app.batch.PathResource; +import org.apache.tika.eval.app.db.Cols; +import org.apache.tika.eval.app.db.JDBCUtil; +import org.apache.tika.eval.app.db.MimeBuffer; +import org.apache.tika.eval.app.db.TableInfo; +import org.apache.tika.eval.app.io.DBWriter; +import org.apache.tika.eval.app.io.ExtractReader; +import org.apache.tika.eval.app.io.ExtractReaderException; +import org.apache.tika.eval.app.io.IDBWriter; + +public class ExtractProfileRunner { + + private static final Logger LOG = LoggerFactory.getLogger(ExtractProfileRunner.class); + private static final PathResource SEMAPHORE = new PathResource(Paths.get("/"), "STOP"); + private static final int DIR_WALKER_COMPLETED_VALUE = 2; + private static final int PROFILE_WORKER_COMPLETED_VALUE = 1; + + static Options OPTIONS; + + static { + + OPTIONS = new Options() + .addOption(Option.builder("e").longOpt("extracts").hasArg().desc("required: directory of extracts").build()) + .addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional: directory for original binary input documents." + + " If not specified, -extracts is crawled as is.").build()) + .addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db path").build()) + .addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json config file").build()) + ; + } + public static void main(String[] args) throws Exception { + DefaultParser defaultCLIParser = new DefaultParser(); + CommandLine commandLine = defaultCLIParser.parse(OPTIONS, args); + EvalConfig evalConfig = commandLine.hasOption('c') ? EvalConfig.load(Paths.get(commandLine.getOptionValue('c'))) : new EvalConfig(); + Path extractsDir = commandLine.hasOption('e') ? Paths.get(commandLine.getOptionValue('e')) : Paths.get(USAGE_FAIL("Must specify extracts dir: -i")); + Path inputDir = commandLine.hasOption('i') ? Paths.get(commandLine.getOptionValue('i')) : extractsDir; + String dbPath = commandLine.hasOption('d') ? commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d"); + String jdbcString = getJdbcConnectionString(dbPath); + execute(inputDir, extractsDir, jdbcString, evalConfig); + } + + private static String getJdbcConnectionString(String dbPath) { + if (dbPath.startsWith("jdbc:")) { + return dbPath; + } + //default to h2 + Path p = Paths.get(dbPath); + return "jdbc:h2:file:" + p.toAbsolutePath(); + + } + + private static void execute(Path inputDir, Path extractsDir, String dbPath, EvalConfig evalConfig) throws SQLException, IOException { + + //parameterize this? if necesssary + try { + ProfilerBase.loadCommonTokens(null, null); + } catch (IOException e) { + throw new RuntimeException(e); + } + + JDBCUtil jdbcUtil = new JDBCUtil(dbPath, evalConfig.getJdbcDriverClass()); + ExtractProfilerBuilder builder = new ExtractProfilerBuilder(); + MimeBuffer mimeBuffer = initTables(jdbcUtil, builder, dbPath, evalConfig); + builder.populateRefTables(jdbcUtil, mimeBuffer); + + AtomicInteger enqueued = new AtomicInteger(0); + AtomicInteger processed = new AtomicInteger(0); + AtomicInteger activeWorkers = new AtomicInteger(evalConfig.getNumWorkers()); + AtomicBoolean crawlerActive = new AtomicBoolean(true); + + + + ArrayBlockingQueue<FileResource> queue = new ArrayBlockingQueue<>(1000); + ExecutorService executorService = Executors.newFixedThreadPool(evalConfig.getNumWorkers() + 2); + ExecutorCompletionService<Integer> executorCompletionService = new ExecutorCompletionService<>(executorService); + + StatusReporter statusReporter = new StatusReporter(enqueued, processed, activeWorkers, crawlerActive); + executorCompletionService.submit(statusReporter); + + DirectoryWalker directoryWalker = new DirectoryWalker(inputDir, queue, enqueued); + executorCompletionService.submit(directoryWalker); + for (int i = 0; i < evalConfig.getNumWorkers(); i++) { + ExtractReader extractReader = new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS, evalConfig.getMinExtractLength(), evalConfig.getMaxExtractLength()); + ExtractProfiler extractProfiler = new ExtractProfiler(inputDir, extractsDir, extractReader, builder.getDBWriter(builder.tableInfos, jdbcUtil, mimeBuffer)); + executorCompletionService.submit(new ProfileWorker(queue, extractProfiler, processed)); + } + + int finished = 0; + try { + while (finished < evalConfig.getNumWorkers() + 2) { + //blocking + Future<Integer> future = executorCompletionService.take(); + Integer result = future.get(); + if (result != null) { + //if the dir walker has finished + if (result == DIR_WALKER_COMPLETED_VALUE) { + queue.put(SEMAPHORE); + crawlerActive.set(false); + } else if (result == PROFILE_WORKER_COMPLETED_VALUE) { + activeWorkers.decrementAndGet(); + } + finished++; + } + } + } catch (InterruptedException e) { + LOG.info("interrupted", e); + } catch (ExecutionException e) { + throw new RuntimeException(e); + } finally { + mimeBuffer.close(); + executorService.shutdownNow(); + } + + } + + private static MimeBuffer initTables(JDBCUtil jdbcUtil, ExtractProfilerBuilder builder, String connectionString, EvalConfig evalConfig) throws SQLException, IOException { + + //step 1. create the tables + jdbcUtil.createTables(builder.getNonRefTableInfos(), JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS); + jdbcUtil.createTables(builder.getRefTableInfos(), JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS); + + //step 2. create mime buffer + return new MimeBuffer(jdbcUtil.getConnection(), builder.getMimeTable(), TikaConfig.getDefaultConfig()); + } + + private static void USAGE() { + HelpFormatter helpFormatter = new HelpFormatter(); + helpFormatter.printHelp(80, "java -jar tika-eval-app-x.y.z.jar FileProfiler -e docs -d mydb [-i inputDir, -c config.json]", + "Tool: Profile", OPTIONS, ""); + } + + private static String USAGE_FAIL(String msg) { + USAGE(); + throw new IllegalArgumentException(msg); + } + + private static class ProfileWorker implements Callable<Integer> { + + private final ArrayBlockingQueue<FileResource> queue; + private final ExtractProfiler extractProfiler; + private final AtomicInteger processed; + + ProfileWorker(ArrayBlockingQueue<FileResource> queue, ExtractProfiler extractProfiler, AtomicInteger processed) { + this.queue = queue; + this.extractProfiler = extractProfiler; + this.processed = processed; + } + + @Override + public Integer call() throws Exception { + while (true) { + FileResource resource = queue.poll(1, TimeUnit.SECONDS); + if (resource == null) { + LOG.info("ExtractProfileWorker waiting on queue"); + continue; + } + if (resource == SEMAPHORE) { + LOG.debug("worker hit semaphore and is stopping"); + extractProfiler.closeWriter(); + //hangs + queue.put(resource); + return PROFILE_WORKER_COMPLETED_VALUE; + } + extractProfiler.processFileResource(resource); + processed.incrementAndGet(); + } + } + } + + private static class DirectoryWalker implements Callable<Integer> { + private final Path startDir; + private final ArrayBlockingQueue<FileResource> queue; + private final AtomicInteger enqueued; + + public DirectoryWalker(Path startDir, ArrayBlockingQueue<FileResource> queue, AtomicInteger enqueued) { + this.startDir = startDir; + this.queue = queue; + this.enqueued = enqueued; + } + + @Override + public Integer call() throws Exception { + Files.walkFileTree(startDir, new FileVisitor<Path>() { + @Override + public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + if (Files.isDirectory(file)) { + return FileVisitResult.CONTINUE; + } + try { + //blocking + queue.put(new PathResource(file, startDir.relativize(file).toString())); + enqueued.incrementAndGet(); + } catch (InterruptedException e) { + return FileVisitResult.TERMINATE; + } + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException { + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { + return FileVisitResult.CONTINUE; + } + }); + return DIR_WALKER_COMPLETED_VALUE; + } + } + + private static class ExtractProfilerBuilder { + private final List<TableInfo> tableInfos; + private final List<TableInfo> refTableInfos; + + public ExtractProfilerBuilder() { + List<TableInfo> tableInfos = new ArrayList(); + tableInfos.add(AbstractProfiler.MIME_TABLE); + tableInfos.add(ExtractProfiler.CONTAINER_TABLE); + tableInfos.add(ExtractProfiler.PROFILE_TABLE); + tableInfos.add(ExtractProfiler.EXTRACT_EXCEPTION_TABLE); + tableInfos.add(ExtractProfiler.EXCEPTION_TABLE); + tableInfos.add(ExtractProfiler.CONTENTS_TABLE); + tableInfos.add(ExtractProfiler.TAGS_TABLE); + tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE); + this.tableInfos = Collections.unmodifiableList(tableInfos); + + List<TableInfo> refTableInfos = new ArrayList<>(); + refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES); + refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES); + refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES); + this.refTableInfos = Collections.unmodifiableList(refTableInfos); + } + + + protected List<TableInfo> getRefTableInfos() { + return refTableInfos; + } + + protected List<TableInfo> getNonRefTableInfos() { + return tableInfos; + } + + protected TableInfo getMimeTable() { + return AbstractProfiler.MIME_TABLE; + } + + public void populateRefTables(JDBCUtil dbUtil, MimeBuffer mimeBuffer) throws IOException, SQLException { + boolean refTablesPopulated = true; + try { + Connection connection = dbUtil.getConnection(); + for (TableInfo tableInfo : getRefTableInfos()) { + int rows = 0; + try (ResultSet rs = connection + .createStatement() + .executeQuery("select * from " + tableInfo.getName())) { + while (rs.next()) { + rows++; + } + } + if (rows == 0) { + refTablesPopulated = false; + break; + } + + } + } catch (SQLException e) { + //swallow + } + if (refTablesPopulated) { + LOG.info("ref tables are already populated"); + return; + } + + IDBWriter writer = getDBWriter(getRefTableInfos(), dbUtil, mimeBuffer); + Map<Cols, String> m = new HashMap<>(); + for (AbstractProfiler.PARSE_ERROR_TYPE t : AbstractProfiler.PARSE_ERROR_TYPE.values()) { + m.clear(); + m.put(Cols.PARSE_ERROR_ID, Integer.toString(t.ordinal())); + m.put(Cols.PARSE_ERROR_DESCRIPTION, t.name()); + writer.writeRow(AbstractProfiler.REF_PARSE_ERROR_TYPES, m); + } + + for (AbstractProfiler.EXCEPTION_TYPE t : AbstractProfiler.EXCEPTION_TYPE.values()) { + m.clear(); + m.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(t.ordinal())); + m.put(Cols.PARSE_EXCEPTION_DESCRIPTION, t.name()); + writer.writeRow(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES, m); + } + + for (ExtractReaderException.TYPE t : ExtractReaderException.TYPE.values()) { + m.clear(); + m.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(t.ordinal())); + m.put(Cols.EXTRACT_EXCEPTION_DESCRIPTION, t.name()); + writer.writeRow(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES, m); + } + writer.close(); + } + + protected IDBWriter getDBWriter(List<TableInfo> tableInfos, JDBCUtil dbUtil, MimeBuffer mimeBuffer) throws IOException, SQLException { + Connection conn = dbUtil.getConnection(); + return new DBWriter(conn, tableInfos, dbUtil, mimeBuffer); + } + + + protected void addErrorLogTablePairs(DBConsumersManager manager, EvalConfig evalConfig) { + Path errorLog = evalConfig.getErrorLogFile(); + if (errorLog == null) { + return; + } + manager.addErrorLogTablePair(errorLog, ExtractProfiler.EXTRACT_EXCEPTION_TABLE); + } + } +} diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java index a2a4986f3..9b0f482f6 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java @@ -22,7 +22,6 @@ import java.sql.Types; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.concurrent.ArrayBlockingQueue; import org.apache.commons.cli.Options; diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java index edc431e4a..925452094 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java @@ -73,7 +73,26 @@ public class FileProfiler extends AbstractProfiler { public static TableInfo FILE_MIME_TABLE = new TableInfo("file_mimes", new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256), new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)); + static Options OPTIONS; + static { + + Option inputDir = new Option("inputDir", true, "optional: directory for original binary input documents." + " If not specified, -extracts is crawled as is."); + + OPTIONS = new Options() + .addOption(inputDir) + .addOption("bc", "optional: tika-batch config file") + .addOption("numConsumers", true, "optional: number of consumer threads") + .addOption("db", true, "db file to which to write results") + .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>") + .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver") + .addOption("tablePrefix", true, "EXPERT: optional prefix for table names") + .addOption("drop", false, "drop tables if they exist") + .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler") + + ; + + } private final Path inputDir; diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java index f5d5f1a15..19a7d680f 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java @@ -29,7 +29,6 @@ import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; -import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -42,8 +41,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; -import org.apache.tika.batch.FileResource; -import org.apache.tika.batch.FileResourceConsumer; import org.apache.tika.batch.fs.FSProperties; import org.apache.tika.eval.app.db.ColInfo; import org.apache.tika.eval.app.db.Cols; diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/StatusReporter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/StatusReporter.java new file mode 100644 index 000000000..8df9824ed --- /dev/null +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/StatusReporter.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.app; + +import java.text.NumberFormat; +import java.util.Locale; +import java.util.concurrent.Callable; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.util.DurationFormatUtils; + +public class StatusReporter implements Callable<Integer> { + + public static final int COMPLETED_VAL = 3; + private static final Logger LOGGER = LoggerFactory.getLogger(StatusReporter.class); + private final AtomicInteger filesQueued; + private final AtomicInteger filesProcessed; + private final AtomicInteger activeWorkers; + private final AtomicBoolean crawlerIsActive; + private final long start; + private final NumberFormat numberFormat = NumberFormat.getNumberInstance(Locale.ROOT); + + + public StatusReporter(AtomicInteger filesQueued, AtomicInteger filesProcessed, AtomicInteger activeWorkers, AtomicBoolean crawlerIsActive) { + this.filesQueued = filesQueued; + this.filesProcessed = filesProcessed; + this.activeWorkers = activeWorkers; + this.crawlerIsActive = crawlerIsActive; + this.start = System.currentTimeMillis(); + } + + @Override + public Integer call() throws Exception { + while (true) { + + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + LOGGER.info("Interrupted?"); + //expected + return COMPLETED_VAL; + } + report(); + if (activeWorkers.get() == 0) { + LOGGER.info("Completed successfully."); + return COMPLETED_VAL; + } + } + } + + private void report() { + int cnt = filesProcessed.get(); + long elapsed = System.currentTimeMillis() - start; + double elapsedSecs = (double) elapsed / (double) 1000; + int avg = (elapsedSecs > 5 || cnt > 100) ? (int) ((double) cnt / elapsedSecs) : -1; + + String elapsedString = DurationFormatUtils.formatMillis(System.currentTimeMillis() - start); + String docsPerSec = avg > -1 ? String.format(Locale.ROOT, " (%s docs per sec)", numberFormat.format(avg)) : ""; + String msg = String.format(Locale.ROOT, "Processed %s documents in %s%s.", numberFormat.format(cnt), elapsedString, docsPerSec); + LOGGER.info(msg); + + int stillAlive = activeWorkers.get(); + if (stillAlive == 1) { + msg = "There is one file processor still active."; + } else { + msg = "There are " + numberFormat.format(stillAlive) + " file processors still active."; + } + LOGGER.info(msg); + + int queued = filesQueued.get(); + + if (queued == 1) { + msg = "The crawler has enqueued 1 file."; + } else { + msg = "The crawler has enqueued " + numberFormat.format(queued) + " files."; + } + LOGGER.info(msg); + + if (! crawlerIsActive.get()) { + msg = "The directory crawler has completed its crawl.\n"; + LOGGER.info(msg); + } + } +} diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java index bae1435e7..91aecd832 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java @@ -117,7 +117,7 @@ public class TikaEvalCLI { CommandLine commandLine = defaultCLIParser.parse(FileProfiler.OPTIONS, updatedArgs); if (commandLine.hasOption("db") && commandLine.hasOption("jdbc")) { System.out.println("Please specify either the default -db or the full -jdbc, not both"); - ExtractProfiler.USAGE(); + FileProfiler.USAGE(); return; } } catch (ParseException e) { @@ -154,7 +154,7 @@ public class TikaEvalCLI { } private void handleProfile(String[] subsetArgs) throws Exception { - ExctractProfileRunner.main(subsetArgs); + ExtractProfileRunner.main(subsetArgs); } private void handleCompare(String[] subsetArgs) throws Exception { diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java deleted file mode 100644 index c59d53016..000000000 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.eval.app.batch; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.sql.SQLException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; - -import org.apache.tika.batch.FileResourceConsumer; -import org.apache.tika.eval.app.AbstractProfiler; -import org.apache.tika.eval.app.ExtractProfiler; -import org.apache.tika.eval.app.db.TableInfo; -import org.apache.tika.util.PropsUtil; - - -public class ExtractProfilerBuilder extends EvalConsumerBuilder { - - public final static String TABLE_PREFIX_KEY = "tablePrefix"; - - private final List<TableInfo> tableInfos; - private final List<TableInfo> refTableInfos; - - public ExtractProfilerBuilder() { - List<TableInfo> tableInfos = new ArrayList(); - tableInfos.add(AbstractProfiler.MIME_TABLE); - tableInfos.add(ExtractProfiler.CONTAINER_TABLE); - tableInfos.add(ExtractProfiler.PROFILE_TABLE); - tableInfos.add(ExtractProfiler.EXTRACT_EXCEPTION_TABLE); - tableInfos.add(ExtractProfiler.EXCEPTION_TABLE); - tableInfos.add(ExtractProfiler.CONTENTS_TABLE); - tableInfos.add(ExtractProfiler.TAGS_TABLE); - tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE); - this.tableInfos = Collections.unmodifiableList(tableInfos); - - List<TableInfo> refTableInfos = new ArrayList<>(); - refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES); - refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES); - refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES); - this.refTableInfos = Collections.unmodifiableList(refTableInfos); - } - - @Override - public FileResourceConsumer build() throws IOException, SQLException { - Path extracts = PropsUtil.getPath(localAttrs.get("extracts"), null); - if (extracts == null) { - throw new RuntimeException("Must specify \"extracts\" -- directory to crawl"); - } - if (!Files.isDirectory(extracts)) { - throw new RuntimeException("ROOT DIRECTORY DOES NOT EXIST: " + extracts.toAbsolutePath()); - } - - Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null); - - //we _could_ set this to extracts (if not null) - //here, but the Crawler defaults to "input" if nothing is passed - //so this won't work - if (inputDir == null) { - throw new RuntimeException("Must specify -inputDir"); - } - if (extracts == null && inputDir != null) { - extracts = inputDir; - } - return parameterizeProfiler(new ExtractProfiler(queue, inputDir, extracts, buildExtractReader(localAttrs), getDBWriter(tableInfos))); - } - - - @Override - protected void updateTableInfosWithPrefixes(Map<String, String> attrs) { - String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY); - if (tableNamePrefix != null && !tableNamePrefix.equals("null")) { - for (TableInfo tableInfo : tableInfos) { - tableInfo.setNamePrefix(tableNamePrefix); - } - } - } - - - @Override - protected List<TableInfo> getRefTableInfos() { - return refTableInfos; - } - - @Override - protected List<TableInfo> getNonRefTableInfos() { - return tableInfos; - } - - @Override - protected TableInfo getMimeTable() { - return AbstractProfiler.MIME_TABLE; - } - - @Override - protected void addErrorLogTablePairs(DBConsumersManager manager) { - Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null); - if (errorLog == null) { - return; - } - manager.addErrorLogTablePair(errorLog, ExtractProfiler.EXTRACT_EXCEPTION_TABLE); - } -} diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java index 9d64317aa..20f67798a 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/PathResource.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.tika.eval.app.batch; @@ -6,15 +22,18 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import org.apache.tika.batch.fs.FSProperties; import org.apache.tika.metadata.Metadata; public class PathResource implements FileResource { private final Path path; private final String resourceId; + private final Metadata metadata = new Metadata(); public PathResource(Path path, String resourceId) { this.path = path; this.resourceId = resourceId; + metadata.set(FSProperties.FS_REL_PATH, resourceId); } @Override public String getResourceId() { @@ -23,7 +42,7 @@ public class PathResource implements FileResource { @Override public Metadata getMetadata() { - return new Metadata(); + return metadata; } @Override diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/EvalConfigTest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/EvalConfigTest.java new file mode 100644 index 000000000..395c90fe6 --- /dev/null +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/EvalConfigTest.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval.app; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; + +import java.net.URISyntaxException; +import java.nio.file.Path; +import java.nio.file.Paths; + +import org.junit.jupiter.api.Test; + +public class EvalConfigTest { + + @Test + public void testBasic() throws Exception { + EvalConfig evalConfig = EvalConfig.load(getConfig("eval-config-basic.json")); + assertEquals(20000, evalConfig.getMaxExtractLength()); + assertNull(evalConfig.getErrorLogFile()); + assertNull(evalConfig.getJdbcString()); + } + + private Path getConfig(String fileName) throws URISyntaxException { + return Paths.get(EvalConfigTest.class.getResource("/eval-configs/" + fileName).toURI()); + } +} diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java index be58e0ed2..3d6e93ad3 100644 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java @@ -21,7 +21,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.IOException; -import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -30,79 +29,75 @@ import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; -import org.junit.jupiter.api.AfterAll; +import org.apache.commons.io.FileUtils; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.apache.tika.eval.app.db.Cols; import org.apache.tika.eval.app.db.H2Util; import org.apache.tika.eval.app.db.TableInfo; -import org.apache.tika.eval.app.io.ExtractReaderException; -@Disabled public class ProfilerBatchTest { - public final static String COMPARER_PROCESS_CLASS = "org.apache.tika.batch.fs.FSBatchProcessCLI"; - private final static String profileTable = ExtractProfiler.PROFILE_TABLE.getName(); - private final static String exTable = ExtractProfiler.EXCEPTION_TABLE.getName(); - private final static String fpCol = Cols.FILE_PATH.name(); - private static Path dbDir; - private static Connection conn; + private static Connection CONN; + private static Path DB_DIR; + private static Path DB; @BeforeAll public static void setUp() throws Exception { + DB_DIR = Files.createTempDirectory("profiler-test"); + Path extractsRoot = Paths.get(ComparerBatchTest.class + .getResource("/test-dirs/extractsA") + .toURI()); Path inputRoot = Paths.get(ComparerBatchTest.class - .getResource("/test-dirs/extractsA") + .getResource("/test-dirs/raw_input") .toURI()); - dbDir = Files.createTempDirectory(inputRoot, "tika-test-db-dir-"); - Map<String, String> args = new HashMap<>(); - Path db = dbDir.resolve("profiler_test"); - args.put("-db", db.toString()); - - //for debugging, you can use this to select only one file pair to load - //args.put("-includeFilePat", "file8.*"); - - /* BatchProcessTestExecutor ex = new BatchProcessTestExecutor(COMPARER_PROCESS_CLASS, args, - "/single-file-profiler-crawl-input-config.xml"); - StreamStrings streamStrings = ex.execute(); - System.out.println(streamStrings.getErrString()); - System.out.println(streamStrings.getOutString());*/ - H2Util dbUtil = new H2Util(db); - conn = dbUtil.getConnection(); - } - @AfterAll - public static void tearDown() throws IOException { + DB = DB_DIR.resolve("mydb"); + String[] args = new String[]{ + "-i", inputRoot.toAbsolutePath().toString(), + "-e", extractsRoot.toAbsolutePath().toString(), + "-d", "jdbc:h2:file:" + DB.toAbsolutePath().toString() + }; + + ExtractProfileRunner.main(args); + } + @AfterEach + public void tearDown() throws IOException { try { - conn.close(); + CONN.close(); } catch (SQLException e) { throw new RuntimeException(e); } - //TODO: if/when we turn this back on, use @TempDir instead of this + FileUtils.deleteDirectory(DB_DIR.toFile()); - DirectoryStream<Path> dStream = Files.newDirectoryStream(dbDir); - for (Path p : dStream) { - Files.delete(p); - } - dStream.close(); - Files.delete(dbDir); + } + + @BeforeEach + public void setUpEach() throws SQLException { + H2Util dbUtil = new H2Util(DB); + CONN = dbUtil.getConnection(); + } + + @AfterEach + public void tearDownEach() throws SQLException { + CONN.close(); } @Test public void testSimpleDBWriteAndRead() throws Exception { - Statement st = null; List<String> fNameList = new ArrayList<>(); try { String sql = "select * from " + ExtractProfiler.CONTAINER_TABLE.getName(); - st = conn.createStatement(); + st = CONN.createStatement(); ResultSet rs = st.executeQuery(sql); while (rs.next()) { String fileName = rs.getString(Cols.FILE_PATH.name()); @@ -113,17 +108,19 @@ public class ProfilerBatchTest { st.close(); } } + /* debugTable(ExtractProfiler.CONTAINER_TABLE); debugTable(ExtractProfiler.PROFILE_TABLE); debugTable(ExtractProfiler.CONTENTS_TABLE); debugTable(ExtractProfiler.EXCEPTION_TABLE); - debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE); - assertEquals(10, fNameList.size()); + debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);*/ + assertEquals(17, fNameList.size()); assertTrue(fNameList.contains("file1.pdf"), "file1.pdf"); assertTrue(fNameList.contains("file2_attachANotB.doc"), "file2_attachANotB.doc"); assertTrue(fNameList.contains("file3_attachBNotA.doc"), "file3_attachBNotA.doc"); assertTrue(fNameList.contains("file4_emptyB.pdf"), "file4_emptyB.pdf"); assertTrue(fNameList.contains("file7_badJson.pdf"), "file4_emptyB.pdf"); + assertTrue(fNameList.contains("file9_noextract.txt"), "file9_noextract.txt"); } @Test @@ -131,43 +128,29 @@ public class ProfilerBatchTest { String sql = "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + " join containers c on c.container_id = e.container_id " + " where c.file_path='file9_noextract.txt'"; - assertEquals("missing extract: file9_noextract.txt", "0", getSingleResult(sql)); - debugTable(ExtractProfiler.CONTAINER_TABLE); + /*debugTable(ExtractProfiler.CONTAINER_TABLE); debugTable(ExtractProfiler.PROFILE_TABLE); debugTable(ExtractProfiler.CONTENTS_TABLE); debugTable(ExtractProfiler.EXCEPTION_TABLE); - debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE); - - sql = "select EXTRACT_EXCEPTION_ID from errors e" + " join containers c on c.container_id = e.container_id " + " where c.file_path='file5_emptyA.pdf'"; - assertEquals("empty extract: file5_emptyA.pdf", "1", getSingleResult(sql)); - - sql = "select EXTRACT_EXCEPTION_ID from errors e" + " join containers c on c.container_id = e.container_id " + " where c.file_path='file7_badJson.pdf'"; - assertEquals("extract error:file7_badJson.pdf", "2", getSingleResult(sql)); - - } - - @Test - public void testParseErrors() throws Exception { - debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE); - String sql = "select file_path from errors where container_id is null"; - assertEquals("file10_permahang.txt", getSingleResult(sql)); - - sql = "select extract_error_id from extract_exceptions " + "where file_path='file11_oom.txt'"; - assertEquals(Integer.toString(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE.ordinal()), getSingleResult(sql)); + debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);*/ + assertEquals("0", getSingleResult(sql), "missing extract: file9_noextract.txt"); - sql = "select parse_error_id from extract_exceptions where file_path='file11_oom.txt'"; - assertEquals(Integer.toString(AbstractProfiler.PARSE_ERROR_TYPE.OOM.ordinal()), getSingleResult(sql)); + sql = "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + " join containers c on c.container_id = e.container_id " + " where c.file_path='file5_emptyA.pdf'"; + assertEquals("1", getSingleResult(sql), "empty extract: file5_emptyA.pdf"); + sql = "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + " join containers c on c.container_id = e.container_id " + " where c.file_path='file7_badJson.pdf'"; + assertEquals("2", getSingleResult(sql), "extract error:file7_badJson.pdf"); } @Test + @Disabled("create actual unit test") public void testParseExceptions() throws Exception { debugTable(ExtractProfiler.EXCEPTION_TABLE); } private String getSingleResult(String sql) throws Exception { Statement st = null; - st = conn.createStatement(); + st = CONN.createStatement(); ResultSet rs = st.executeQuery(sql); int hits = 0; String val = ""; @@ -188,7 +171,7 @@ public class ProfilerBatchTest { Statement st = null; try { String sql = "select * from " + table.getName(); - st = conn.createStatement(); + st = CONN.createStatement(); ResultSet rs = st.executeQuery(sql); int colCount = rs .getMetaData() diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java index fff15e3dc..4d7d4bb2b 100644 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java @@ -98,20 +98,13 @@ public class TikaEvalCLITest extends TikaTest { private static void profile() throws IOException { List<String> args = new ArrayList<>(); args.add("Profile"); - args.add("-extracts"); + args.add("-e"); args.add(ProcessUtils.escapeCommandLine(extractsDir .resolve("extractsA") .toAbsolutePath() .toString())); - //add these just to confirm this info doesn't cause problems w cli - args.add("-maxTokens"); - args.add("10000000"); - args.add("-maxContentLength"); - args.add("100000000"); - args.add("-maxContentLengthForLangId"); - args.add("100000"); - args.add("-db"); + args.add("-d"); args.add(ProcessUtils.escapeCommandLine(profileDBDir .toAbsolutePath() .toString() + "/" + dbName)); diff --git a/tika-eval/tika-eval-app/src/test/resources/eval-configs/eval-config-basic.json b/tika-eval/tika-eval-app/src/test/resources/eval-configs/eval-config-basic.json new file mode 100644 index 000000000..b4af28df3 --- /dev/null +++ b/tika-eval/tika-eval-app/src/test/resources/eval-configs/eval-config-basic.json @@ -0,0 +1,3 @@ +{ + "maxExtractLength" : 20000 +} \ No newline at end of file diff --git a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file10_permahang.txt b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file10_permahang.txt new file mode 100644 index 000000000..e69de29bb diff --git a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file12_es.txt b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file12_es.txt new file mode 100644 index 000000000..5ffd824a9 --- /dev/null +++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file12_es.txt @@ -0,0 +1,6 @@ +[ + { + "Content-Type": "text/plain", + "X-TIKA:content": "El zorro marrón rápido saltó sobre el perro. El zorro marrón rápido saltó sobre el perro. El zorro marrón rápido saltó sobre el perro" + } +] \ No newline at end of file diff --git a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file13_attachANotB.doc b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file13_attachANotB.doc new file mode 100644 index 000000000..15bc592a5 --- /dev/null +++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file13_attachANotB.doc @@ -0,0 +1,12 @@ +[ + { + "Content-Type": "text/plain", + "_comment": "simplified", + "X-TIKA:content": "调整每一个心脏和每个声音,投标每个护理提取;让大家一起欢乐,赞美老拿骚.调整每一个心脏和每个声音,投标每个护理提取;让大家一起欢乐,赞美老拿骚 狐狸狐狸狐狸 " + }, + { + "Content-Type": "text/plain", + "X-TIKA:embedded_resource_path": "inner.txt", + "X-TIKA:content": "attachment contents" + } +] \ No newline at end of file diff --git a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file14_diffAttachOrder b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file14_diffAttachOrder new file mode 100644 index 000000000..25f0db9a1 --- /dev/null +++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file14_diffAttachOrder @@ -0,0 +1,21 @@ +[ + { + "Content-Type": "text/plain", + "X-TIKA:content": "the quick brown fox fox fox jumped over the lazy lazy dog", + "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8351" + }, + { + "Content-Type": "text/plain", + "X-TIKA:embedded_resource_path": "/0", + "X-TIKA:content": "a b c d e f g h i j k l m n", + "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8354", + "X-TIKA:embedded_depth": "1" + }, + { + "Content-Type": "text/plain", + "X-TIKA:embedded_resource_path": "/1", + "X-TIKA:content": "o p q r s t u v w x y z", + "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8353", + "X-TIKA:embedded_depth": "1" + } +] \ No newline at end of file diff --git a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file15_tags b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file15_tags new file mode 100644 index 000000000..5af73db80 --- /dev/null +++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file15_tags @@ -0,0 +1,41 @@ +[ + { + "Content-Length": "34824", + "Content-Type": "application/pdf", + "Last-Modified": "2007-09-15T09:02:31Z", + "X-Parsed-By": [ + "org.apache.tika.parser.DefaultParser", + "org.apache.tika.parser.pdf.PDFParser" + ], + "X-TIKA:content_handler": "ToXMLContentHandler", + "X-TIKA:content": "\u003chtml xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" /\u003e\n\u003cmeta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" /\u003e\n\u003cmeta name\u003d\"access_permission:modify_annotations\" content\u003d\"true\" /\u003e\n\u003cmeta name\u003d\"access_permission:can_print_degra [...] + "X-TIKA:parse_time_millis": "500", + "access_permission:assemble_document": "true", + "access_permission:can_modify": "true", + "access_permission:can_print": "true", + "access_permission:can_print_degraded": "true", + "access_permission:extract_content": "true", + "access_permission:extract_for_accessibility": "true", + "access_permission:fill_in_form": "true", + "access_permission:modify_annotations": "true", + "dc:creator": "Bertrand DelacrΘtaz", + "dc:format": "application/pdf; version\u003d1.3", + "dc:title": "Apache Tika - Apache Tika", + "dcterms:created": "2007-09-15T09:02:31Z", + "dcterms:modified": "2007-09-15T09:02:31Z", + "meta:author": "Bertrand DelacrΘtaz", + "meta:creation-date": "2007-09-15T09:02:31Z", + "meta:save-date": "2007-09-15T09:02:31Z", + "pdf:PDFVersion": "1.3", + "pdf:docinfo:created": "2007-09-15T09:02:31Z", + "pdf:docinfo:creator": "Bertrand DelacrΘtaz", + "pdf:docinfo:creator_tool": "Firefox", + "pdf:docinfo:modified": "2007-09-15T09:02:31Z", + "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext", + "pdf:docinfo:title": "Apache Tika - Apache Tika", + "pdf:encrypted": "false", + "resourceName": "testPDF.pdf", + "xmp:CreatorTool": "Firefox", + "xmpTPg:NPages": "1" + } +] \ No newline at end of file diff --git a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file16_badTags b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file16_badTags new file mode 100644 index 000000000..5c6272e43 --- /dev/null +++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file16_badTags @@ -0,0 +1,41 @@ +[ + { + "Content-Length": "34824", + "Content-Type": "application/pdf", + "Last-Modified": "2007-09-15T09:02:31Z", + "X-Parsed-By": [ + "org.apache.tika.parser.DefaultParser", + "org.apache.tika.parser.pdf.PDFParser" + ], + "X-TIKA:content_handler": "ToXMLContentHandler", + "X-TIKA:content": "\u003chtml xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" meta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" /\u003e\n\u003c\u003c\u003c\u003c\u003cmeta name\u003d\"access_permission:modify_annotations\" content\u003d\"true\" /\u003e\n\u003cmeta name\u003d\"access_permission:can_pr [...] + "X-TIKA:parse_time_millis": "500", + "access_permission:assemble_document": "true", + "access_permission:can_modify": "true", + "access_permission:can_print": "true", + "access_permission:can_print_degraded": "true", + "access_permission:extract_content": "true", + "access_permission:extract_for_accessibility": "true", + "access_permission:fill_in_form": "true", + "access_permission:modify_annotations": "true", + "dc:creator": "Bertrand DelacrΘtaz", + "dc:format": "application/pdf; version\u003d1.3", + "dc:title": "Apache Tika - Apache Tika", + "dcterms:created": "2007-09-15T09:02:31Z", + "dcterms:modified": "2007-09-15T09:02:31Z", + "meta:author": "Bertrand DelacrΘtaz", + "meta:creation-date": "2007-09-15T09:02:31Z", + "meta:save-date": "2007-09-15T09:02:31Z", + "pdf:PDFVersion": "1.3", + "pdf:docinfo:created": "2007-09-15T09:02:31Z", + "pdf:docinfo:creator": "Bertrand DelacrΘtaz", + "pdf:docinfo:creator_tool": "Firefox", + "pdf:docinfo:modified": "2007-09-15T09:02:31Z", + "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext", + "pdf:docinfo:title": "Apache Tika - Apache Tika", + "pdf:encrypted": "false", + "resourceName": "testPDF.pdf", + "xmp:CreatorTool": "Firefox", + "xmpTPg:NPages": "1" + } +] \ No newline at end of file diff --git a/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file17_tagsOutOfOrder b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file17_tagsOutOfOrder new file mode 100644 index 000000000..97afec8ad --- /dev/null +++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/raw_input/file17_tagsOutOfOrder @@ -0,0 +1,41 @@ +[ + { + "Content-Length": "34824", + "Content-Type": "application/pdf", + "Last-Modified": "2007-09-15T09:02:31Z", + "X-Parsed-By": [ + "org.apache.tika.parser.DefaultParser", + "org.apache.tika.parser.pdf.PDFParser" + ], + "X-TIKA:content_handler": "ToXMLContentHandler", + "X-TIKA:content": "\u003chtml xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" /\u003e\n\u003cmeta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" /\u003e\n\u003cmeta name\u003d\"access_permission:modify_annotations\" content\u003d\"true\" /\u003e\n\u003cmeta name\u003d\"access_permission:can_print_degra [...] + "X-TIKA:parse_time_millis": "500", + "access_permission:assemble_document": "true", + "access_permission:can_modify": "true", + "access_permission:can_print": "true", + "access_permission:can_print_degraded": "true", + "access_permission:extract_content": "true", + "access_permission:extract_for_accessibility": "true", + "access_permission:fill_in_form": "true", + "access_permission:modify_annotations": "true", + "dc:creator": "Bertrand DelacrΘtaz", + "dc:format": "application/pdf; version\u003d1.3", + "dc:title": "Apache Tika - Apache Tika", + "dcterms:created": "2007-09-15T09:02:31Z", + "dcterms:modified": "2007-09-15T09:02:31Z", + "meta:author": "Bertrand DelacrΘtaz", + "meta:creation-date": "2007-09-15T09:02:31Z", + "meta:save-date": "2007-09-15T09:02:31Z", + "pdf:PDFVersion": "1.3", + "pdf:docinfo:created": "2007-09-15T09:02:31Z", + "pdf:docinfo:creator": "Bertrand DelacrΘtaz", + "pdf:docinfo:creator_tool": "Firefox", + "pdf:docinfo:modified": "2007-09-15T09:02:31Z", + "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext", + "pdf:docinfo:title": "Apache Tika - Apache Tika", + "pdf:encrypted": "false", + "resourceName": "testPDF.pdf", + "xmp:CreatorTool": "Firefox", + "xmpTPg:NPages": "1" + } +] \ No newline at end of file diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index e0ba1333d..2037f1aba 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -399,7 +399,7 @@ <jsoup.version>1.21.1</jsoup.version> <jsr305.version>3.0.2</jsr305.version> <junit4.version>4.13.2</junit4.version> - <junit5.version>6.0.0-M1</junit5.version> + <junit5.version>5.13.3</junit5.version> <juniversalchardet.version>2.5.0</juniversalchardet.version> <junrar.version>7.5.5</junrar.version> <jwarc.version>0.31.1</jwarc.version>