This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4342 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 7b94cf0ffa6cfbd0ef337946564e516df69c5c85 Author: tallison <talli...@apache.org> AuthorDate: Thu May 22 09:39:06 2025 -0400 TIKA-4342 -- WIP do not merge --- .../java/org/apache/tika/eval/app/EvalConfig.java | 13 +++ .../org/apache/tika/eval/app/ExtractProfiler.java | 44 ++------- .../apache/tika/eval/app/FileProfileRunner.java | 54 +++++++++++ .../org/apache/tika/eval/app/FileProfiler.java | 19 ---- .../java/org/apache/tika/eval/app/TikaEvalCLI.java | 104 +-------------------- .../apache/tika/eval/app/batch/FileResource.java | 66 +++++++++++++ 6 files changed, 140 insertions(+), 160 deletions(-) diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java new file mode 100644 index 000000000..416d7bb6e --- /dev/null +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java @@ -0,0 +1,13 @@ +package org.apache.tika.eval.app; + +public class EvalConfig { + + long minExtractLength = 0; + long maxExtractLength = 10_000_000; + String jdbcString = null; + int maxFilesToAdd = -1; + int maxTokens = 200000; + int maxContentLength = 5_000_000; + int numThreads = 4; + +} diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java index 22889d73b..844eac125 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java @@ -16,6 +16,7 @@ */ package org.apache.tika.eval.app; +import java.io.File; import java.io.IOException; import java.nio.file.Path; import java.sql.Types; @@ -76,54 +77,21 @@ public class ExtractProfiler extends AbstractProfiler { new ColInfo(Cols.TAGS_PARSE_EXCEPTION, Types.BOOLEAN)); static Options OPTIONS; - static { - //By the time this commandline is parsed, there should be both an extracts and an inputDir - Option extracts = new Option("extracts", true, "directory for extract files"); - extracts.setRequired(true); - - Option inputDir = new Option("inputDir", true, "optional: directory for original binary input documents." + " If not specified, -extracts is crawled as is."); - - OPTIONS = new Options() - .addOption(extracts) - .addOption(inputDir) - .addOption("bc", "optional: tika-batch config file") - .addOption("numConsumers", true, "optional: number of consumer threads") - .addOption(new Option("alterExtract", true, - "for json-formatted extract files, " + "process full metadata list ('as_is'=default), " + "take just the first/container document ('first_only'), " + - "concatenate all content into the first metadata item ('concatenate_content')")) - .addOption("minExtractLength", true, "minimum extract length to process (in bytes)") - .addOption("maxExtractLength", true, "maximum extract length to process (in bytes)") - .addOption("db", true, "db file to which to write results") - .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>") - .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver") - .addOption("tablePrefix", true, "EXPERT: optional prefix for table names") - .addOption("drop", false, "drop tables if they exist") - .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler") - .addOption("maxTokens", true, "maximum tokens to process, default=200000") - .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000") - .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000") - .addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result") - - ; - - } - private final Path inputDir; private final Path extracts; private final ExtractReader extractReader; - public ExtractProfiler(ArrayBlockingQueue<FileResource> queue, Path inputDir, Path extracts, ExtractReader extractReader, IDBWriter dbWriter) { + public static ExtractProfiler build(ArrayBlockingQueue<FileResource> queue, Path inputDir, Path extracts, ExtractReader extractReader, IDBWriter dbWriter) { + return new ExtractProfiler(queue, inputDir, extracts, extractReader, dbWriter); + } + + ExtractProfiler(ArrayBlockingQueue<FileResource> queue, Path inputDir, Path extracts, ExtractReader extractReader, IDBWriter dbWriter) { super(queue, dbWriter); this.inputDir = inputDir; this.extracts = extracts; this.extractReader = extractReader; } - public static void USAGE() { - HelpFormatter helpFormatter = new HelpFormatter(); - helpFormatter.printHelp(80, "java -jar tika-eval-x.y.jar Profile -extracts extracts -db mydb [-inputDir input]", "Tool: Profile", ExtractProfiler.OPTIONS, - "Note: for the default h2 db, do not include the .mv.db at the end of the db name."); - } @Override public boolean processFileResource(FileResource fileResource) { diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfileRunner.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfileRunner.java new file mode 100644 index 000000000..1e57b35ad --- /dev/null +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfileRunner.java @@ -0,0 +1,54 @@ +package org.apache.tika.eval.app; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.concurrent.ArrayBlockingQueue; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; + +import org.apache.tika.batch.FileResource; + +public class FileProfileRunner { + static Options OPTIONS; + + static { + + OPTIONS = new Options() + .addOption(Option.builder("e").longOpt("extracts").hasArg().desc("required: directory of extracts").build()) + .addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional: directory for original binary input documents." + + " If not specified, -extracts is crawled as is.").build()) + .addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db path").build()) + .addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json config file").build()) + ; + } + public static void main(String[] args) throws Exception { + DefaultParser defaultCLIParser = new DefaultParser(); + CommandLine commandLine = defaultCLIParser.parse(OPTIONS, args); + EvalConfig evalConfig = commandLine.hasOption('c') ? EvalConfig.load(Paths.get(commandLine.getOptionValue('c'))) : new EvalConfig(); + Path extractsDir = commandLine.hasOption('e') ? Paths.get(commandLine.getOptionValue('e')) : Paths.get(USAGE_FAIL("Must specify extracts dir: -i")); + Path inputDir = commandLine.hasOption('i') ? Paths.get(commandLine.getOptionValue('i')) : extractsDir; + String dbPath = commandLine.hasOption('d') ? commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d"); + execute(inputDir, extractsDir, dbPath, evalConfig); + } + + private static void execute(Path inputDir, Path extractsDir, String dbPath, EvalConfig evalConfig) { + + ArrayBlockingQueue<FileResource> queue = new ArrayBlockingQueue<>(1000); + FileWalker fileWalker = new FileWalker(queue) + } + + private static void USAGE() { + HelpFormatter helpFormatter = new HelpFormatter(); + helpFormatter.printHelp(80, "java -jar tika-eval-app-x.y.z.jar FileProfiler -e docs -d mydb [-i inputDir, -c config.json]", + "Tool: Profile", OPTIONS, ""); + } + + private static String USAGE_FAIL(String msg) { + USAGE(); + throw new IllegalArgumentException(msg); + } +} diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java index 925452094..edc431e4a 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java @@ -73,26 +73,7 @@ public class FileProfiler extends AbstractProfiler { public static TableInfo FILE_MIME_TABLE = new TableInfo("file_mimes", new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256), new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)); - static Options OPTIONS; - static { - - Option inputDir = new Option("inputDir", true, "optional: directory for original binary input documents." + " If not specified, -extracts is crawled as is."); - - OPTIONS = new Options() - .addOption(inputDir) - .addOption("bc", "optional: tika-batch config file") - .addOption("numConsumers", true, "optional: number of consumer threads") - .addOption("db", true, "db file to which to write results") - .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>") - .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver") - .addOption("tablePrefix", true, "EXPERT: optional prefix for table names") - .addOption("drop", false, "drop tables if they exist") - .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler") - - ; - - } private final Path inputDir; diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java index a897461ee..770319708 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java @@ -154,109 +154,7 @@ public class TikaEvalCLI { } private void handleProfile(String[] subsetArgs) throws Exception { - List<String> argList = new ArrayList(Arrays.asList(subsetArgs)); - - boolean containsBC = false; - String inputDir = null; - String extracts = null; - String alterExtract = null; - //confirm there's a batch-config file - for (int i = 0; i < argList.size(); i++) { - String arg = argList.get(i); - switch (arg) { - case "-bc": - containsBC = true; - break; - case "-inputDir": - if (i + 1 >= argList.size()) { - System.err.println("Must specify directory after -inputDir"); - ExtractProfiler.USAGE(); - return; - } - inputDir = argList.get(i + 1); - i++; - break; - case "-extracts": - if (i + 1 >= argList.size()) { - System.err.println("Must specify directory after -extracts"); - ExtractProfiler.USAGE(); - return; - } - extracts = argList.get(i + 1); - i++; - break; - case "-alterExtract": - if (i + 1 >= argList.size()) { - System.err.println("Must specify type 'as_is', 'first_only' or " + "'concatenate_content' after -alterExtract"); - ExtractComparer.USAGE(); - return; - } - alterExtract = argList.get(i + 1); - i++; - break; - } - } - - if (alterExtract != null && !alterExtract.equals("as_is") && !alterExtract.equals("concatenate_content") && !alterExtract.equals("first_only")) { - System.out.println("Sorry, I don't understand:" + alterExtract + ". The values must be one of: as_is, first_only, concatenate_content"); - ExtractProfiler.USAGE(); - return; - } - - //need to specify each in this commandline - //if only extracts is passed to tika-batch, - //the crawler will see no inputDir and start crawling "input". - //this allows the user to specify either extracts or inputDir - if (extracts == null && inputDir != null) { - argList.add("-extracts"); - argList.add(inputDir); - } else if (inputDir == null && extracts != null) { - argList.add("-inputDir"); - argList.add(extracts); - } - - Path tmpBCConfig = null; - try { - tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml"); - if (!containsBC) { - try (InputStream is = this - .getClass() - .getResourceAsStream("/tika-eval-profiler-config.xml")) { - Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING); - } - argList.add("-bc"); - argList.add(tmpBCConfig - .toAbsolutePath() - .toString()); - } - - String[] updatedArgs = argList.toArray(new String[0]); - DefaultParser defaultCLIParser = new DefaultParser(); - try { - CommandLine commandLine = defaultCLIParser.parse(ExtractProfiler.OPTIONS, updatedArgs); - if (commandLine.hasOption("db") && commandLine.hasOption("jdbc")) { - System.out.println("Please specify either the default -db or the full -jdbc, not both"); - ExtractProfiler.USAGE(); - return; - } - } catch (ParseException e) { - System.out.println(e.getMessage() + "\n"); - ExtractProfiler.USAGE(); - return; - } - - // lazy delete because main() calls System.exit() - if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) { - tmpBCConfig - .toFile() - .deleteOnExit(); - } - FSBatchProcessCLI.main(updatedArgs); - } finally { - if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) { - Files.delete(tmpBCConfig); - } - } + FileProfileRunner.main(subsetArgs); } private void handleCompare(String[] subsetArgs) throws Exception { diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileResource.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileResource.java new file mode 100644 index 000000000..dc0a35d7e --- /dev/null +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileResource.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.batch; + + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; + + +/** + * This is a basic interface to handle a logical "file". + * This should enable code-agnostic handling of files from different + * sources: file system, database, etc. + */ +public interface FileResource { + + //The literal lowercased extension of a file. This may or may not + //have any relationship to the actual type of the file. + public static final Property FILE_EXTENSION = Property.internalText("tika:file_ext"); + + /** + * This is only used in logging to identify which file + * may have caused problems. While it is probably best + * to use unique ids for the sake of debugging, it is not + * necessary that the ids be unique. This id + * is never used as a hashkey by the batch processors, for example. + * + * @return an id for a FileResource + */ + public String getResourceId(); + + /** + * This gets the metadata available before the parsing of the file. + * This will typically be "external" metadata: file name, + * file size, file location, data stream, etc. That is, things + * that are known about the file from outside information, not + * file-internal metadata. + * + * @return Metadata + */ + public Metadata getMetadata(); + + /** + * @return an InputStream for the FileResource + * @throws java.io.IOException + */ + public InputStream openInputStream() throws IOException; + +}