(tika) 01/03: TIKA-4342 -- WIP do not merge

tallison Tue, 08 Jul 2025 12:27:57 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4342
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 7b94cf0ffa6cfbd0ef337946564e516df69c5c85
Author: tallison <talli...@apache.org>
AuthorDate: Thu May 22 09:39:06 2025 -0400

    TIKA-4342 -- WIP do not merge
---
 .../java/org/apache/tika/eval/app/EvalConfig.java  |  13 +++
 .../org/apache/tika/eval/app/ExtractProfiler.java  |  44 ++-------
 .../apache/tika/eval/app/FileProfileRunner.java    |  54 +++++++++++
 .../org/apache/tika/eval/app/FileProfiler.java     |  19 ----
 .../java/org/apache/tika/eval/app/TikaEvalCLI.java | 104 +--------------------
 .../apache/tika/eval/app/batch/FileResource.java   |  66 +++++++++++++
 6 files changed, 140 insertions(+), 160 deletions(-)

diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
new file mode 100644
index 000000000..416d7bb6e
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
@@ -0,0 +1,13 @@
+package org.apache.tika.eval.app;
+
+public class EvalConfig {
+
+    long minExtractLength = 0;
+    long maxExtractLength = 10_000_000;
+    String jdbcString = null;
+    int maxFilesToAdd = -1;
+    int maxTokens = 200000;
+    int maxContentLength = 5_000_000;
+    int numThreads = 4;
+
+}
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
index 22889d73b..844eac125 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.eval.app;
 
+import java.io.File;
 import java.io.IOException;
 import java.nio.file.Path;
 import java.sql.Types;
@@ -76,54 +77,21 @@ public class ExtractProfiler extends AbstractProfiler {
                     new ColInfo(Cols.TAGS_PARSE_EXCEPTION, Types.BOOLEAN));
     static Options OPTIONS;
 
-    static {
-        //By the time this commandline is parsed, there should be both an 
extracts and an inputDir
-        Option extracts = new Option("extracts", true, "directory for extract 
files");
-        extracts.setRequired(true);
-
-        Option inputDir = new Option("inputDir", true, "optional: directory 
for original binary input documents." + " If not specified, -extracts is 
crawled as is.");
-
-        OPTIONS = new Options()
-                .addOption(extracts)
-                .addOption(inputDir)
-                .addOption("bc", "optional: tika-batch config file")
-                .addOption("numConsumers", true, "optional: number of consumer 
threads")
-                .addOption(new Option("alterExtract", true,
-                        "for json-formatted extract files, " + "process full 
metadata list ('as_is'=default), " + "take just the first/container document 
('first_only'), " +
-                                "concatenate all content into the first 
metadata item ('concatenate_content')"))
-                .addOption("minExtractLength", true, "minimum extract length 
to process (in bytes)")
-                .addOption("maxExtractLength", true, "maximum extract length 
to process (in bytes)")
-                .addOption("db", true, "db file to which to write results")
-                .addOption("jdbc", true, "EXPERT: full jdbc connection string. 
Must specify this or -db <h2db>")
-                .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or 
specify via -Djdbc.driver")
-                .addOption("tablePrefix", true, "EXPERT: optional prefix for 
table names")
-                .addOption("drop", false, "drop tables if they exist")
-                .addOption("maxFilesToAdd", true, "maximum number of files to 
add to the crawler")
-                .addOption("maxTokens", true, "maximum tokens to process, 
default=200000")
-                .addOption("maxContentLength", true, "truncate content beyond 
this length for calculating 'contents' stats, default=1000000")
-                .addOption("maxContentLengthForLangId", true, "truncate 
content beyond this length for language id, default=50000")
-                .addOption("defaultLangCode", true, "which language to use for 
common words if no 'common words' file exists for the langid result")
-
-        ;
-
-    }
-
     private final Path inputDir;
     private final Path extracts;
     private final ExtractReader extractReader;
 
-    public ExtractProfiler(ArrayBlockingQueue<FileResource> queue, Path 
inputDir, Path extracts, ExtractReader extractReader, IDBWriter dbWriter) {
+    public static ExtractProfiler build(ArrayBlockingQueue<FileResource> 
queue, Path inputDir, Path extracts, ExtractReader extractReader, IDBWriter 
dbWriter) {
+        return new ExtractProfiler(queue, inputDir, extracts, extractReader, 
dbWriter);
+    }
+
+    ExtractProfiler(ArrayBlockingQueue<FileResource> queue, Path inputDir, 
Path extracts, ExtractReader extractReader, IDBWriter dbWriter) {
         super(queue, dbWriter);
         this.inputDir = inputDir;
         this.extracts = extracts;
         this.extractReader = extractReader;
     }
 
-    public static void USAGE() {
-        HelpFormatter helpFormatter = new HelpFormatter();
-        helpFormatter.printHelp(80, "java -jar tika-eval-x.y.jar Profile 
-extracts extracts -db mydb [-inputDir input]", "Tool: Profile", 
ExtractProfiler.OPTIONS,
-                "Note: for the default h2 db, do not include the .mv.db at the 
end of the db name.");
-    }
 
     @Override
     public boolean processFileResource(FileResource fileResource) {
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfileRunner.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfileRunner.java
new file mode 100644
index 000000000..1e57b35ad
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfileRunner.java
@@ -0,0 +1,54 @@
+package org.apache.tika.eval.app;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+
+import org.apache.tika.batch.FileResource;
+
+public class FileProfileRunner {
+    static Options OPTIONS;
+
+    static {
+
+        OPTIONS = new Options()
+                
.addOption(Option.builder("e").longOpt("extracts").hasArg().desc("required: 
directory of extracts").build())
+                
.addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional: 
directory for original binary input documents."
+                        + " If not specified, -extracts is crawled as 
is.").build())
+                
.addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db 
path").build())
+                
.addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json 
config file").build())
+                ;
+    }
+    public static void main(String[] args) throws Exception {
+        DefaultParser defaultCLIParser = new DefaultParser();
+        CommandLine commandLine = defaultCLIParser.parse(OPTIONS, args);
+        EvalConfig evalConfig = commandLine.hasOption('c') ? 
EvalConfig.load(Paths.get(commandLine.getOptionValue('c'))) : new EvalConfig();
+        Path extractsDir = commandLine.hasOption('e') ? 
Paths.get(commandLine.getOptionValue('e')) : Paths.get(USAGE_FAIL("Must specify 
extracts dir: -i"));
+        Path inputDir = commandLine.hasOption('i') ? 
Paths.get(commandLine.getOptionValue('i')) : extractsDir;
+        String dbPath = commandLine.hasOption('d') ? 
commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d");
+        execute(inputDir, extractsDir, dbPath, evalConfig);
+    }
+
+    private static void execute(Path inputDir, Path extractsDir, String 
dbPath, EvalConfig evalConfig) {
+
+        ArrayBlockingQueue<FileResource> queue = new 
ArrayBlockingQueue<>(1000);
+        FileWalker fileWalker = new FileWalker(queue)
+    }
+
+    private static void USAGE() {
+        HelpFormatter helpFormatter = new HelpFormatter();
+        helpFormatter.printHelp(80, "java -jar tika-eval-app-x.y.z.jar 
FileProfiler -e docs -d mydb [-i inputDir, -c config.json]",
+                "Tool: Profile", OPTIONS, "");
+    }
+
+    private static String USAGE_FAIL(String msg) {
+        USAGE();
+        throw new IllegalArgumentException(msg);
+    }
+}
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
index 925452094..edc431e4a 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
@@ -73,26 +73,7 @@ public class FileProfiler extends AbstractProfiler {
     public static TableInfo FILE_MIME_TABLE =
             new TableInfo("file_mimes", new ColInfo(Cols.MIME_ID, 
Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 
256),
                     new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12));
-    static Options OPTIONS;
 
-    static {
-
-        Option inputDir = new Option("inputDir", true, "optional: directory 
for original binary input documents." + " If not specified, -extracts is 
crawled as is.");
-
-        OPTIONS = new Options()
-                .addOption(inputDir)
-                .addOption("bc", "optional: tika-batch config file")
-                .addOption("numConsumers", true, "optional: number of consumer 
threads")
-                .addOption("db", true, "db file to which to write results")
-                .addOption("jdbc", true, "EXPERT: full jdbc connection string. 
Must specify this or -db <h2db>")
-                .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or 
specify via -Djdbc.driver")
-                .addOption("tablePrefix", true, "EXPERT: optional prefix for 
table names")
-                .addOption("drop", false, "drop tables if they exist")
-                .addOption("maxFilesToAdd", true, "maximum number of files to 
add to the crawler")
-
-        ;
-
-    }
 
     private final Path inputDir;
 
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
index a897461ee..770319708 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
@@ -154,109 +154,7 @@ public class TikaEvalCLI {
     }
 
     private void handleProfile(String[] subsetArgs) throws Exception {
-        List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
-
-        boolean containsBC = false;
-        String inputDir = null;
-        String extracts = null;
-        String alterExtract = null;
-        //confirm there's a batch-config file
-        for (int i = 0; i < argList.size(); i++) {
-            String arg = argList.get(i);
-            switch (arg) {
-                case "-bc":
-                    containsBC = true;
-                    break;
-                case "-inputDir":
-                    if (i + 1 >= argList.size()) {
-                        System.err.println("Must specify directory after 
-inputDir");
-                        ExtractProfiler.USAGE();
-                        return;
-                    }
-                    inputDir = argList.get(i + 1);
-                    i++;
-                    break;
-                case "-extracts":
-                    if (i + 1 >= argList.size()) {
-                        System.err.println("Must specify directory after 
-extracts");
-                        ExtractProfiler.USAGE();
-                        return;
-                    }
-                    extracts = argList.get(i + 1);
-                    i++;
-                    break;
-                case "-alterExtract":
-                    if (i + 1 >= argList.size()) {
-                        System.err.println("Must specify type 'as_is', 
'first_only' or " + "'concatenate_content' after -alterExtract");
-                        ExtractComparer.USAGE();
-                        return;
-                    }
-                    alterExtract = argList.get(i + 1);
-                    i++;
-                    break;
-            }
-        }
-
-        if (alterExtract != null && !alterExtract.equals("as_is") && 
!alterExtract.equals("concatenate_content") && 
!alterExtract.equals("first_only")) {
-            System.out.println("Sorry, I don't understand:" + alterExtract + 
". The values must be one of: as_is, first_only, concatenate_content");
-            ExtractProfiler.USAGE();
-            return;
-        }
-
-        //need to specify each in this commandline
-        //if only extracts is passed to tika-batch,
-        //the crawler will see no inputDir and start crawling "input".
-        //this allows the user to specify either extracts or inputDir
-        if (extracts == null && inputDir != null) {
-            argList.add("-extracts");
-            argList.add(inputDir);
-        } else if (inputDir == null && extracts != null) {
-            argList.add("-inputDir");
-            argList.add(extracts);
-        }
-
-        Path tmpBCConfig = null;
-        try {
-            tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
-            if (!containsBC) {
-                try (InputStream is = this
-                        .getClass()
-                        
.getResourceAsStream("/tika-eval-profiler-config.xml")) {
-                    Files.copy(is, tmpBCConfig, 
StandardCopyOption.REPLACE_EXISTING);
-                }
-                argList.add("-bc");
-                argList.add(tmpBCConfig
-                        .toAbsolutePath()
-                        .toString());
-            }
-
-            String[] updatedArgs = argList.toArray(new String[0]);
-            DefaultParser defaultCLIParser = new DefaultParser();
-            try {
-                CommandLine commandLine = 
defaultCLIParser.parse(ExtractProfiler.OPTIONS, updatedArgs);
-                if (commandLine.hasOption("db") && 
commandLine.hasOption("jdbc")) {
-                    System.out.println("Please specify either the default -db 
or the full -jdbc, not both");
-                    ExtractProfiler.USAGE();
-                    return;
-                }
-            } catch (ParseException e) {
-                System.out.println(e.getMessage() + "\n");
-                ExtractProfiler.USAGE();
-                return;
-            }
-
-            // lazy delete because main() calls System.exit()
-            if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
-                tmpBCConfig
-                        .toFile()
-                        .deleteOnExit();
-            }
-            FSBatchProcessCLI.main(updatedArgs);
-        } finally {
-            if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
-                Files.delete(tmpBCConfig);
-            }
-        }
+        FileProfileRunner.main(subsetArgs);
     }
 
     private void handleCompare(String[] subsetArgs) throws Exception {
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileResource.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileResource.java
new file mode 100644
index 000000000..dc0a35d7e
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileResource.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.batch;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+
+
+/**
+ * This is a basic interface to handle a logical "file".
+ * This should enable code-agnostic handling of files from different
+ * sources: file system, database, etc.
+ */
+public interface FileResource {
+
+    //The literal lowercased extension of a file.  This may or may not
+    //have any relationship to the actual type of the file.
+    public static final Property FILE_EXTENSION = 
Property.internalText("tika:file_ext");
+
+    /**
+     * This is only used in logging to identify which file
+     * may have caused problems.  While it is probably best
+     * to use unique ids for the sake of debugging, it is not
+     * necessary that the ids be unique.  This id
+     * is never used as a hashkey by the batch processors, for example.
+     *
+     * @return an id for a FileResource
+     */
+    public String getResourceId();
+
+    /**
+     * This gets the metadata available before the parsing of the file.
+     * This will typically be "external" metadata: file name,
+     * file size, file location, data stream, etc.  That is, things
+     * that are known about the file from outside information, not
+     * file-internal metadata.
+     *
+     * @return Metadata
+     */
+    public Metadata getMetadata();
+
+    /**
+     * @return an InputStream for the FileResource
+     * @throws java.io.IOException
+     */
+    public InputStream openInputStream() throws IOException;
+
+}

(tika) 01/03: TIKA-4342 -- WIP do not merge

Reply via email to