(tika) 01/01: add md summary and other cli improvements

tallison Fri, 06 Mar 2026 12:43:04 -0800

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4684-tika-eval
in repository https://gitbox.apache.org/repos/asf/tika.git


commit a51c253a8225779875c7e6e0a0986006e0558031
Author: tallison <[email protected]>
AuthorDate: Fri Mar 6 15:42:41 2026 -0500

    add md summary and other cli improvements
---
 tika-eval/tika-eval-app/pom.xml                    |   4 +
 .../java/org/apache/tika/eval/app/EvalConfig.java  |   2 +-
 .../tika/eval/app/ExtractComparerRunner.java       |  83 ++-
 .../java/org/apache/tika/eval/app/TikaEvalCLI.java |  58 ++
 .../eval/app/reports/MarkdownSummaryWriter.java    | 611 +++++++++++++++++++++
 .../tika/eval/app/reports/ResultsReporter.java     |   7 +-
 .../org/apache/tika/eval/app/TikaEvalCLITest.java  |  13 +-
 .../tika/serialization/JsonMetadataList.java       |   2 +-
 8 files changed, 767 insertions(+), 13 deletions(-)

diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml
index b145c273dd..4b97f249c1 100644
--- a/tika-eval/tika-eval-app/pom.xml
+++ b/tika-eval/tika-eval-app/pom.xml
@@ -65,6 +65,10 @@
       <groupId>org.apache.poi</groupId>
       <artifactId>poi-ooxml</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-compress</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-core</artifactId>
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
index fc0d72f0a6..b1aa848992 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
@@ -23,7 +23,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 public class EvalConfig {
 
     private long minExtractLength = 0;
-    private long maxExtractLength = 2_000_000;
+    private long maxExtractLength = 100_000_000;
     private String jdbcString = null;
     private String jdbcDriverClass = null;
     private boolean forceDrop = true;
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
index 6d23fe3653..c1faacc965 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
@@ -18,7 +18,9 @@ package org.apache.tika.eval.app;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.sql.Connection;
@@ -39,12 +41,16 @@ import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Stream;
 
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.DefaultParser;
 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.Options;
 import org.apache.commons.cli.help.HelpFormatter;
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -56,6 +62,7 @@ import org.apache.tika.eval.app.io.DBWriter;
 import org.apache.tika.eval.app.io.ExtractReader;
 import org.apache.tika.eval.app.io.ExtractReaderException;
 import org.apache.tika.eval.app.io.IDBWriter;
+import org.apache.tika.eval.app.reports.ResultsReporter;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.mime.MimeTypes;
 import org.apache.tika.pipes.api.FetchEmitTuple;
@@ -79,10 +86,12 @@ public class ExtractComparerRunner {
                 
.addOption(Option.builder("b").longOpt("extractsB").hasArg().desc("required: 
directory of 'B' extracts").get())
                 
.addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional: 
directory for original binary input documents."
                         + " If not specified, -extracts is crawled as 
is.").get())
-                
.addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db 
path").get())
+                
.addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db path 
(uses temp file if not specified)").get())
                 
.addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json 
config file").get())
                 
.addOption(Option.builder("n").longOpt("numWorkers").hasArg().desc("number of 
worker threads").get())
                 
.addOption(Option.builder("m").longOpt("maxExtractLength").hasArg().desc("maximum
 extract length").get())
+                
.addOption(Option.builder("r").longOpt("report").desc("automatically run Report 
and tgz after Compare").get())
+                
.addOption(Option.builder("rd").longOpt("reportsDir").hasArg().desc("directory 
for reports (default: 'reports')").get())
                 ;
     }
 
@@ -93,7 +102,16 @@ public class ExtractComparerRunner {
         Path extractsADir = commandLine.hasOption('a') ? 
Paths.get(commandLine.getOptionValue('a')) : Paths.get(USAGE_FAIL("Must specify 
extractsA dir: -a"));
         Path extractsBDir = commandLine.hasOption('b') ? 
Paths.get(commandLine.getOptionValue('b')) : Paths.get(USAGE_FAIL("Must specify 
extractsB dir: -b"));
         Path inputDir = commandLine.hasOption('i') ? 
Paths.get(commandLine.getOptionValue('i')) : extractsADir;
-        String dbPath = commandLine.hasOption('d') ? 
commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d");
+
+        boolean usesTempDb = !commandLine.hasOption('d');
+        Path tempDbDir = null;
+        String dbPath;
+        if (usesTempDb) {
+            tempDbDir = Files.createTempDirectory("tika-eval-");
+            dbPath = tempDbDir.resolve("eval-db").toAbsolutePath().toString();
+        } else {
+            dbPath = commandLine.getOptionValue('d');
+        }
 
         if (commandLine.hasOption('n')) {
             
evalConfig.setNumWorkers(Integer.parseInt(commandLine.getOptionValue('n')));
@@ -103,8 +121,27 @@ public class ExtractComparerRunner {
             
evalConfig.setMaxExtractLength(Long.parseLong(commandLine.getOptionValue('m')));
         }
 
-        String jdbcString = getJdbcConnectionString(dbPath);
-        execute(inputDir, extractsADir, extractsBDir, jdbcString, evalConfig);
+        try {
+            String jdbcString = getJdbcConnectionString(dbPath);
+            execute(inputDir, extractsADir, extractsBDir, jdbcString, 
evalConfig);
+
+            if (commandLine.hasOption('r')) {
+                String reportsDir = commandLine.getOptionValue("rd", 
"reports");
+                LOG.info("Running Report...");
+                ResultsReporter.main(new String[]{"-d", dbPath, "-rd", 
reportsDir});
+                Path reportsDirPath = Paths.get(reportsDir);
+                if (Files.isDirectory(reportsDirPath)) {
+                    Path tgzPath = reportsDirPath.resolveSibling(reportsDir + 
".tar.gz");
+                    LOG.info("Creating {}", tgzPath);
+                    createTarGz(reportsDirPath, tgzPath);
+                    LOG.info("Reports archived to {}", tgzPath);
+                }
+            }
+        } finally {
+            if (usesTempDb && tempDbDir != null) {
+                deleteDirectory(tempDbDir);
+            }
+        }
     }
 
     private static String getJdbcConnectionString(String dbPath) {
@@ -205,6 +242,44 @@ public class ExtractComparerRunner {
         return new MimeBuffer(jdbcUtil.getConnection(), 
builder.getMimeTable(), MimeTypes.getDefaultMimeTypes());
     }
 
+    private static void deleteDirectory(Path dir) throws IOException {
+        if (!Files.exists(dir)) {
+            return;
+        }
+        try (Stream<Path> walk = Files.walk(dir)) {
+            walk.sorted(java.util.Comparator.reverseOrder())
+                    .forEach(p -> {
+                        try {
+                            Files.deleteIfExists(p);
+                        } catch (IOException e) {
+                            LOG.warn("Failed to delete {}", p, e);
+                        }
+                    });
+        }
+    }
+
+    private static void createTarGz(Path sourceDir, Path output) throws 
IOException {
+        try (OutputStream fos = Files.newOutputStream(output);
+             GzipCompressorOutputStream gzo = new 
GzipCompressorOutputStream(fos);
+             TarArchiveOutputStream tar = new TarArchiveOutputStream(gzo)) {
+            tar.setLongFileMode(TarArchiveOutputStream.LONGFILE_POSIX);
+            try (Stream<Path> walk = Files.walk(sourceDir)) {
+                walk.filter(Files::isRegularFile).forEach(file -> {
+                    try {
+                        String entryName = sourceDir.getFileName()
+                                
.resolve(sourceDir.relativize(file)).toString();
+                        TarArchiveEntry entry = new TarArchiveEntry(file, 
entryName);
+                        tar.putArchiveEntry(entry);
+                        Files.copy(file, tar);
+                        tar.closeArchiveEntry();
+                    } catch (IOException e) {
+                        throw new RuntimeException(e);
+                    }
+                });
+            }
+        }
+    }
+
     private static void USAGE() throws IOException {
         HelpFormatter helpFormatter = HelpFormatter.builder().get();
         helpFormatter.printHelp("java -jar tika-eval-app-x.y.z.jar 
FileProfiler -e docs -d mydb [-i inputDir, -c config.json]",
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
index ce32d78f4b..38ace5239e 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
@@ -50,6 +50,27 @@ public class TikaEvalCLI {
 
     private void execute(String[] args) throws Exception {
         String tool = args[0];
+        // If the first arg looks like a flag, infer the tool from the args
+        if (tool.startsWith("-")) {
+            String inferred = inferTool(args);
+            if (inferred != null) {
+                tool = inferred;
+                // Don't strip the first arg — all args are flags for the tool
+                switch (tool) {
+                    case "Compare":
+                        handleCompare(args);
+                        return;
+                    case "Profile":
+                        handleProfile(args);
+                        return;
+                    case "Report":
+                        handleReport(args);
+                        return;
+                }
+            }
+            System.err.println(specifyTools());
+            return;
+        }
         String[] subsetArgs = new String[args.length - 1];
         System.arraycopy(args, 1, subsetArgs, 0, args.length - 1);
         switch (tool) {
@@ -72,6 +93,43 @@ public class TikaEvalCLI {
         }
     }
 
+    private static String inferTool(String[] args) {
+        boolean hasA = false;
+        boolean hasB = false;
+        boolean hasE = false;
+        boolean hasDb = false;
+        for (int i = 0; i < args.length; i++) {
+            switch (args[i]) {
+                case "-a":
+                case "--extractsA":
+                    hasA = true;
+                    break;
+                case "-b":
+                case "--extractsB":
+                    hasB = true;
+                    break;
+                case "-e":
+                case "--extracts":
+                    hasE = true;
+                    break;
+                case "-d":
+                case "--db":
+                    hasDb = true;
+                    break;
+            }
+        }
+        if (hasA && hasB) {
+            return "Compare";
+        }
+        if (hasE) {
+            return "Profile";
+        }
+        if (hasDb && !hasA && !hasB && !hasE) {
+            return "Report";
+        }
+        return null;
+    }
+
     private void handleStartDB(String[] args) throws SQLException {
         List<String> argList = new ArrayList<>();
         argList.add("-web");
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/MarkdownSummaryWriter.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/MarkdownSummaryWriter.java
new file mode 100644
index 0000000000..0681e57887
--- /dev/null
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/MarkdownSummaryWriter.java
@@ -0,0 +1,611 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.reports;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.DatabaseMetaData;
+import java.sql.ResultSet;
+import java.sql.ResultSetMetaData;
+import java.sql.SQLException;
+import java.sql.Statement;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Writes a markdown summary of a tika-eval comparison run.
+ * <p>
+ * This is designed to be read by both humans and LLMs for fast
+ * regression detection. It queries the same temp tables that the
+ * xlsx report pipeline creates (exceptions_compared,
+ * token_counts_compared, parse_time_compared) so it must be called
+ * after the "before" SQL has executed.
+ */
+public class MarkdownSummaryWriter {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(MarkdownSummaryWriter.class);
+
+    private static final int TOP_N = 20;
+
+    public static void write(Connection c, Path reportsDir) throws 
IOException, SQLException {
+        if (!isComparisonDb(c)) {
+            LOG.info("Not a comparison database; skipping markdown summary.");
+            return;
+        }
+        Path summaryPath = reportsDir.resolve("summary.md");
+        Files.createDirectories(reportsDir);
+
+        try (BufferedWriter w = Files.newBufferedWriter(summaryPath)) {
+            w.write("# Tika Eval Comparison Summary\n\n");
+
+            writeOverview(c, w);
+            writeExtractExceptionSummary(c, w);
+            writeExceptionSummary(c, w);
+            writeContentQualitySummary(c, w);
+            writeOovComparison(c, w);
+            writeLanguageChanges(c, w);
+            writeContentLengthRatio(c, w);
+            writeEmbeddedCountChanges(c, w);
+            writeTokenCountSummary(c, w);
+            writeParseTimeSummary(c, w);
+            writeMimeChanges(c, w);
+            writeTopRegressions(c, w);
+            writeTopImprovements(c, w);
+            writeContentLost(c, w);
+            writeContentGained(c, w);
+            writeMissingExtracts(c, w);
+        }
+        LOG.info("Wrote markdown summary to {}", summaryPath);
+    }
+
+    private static void writeOverview(Connection c, BufferedWriter w)
+            throws IOException, SQLException {
+        w.write("## Overview\n\n");
+
+        try (Statement st = c.createStatement()) {
+            try (ResultSet rs = st.executeQuery(
+                    "select dir_name_a, dir_name_b from pair_names")) {
+                if (rs.next()) {
+                    w.write("- **A**: " + rs.getString(1) + "\n");
+                    w.write("- **B**: " + rs.getString(2) + "\n");
+                }
+            }
+
+            writeScalar(st, w, "- **Total containers**: ",
+                    "select count(1) from containers");
+            writeScalar(st, w, "- **Total files (A)**: ",
+                    "select count(1) from profiles_a");
+            writeScalar(st, w, "- **Total files (B)**: ",
+                    "select count(1) from profiles_b");
+            writeScalar(st, w, "- **Exceptions (A)**: ",
+                    "select count(1) from exceptions_a");
+            writeScalar(st, w, "- **Exceptions (B)**: ",
+                    "select count(1) from exceptions_b");
+        }
+        w.write("\n");
+    }
+
+    private static void writeExtractExceptionSummary(Connection c, 
BufferedWriter w)
+            throws IOException, SQLException {
+        w.write("## Extract File Issues\n\n");
+        w.write("Problems reading extract files (before parsing). " +
+                "Includes missing files, zero-byte files, oversized files, and 
bad JSON.\n\n");
+
+        w.write("### Extract A\n\n");
+        writeQueryAsTable(c, w,
+                "select r.extract_exception_description as TYPE, count(1) as 
COUNT " +
+                "from extract_exceptions_a ee " +
+                "join ref_extract_exception_types r " +
+                "  on r.extract_exception_id = ee.extract_exception_id " +
+                "group by r.extract_exception_description " +
+                "order by COUNT desc");
+
+        w.write("\n### Extract B\n\n");
+        writeQueryAsTable(c, w,
+                "select r.extract_exception_description as TYPE, count(1) as 
COUNT " +
+                "from extract_exceptions_b ee " +
+                "join ref_extract_exception_types r " +
+                "  on r.extract_exception_id = ee.extract_exception_id " +
+                "group by r.extract_exception_description " +
+                "order by COUNT desc");
+        w.write("\n");
+    }
+
+    private static void writeExceptionSummary(Connection c, BufferedWriter w)
+            throws IOException, SQLException {
+        w.write("## Exception Changes by Mime Type\n\n");
+        w.write("Mime types with >100 files where exception rate changed by 
>5%.\n\n");
+
+        writeQueryAsTable(c, w,
+                "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " +
+                "ec.total as TOTAL, " +
+                "ec.exc_cnt_a as EXC_A, ec.exc_cnt_b as EXC_B, " +
+                "round(ec.exc_prcnt_a * 100, 1) as EXC_PCT_A, " +
+                "round(ec.exc_prcnt_b * 100, 1) as EXC_PCT_B, " +
+                "ec.notes as FLAG " +
+                "from exceptions_compared ec " +
+                "join mimes ma on ma.mime_id = ec.mime_id_a " +
+                "join mimes mb on mb.mime_id = ec.mime_id_b " +
+                "where ec.total > 100 " +
+                "and abs(ec.exc_prcnt_a - ec.exc_prcnt_b) > 0.05 " +
+                "order by abs(ec.exc_prcnt_a - ec.exc_prcnt_b) desc");
+
+        w.write("\n### New Exception Types in B\n\n");
+        writeQueryAsTable(c, w,
+                "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " +
+                "count(1) as COUNT " +
+                "from exceptions_b eb " +
+                "left join exceptions_a ea on ea.id = eb.id " +
+                "join profiles_a pa on pa.id = eb.id " +
+                "join profiles_b pb on pb.id = eb.id " +
+                "join mimes ma on ma.mime_id = pa.mime_id " +
+                "join mimes mb on mb.mime_id = pb.mime_id " +
+                "where ea.id is null and eb.parse_exception_id = 0 " +
+                "group by ma.mime_string, mb.mime_string " +
+                "order by COUNT desc " +
+                "limit " + TOP_N);
+
+        w.write("\n### Fixed Exceptions in B\n\n");
+        writeQueryAsTable(c, w,
+                "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " +
+                "count(1) as COUNT " +
+                "from exceptions_a ea " +
+                "left join exceptions_b eb on ea.id = eb.id " +
+                "join profiles_a pa on pa.id = ea.id " +
+                "join profiles_b pb on pb.id = pa.id " +
+                "join mimes ma on ma.mime_id = pa.mime_id " +
+                "join mimes mb on mb.mime_id = pb.mime_id " +
+                "where eb.id is null and ea.parse_exception_id = 0 " +
+                "group by ma.mime_string, mb.mime_string " +
+                "order by COUNT desc " +
+                "limit " + TOP_N);
+        w.write("\n");
+    }
+
+    private static void writeContentQualitySummary(Connection c, 
BufferedWriter w)
+            throws IOException, SQLException {
+        w.write("## Content Quality (Dice Coefficient) by Mime Type\n\n");
+        w.write("Mean and median dice coefficient per mime type (higher = more 
similar).\n\n");
+
+        writeQueryAsTable(c, w,
+                "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " +
+                "count(1) as FILES, " +
+                "round(avg(cc.dice_coefficient), 4) as MEAN_DICE, " +
+                "round(median(cc.dice_coefficient), 4) as MEDIAN_DICE, " +
+                "round(min(cc.dice_coefficient), 4) as MIN_DICE " +
+                "from content_comparisons cc " +
+                "join profiles_a pa on cc.id = pa.id " +
+                "join profiles_b pb on cc.id = pb.id " +
+                "join mimes ma on ma.mime_id = pa.mime_id " +
+                "join mimes mb on mb.mime_id = pb.mime_id " +
+                "group by ma.mime_string, mb.mime_string " +
+                "having count(1) > 5 " +
+                "order by MEAN_DICE asc");
+        w.write("\n");
+    }
+
+    private static void writeTokenCountSummary(Connection c, BufferedWriter w)
+            throws IOException, SQLException {
+        w.write("## Token Count Changes by Mime Type\n\n");
+
+        writeQueryAsTable(c, w,
+                "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " +
+                "tcc.num_tokens_a as TOKENS_A, tcc.num_tokens_b as TOKENS_B, " 
+
+                "case when tcc.num_tokens_a > 0 " +
+                "  then round(100.0 * (tcc.num_tokens_b - tcc.num_tokens_a) / 
tcc.num_tokens_a, 1) " +
+                "  else null end as PCT_CHANGE, " +
+                "tcc.num_common_tokens_a as COMMON_A, tcc.num_common_tokens_b 
as COMMON_B " +
+                "from token_counts_compared tcc " +
+                "join mimes ma on ma.mime_id = tcc.mime_id_a " +
+                "join mimes mb on mb.mime_id = tcc.mime_id_b " +
+                "order by abs(tcc.num_tokens_a - tcc.num_tokens_b) desc");
+        w.write("\n");
+    }
+
+    private static void writeParseTimeSummary(Connection c, BufferedWriter w)
+            throws IOException, SQLException {
+        w.write("## Parse Time Changes by Mime Type\n\n");
+
+        writeQueryAsTable(c, w,
+                "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " +
+                "ptc.total_a as MS_A, ptc.total_b as MS_B, " +
+                "round(ptc.prcnt_increase, 1) as B_AS_PCT_OF_A " +
+                "from parse_time_compared ptc " +
+                "join mimes ma on ma.mime_id = ptc.mime_id_a " +
+                "join mimes mb on mb.mime_id = ptc.mime_id_b " +
+                "where ptc.total_a > 0 " +
+                "order by ptc.prcnt_increase desc");
+
+        w.write("\n### Parse Time Outliers (individual files, B > 10x A, A >= 
1s)\n\n");
+        writeQueryAsTable(c, w,
+                "select c.file_path as FILE, " +
+                "ma.mime_string as MIME_A, " +
+                "pa.elapsed_time_millis as MS_A, " +
+                "pb.elapsed_time_millis as MS_B, " +
+                "round(cast(pb.elapsed_time_millis as double) / " +
+                "  cast(pa.elapsed_time_millis as double), 1) as RATIO " +
+                "from profiles_a pa " +
+                "join profiles_b pb on pa.id = pb.id " +
+                "join containers c on pa.container_id = c.container_id " +
+                "join mimes ma on ma.mime_id = pa.mime_id " +
+                "where pa.is_embedded = false " +
+                "and pa.elapsed_time_millis >= 1000 " +
+                "and pb.elapsed_time_millis > pa.elapsed_time_millis * 10 " +
+                "order by RATIO desc " +
+                "limit " + TOP_N);
+        w.write("\n");
+    }
+
+    private static void writeMimeChanges(Connection c, BufferedWriter w)
+            throws IOException, SQLException {
+        w.write("## Mime Type Changes (A -> B)\n\n");
+
+        writeQueryAsTable(c, w,
+                "select concat(ma.mime_string, ' -> ', mb.mime_string) as 
CHANGE, " +
+                "count(1) as COUNT " +
+                "from profiles_a a " +
+                "join profiles_b b on a.id = b.id " +
+                "join mimes ma on ma.mime_id = a.mime_id " +
+                "join mimes mb on mb.mime_id = b.mime_id " +
+                "where a.mime_id <> b.mime_id " +
+                "group by CHANGE " +
+                "order by COUNT desc " +
+                "limit " + TOP_N);
+        w.write("\n");
+    }
+
+    private static void writeTopRegressions(Connection c, BufferedWriter w)
+            throws IOException, SQLException {
+        w.write("## Top " + TOP_N + " Content Regressions (lowest dice)\n\n");
+        w.write("Files where content changed the most (excluding perfect 
matches).\n\n");
+
+        writeQueryAsTable(c, w,
+                "select c.file_path as FILE, " +
+                "ma.mime_string as MIME_A, " +
+                "round(cc.dice_coefficient, 4) as DICE, " +
+                "round(cc.overlap, 4) as OVERLAP, " +
+                "cc.top_10_unique_token_diffs_a as ONLY_IN_A, " +
+                "cc.top_10_unique_token_diffs_b as ONLY_IN_B " +
+                "from content_comparisons cc " +
+                "join profiles_a pa on cc.id = pa.id " +
+                "join profiles_b pb on cc.id = pb.id " +
+                "join containers c on pa.container_id = c.container_id " +
+                "join mimes ma on ma.mime_id = pa.mime_id " +
+                "where cc.dice_coefficient < 1.0 " +
+                "and pa.is_embedded = false " +
+                "order by cc.dice_coefficient asc " +
+                "limit " + TOP_N);
+        w.write("\n");
+    }
+
+    private static void writeTopImprovements(Connection c, BufferedWriter w)
+            throws IOException, SQLException {
+        w.write("## Top " + TOP_N + " Fixed Exceptions in B (with content 
gained)\n\n");
+
+        writeQueryAsTable(c, w,
+                "select c.file_path as FILE, " +
+                "ma.mime_string as MIME_A, " +
+                "cb.num_tokens as TOKENS_B, " +
+                "cb.num_common_tokens as COMMON_TOKENS_B, " +
+                "cb.lang_id_1 as LANG_B " +
+                "from exceptions_a ea " +
+                "left join exceptions_b eb on ea.id = eb.id " +
+                "join profiles_a pa on pa.id = ea.id " +
+                "join profiles_b pb on pb.id = pa.id " +
+                "join containers c on pa.container_id = c.container_id " +
+                "join mimes ma on ma.mime_id = pa.mime_id " +
+                "left join contents_b cb on cb.id = ea.id " +
+                "where eb.id is null and ea.parse_exception_id = 0 " +
+                "and pa.is_embedded = false " +
+                "order by cb.num_common_tokens desc nulls last " +
+                "limit " + TOP_N);
+        w.write("\n");
+    }
+
+    private static void writeContentLost(Connection c, BufferedWriter w)
+            throws IOException, SQLException {
+        w.write("## Content Lost (had content in A, empty/missing in B)\n\n");
+
+        writeQueryAsTable(c, w,
+                "select c.file_path as FILE, " +
+                "ma.mime_string as MIME_A, " +
+                "ca.num_tokens as TOKENS_A, " +
+                "ca.num_common_tokens as COMMON_A, " +
+                "coalesce(cb.num_tokens, 0) as TOKENS_B " +
+                "from contents_a ca " +
+                "join profiles_a pa on ca.id = pa.id " +
+                "join containers c on pa.container_id = c.container_id " +
+                "join mimes ma on ma.mime_id = pa.mime_id " +
+                "left join contents_b cb on ca.id = cb.id " +
+                "where ca.num_tokens > 10 " +
+                "and coalesce(cb.num_tokens, 0) = 0 " +
+                "and pa.is_embedded = false " +
+                "order by ca.num_tokens desc " +
+                "limit " + TOP_N);
+        w.write("\n");
+    }
+
+    private static void writeContentGained(Connection c, BufferedWriter w)
+            throws IOException, SQLException {
+        w.write("## Content Gained (empty in A, has content in B)\n\n");
+
+        writeQueryAsTable(c, w,
+                "select c.file_path as FILE, " +
+                "mb.mime_string as MIME_B, " +
+                "coalesce(ca.num_tokens, 0) as TOKENS_A, " +
+                "cb.num_tokens as TOKENS_B, " +
+                "cb.num_common_tokens as COMMON_B " +
+                "from contents_b cb " +
+                "join profiles_b pb on cb.id = pb.id " +
+                "join profiles_a pa on cb.id = pa.id " +
+                "join containers c on pa.container_id = c.container_id " +
+                "join mimes mb on mb.mime_id = pb.mime_id " +
+                "left join contents_a ca on cb.id = ca.id " +
+                "where cb.num_tokens > 10 " +
+                "and coalesce(ca.num_tokens, 0) = 0 " +
+                "and pa.is_embedded = false " +
+                "order by cb.num_tokens desc " +
+                "limit " + TOP_N);
+        w.write("\n");
+    }
+
+    private static void writeEmbeddedCountChanges(Connection c, BufferedWriter 
w)
+            throws IOException, SQLException {
+        w.write("## Embedded Document Count Changes\n\n");
+        w.write("Files where the number of embedded documents changed 
significantly.\n\n");
+
+        writeQueryAsTable(c, w,
+                "select c.file_path as FILE, " +
+                "ma.mime_string as MIME_A, " +
+                "pa.num_attachments as EMBEDDED_A, " +
+                "pb.num_attachments as EMBEDDED_B, " +
+                "(pb.num_attachments - pa.num_attachments) as DELTA " +
+                "from profiles_a pa " +
+                "join profiles_b pb on pa.id = pb.id " +
+                "join containers c on pa.container_id = c.container_id " +
+                "join mimes ma on ma.mime_id = pa.mime_id " +
+                "where pa.is_embedded = false " +
+                "and pa.num_attachments <> pb.num_attachments " +
+                "and (pa.num_attachments > 0 or pb.num_attachments > 0) " +
+                "order by abs(pb.num_attachments - pa.num_attachments) desc " +
+                "limit " + TOP_N);
+        w.write("\n");
+    }
+
+    private static void writeContentLengthRatio(Connection c, BufferedWriter w)
+            throws IOException, SQLException {
+        w.write("## Content Length Ratio Outliers\n\n");
+        w.write("Files where content length changed by more than 2x " +
+                "(possible repeated text or truncation).\n\n");
+
+        writeQueryAsTable(c, w,
+                "select c.file_path as FILE, " +
+                "ma.mime_string as MIME_A, " +
+                "ca.content_length as LEN_A, " +
+                "cb.content_length as LEN_B, " +
+                "round(cast(cb.content_length as double) / " +
+                "  cast(ca.content_length as double), 2) as RATIO_B_TO_A " +
+                "from contents_a ca " +
+                "join contents_b cb on ca.id = cb.id " +
+                "join profiles_a pa on ca.id = pa.id " +
+                "join containers c on pa.container_id = c.container_id " +
+                "join mimes ma on ma.mime_id = pa.mime_id " +
+                "where pa.is_embedded = false " +
+                "and ca.content_length > 100 " +
+                "and cb.content_length > 100 " +
+                "and (cast(cb.content_length as double) / " +
+                "  cast(ca.content_length as double) > 2.0 " +
+                "  or cast(cb.content_length as double) / " +
+                "  cast(ca.content_length as double) < 0.5) " +
+                "order by abs(cast(cb.content_length as double) / " +
+                "  cast(ca.content_length as double) - 1.0) desc " +
+                "limit " + TOP_N);
+        w.write("\n");
+    }
+
+    private static void writeLanguageChanges(Connection c, BufferedWriter w)
+            throws IOException, SQLException {
+        w.write("## Language Detection Changes\n\n");
+        w.write("Files where the detected language changed between A and 
B.\n\n");
+
+        w.write("### By Language Pair (aggregate)\n\n");
+        writeQueryAsTable(c, w,
+                "select ca.lang_id_1 as LANG_A, cb.lang_id_1 as LANG_B, " +
+                "count(1) as COUNT " +
+                "from contents_a ca " +
+                "join contents_b cb on ca.id = cb.id " +
+                "join profiles_a pa on ca.id = pa.id " +
+                "where pa.is_embedded = false " +
+                "and ca.lang_id_1 is not null " +
+                "and cb.lang_id_1 is not null " +
+                "and ca.lang_id_1 <> cb.lang_id_1 " +
+                "group by ca.lang_id_1, cb.lang_id_1 " +
+                "order by COUNT desc " +
+                "limit " + TOP_N);
+
+        w.write("\n### Top " + TOP_N + " Individual Files\n\n");
+        writeQueryAsTable(c, w,
+                "select c.file_path as FILE, " +
+                "ma.mime_string as MIME_A, " +
+                "ca.lang_id_1 as LANG_A, " +
+                "round(ca.lang_id_prob_1, 3) as PROB_A, " +
+                "cb.lang_id_1 as LANG_B, " +
+                "round(cb.lang_id_prob_1, 3) as PROB_B " +
+                "from contents_a ca " +
+                "join contents_b cb on ca.id = cb.id " +
+                "join profiles_a pa on ca.id = pa.id " +
+                "join containers c on pa.container_id = c.container_id " +
+                "join mimes ma on ma.mime_id = pa.mime_id " +
+                "where pa.is_embedded = false " +
+                "and ca.lang_id_1 is not null " +
+                "and cb.lang_id_1 is not null " +
+                "and ca.lang_id_1 <> cb.lang_id_1 " +
+                "order by ca.num_tokens desc " +
+                "limit " + TOP_N);
+        w.write("\n");
+    }
+
+    private static void writeOovComparison(Connection c, BufferedWriter w)
+            throws IOException, SQLException {
+        w.write("## Out-of-Vocabulary (OOV) Rate Changes\n\n");
+        w.write("Files where OOV rate increased significantly in B " +
+                "(possible mojibake or encoding regression).\n\n");
+
+        w.write("### By Mime Type (aggregate)\n\n");
+        writeQueryAsTable(c, w,
+                "select ma.mime_string as MIME_A, " +
+                "count(1) as FILES, " +
+                "round(avg(ca.oov), 4) as MEAN_OOV_A, " +
+                "round(avg(cb.oov), 4) as MEAN_OOV_B, " +
+                "round(avg(cb.oov) - avg(ca.oov), 4) as OOV_DELTA " +
+                "from contents_a ca " +
+                "join contents_b cb on ca.id = cb.id " +
+                "join profiles_a pa on ca.id = pa.id " +
+                "join mimes ma on ma.mime_id = pa.mime_id " +
+                "where pa.is_embedded = false " +
+                "and ca.oov is not null and cb.oov is not null " +
+                "group by ma.mime_string " +
+                "having count(1) > 5 " +
+                "order by OOV_DELTA desc");
+
+        w.write("\n### Top " + TOP_N + " Individual OOV Increases\n\n");
+        writeQueryAsTable(c, w,
+                "select c.file_path as FILE, " +
+                "ma.mime_string as MIME_A, " +
+                "round(ca.oov, 4) as OOV_A, " +
+                "round(cb.oov, 4) as OOV_B, " +
+                "round(cb.oov - ca.oov, 4) as OOV_DELTA, " +
+                "ca.lang_id_1 as LANG_A, " +
+                "cb.lang_id_1 as LANG_B " +
+                "from contents_a ca " +
+                "join contents_b cb on ca.id = cb.id " +
+                "join profiles_a pa on ca.id = pa.id " +
+                "join containers c on pa.container_id = c.container_id " +
+                "join mimes ma on ma.mime_id = pa.mime_id " +
+                "where pa.is_embedded = false " +
+                "and ca.oov is not null and cb.oov is not null " +
+                "and ca.num_tokens > 10 " +
+                "and (cb.oov - ca.oov) > 0.1 " +
+                "order by (cb.oov - ca.oov) desc " +
+                "limit " + TOP_N);
+        w.write("\n");
+    }
+
+    private static void writeMissingExtracts(Connection c, BufferedWriter w)
+            throws IOException, SQLException {
+        w.write("## Missing Extracts\n\n");
+        w.write("Files where A had an extract file but B did not (or vice 
versa).\n\n");
+
+        w.write("### Had extract in A, missing in B\n\n");
+        writeQueryAsTable(c, w,
+                "select c.file_path as FILE, " +
+                "ma.mime_string as MIME_A, " +
+                "c.extract_file_length_a as EXTRACT_LEN_A, " +
+                "c.extract_file_length_b as EXTRACT_LEN_B " +
+                "from containers c " +
+                "join profiles_a pa on pa.container_id = c.container_id " +
+                "join mimes ma on ma.mime_id = pa.mime_id " +
+                "where pa.is_embedded = false " +
+                "and c.extract_file_length_a > 0 " +
+                "and (c.extract_file_length_b is null or 
c.extract_file_length_b = 0) " +
+                "order by c.extract_file_length_a desc " +
+                "limit " + TOP_N);
+
+        w.write("\n### Had extract in B, missing in A\n\n");
+        writeQueryAsTable(c, w,
+                "select c.file_path as FILE, " +
+                "mb.mime_string as MIME_B, " +
+                "c.extract_file_length_a as EXTRACT_LEN_A, " +
+                "c.extract_file_length_b as EXTRACT_LEN_B " +
+                "from containers c " +
+                "join profiles_b pb on pb.container_id = c.container_id " +
+                "join mimes mb on mb.mime_id = pb.mime_id " +
+                "where pb.is_embedded = false " +
+                "and c.extract_file_length_b > 0 " +
+                "and (c.extract_file_length_a is null or 
c.extract_file_length_a = 0) " +
+                "order by c.extract_file_length_b desc " +
+                "limit " + TOP_N);
+        w.write("\n");
+    }
+
+    private static boolean isComparisonDb(Connection c) throws SQLException {
+        DatabaseMetaData md = c.getMetaData();
+        try (ResultSet rs = md.getTables(null, null, "%", null)) {
+            while (rs.next()) {
+                if ("CONTENT_COMPARISONS".equalsIgnoreCase(rs.getString(3))) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    private static void writeScalar(Statement st, BufferedWriter w,
+                                     String prefix, String sql)
+            throws IOException, SQLException {
+        try (ResultSet rs = st.executeQuery(sql)) {
+            if (rs.next()) {
+                w.write(prefix + rs.getString(1) + "\n");
+            }
+        }
+    }
+
+    private static void writeQueryAsTable(Connection c, BufferedWriter w,
+                                           String sql)
+            throws IOException, SQLException {
+        try (Statement st = c.createStatement();
+             ResultSet rs = st.executeQuery(sql)) {
+            ResultSetMetaData meta = rs.getMetaData();
+            int cols = meta.getColumnCount();
+
+            if (!rs.isBeforeFirst()) {
+                w.write("_No data._\n\n");
+                return;
+            }
+
+            // Header
+            w.write("|");
+            for (int i = 1; i <= cols; i++) {
+                w.write(" " + meta.getColumnLabel(i) + " |");
+            }
+            w.write("\n|");
+            for (int i = 1; i <= cols; i++) {
+                w.write(" --- |");
+            }
+            w.write("\n");
+
+            // Rows
+            while (rs.next()) {
+                w.write("|");
+                for (int i = 1; i <= cols; i++) {
+                    String val = rs.getString(i);
+                    if (val == null) {
+                        val = "";
+                    }
+                    // Escape pipes in values
+                    val = val.replace("|", "\\|");
+                    w.write(" " + val + " |");
+                }
+                w.write("\n");
+            }
+        }
+    }
+}
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java
index bf83593023..d9eed292d7 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java
@@ -65,8 +65,8 @@ public class ResultsReporter {
                         "directory for the reports. " + "If not specified, 
will write to 'reports'" + "BEWARE: Will overwrite existing reports without 
warning!")
                 .addOption("rf", "reportsFile", true,
                         "xml specifying sql to call for the reports." + "If 
not specified, will use default reports in resources/tika-eval-*-config.xml")
-                .addOption("db", true, "default database (in memory H2). 
Specify a file name for the H2 database.")
-                .addOption("jdbc", true, "EXPERT: full jdbc connection string. 
Specify this or use -db <h2db_name>")
+                .addOption("d", "db", true, "default database (in memory H2). 
Specify a file name for the H2 database.")
+                .addOption("jdbc", true, "EXPERT: full jdbc connection string. 
Specify this or use -d <h2db_name>")
                 .addOption("jdbcdriver", true, "EXPERT: specify the jdbc 
driver class if all else fails")
                 .addOption("tablePrefix", true, "EXPERT: if not using the 
default tables, specify your table name prefix");
 
@@ -78,7 +78,7 @@ public class ResultsReporter {
 
     public static void USAGE() throws IOException {
         HelpFormatter helpFormatter = HelpFormatter.builder().get();
-        helpFormatter.printHelp("java -jar tika-eval-x.y.jar Report -db mydb 
[-rd myreports] [-rf myreports.xml]", "Tool: Report", ResultsReporter.OPTIONS,
+        helpFormatter.printHelp("java -jar tika-eval-x.y.jar Report -d mydb 
[-rd myreports] [-rf myreports.xml]", "Tool: Report", ResultsReporter.OPTIONS,
                 "Note: for h2 db, do not include the .mv.db at the end of the 
db name.", true);
     }
 
@@ -323,6 +323,7 @@ public class ResultsReporter {
             for (Report r : reports) {
                 r.writeReport(c, reportsDirectory);
             }
+            MarkdownSummaryWriter.write(c, reportsDirectory);
             for (String sql : after) {
                 LOG.info("processing 'after': {}", sql);
                 long start = System.currentTimeMillis();
diff --git 
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
 
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
index 5d0fd93d18..6fa2ec6d6d 100644
--- 
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
+++ 
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
@@ -206,15 +206,20 @@ public class TikaEvalCLITest extends TikaTest {
         CachingFileVisitor v = new CachingFileVisitor();
         Files.walkFileTree(compareReportsDir, v);
         int cnt = 0;
+        boolean hasSummaryMd = false;
         for (Path report : v.getPaths()) {
-            if (report
-                    .getFileName()
-                    .toString()
-                    .endsWith(".xlsx")) {
+            String name = report.getFileName().toString();
+            if (name.endsWith(".xlsx")) {
                 cnt++;
             }
+            if ("summary.md".equals(name)) {
+                hasSummaryMd = true;
+                assertTrue(Files.size(report) > 100,
+                        "summary.md should not be empty");
+            }
         }
         assertTrue(cnt > 33);
+        assertTrue(hasSummaryMd, "summary.md should be generated for 
comparison reports");
         // If there is a failure, check for SQL errors in the previous log.
         // If it's is a syntax error, for the position look for "[*]" in the 
exception message.
         // The "[42001-230]" is [<error number>-<build number].
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
 
b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
index 21f413087f..03d6b53cc0 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
@@ -40,7 +40,7 @@ public class JsonMetadataList {
     private static final StreamReadConstraints DEFAULT_CONSTRAINTS = 
StreamReadConstraints
             .builder()
             .maxNestingDepth(10)
-            .maxStringLength(20_000_000)
+            .maxStringLength(200_000_000)
             .maxNumberLength(500)
             .build();

(tika) 01/01: add md summary and other cli improvements

Reply via email to