This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 96002a73d4 add md summary and other cli improvements (#2676)
96002a73d4 is described below
commit 96002a73d48a7832339c2e5cdc1de8360937882d
Author: Tim Allison <[email protected]>
AuthorDate: Fri Mar 6 16:28:16 2026 -0500
add md summary and other cli improvements (#2676)
---
tika-eval/tika-eval-app/pom.xml | 4 +
.../java/org/apache/tika/eval/app/EvalConfig.java | 2 +-
.../tika/eval/app/ExtractComparerRunner.java | 88 ++-
.../java/org/apache/tika/eval/app/TikaEvalCLI.java | 58 ++
.../eval/app/reports/MarkdownSummaryWriter.java | 611 +++++++++++++++++++++
.../tika/eval/app/reports/ResultsReporter.java | 7 +-
.../org/apache/tika/eval/app/TikaEvalCLITest.java | 13 +-
.../tika/serialization/JsonMetadataList.java | 2 +-
8 files changed, 772 insertions(+), 13 deletions(-)
diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml
index b145c273dd..4b97f249c1 100644
--- a/tika-eval/tika-eval-app/pom.xml
+++ b/tika-eval/tika-eval-app/pom.xml
@@ -65,6 +65,10 @@
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-compress</artifactId>
+ </dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
index fc0d72f0a6..b1aa848992 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
@@ -23,7 +23,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
public class EvalConfig {
private long minExtractLength = 0;
- private long maxExtractLength = 2_000_000;
+ private long maxExtractLength = 100_000_000;
private String jdbcString = null;
private String jdbcDriverClass = null;
private boolean forceDrop = true;
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
index 6d23fe3653..3535574233 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
@@ -18,7 +18,9 @@ package org.apache.tika.eval.app;
import java.io.IOException;
import java.io.InputStream;
+import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Connection;
@@ -39,12 +41,16 @@ import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Stream;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.help.HelpFormatter;
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -56,6 +62,7 @@ import org.apache.tika.eval.app.io.DBWriter;
import org.apache.tika.eval.app.io.ExtractReader;
import org.apache.tika.eval.app.io.ExtractReaderException;
import org.apache.tika.eval.app.io.IDBWriter;
+import org.apache.tika.eval.app.reports.ResultsReporter;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.pipes.api.FetchEmitTuple;
@@ -79,10 +86,12 @@ public class ExtractComparerRunner {
.addOption(Option.builder("b").longOpt("extractsB").hasArg().desc("required:
directory of 'B' extracts").get())
.addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional:
directory for original binary input documents."
+ " If not specified, -extracts is crawled as
is.").get())
-
.addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db
path").get())
+
.addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db path
(uses temp file if not specified)").get())
.addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json
config file").get())
.addOption(Option.builder("n").longOpt("numWorkers").hasArg().desc("number of
worker threads").get())
.addOption(Option.builder("m").longOpt("maxExtractLength").hasArg().desc("maximum
extract length").get())
+
.addOption(Option.builder("r").longOpt("report").desc("automatically run Report
and tgz after Compare").get())
+
.addOption(Option.builder("rd").longOpt("reportsDir").hasArg().desc("directory
for reports (default: 'reports')").get())
;
}
@@ -93,7 +102,16 @@ public class ExtractComparerRunner {
Path extractsADir = commandLine.hasOption('a') ?
Paths.get(commandLine.getOptionValue('a')) : Paths.get(USAGE_FAIL("Must specify
extractsA dir: -a"));
Path extractsBDir = commandLine.hasOption('b') ?
Paths.get(commandLine.getOptionValue('b')) : Paths.get(USAGE_FAIL("Must specify
extractsB dir: -b"));
Path inputDir = commandLine.hasOption('i') ?
Paths.get(commandLine.getOptionValue('i')) : extractsADir;
- String dbPath = commandLine.hasOption('d') ?
commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d");
+
+ boolean usesTempDb = !commandLine.hasOption('d');
+ Path tempDbDir = null;
+ String dbPath;
+ if (usesTempDb) {
+ tempDbDir = Files.createTempDirectory("tika-eval-");
+ dbPath = tempDbDir.resolve("eval-db").toAbsolutePath().toString();
+ } else {
+ dbPath = commandLine.getOptionValue('d');
+ }
if (commandLine.hasOption('n')) {
evalConfig.setNumWorkers(Integer.parseInt(commandLine.getOptionValue('n')));
@@ -103,8 +121,27 @@ public class ExtractComparerRunner {
evalConfig.setMaxExtractLength(Long.parseLong(commandLine.getOptionValue('m')));
}
- String jdbcString = getJdbcConnectionString(dbPath);
- execute(inputDir, extractsADir, extractsBDir, jdbcString, evalConfig);
+ try {
+ String jdbcString = getJdbcConnectionString(dbPath);
+ execute(inputDir, extractsADir, extractsBDir, jdbcString,
evalConfig);
+
+ if (commandLine.hasOption('r')) {
+ String reportsDir = commandLine.getOptionValue("rd",
"reports");
+ LOG.info("Running Report...");
+ ResultsReporter.main(new String[]{"-d", dbPath, "-rd",
reportsDir});
+ Path reportsDirPath = Paths.get(reportsDir);
+ if (Files.isDirectory(reportsDirPath)) {
+ Path tgzPath = reportsDirPath.resolveSibling(reportsDir +
".tar.gz");
+ LOG.info("Creating {}", tgzPath);
+ createTarGz(reportsDirPath, tgzPath);
+ LOG.info("Reports archived to {}", tgzPath);
+ }
+ }
+ } finally {
+ if (usesTempDb && tempDbDir != null) {
+ deleteDirectory(tempDbDir);
+ }
+ }
}
private static String getJdbcConnectionString(String dbPath) {
@@ -177,6 +214,11 @@ public class ExtractComparerRunner {
} finally {
mimeBuffer.close();
executorService.shutdownNow();
+ try {
+ jdbcUtil.getConnection().close();
+ } catch (SQLException e) {
+ LOG.warn("failed to close db connection", e);
+ }
}
}
@@ -205,6 +247,44 @@ public class ExtractComparerRunner {
return new MimeBuffer(jdbcUtil.getConnection(),
builder.getMimeTable(), MimeTypes.getDefaultMimeTypes());
}
+ private static void deleteDirectory(Path dir) throws IOException {
+ if (!Files.exists(dir)) {
+ return;
+ }
+ try (Stream<Path> walk = Files.walk(dir)) {
+ walk.sorted(java.util.Comparator.reverseOrder())
+ .forEach(p -> {
+ try {
+ Files.deleteIfExists(p);
+ } catch (IOException e) {
+ LOG.warn("Failed to delete {}", p, e);
+ }
+ });
+ }
+ }
+
+ private static void createTarGz(Path sourceDir, Path output) throws
IOException {
+ try (OutputStream fos = Files.newOutputStream(output);
+ GzipCompressorOutputStream gzo = new
GzipCompressorOutputStream(fos);
+ TarArchiveOutputStream tar = new TarArchiveOutputStream(gzo)) {
+ tar.setLongFileMode(TarArchiveOutputStream.LONGFILE_POSIX);
+ try (Stream<Path> walk = Files.walk(sourceDir)) {
+ walk.filter(Files::isRegularFile).forEach(file -> {
+ try {
+ String entryName = sourceDir.getFileName()
+
.resolve(sourceDir.relativize(file)).toString();
+ TarArchiveEntry entry = new TarArchiveEntry(file,
entryName);
+ tar.putArchiveEntry(entry);
+ Files.copy(file, tar);
+ tar.closeArchiveEntry();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ });
+ }
+ }
+ }
+
private static void USAGE() throws IOException {
HelpFormatter helpFormatter = HelpFormatter.builder().get();
helpFormatter.printHelp("java -jar tika-eval-app-x.y.z.jar
FileProfiler -e docs -d mydb [-i inputDir, -c config.json]",
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
index ce32d78f4b..38ace5239e 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
@@ -50,6 +50,27 @@ public class TikaEvalCLI {
private void execute(String[] args) throws Exception {
String tool = args[0];
+ // If the first arg looks like a flag, infer the tool from the args
+ if (tool.startsWith("-")) {
+ String inferred = inferTool(args);
+ if (inferred != null) {
+ tool = inferred;
+ // Don't strip the first arg — all args are flags for the tool
+ switch (tool) {
+ case "Compare":
+ handleCompare(args);
+ return;
+ case "Profile":
+ handleProfile(args);
+ return;
+ case "Report":
+ handleReport(args);
+ return;
+ }
+ }
+ System.err.println(specifyTools());
+ return;
+ }
String[] subsetArgs = new String[args.length - 1];
System.arraycopy(args, 1, subsetArgs, 0, args.length - 1);
switch (tool) {
@@ -72,6 +93,43 @@ public class TikaEvalCLI {
}
}
+ private static String inferTool(String[] args) {
+ boolean hasA = false;
+ boolean hasB = false;
+ boolean hasE = false;
+ boolean hasDb = false;
+ for (int i = 0; i < args.length; i++) {
+ switch (args[i]) {
+ case "-a":
+ case "--extractsA":
+ hasA = true;
+ break;
+ case "-b":
+ case "--extractsB":
+ hasB = true;
+ break;
+ case "-e":
+ case "--extracts":
+ hasE = true;
+ break;
+ case "-d":
+ case "--db":
+ hasDb = true;
+ break;
+ }
+ }
+ if (hasA && hasB) {
+ return "Compare";
+ }
+ if (hasE) {
+ return "Profile";
+ }
+ if (hasDb && !hasA && !hasB && !hasE) {
+ return "Report";
+ }
+ return null;
+ }
+
private void handleStartDB(String[] args) throws SQLException {
List<String> argList = new ArrayList<>();
argList.add("-web");
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/MarkdownSummaryWriter.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/MarkdownSummaryWriter.java
new file mode 100644
index 0000000000..0681e57887
--- /dev/null
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/MarkdownSummaryWriter.java
@@ -0,0 +1,611 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.reports;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.DatabaseMetaData;
+import java.sql.ResultSet;
+import java.sql.ResultSetMetaData;
+import java.sql.SQLException;
+import java.sql.Statement;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Writes a markdown summary of a tika-eval comparison run.
+ * <p>
+ * This is designed to be read by both humans and LLMs for fast
+ * regression detection. It queries the same temp tables that the
+ * xlsx report pipeline creates (exceptions_compared,
+ * token_counts_compared, parse_time_compared) so it must be called
+ * after the "before" SQL has executed.
+ */
+public class MarkdownSummaryWriter {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(MarkdownSummaryWriter.class);
+
+ private static final int TOP_N = 20;
+
+ public static void write(Connection c, Path reportsDir) throws
IOException, SQLException {
+ if (!isComparisonDb(c)) {
+ LOG.info("Not a comparison database; skipping markdown summary.");
+ return;
+ }
+ Path summaryPath = reportsDir.resolve("summary.md");
+ Files.createDirectories(reportsDir);
+
+ try (BufferedWriter w = Files.newBufferedWriter(summaryPath)) {
+ w.write("# Tika Eval Comparison Summary\n\n");
+
+ writeOverview(c, w);
+ writeExtractExceptionSummary(c, w);
+ writeExceptionSummary(c, w);
+ writeContentQualitySummary(c, w);
+ writeOovComparison(c, w);
+ writeLanguageChanges(c, w);
+ writeContentLengthRatio(c, w);
+ writeEmbeddedCountChanges(c, w);
+ writeTokenCountSummary(c, w);
+ writeParseTimeSummary(c, w);
+ writeMimeChanges(c, w);
+ writeTopRegressions(c, w);
+ writeTopImprovements(c, w);
+ writeContentLost(c, w);
+ writeContentGained(c, w);
+ writeMissingExtracts(c, w);
+ }
+ LOG.info("Wrote markdown summary to {}", summaryPath);
+ }
+
+ private static void writeOverview(Connection c, BufferedWriter w)
+ throws IOException, SQLException {
+ w.write("## Overview\n\n");
+
+ try (Statement st = c.createStatement()) {
+ try (ResultSet rs = st.executeQuery(
+ "select dir_name_a, dir_name_b from pair_names")) {
+ if (rs.next()) {
+ w.write("- **A**: " + rs.getString(1) + "\n");
+ w.write("- **B**: " + rs.getString(2) + "\n");
+ }
+ }
+
+ writeScalar(st, w, "- **Total containers**: ",
+ "select count(1) from containers");
+ writeScalar(st, w, "- **Total files (A)**: ",
+ "select count(1) from profiles_a");
+ writeScalar(st, w, "- **Total files (B)**: ",
+ "select count(1) from profiles_b");
+ writeScalar(st, w, "- **Exceptions (A)**: ",
+ "select count(1) from exceptions_a");
+ writeScalar(st, w, "- **Exceptions (B)**: ",
+ "select count(1) from exceptions_b");
+ }
+ w.write("\n");
+ }
+
+ private static void writeExtractExceptionSummary(Connection c,
BufferedWriter w)
+ throws IOException, SQLException {
+ w.write("## Extract File Issues\n\n");
+ w.write("Problems reading extract files (before parsing). " +
+ "Includes missing files, zero-byte files, oversized files, and
bad JSON.\n\n");
+
+ w.write("### Extract A\n\n");
+ writeQueryAsTable(c, w,
+ "select r.extract_exception_description as TYPE, count(1) as
COUNT " +
+ "from extract_exceptions_a ee " +
+ "join ref_extract_exception_types r " +
+ " on r.extract_exception_id = ee.extract_exception_id " +
+ "group by r.extract_exception_description " +
+ "order by COUNT desc");
+
+ w.write("\n### Extract B\n\n");
+ writeQueryAsTable(c, w,
+ "select r.extract_exception_description as TYPE, count(1) as
COUNT " +
+ "from extract_exceptions_b ee " +
+ "join ref_extract_exception_types r " +
+ " on r.extract_exception_id = ee.extract_exception_id " +
+ "group by r.extract_exception_description " +
+ "order by COUNT desc");
+ w.write("\n");
+ }
+
+ private static void writeExceptionSummary(Connection c, BufferedWriter w)
+ throws IOException, SQLException {
+ w.write("## Exception Changes by Mime Type\n\n");
+ w.write("Mime types with >100 files where exception rate changed by
>5%.\n\n");
+
+ writeQueryAsTable(c, w,
+ "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " +
+ "ec.total as TOTAL, " +
+ "ec.exc_cnt_a as EXC_A, ec.exc_cnt_b as EXC_B, " +
+ "round(ec.exc_prcnt_a * 100, 1) as EXC_PCT_A, " +
+ "round(ec.exc_prcnt_b * 100, 1) as EXC_PCT_B, " +
+ "ec.notes as FLAG " +
+ "from exceptions_compared ec " +
+ "join mimes ma on ma.mime_id = ec.mime_id_a " +
+ "join mimes mb on mb.mime_id = ec.mime_id_b " +
+ "where ec.total > 100 " +
+ "and abs(ec.exc_prcnt_a - ec.exc_prcnt_b) > 0.05 " +
+ "order by abs(ec.exc_prcnt_a - ec.exc_prcnt_b) desc");
+
+ w.write("\n### New Exception Types in B\n\n");
+ writeQueryAsTable(c, w,
+ "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " +
+ "count(1) as COUNT " +
+ "from exceptions_b eb " +
+ "left join exceptions_a ea on ea.id = eb.id " +
+ "join profiles_a pa on pa.id = eb.id " +
+ "join profiles_b pb on pb.id = eb.id " +
+ "join mimes ma on ma.mime_id = pa.mime_id " +
+ "join mimes mb on mb.mime_id = pb.mime_id " +
+ "where ea.id is null and eb.parse_exception_id = 0 " +
+ "group by ma.mime_string, mb.mime_string " +
+ "order by COUNT desc " +
+ "limit " + TOP_N);
+
+ w.write("\n### Fixed Exceptions in B\n\n");
+ writeQueryAsTable(c, w,
+ "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " +
+ "count(1) as COUNT " +
+ "from exceptions_a ea " +
+ "left join exceptions_b eb on ea.id = eb.id " +
+ "join profiles_a pa on pa.id = ea.id " +
+ "join profiles_b pb on pb.id = pa.id " +
+ "join mimes ma on ma.mime_id = pa.mime_id " +
+ "join mimes mb on mb.mime_id = pb.mime_id " +
+ "where eb.id is null and ea.parse_exception_id = 0 " +
+ "group by ma.mime_string, mb.mime_string " +
+ "order by COUNT desc " +
+ "limit " + TOP_N);
+ w.write("\n");
+ }
+
+ private static void writeContentQualitySummary(Connection c,
BufferedWriter w)
+ throws IOException, SQLException {
+ w.write("## Content Quality (Dice Coefficient) by Mime Type\n\n");
+ w.write("Mean and median dice coefficient per mime type (higher = more
similar).\n\n");
+
+ writeQueryAsTable(c, w,
+ "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " +
+ "count(1) as FILES, " +
+ "round(avg(cc.dice_coefficient), 4) as MEAN_DICE, " +
+ "round(median(cc.dice_coefficient), 4) as MEDIAN_DICE, " +
+ "round(min(cc.dice_coefficient), 4) as MIN_DICE " +
+ "from content_comparisons cc " +
+ "join profiles_a pa on cc.id = pa.id " +
+ "join profiles_b pb on cc.id = pb.id " +
+ "join mimes ma on ma.mime_id = pa.mime_id " +
+ "join mimes mb on mb.mime_id = pb.mime_id " +
+ "group by ma.mime_string, mb.mime_string " +
+ "having count(1) > 5 " +
+ "order by MEAN_DICE asc");
+ w.write("\n");
+ }
+
+ private static void writeTokenCountSummary(Connection c, BufferedWriter w)
+ throws IOException, SQLException {
+ w.write("## Token Count Changes by Mime Type\n\n");
+
+ writeQueryAsTable(c, w,
+ "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " +
+ "tcc.num_tokens_a as TOKENS_A, tcc.num_tokens_b as TOKENS_B, "
+
+ "case when tcc.num_tokens_a > 0 " +
+ " then round(100.0 * (tcc.num_tokens_b - tcc.num_tokens_a) /
tcc.num_tokens_a, 1) " +
+ " else null end as PCT_CHANGE, " +
+ "tcc.num_common_tokens_a as COMMON_A, tcc.num_common_tokens_b
as COMMON_B " +
+ "from token_counts_compared tcc " +
+ "join mimes ma on ma.mime_id = tcc.mime_id_a " +
+ "join mimes mb on mb.mime_id = tcc.mime_id_b " +
+ "order by abs(tcc.num_tokens_a - tcc.num_tokens_b) desc");
+ w.write("\n");
+ }
+
+ private static void writeParseTimeSummary(Connection c, BufferedWriter w)
+ throws IOException, SQLException {
+ w.write("## Parse Time Changes by Mime Type\n\n");
+
+ writeQueryAsTable(c, w,
+ "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " +
+ "ptc.total_a as MS_A, ptc.total_b as MS_B, " +
+ "round(ptc.prcnt_increase, 1) as B_AS_PCT_OF_A " +
+ "from parse_time_compared ptc " +
+ "join mimes ma on ma.mime_id = ptc.mime_id_a " +
+ "join mimes mb on mb.mime_id = ptc.mime_id_b " +
+ "where ptc.total_a > 0 " +
+ "order by ptc.prcnt_increase desc");
+
+ w.write("\n### Parse Time Outliers (individual files, B > 10x A, A >=
1s)\n\n");
+ writeQueryAsTable(c, w,
+ "select c.file_path as FILE, " +
+ "ma.mime_string as MIME_A, " +
+ "pa.elapsed_time_millis as MS_A, " +
+ "pb.elapsed_time_millis as MS_B, " +
+ "round(cast(pb.elapsed_time_millis as double) / " +
+ " cast(pa.elapsed_time_millis as double), 1) as RATIO " +
+ "from profiles_a pa " +
+ "join profiles_b pb on pa.id = pb.id " +
+ "join containers c on pa.container_id = c.container_id " +
+ "join mimes ma on ma.mime_id = pa.mime_id " +
+ "where pa.is_embedded = false " +
+ "and pa.elapsed_time_millis >= 1000 " +
+ "and pb.elapsed_time_millis > pa.elapsed_time_millis * 10 " +
+ "order by RATIO desc " +
+ "limit " + TOP_N);
+ w.write("\n");
+ }
+
+ private static void writeMimeChanges(Connection c, BufferedWriter w)
+ throws IOException, SQLException {
+ w.write("## Mime Type Changes (A -> B)\n\n");
+
+ writeQueryAsTable(c, w,
+ "select concat(ma.mime_string, ' -> ', mb.mime_string) as
CHANGE, " +
+ "count(1) as COUNT " +
+ "from profiles_a a " +
+ "join profiles_b b on a.id = b.id " +
+ "join mimes ma on ma.mime_id = a.mime_id " +
+ "join mimes mb on mb.mime_id = b.mime_id " +
+ "where a.mime_id <> b.mime_id " +
+ "group by CHANGE " +
+ "order by COUNT desc " +
+ "limit " + TOP_N);
+ w.write("\n");
+ }
+
+ private static void writeTopRegressions(Connection c, BufferedWriter w)
+ throws IOException, SQLException {
+ w.write("## Top " + TOP_N + " Content Regressions (lowest dice)\n\n");
+ w.write("Files where content changed the most (excluding perfect
matches).\n\n");
+
+ writeQueryAsTable(c, w,
+ "select c.file_path as FILE, " +
+ "ma.mime_string as MIME_A, " +
+ "round(cc.dice_coefficient, 4) as DICE, " +
+ "round(cc.overlap, 4) as OVERLAP, " +
+ "cc.top_10_unique_token_diffs_a as ONLY_IN_A, " +
+ "cc.top_10_unique_token_diffs_b as ONLY_IN_B " +
+ "from content_comparisons cc " +
+ "join profiles_a pa on cc.id = pa.id " +
+ "join profiles_b pb on cc.id = pb.id " +
+ "join containers c on pa.container_id = c.container_id " +
+ "join mimes ma on ma.mime_id = pa.mime_id " +
+ "where cc.dice_coefficient < 1.0 " +
+ "and pa.is_embedded = false " +
+ "order by cc.dice_coefficient asc " +
+ "limit " + TOP_N);
+ w.write("\n");
+ }
+
+ private static void writeTopImprovements(Connection c, BufferedWriter w)
+ throws IOException, SQLException {
+ w.write("## Top " + TOP_N + " Fixed Exceptions in B (with content
gained)\n\n");
+
+ writeQueryAsTable(c, w,
+ "select c.file_path as FILE, " +
+ "ma.mime_string as MIME_A, " +
+ "cb.num_tokens as TOKENS_B, " +
+ "cb.num_common_tokens as COMMON_TOKENS_B, " +
+ "cb.lang_id_1 as LANG_B " +
+ "from exceptions_a ea " +
+ "left join exceptions_b eb on ea.id = eb.id " +
+ "join profiles_a pa on pa.id = ea.id " +
+ "join profiles_b pb on pb.id = pa.id " +
+ "join containers c on pa.container_id = c.container_id " +
+ "join mimes ma on ma.mime_id = pa.mime_id " +
+ "left join contents_b cb on cb.id = ea.id " +
+ "where eb.id is null and ea.parse_exception_id = 0 " +
+ "and pa.is_embedded = false " +
+ "order by cb.num_common_tokens desc nulls last " +
+ "limit " + TOP_N);
+ w.write("\n");
+ }
+
+ private static void writeContentLost(Connection c, BufferedWriter w)
+ throws IOException, SQLException {
+ w.write("## Content Lost (had content in A, empty/missing in B)\n\n");
+
+ writeQueryAsTable(c, w,
+ "select c.file_path as FILE, " +
+ "ma.mime_string as MIME_A, " +
+ "ca.num_tokens as TOKENS_A, " +
+ "ca.num_common_tokens as COMMON_A, " +
+ "coalesce(cb.num_tokens, 0) as TOKENS_B " +
+ "from contents_a ca " +
+ "join profiles_a pa on ca.id = pa.id " +
+ "join containers c on pa.container_id = c.container_id " +
+ "join mimes ma on ma.mime_id = pa.mime_id " +
+ "left join contents_b cb on ca.id = cb.id " +
+ "where ca.num_tokens > 10 " +
+ "and coalesce(cb.num_tokens, 0) = 0 " +
+ "and pa.is_embedded = false " +
+ "order by ca.num_tokens desc " +
+ "limit " + TOP_N);
+ w.write("\n");
+ }
+
+ private static void writeContentGained(Connection c, BufferedWriter w)
+ throws IOException, SQLException {
+ w.write("## Content Gained (empty in A, has content in B)\n\n");
+
+ writeQueryAsTable(c, w,
+ "select c.file_path as FILE, " +
+ "mb.mime_string as MIME_B, " +
+ "coalesce(ca.num_tokens, 0) as TOKENS_A, " +
+ "cb.num_tokens as TOKENS_B, " +
+ "cb.num_common_tokens as COMMON_B " +
+ "from contents_b cb " +
+ "join profiles_b pb on cb.id = pb.id " +
+ "join profiles_a pa on cb.id = pa.id " +
+ "join containers c on pa.container_id = c.container_id " +
+ "join mimes mb on mb.mime_id = pb.mime_id " +
+ "left join contents_a ca on cb.id = ca.id " +
+ "where cb.num_tokens > 10 " +
+ "and coalesce(ca.num_tokens, 0) = 0 " +
+ "and pa.is_embedded = false " +
+ "order by cb.num_tokens desc " +
+ "limit " + TOP_N);
+ w.write("\n");
+ }
+
+ private static void writeEmbeddedCountChanges(Connection c, BufferedWriter
w)
+ throws IOException, SQLException {
+ w.write("## Embedded Document Count Changes\n\n");
+ w.write("Files where the number of embedded documents changed
significantly.\n\n");
+
+ writeQueryAsTable(c, w,
+ "select c.file_path as FILE, " +
+ "ma.mime_string as MIME_A, " +
+ "pa.num_attachments as EMBEDDED_A, " +
+ "pb.num_attachments as EMBEDDED_B, " +
+ "(pb.num_attachments - pa.num_attachments) as DELTA " +
+ "from profiles_a pa " +
+ "join profiles_b pb on pa.id = pb.id " +
+ "join containers c on pa.container_id = c.container_id " +
+ "join mimes ma on ma.mime_id = pa.mime_id " +
+ "where pa.is_embedded = false " +
+ "and pa.num_attachments <> pb.num_attachments " +
+ "and (pa.num_attachments > 0 or pb.num_attachments > 0) " +
+ "order by abs(pb.num_attachments - pa.num_attachments) desc " +
+ "limit " + TOP_N);
+ w.write("\n");
+ }
+
+ private static void writeContentLengthRatio(Connection c, BufferedWriter w)
+ throws IOException, SQLException {
+ w.write("## Content Length Ratio Outliers\n\n");
+ w.write("Files where content length changed by more than 2x " +
+ "(possible repeated text or truncation).\n\n");
+
+ writeQueryAsTable(c, w,
+ "select c.file_path as FILE, " +
+ "ma.mime_string as MIME_A, " +
+ "ca.content_length as LEN_A, " +
+ "cb.content_length as LEN_B, " +
+ "round(cast(cb.content_length as double) / " +
+ " cast(ca.content_length as double), 2) as RATIO_B_TO_A " +
+ "from contents_a ca " +
+ "join contents_b cb on ca.id = cb.id " +
+ "join profiles_a pa on ca.id = pa.id " +
+ "join containers c on pa.container_id = c.container_id " +
+ "join mimes ma on ma.mime_id = pa.mime_id " +
+ "where pa.is_embedded = false " +
+ "and ca.content_length > 100 " +
+ "and cb.content_length > 100 " +
+ "and (cast(cb.content_length as double) / " +
+ " cast(ca.content_length as double) > 2.0 " +
+ " or cast(cb.content_length as double) / " +
+ " cast(ca.content_length as double) < 0.5) " +
+ "order by abs(cast(cb.content_length as double) / " +
+ " cast(ca.content_length as double) - 1.0) desc " +
+ "limit " + TOP_N);
+ w.write("\n");
+ }
+
+ private static void writeLanguageChanges(Connection c, BufferedWriter w)
+ throws IOException, SQLException {
+ w.write("## Language Detection Changes\n\n");
+ w.write("Files where the detected language changed between A and
B.\n\n");
+
+ w.write("### By Language Pair (aggregate)\n\n");
+ writeQueryAsTable(c, w,
+ "select ca.lang_id_1 as LANG_A, cb.lang_id_1 as LANG_B, " +
+ "count(1) as COUNT " +
+ "from contents_a ca " +
+ "join contents_b cb on ca.id = cb.id " +
+ "join profiles_a pa on ca.id = pa.id " +
+ "where pa.is_embedded = false " +
+ "and ca.lang_id_1 is not null " +
+ "and cb.lang_id_1 is not null " +
+ "and ca.lang_id_1 <> cb.lang_id_1 " +
+ "group by ca.lang_id_1, cb.lang_id_1 " +
+ "order by COUNT desc " +
+ "limit " + TOP_N);
+
+ w.write("\n### Top " + TOP_N + " Individual Files\n\n");
+ writeQueryAsTable(c, w,
+ "select c.file_path as FILE, " +
+ "ma.mime_string as MIME_A, " +
+ "ca.lang_id_1 as LANG_A, " +
+ "round(ca.lang_id_prob_1, 3) as PROB_A, " +
+ "cb.lang_id_1 as LANG_B, " +
+ "round(cb.lang_id_prob_1, 3) as PROB_B " +
+ "from contents_a ca " +
+ "join contents_b cb on ca.id = cb.id " +
+ "join profiles_a pa on ca.id = pa.id " +
+ "join containers c on pa.container_id = c.container_id " +
+ "join mimes ma on ma.mime_id = pa.mime_id " +
+ "where pa.is_embedded = false " +
+ "and ca.lang_id_1 is not null " +
+ "and cb.lang_id_1 is not null " +
+ "and ca.lang_id_1 <> cb.lang_id_1 " +
+ "order by ca.num_tokens desc " +
+ "limit " + TOP_N);
+ w.write("\n");
+ }
+
+ private static void writeOovComparison(Connection c, BufferedWriter w)
+ throws IOException, SQLException {
+ w.write("## Out-of-Vocabulary (OOV) Rate Changes\n\n");
+ w.write("Files where OOV rate increased significantly in B " +
+ "(possible mojibake or encoding regression).\n\n");
+
+ w.write("### By Mime Type (aggregate)\n\n");
+ writeQueryAsTable(c, w,
+ "select ma.mime_string as MIME_A, " +
+ "count(1) as FILES, " +
+ "round(avg(ca.oov), 4) as MEAN_OOV_A, " +
+ "round(avg(cb.oov), 4) as MEAN_OOV_B, " +
+ "round(avg(cb.oov) - avg(ca.oov), 4) as OOV_DELTA " +
+ "from contents_a ca " +
+ "join contents_b cb on ca.id = cb.id " +
+ "join profiles_a pa on ca.id = pa.id " +
+ "join mimes ma on ma.mime_id = pa.mime_id " +
+ "where pa.is_embedded = false " +
+ "and ca.oov is not null and cb.oov is not null " +
+ "group by ma.mime_string " +
+ "having count(1) > 5 " +
+ "order by OOV_DELTA desc");
+
+ w.write("\n### Top " + TOP_N + " Individual OOV Increases\n\n");
+ writeQueryAsTable(c, w,
+ "select c.file_path as FILE, " +
+ "ma.mime_string as MIME_A, " +
+ "round(ca.oov, 4) as OOV_A, " +
+ "round(cb.oov, 4) as OOV_B, " +
+ "round(cb.oov - ca.oov, 4) as OOV_DELTA, " +
+ "ca.lang_id_1 as LANG_A, " +
+ "cb.lang_id_1 as LANG_B " +
+ "from contents_a ca " +
+ "join contents_b cb on ca.id = cb.id " +
+ "join profiles_a pa on ca.id = pa.id " +
+ "join containers c on pa.container_id = c.container_id " +
+ "join mimes ma on ma.mime_id = pa.mime_id " +
+ "where pa.is_embedded = false " +
+ "and ca.oov is not null and cb.oov is not null " +
+ "and ca.num_tokens > 10 " +
+ "and (cb.oov - ca.oov) > 0.1 " +
+ "order by (cb.oov - ca.oov) desc " +
+ "limit " + TOP_N);
+ w.write("\n");
+ }
+
+ private static void writeMissingExtracts(Connection c, BufferedWriter w)
+ throws IOException, SQLException {
+ w.write("## Missing Extracts\n\n");
+ w.write("Files where A had an extract file but B did not (or vice
versa).\n\n");
+
+ w.write("### Had extract in A, missing in B\n\n");
+ writeQueryAsTable(c, w,
+ "select c.file_path as FILE, " +
+ "ma.mime_string as MIME_A, " +
+ "c.extract_file_length_a as EXTRACT_LEN_A, " +
+ "c.extract_file_length_b as EXTRACT_LEN_B " +
+ "from containers c " +
+ "join profiles_a pa on pa.container_id = c.container_id " +
+ "join mimes ma on ma.mime_id = pa.mime_id " +
+ "where pa.is_embedded = false " +
+ "and c.extract_file_length_a > 0 " +
+ "and (c.extract_file_length_b is null or
c.extract_file_length_b = 0) " +
+ "order by c.extract_file_length_a desc " +
+ "limit " + TOP_N);
+
+ w.write("\n### Had extract in B, missing in A\n\n");
+ writeQueryAsTable(c, w,
+ "select c.file_path as FILE, " +
+ "mb.mime_string as MIME_B, " +
+ "c.extract_file_length_a as EXTRACT_LEN_A, " +
+ "c.extract_file_length_b as EXTRACT_LEN_B " +
+ "from containers c " +
+ "join profiles_b pb on pb.container_id = c.container_id " +
+ "join mimes mb on mb.mime_id = pb.mime_id " +
+ "where pb.is_embedded = false " +
+ "and c.extract_file_length_b > 0 " +
+ "and (c.extract_file_length_a is null or
c.extract_file_length_a = 0) " +
+ "order by c.extract_file_length_b desc " +
+ "limit " + TOP_N);
+ w.write("\n");
+ }
+
+ private static boolean isComparisonDb(Connection c) throws SQLException {
+ DatabaseMetaData md = c.getMetaData();
+ try (ResultSet rs = md.getTables(null, null, "%", null)) {
+ while (rs.next()) {
+ if ("CONTENT_COMPARISONS".equalsIgnoreCase(rs.getString(3))) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ private static void writeScalar(Statement st, BufferedWriter w,
+ String prefix, String sql)
+ throws IOException, SQLException {
+ try (ResultSet rs = st.executeQuery(sql)) {
+ if (rs.next()) {
+ w.write(prefix + rs.getString(1) + "\n");
+ }
+ }
+ }
+
+ private static void writeQueryAsTable(Connection c, BufferedWriter w,
+ String sql)
+ throws IOException, SQLException {
+ try (Statement st = c.createStatement();
+ ResultSet rs = st.executeQuery(sql)) {
+ ResultSetMetaData meta = rs.getMetaData();
+ int cols = meta.getColumnCount();
+
+ if (!rs.isBeforeFirst()) {
+ w.write("_No data._\n\n");
+ return;
+ }
+
+ // Header
+ w.write("|");
+ for (int i = 1; i <= cols; i++) {
+ w.write(" " + meta.getColumnLabel(i) + " |");
+ }
+ w.write("\n|");
+ for (int i = 1; i <= cols; i++) {
+ w.write(" --- |");
+ }
+ w.write("\n");
+
+ // Rows
+ while (rs.next()) {
+ w.write("|");
+ for (int i = 1; i <= cols; i++) {
+ String val = rs.getString(i);
+ if (val == null) {
+ val = "";
+ }
+ // Escape pipes in values
+ val = val.replace("|", "\\|");
+ w.write(" " + val + " |");
+ }
+ w.write("\n");
+ }
+ }
+ }
+}
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java
index bf83593023..d9eed292d7 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java
@@ -65,8 +65,8 @@ public class ResultsReporter {
"directory for the reports. " + "If not specified,
will write to 'reports'" + "BEWARE: Will overwrite existing reports without
warning!")
.addOption("rf", "reportsFile", true,
"xml specifying sql to call for the reports." + "If
not specified, will use default reports in resources/tika-eval-*-config.xml")
- .addOption("db", true, "default database (in memory H2).
Specify a file name for the H2 database.")
- .addOption("jdbc", true, "EXPERT: full jdbc connection string.
Specify this or use -db <h2db_name>")
+ .addOption("d", "db", true, "default database (in memory H2).
Specify a file name for the H2 database.")
+ .addOption("jdbc", true, "EXPERT: full jdbc connection string.
Specify this or use -d <h2db_name>")
.addOption("jdbcdriver", true, "EXPERT: specify the jdbc
driver class if all else fails")
.addOption("tablePrefix", true, "EXPERT: if not using the
default tables, specify your table name prefix");
@@ -78,7 +78,7 @@ public class ResultsReporter {
public static void USAGE() throws IOException {
HelpFormatter helpFormatter = HelpFormatter.builder().get();
- helpFormatter.printHelp("java -jar tika-eval-x.y.jar Report -db mydb
[-rd myreports] [-rf myreports.xml]", "Tool: Report", ResultsReporter.OPTIONS,
+ helpFormatter.printHelp("java -jar tika-eval-x.y.jar Report -d mydb
[-rd myreports] [-rf myreports.xml]", "Tool: Report", ResultsReporter.OPTIONS,
"Note: for h2 db, do not include the .mv.db at the end of the
db name.", true);
}
@@ -323,6 +323,7 @@ public class ResultsReporter {
for (Report r : reports) {
r.writeReport(c, reportsDirectory);
}
+ MarkdownSummaryWriter.write(c, reportsDirectory);
for (String sql : after) {
LOG.info("processing 'after': {}", sql);
long start = System.currentTimeMillis();
diff --git
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
index 5d0fd93d18..6fa2ec6d6d 100644
---
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
+++
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java
@@ -206,15 +206,20 @@ public class TikaEvalCLITest extends TikaTest {
CachingFileVisitor v = new CachingFileVisitor();
Files.walkFileTree(compareReportsDir, v);
int cnt = 0;
+ boolean hasSummaryMd = false;
for (Path report : v.getPaths()) {
- if (report
- .getFileName()
- .toString()
- .endsWith(".xlsx")) {
+ String name = report.getFileName().toString();
+ if (name.endsWith(".xlsx")) {
cnt++;
}
+ if ("summary.md".equals(name)) {
+ hasSummaryMd = true;
+ assertTrue(Files.size(report) > 100,
+ "summary.md should not be empty");
+ }
}
assertTrue(cnt > 33);
+ assertTrue(hasSummaryMd, "summary.md should be generated for
comparison reports");
// If there is a failure, check for SQL errors in the previous log.
// If it's is a syntax error, for the position look for "[*]" in the
exception message.
// The "[42001-230]" is [<error number>-<build number].
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
index 21f413087f..03d6b53cc0 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
@@ -40,7 +40,7 @@ public class JsonMetadataList {
private static final StreamReadConstraints DEFAULT_CONSTRAINTS =
StreamReadConstraints
.builder()
.maxNestingDepth(10)
- .maxStringLength(20_000_000)
+ .maxStringLength(200_000_000)
.maxNumberLength(500)
.build();