This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4684-tika-eval in repository https://gitbox.apache.org/repos/asf/tika.git
commit a51c253a8225779875c7e6e0a0986006e0558031 Author: tallison <[email protected]> AuthorDate: Fri Mar 6 15:42:41 2026 -0500 add md summary and other cli improvements --- tika-eval/tika-eval-app/pom.xml | 4 + .../java/org/apache/tika/eval/app/EvalConfig.java | 2 +- .../tika/eval/app/ExtractComparerRunner.java | 83 ++- .../java/org/apache/tika/eval/app/TikaEvalCLI.java | 58 ++ .../eval/app/reports/MarkdownSummaryWriter.java | 611 +++++++++++++++++++++ .../tika/eval/app/reports/ResultsReporter.java | 7 +- .../org/apache/tika/eval/app/TikaEvalCLITest.java | 13 +- .../tika/serialization/JsonMetadataList.java | 2 +- 8 files changed, 767 insertions(+), 13 deletions(-) diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml index b145c273dd..4b97f249c1 100644 --- a/tika-eval/tika-eval-app/pom.xml +++ b/tika-eval/tika-eval-app/pom.xml @@ -65,6 +65,10 @@ <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-compress</artifactId> + </dependency> <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-core</artifactId> diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java index fc0d72f0a6..b1aa848992 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java @@ -23,7 +23,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; public class EvalConfig { private long minExtractLength = 0; - private long maxExtractLength = 2_000_000; + private long maxExtractLength = 100_000_000; private String jdbcString = null; private String jdbcDriverClass = null; private boolean forceDrop = true; diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java index 6d23fe3653..c1faacc965 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java @@ -18,7 +18,9 @@ package org.apache.tika.eval.app; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.sql.Connection; @@ -39,12 +41,16 @@ import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Stream; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.help.HelpFormatter; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -56,6 +62,7 @@ import org.apache.tika.eval.app.io.DBWriter; import org.apache.tika.eval.app.io.ExtractReader; import org.apache.tika.eval.app.io.ExtractReaderException; import org.apache.tika.eval.app.io.IDBWriter; +import org.apache.tika.eval.app.reports.ResultsReporter; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.mime.MimeTypes; import org.apache.tika.pipes.api.FetchEmitTuple; @@ -79,10 +86,12 @@ public class ExtractComparerRunner { .addOption(Option.builder("b").longOpt("extractsB").hasArg().desc("required: directory of 'B' extracts").get()) .addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional: directory for original binary input documents." + " If not specified, -extracts is crawled as is.").get()) - .addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db path").get()) + .addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db path (uses temp file if not specified)").get()) .addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json config file").get()) .addOption(Option.builder("n").longOpt("numWorkers").hasArg().desc("number of worker threads").get()) .addOption(Option.builder("m").longOpt("maxExtractLength").hasArg().desc("maximum extract length").get()) + .addOption(Option.builder("r").longOpt("report").desc("automatically run Report and tgz after Compare").get()) + .addOption(Option.builder("rd").longOpt("reportsDir").hasArg().desc("directory for reports (default: 'reports')").get()) ; } @@ -93,7 +102,16 @@ public class ExtractComparerRunner { Path extractsADir = commandLine.hasOption('a') ? Paths.get(commandLine.getOptionValue('a')) : Paths.get(USAGE_FAIL("Must specify extractsA dir: -a")); Path extractsBDir = commandLine.hasOption('b') ? Paths.get(commandLine.getOptionValue('b')) : Paths.get(USAGE_FAIL("Must specify extractsB dir: -b")); Path inputDir = commandLine.hasOption('i') ? Paths.get(commandLine.getOptionValue('i')) : extractsADir; - String dbPath = commandLine.hasOption('d') ? commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d"); + + boolean usesTempDb = !commandLine.hasOption('d'); + Path tempDbDir = null; + String dbPath; + if (usesTempDb) { + tempDbDir = Files.createTempDirectory("tika-eval-"); + dbPath = tempDbDir.resolve("eval-db").toAbsolutePath().toString(); + } else { + dbPath = commandLine.getOptionValue('d'); + } if (commandLine.hasOption('n')) { evalConfig.setNumWorkers(Integer.parseInt(commandLine.getOptionValue('n'))); @@ -103,8 +121,27 @@ public class ExtractComparerRunner { evalConfig.setMaxExtractLength(Long.parseLong(commandLine.getOptionValue('m'))); } - String jdbcString = getJdbcConnectionString(dbPath); - execute(inputDir, extractsADir, extractsBDir, jdbcString, evalConfig); + try { + String jdbcString = getJdbcConnectionString(dbPath); + execute(inputDir, extractsADir, extractsBDir, jdbcString, evalConfig); + + if (commandLine.hasOption('r')) { + String reportsDir = commandLine.getOptionValue("rd", "reports"); + LOG.info("Running Report..."); + ResultsReporter.main(new String[]{"-d", dbPath, "-rd", reportsDir}); + Path reportsDirPath = Paths.get(reportsDir); + if (Files.isDirectory(reportsDirPath)) { + Path tgzPath = reportsDirPath.resolveSibling(reportsDir + ".tar.gz"); + LOG.info("Creating {}", tgzPath); + createTarGz(reportsDirPath, tgzPath); + LOG.info("Reports archived to {}", tgzPath); + } + } + } finally { + if (usesTempDb && tempDbDir != null) { + deleteDirectory(tempDbDir); + } + } } private static String getJdbcConnectionString(String dbPath) { @@ -205,6 +242,44 @@ public class ExtractComparerRunner { return new MimeBuffer(jdbcUtil.getConnection(), builder.getMimeTable(), MimeTypes.getDefaultMimeTypes()); } + private static void deleteDirectory(Path dir) throws IOException { + if (!Files.exists(dir)) { + return; + } + try (Stream<Path> walk = Files.walk(dir)) { + walk.sorted(java.util.Comparator.reverseOrder()) + .forEach(p -> { + try { + Files.deleteIfExists(p); + } catch (IOException e) { + LOG.warn("Failed to delete {}", p, e); + } + }); + } + } + + private static void createTarGz(Path sourceDir, Path output) throws IOException { + try (OutputStream fos = Files.newOutputStream(output); + GzipCompressorOutputStream gzo = new GzipCompressorOutputStream(fos); + TarArchiveOutputStream tar = new TarArchiveOutputStream(gzo)) { + tar.setLongFileMode(TarArchiveOutputStream.LONGFILE_POSIX); + try (Stream<Path> walk = Files.walk(sourceDir)) { + walk.filter(Files::isRegularFile).forEach(file -> { + try { + String entryName = sourceDir.getFileName() + .resolve(sourceDir.relativize(file)).toString(); + TarArchiveEntry entry = new TarArchiveEntry(file, entryName); + tar.putArchiveEntry(entry); + Files.copy(file, tar); + tar.closeArchiveEntry(); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + } + } + private static void USAGE() throws IOException { HelpFormatter helpFormatter = HelpFormatter.builder().get(); helpFormatter.printHelp("java -jar tika-eval-app-x.y.z.jar FileProfiler -e docs -d mydb [-i inputDir, -c config.json]", diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java index ce32d78f4b..38ace5239e 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java @@ -50,6 +50,27 @@ public class TikaEvalCLI { private void execute(String[] args) throws Exception { String tool = args[0]; + // If the first arg looks like a flag, infer the tool from the args + if (tool.startsWith("-")) { + String inferred = inferTool(args); + if (inferred != null) { + tool = inferred; + // Don't strip the first arg — all args are flags for the tool + switch (tool) { + case "Compare": + handleCompare(args); + return; + case "Profile": + handleProfile(args); + return; + case "Report": + handleReport(args); + return; + } + } + System.err.println(specifyTools()); + return; + } String[] subsetArgs = new String[args.length - 1]; System.arraycopy(args, 1, subsetArgs, 0, args.length - 1); switch (tool) { @@ -72,6 +93,43 @@ public class TikaEvalCLI { } } + private static String inferTool(String[] args) { + boolean hasA = false; + boolean hasB = false; + boolean hasE = false; + boolean hasDb = false; + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "-a": + case "--extractsA": + hasA = true; + break; + case "-b": + case "--extractsB": + hasB = true; + break; + case "-e": + case "--extracts": + hasE = true; + break; + case "-d": + case "--db": + hasDb = true; + break; + } + } + if (hasA && hasB) { + return "Compare"; + } + if (hasE) { + return "Profile"; + } + if (hasDb && !hasA && !hasB && !hasE) { + return "Report"; + } + return null; + } + private void handleStartDB(String[] args) throws SQLException { List<String> argList = new ArrayList<>(); argList.add("-web"); diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/MarkdownSummaryWriter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/MarkdownSummaryWriter.java new file mode 100644 index 0000000000..0681e57887 --- /dev/null +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/MarkdownSummaryWriter.java @@ -0,0 +1,611 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.app.reports; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.DatabaseMetaData; +import java.sql.ResultSet; +import java.sql.ResultSetMetaData; +import java.sql.SQLException; +import java.sql.Statement; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Writes a markdown summary of a tika-eval comparison run. + * <p> + * This is designed to be read by both humans and LLMs for fast + * regression detection. It queries the same temp tables that the + * xlsx report pipeline creates (exceptions_compared, + * token_counts_compared, parse_time_compared) so it must be called + * after the "before" SQL has executed. + */ +public class MarkdownSummaryWriter { + + private static final Logger LOG = LoggerFactory.getLogger(MarkdownSummaryWriter.class); + + private static final int TOP_N = 20; + + public static void write(Connection c, Path reportsDir) throws IOException, SQLException { + if (!isComparisonDb(c)) { + LOG.info("Not a comparison database; skipping markdown summary."); + return; + } + Path summaryPath = reportsDir.resolve("summary.md"); + Files.createDirectories(reportsDir); + + try (BufferedWriter w = Files.newBufferedWriter(summaryPath)) { + w.write("# Tika Eval Comparison Summary\n\n"); + + writeOverview(c, w); + writeExtractExceptionSummary(c, w); + writeExceptionSummary(c, w); + writeContentQualitySummary(c, w); + writeOovComparison(c, w); + writeLanguageChanges(c, w); + writeContentLengthRatio(c, w); + writeEmbeddedCountChanges(c, w); + writeTokenCountSummary(c, w); + writeParseTimeSummary(c, w); + writeMimeChanges(c, w); + writeTopRegressions(c, w); + writeTopImprovements(c, w); + writeContentLost(c, w); + writeContentGained(c, w); + writeMissingExtracts(c, w); + } + LOG.info("Wrote markdown summary to {}", summaryPath); + } + + private static void writeOverview(Connection c, BufferedWriter w) + throws IOException, SQLException { + w.write("## Overview\n\n"); + + try (Statement st = c.createStatement()) { + try (ResultSet rs = st.executeQuery( + "select dir_name_a, dir_name_b from pair_names")) { + if (rs.next()) { + w.write("- **A**: " + rs.getString(1) + "\n"); + w.write("- **B**: " + rs.getString(2) + "\n"); + } + } + + writeScalar(st, w, "- **Total containers**: ", + "select count(1) from containers"); + writeScalar(st, w, "- **Total files (A)**: ", + "select count(1) from profiles_a"); + writeScalar(st, w, "- **Total files (B)**: ", + "select count(1) from profiles_b"); + writeScalar(st, w, "- **Exceptions (A)**: ", + "select count(1) from exceptions_a"); + writeScalar(st, w, "- **Exceptions (B)**: ", + "select count(1) from exceptions_b"); + } + w.write("\n"); + } + + private static void writeExtractExceptionSummary(Connection c, BufferedWriter w) + throws IOException, SQLException { + w.write("## Extract File Issues\n\n"); + w.write("Problems reading extract files (before parsing). " + + "Includes missing files, zero-byte files, oversized files, and bad JSON.\n\n"); + + w.write("### Extract A\n\n"); + writeQueryAsTable(c, w, + "select r.extract_exception_description as TYPE, count(1) as COUNT " + + "from extract_exceptions_a ee " + + "join ref_extract_exception_types r " + + " on r.extract_exception_id = ee.extract_exception_id " + + "group by r.extract_exception_description " + + "order by COUNT desc"); + + w.write("\n### Extract B\n\n"); + writeQueryAsTable(c, w, + "select r.extract_exception_description as TYPE, count(1) as COUNT " + + "from extract_exceptions_b ee " + + "join ref_extract_exception_types r " + + " on r.extract_exception_id = ee.extract_exception_id " + + "group by r.extract_exception_description " + + "order by COUNT desc"); + w.write("\n"); + } + + private static void writeExceptionSummary(Connection c, BufferedWriter w) + throws IOException, SQLException { + w.write("## Exception Changes by Mime Type\n\n"); + w.write("Mime types with >100 files where exception rate changed by >5%.\n\n"); + + writeQueryAsTable(c, w, + "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " + + "ec.total as TOTAL, " + + "ec.exc_cnt_a as EXC_A, ec.exc_cnt_b as EXC_B, " + + "round(ec.exc_prcnt_a * 100, 1) as EXC_PCT_A, " + + "round(ec.exc_prcnt_b * 100, 1) as EXC_PCT_B, " + + "ec.notes as FLAG " + + "from exceptions_compared ec " + + "join mimes ma on ma.mime_id = ec.mime_id_a " + + "join mimes mb on mb.mime_id = ec.mime_id_b " + + "where ec.total > 100 " + + "and abs(ec.exc_prcnt_a - ec.exc_prcnt_b) > 0.05 " + + "order by abs(ec.exc_prcnt_a - ec.exc_prcnt_b) desc"); + + w.write("\n### New Exception Types in B\n\n"); + writeQueryAsTable(c, w, + "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " + + "count(1) as COUNT " + + "from exceptions_b eb " + + "left join exceptions_a ea on ea.id = eb.id " + + "join profiles_a pa on pa.id = eb.id " + + "join profiles_b pb on pb.id = eb.id " + + "join mimes ma on ma.mime_id = pa.mime_id " + + "join mimes mb on mb.mime_id = pb.mime_id " + + "where ea.id is null and eb.parse_exception_id = 0 " + + "group by ma.mime_string, mb.mime_string " + + "order by COUNT desc " + + "limit " + TOP_N); + + w.write("\n### Fixed Exceptions in B\n\n"); + writeQueryAsTable(c, w, + "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " + + "count(1) as COUNT " + + "from exceptions_a ea " + + "left join exceptions_b eb on ea.id = eb.id " + + "join profiles_a pa on pa.id = ea.id " + + "join profiles_b pb on pb.id = pa.id " + + "join mimes ma on ma.mime_id = pa.mime_id " + + "join mimes mb on mb.mime_id = pb.mime_id " + + "where eb.id is null and ea.parse_exception_id = 0 " + + "group by ma.mime_string, mb.mime_string " + + "order by COUNT desc " + + "limit " + TOP_N); + w.write("\n"); + } + + private static void writeContentQualitySummary(Connection c, BufferedWriter w) + throws IOException, SQLException { + w.write("## Content Quality (Dice Coefficient) by Mime Type\n\n"); + w.write("Mean and median dice coefficient per mime type (higher = more similar).\n\n"); + + writeQueryAsTable(c, w, + "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " + + "count(1) as FILES, " + + "round(avg(cc.dice_coefficient), 4) as MEAN_DICE, " + + "round(median(cc.dice_coefficient), 4) as MEDIAN_DICE, " + + "round(min(cc.dice_coefficient), 4) as MIN_DICE " + + "from content_comparisons cc " + + "join profiles_a pa on cc.id = pa.id " + + "join profiles_b pb on cc.id = pb.id " + + "join mimes ma on ma.mime_id = pa.mime_id " + + "join mimes mb on mb.mime_id = pb.mime_id " + + "group by ma.mime_string, mb.mime_string " + + "having count(1) > 5 " + + "order by MEAN_DICE asc"); + w.write("\n"); + } + + private static void writeTokenCountSummary(Connection c, BufferedWriter w) + throws IOException, SQLException { + w.write("## Token Count Changes by Mime Type\n\n"); + + writeQueryAsTable(c, w, + "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " + + "tcc.num_tokens_a as TOKENS_A, tcc.num_tokens_b as TOKENS_B, " + + "case when tcc.num_tokens_a > 0 " + + " then round(100.0 * (tcc.num_tokens_b - tcc.num_tokens_a) / tcc.num_tokens_a, 1) " + + " else null end as PCT_CHANGE, " + + "tcc.num_common_tokens_a as COMMON_A, tcc.num_common_tokens_b as COMMON_B " + + "from token_counts_compared tcc " + + "join mimes ma on ma.mime_id = tcc.mime_id_a " + + "join mimes mb on mb.mime_id = tcc.mime_id_b " + + "order by abs(tcc.num_tokens_a - tcc.num_tokens_b) desc"); + w.write("\n"); + } + + private static void writeParseTimeSummary(Connection c, BufferedWriter w) + throws IOException, SQLException { + w.write("## Parse Time Changes by Mime Type\n\n"); + + writeQueryAsTable(c, w, + "select ma.mime_string as MIME_A, mb.mime_string as MIME_B, " + + "ptc.total_a as MS_A, ptc.total_b as MS_B, " + + "round(ptc.prcnt_increase, 1) as B_AS_PCT_OF_A " + + "from parse_time_compared ptc " + + "join mimes ma on ma.mime_id = ptc.mime_id_a " + + "join mimes mb on mb.mime_id = ptc.mime_id_b " + + "where ptc.total_a > 0 " + + "order by ptc.prcnt_increase desc"); + + w.write("\n### Parse Time Outliers (individual files, B > 10x A, A >= 1s)\n\n"); + writeQueryAsTable(c, w, + "select c.file_path as FILE, " + + "ma.mime_string as MIME_A, " + + "pa.elapsed_time_millis as MS_A, " + + "pb.elapsed_time_millis as MS_B, " + + "round(cast(pb.elapsed_time_millis as double) / " + + " cast(pa.elapsed_time_millis as double), 1) as RATIO " + + "from profiles_a pa " + + "join profiles_b pb on pa.id = pb.id " + + "join containers c on pa.container_id = c.container_id " + + "join mimes ma on ma.mime_id = pa.mime_id " + + "where pa.is_embedded = false " + + "and pa.elapsed_time_millis >= 1000 " + + "and pb.elapsed_time_millis > pa.elapsed_time_millis * 10 " + + "order by RATIO desc " + + "limit " + TOP_N); + w.write("\n"); + } + + private static void writeMimeChanges(Connection c, BufferedWriter w) + throws IOException, SQLException { + w.write("## Mime Type Changes (A -> B)\n\n"); + + writeQueryAsTable(c, w, + "select concat(ma.mime_string, ' -> ', mb.mime_string) as CHANGE, " + + "count(1) as COUNT " + + "from profiles_a a " + + "join profiles_b b on a.id = b.id " + + "join mimes ma on ma.mime_id = a.mime_id " + + "join mimes mb on mb.mime_id = b.mime_id " + + "where a.mime_id <> b.mime_id " + + "group by CHANGE " + + "order by COUNT desc " + + "limit " + TOP_N); + w.write("\n"); + } + + private static void writeTopRegressions(Connection c, BufferedWriter w) + throws IOException, SQLException { + w.write("## Top " + TOP_N + " Content Regressions (lowest dice)\n\n"); + w.write("Files where content changed the most (excluding perfect matches).\n\n"); + + writeQueryAsTable(c, w, + "select c.file_path as FILE, " + + "ma.mime_string as MIME_A, " + + "round(cc.dice_coefficient, 4) as DICE, " + + "round(cc.overlap, 4) as OVERLAP, " + + "cc.top_10_unique_token_diffs_a as ONLY_IN_A, " + + "cc.top_10_unique_token_diffs_b as ONLY_IN_B " + + "from content_comparisons cc " + + "join profiles_a pa on cc.id = pa.id " + + "join profiles_b pb on cc.id = pb.id " + + "join containers c on pa.container_id = c.container_id " + + "join mimes ma on ma.mime_id = pa.mime_id " + + "where cc.dice_coefficient < 1.0 " + + "and pa.is_embedded = false " + + "order by cc.dice_coefficient asc " + + "limit " + TOP_N); + w.write("\n"); + } + + private static void writeTopImprovements(Connection c, BufferedWriter w) + throws IOException, SQLException { + w.write("## Top " + TOP_N + " Fixed Exceptions in B (with content gained)\n\n"); + + writeQueryAsTable(c, w, + "select c.file_path as FILE, " + + "ma.mime_string as MIME_A, " + + "cb.num_tokens as TOKENS_B, " + + "cb.num_common_tokens as COMMON_TOKENS_B, " + + "cb.lang_id_1 as LANG_B " + + "from exceptions_a ea " + + "left join exceptions_b eb on ea.id = eb.id " + + "join profiles_a pa on pa.id = ea.id " + + "join profiles_b pb on pb.id = pa.id " + + "join containers c on pa.container_id = c.container_id " + + "join mimes ma on ma.mime_id = pa.mime_id " + + "left join contents_b cb on cb.id = ea.id " + + "where eb.id is null and ea.parse_exception_id = 0 " + + "and pa.is_embedded = false " + + "order by cb.num_common_tokens desc nulls last " + + "limit " + TOP_N); + w.write("\n"); + } + + private static void writeContentLost(Connection c, BufferedWriter w) + throws IOException, SQLException { + w.write("## Content Lost (had content in A, empty/missing in B)\n\n"); + + writeQueryAsTable(c, w, + "select c.file_path as FILE, " + + "ma.mime_string as MIME_A, " + + "ca.num_tokens as TOKENS_A, " + + "ca.num_common_tokens as COMMON_A, " + + "coalesce(cb.num_tokens, 0) as TOKENS_B " + + "from contents_a ca " + + "join profiles_a pa on ca.id = pa.id " + + "join containers c on pa.container_id = c.container_id " + + "join mimes ma on ma.mime_id = pa.mime_id " + + "left join contents_b cb on ca.id = cb.id " + + "where ca.num_tokens > 10 " + + "and coalesce(cb.num_tokens, 0) = 0 " + + "and pa.is_embedded = false " + + "order by ca.num_tokens desc " + + "limit " + TOP_N); + w.write("\n"); + } + + private static void writeContentGained(Connection c, BufferedWriter w) + throws IOException, SQLException { + w.write("## Content Gained (empty in A, has content in B)\n\n"); + + writeQueryAsTable(c, w, + "select c.file_path as FILE, " + + "mb.mime_string as MIME_B, " + + "coalesce(ca.num_tokens, 0) as TOKENS_A, " + + "cb.num_tokens as TOKENS_B, " + + "cb.num_common_tokens as COMMON_B " + + "from contents_b cb " + + "join profiles_b pb on cb.id = pb.id " + + "join profiles_a pa on cb.id = pa.id " + + "join containers c on pa.container_id = c.container_id " + + "join mimes mb on mb.mime_id = pb.mime_id " + + "left join contents_a ca on cb.id = ca.id " + + "where cb.num_tokens > 10 " + + "and coalesce(ca.num_tokens, 0) = 0 " + + "and pa.is_embedded = false " + + "order by cb.num_tokens desc " + + "limit " + TOP_N); + w.write("\n"); + } + + private static void writeEmbeddedCountChanges(Connection c, BufferedWriter w) + throws IOException, SQLException { + w.write("## Embedded Document Count Changes\n\n"); + w.write("Files where the number of embedded documents changed significantly.\n\n"); + + writeQueryAsTable(c, w, + "select c.file_path as FILE, " + + "ma.mime_string as MIME_A, " + + "pa.num_attachments as EMBEDDED_A, " + + "pb.num_attachments as EMBEDDED_B, " + + "(pb.num_attachments - pa.num_attachments) as DELTA " + + "from profiles_a pa " + + "join profiles_b pb on pa.id = pb.id " + + "join containers c on pa.container_id = c.container_id " + + "join mimes ma on ma.mime_id = pa.mime_id " + + "where pa.is_embedded = false " + + "and pa.num_attachments <> pb.num_attachments " + + "and (pa.num_attachments > 0 or pb.num_attachments > 0) " + + "order by abs(pb.num_attachments - pa.num_attachments) desc " + + "limit " + TOP_N); + w.write("\n"); + } + + private static void writeContentLengthRatio(Connection c, BufferedWriter w) + throws IOException, SQLException { + w.write("## Content Length Ratio Outliers\n\n"); + w.write("Files where content length changed by more than 2x " + + "(possible repeated text or truncation).\n\n"); + + writeQueryAsTable(c, w, + "select c.file_path as FILE, " + + "ma.mime_string as MIME_A, " + + "ca.content_length as LEN_A, " + + "cb.content_length as LEN_B, " + + "round(cast(cb.content_length as double) / " + + " cast(ca.content_length as double), 2) as RATIO_B_TO_A " + + "from contents_a ca " + + "join contents_b cb on ca.id = cb.id " + + "join profiles_a pa on ca.id = pa.id " + + "join containers c on pa.container_id = c.container_id " + + "join mimes ma on ma.mime_id = pa.mime_id " + + "where pa.is_embedded = false " + + "and ca.content_length > 100 " + + "and cb.content_length > 100 " + + "and (cast(cb.content_length as double) / " + + " cast(ca.content_length as double) > 2.0 " + + " or cast(cb.content_length as double) / " + + " cast(ca.content_length as double) < 0.5) " + + "order by abs(cast(cb.content_length as double) / " + + " cast(ca.content_length as double) - 1.0) desc " + + "limit " + TOP_N); + w.write("\n"); + } + + private static void writeLanguageChanges(Connection c, BufferedWriter w) + throws IOException, SQLException { + w.write("## Language Detection Changes\n\n"); + w.write("Files where the detected language changed between A and B.\n\n"); + + w.write("### By Language Pair (aggregate)\n\n"); + writeQueryAsTable(c, w, + "select ca.lang_id_1 as LANG_A, cb.lang_id_1 as LANG_B, " + + "count(1) as COUNT " + + "from contents_a ca " + + "join contents_b cb on ca.id = cb.id " + + "join profiles_a pa on ca.id = pa.id " + + "where pa.is_embedded = false " + + "and ca.lang_id_1 is not null " + + "and cb.lang_id_1 is not null " + + "and ca.lang_id_1 <> cb.lang_id_1 " + + "group by ca.lang_id_1, cb.lang_id_1 " + + "order by COUNT desc " + + "limit " + TOP_N); + + w.write("\n### Top " + TOP_N + " Individual Files\n\n"); + writeQueryAsTable(c, w, + "select c.file_path as FILE, " + + "ma.mime_string as MIME_A, " + + "ca.lang_id_1 as LANG_A, " + + "round(ca.lang_id_prob_1, 3) as PROB_A, " + + "cb.lang_id_1 as LANG_B, " + + "round(cb.lang_id_prob_1, 3) as PROB_B " + + "from contents_a ca " + + "join contents_b cb on ca.id = cb.id " + + "join profiles_a pa on ca.id = pa.id " + + "join containers c on pa.container_id = c.container_id " + + "join mimes ma on ma.mime_id = pa.mime_id " + + "where pa.is_embedded = false " + + "and ca.lang_id_1 is not null " + + "and cb.lang_id_1 is not null " + + "and ca.lang_id_1 <> cb.lang_id_1 " + + "order by ca.num_tokens desc " + + "limit " + TOP_N); + w.write("\n"); + } + + private static void writeOovComparison(Connection c, BufferedWriter w) + throws IOException, SQLException { + w.write("## Out-of-Vocabulary (OOV) Rate Changes\n\n"); + w.write("Files where OOV rate increased significantly in B " + + "(possible mojibake or encoding regression).\n\n"); + + w.write("### By Mime Type (aggregate)\n\n"); + writeQueryAsTable(c, w, + "select ma.mime_string as MIME_A, " + + "count(1) as FILES, " + + "round(avg(ca.oov), 4) as MEAN_OOV_A, " + + "round(avg(cb.oov), 4) as MEAN_OOV_B, " + + "round(avg(cb.oov) - avg(ca.oov), 4) as OOV_DELTA " + + "from contents_a ca " + + "join contents_b cb on ca.id = cb.id " + + "join profiles_a pa on ca.id = pa.id " + + "join mimes ma on ma.mime_id = pa.mime_id " + + "where pa.is_embedded = false " + + "and ca.oov is not null and cb.oov is not null " + + "group by ma.mime_string " + + "having count(1) > 5 " + + "order by OOV_DELTA desc"); + + w.write("\n### Top " + TOP_N + " Individual OOV Increases\n\n"); + writeQueryAsTable(c, w, + "select c.file_path as FILE, " + + "ma.mime_string as MIME_A, " + + "round(ca.oov, 4) as OOV_A, " + + "round(cb.oov, 4) as OOV_B, " + + "round(cb.oov - ca.oov, 4) as OOV_DELTA, " + + "ca.lang_id_1 as LANG_A, " + + "cb.lang_id_1 as LANG_B " + + "from contents_a ca " + + "join contents_b cb on ca.id = cb.id " + + "join profiles_a pa on ca.id = pa.id " + + "join containers c on pa.container_id = c.container_id " + + "join mimes ma on ma.mime_id = pa.mime_id " + + "where pa.is_embedded = false " + + "and ca.oov is not null and cb.oov is not null " + + "and ca.num_tokens > 10 " + + "and (cb.oov - ca.oov) > 0.1 " + + "order by (cb.oov - ca.oov) desc " + + "limit " + TOP_N); + w.write("\n"); + } + + private static void writeMissingExtracts(Connection c, BufferedWriter w) + throws IOException, SQLException { + w.write("## Missing Extracts\n\n"); + w.write("Files where A had an extract file but B did not (or vice versa).\n\n"); + + w.write("### Had extract in A, missing in B\n\n"); + writeQueryAsTable(c, w, + "select c.file_path as FILE, " + + "ma.mime_string as MIME_A, " + + "c.extract_file_length_a as EXTRACT_LEN_A, " + + "c.extract_file_length_b as EXTRACT_LEN_B " + + "from containers c " + + "join profiles_a pa on pa.container_id = c.container_id " + + "join mimes ma on ma.mime_id = pa.mime_id " + + "where pa.is_embedded = false " + + "and c.extract_file_length_a > 0 " + + "and (c.extract_file_length_b is null or c.extract_file_length_b = 0) " + + "order by c.extract_file_length_a desc " + + "limit " + TOP_N); + + w.write("\n### Had extract in B, missing in A\n\n"); + writeQueryAsTable(c, w, + "select c.file_path as FILE, " + + "mb.mime_string as MIME_B, " + + "c.extract_file_length_a as EXTRACT_LEN_A, " + + "c.extract_file_length_b as EXTRACT_LEN_B " + + "from containers c " + + "join profiles_b pb on pb.container_id = c.container_id " + + "join mimes mb on mb.mime_id = pb.mime_id " + + "where pb.is_embedded = false " + + "and c.extract_file_length_b > 0 " + + "and (c.extract_file_length_a is null or c.extract_file_length_a = 0) " + + "order by c.extract_file_length_b desc " + + "limit " + TOP_N); + w.write("\n"); + } + + private static boolean isComparisonDb(Connection c) throws SQLException { + DatabaseMetaData md = c.getMetaData(); + try (ResultSet rs = md.getTables(null, null, "%", null)) { + while (rs.next()) { + if ("CONTENT_COMPARISONS".equalsIgnoreCase(rs.getString(3))) { + return true; + } + } + } + return false; + } + + private static void writeScalar(Statement st, BufferedWriter w, + String prefix, String sql) + throws IOException, SQLException { + try (ResultSet rs = st.executeQuery(sql)) { + if (rs.next()) { + w.write(prefix + rs.getString(1) + "\n"); + } + } + } + + private static void writeQueryAsTable(Connection c, BufferedWriter w, + String sql) + throws IOException, SQLException { + try (Statement st = c.createStatement(); + ResultSet rs = st.executeQuery(sql)) { + ResultSetMetaData meta = rs.getMetaData(); + int cols = meta.getColumnCount(); + + if (!rs.isBeforeFirst()) { + w.write("_No data._\n\n"); + return; + } + + // Header + w.write("|"); + for (int i = 1; i <= cols; i++) { + w.write(" " + meta.getColumnLabel(i) + " |"); + } + w.write("\n|"); + for (int i = 1; i <= cols; i++) { + w.write(" --- |"); + } + w.write("\n"); + + // Rows + while (rs.next()) { + w.write("|"); + for (int i = 1; i <= cols; i++) { + String val = rs.getString(i); + if (val == null) { + val = ""; + } + // Escape pipes in values + val = val.replace("|", "\\|"); + w.write(" " + val + " |"); + } + w.write("\n"); + } + } + } +} diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java index bf83593023..d9eed292d7 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java @@ -65,8 +65,8 @@ public class ResultsReporter { "directory for the reports. " + "If not specified, will write to 'reports'" + "BEWARE: Will overwrite existing reports without warning!") .addOption("rf", "reportsFile", true, "xml specifying sql to call for the reports." + "If not specified, will use default reports in resources/tika-eval-*-config.xml") - .addOption("db", true, "default database (in memory H2). Specify a file name for the H2 database.") - .addOption("jdbc", true, "EXPERT: full jdbc connection string. Specify this or use -db <h2db_name>") + .addOption("d", "db", true, "default database (in memory H2). Specify a file name for the H2 database.") + .addOption("jdbc", true, "EXPERT: full jdbc connection string. Specify this or use -d <h2db_name>") .addOption("jdbcdriver", true, "EXPERT: specify the jdbc driver class if all else fails") .addOption("tablePrefix", true, "EXPERT: if not using the default tables, specify your table name prefix"); @@ -78,7 +78,7 @@ public class ResultsReporter { public static void USAGE() throws IOException { HelpFormatter helpFormatter = HelpFormatter.builder().get(); - helpFormatter.printHelp("java -jar tika-eval-x.y.jar Report -db mydb [-rd myreports] [-rf myreports.xml]", "Tool: Report", ResultsReporter.OPTIONS, + helpFormatter.printHelp("java -jar tika-eval-x.y.jar Report -d mydb [-rd myreports] [-rf myreports.xml]", "Tool: Report", ResultsReporter.OPTIONS, "Note: for h2 db, do not include the .mv.db at the end of the db name.", true); } @@ -323,6 +323,7 @@ public class ResultsReporter { for (Report r : reports) { r.writeReport(c, reportsDirectory); } + MarkdownSummaryWriter.write(c, reportsDirectory); for (String sql : after) { LOG.info("processing 'after': {}", sql); long start = System.currentTimeMillis(); diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java index 5d0fd93d18..6fa2ec6d6d 100644 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java @@ -206,15 +206,20 @@ public class TikaEvalCLITest extends TikaTest { CachingFileVisitor v = new CachingFileVisitor(); Files.walkFileTree(compareReportsDir, v); int cnt = 0; + boolean hasSummaryMd = false; for (Path report : v.getPaths()) { - if (report - .getFileName() - .toString() - .endsWith(".xlsx")) { + String name = report.getFileName().toString(); + if (name.endsWith(".xlsx")) { cnt++; } + if ("summary.md".equals(name)) { + hasSummaryMd = true; + assertTrue(Files.size(report) > 100, + "summary.md should not be empty"); + } } assertTrue(cnt > 33); + assertTrue(hasSummaryMd, "summary.md should be generated for comparison reports"); // If there is a failure, check for SQL errors in the previous log. // If it's is a syntax error, for the position look for "[*]" in the exception message. // The "[42001-230]" is [<error number>-<build number]. diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java index 21f413087f..03d6b53cc0 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java @@ -40,7 +40,7 @@ public class JsonMetadataList { private static final StreamReadConstraints DEFAULT_CONSTRAINTS = StreamReadConstraints .builder() .maxNestingDepth(10) - .maxStringLength(20_000_000) + .maxStringLength(200_000_000) .maxNumberLength(500) .build();
