This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4731-common-script in repository https://gitbox.apache.org/repos/asf/tika.git
commit 2c18dd8109d854bea6e180fff8af6fca7647d20e Author: tallison <[email protected]> AuthorDate: Thu May 21 14:30:37 2026 -0400 TIKA-4731 - checkpoint --- .../integration-testing/tika-eval-regression.adoc | 52 ++++++++++++++++++++++ .../ml/chardetect/MojibusterEncodingDetector.java | 32 ++++++------- 2 files changed, 69 insertions(+), 15 deletions(-) diff --git a/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc b/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc index 8c7f5b6438..24460db449 100644 --- a/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc +++ b/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc @@ -310,3 +310,55 @@ Compare step finishes in 2-5 minutes depending on extract size. A regression analysis writeup goes in `~/Desktop/claude-todo/<reports-dir-name>-analysis.md` per the `regression` skill. + +== Querying the H2 database directly + +`tika-eval` stores everything in an *H2* database — the `-d <name>` passed to +`Compare` produces `<name>.mv.db`. The generated reports surface only pre-canned +views; for custom counts and joins, connect to the H2 file and run SQL. + +The database is created with *no username and no password* +(`H2Util` opens it with `DriverManager.getConnection(url)` and no credentials). +So the old H2 default `-user sa` *fails* with "Wrong user name or password" — use +empty credentials. Append `;IFEXISTS=TRUE` (so a typo opens nothing rather than +creating a new empty db) and `;ACCESS_MODE_DATA=r` for a read-only open. H2 is +single-writer: query only after the `Compare`/`Profile`/`Report` run has released +the file lock. Use the `h2-*.jar` shipped with `tika-eval-app`. + +[source,bash] +---- +H2=path/to/h2-x.y.z.jar +# note: omit the .mv.db suffix in the URL +DB='jdbc:h2:path/to/eval-db;IFEXISTS=TRUE;ACCESS_MODE_DATA=r' +java -cp "$H2" org.h2.tools.Shell -url "$DB" -user '' -password '' \ + -sql "SHOW TABLES" +---- + +Pass SQL to `-sql` as a *single line* (a multi-line argument leaves the Shell +waiting on stdin). + +Key tables: `profiles_a`/`profiles_b` (one row per extracted file: `file_name`, +`mime_id`, `length`, …), `contents_a`/`contents_b` (text profile: `oov`, +`languageness`, `num_tokens`, `lang_id_1`, …), `content_comparisons` +(`dice_coefficient`, `overlap`), `mimes`, `containers`. *A and B are paired by +`id`* — the same row `id` is the same file in both runs (this is how the built-in +reports join: `join profiles_b pb on pa.id = pb.id`). Always join on `id`. + +[source,sql] +---- +-- OOV / languageness: how many files improved vs regressed in B (fast PK join) +SELECT SUM(CASE WHEN cb.oov < ca.oov THEN 1 ELSE 0 END) AS oov_better, + SUM(CASE WHEN cb.oov > ca.oov THEN 1 ELSE 0 END) AS oov_worse +FROM contents_a ca JOIN contents_b cb ON ca.id = cb.id; + +-- net common-tokens A vs B (headline "more real text recovered" metric) +SELECT SUM(ca.num_common_tokens) AS common_a, + SUM(cb.num_common_tokens) AS common_b, + SUM(cb.num_common_tokens) - SUM(ca.num_common_tokens) AS delta +FROM contents_a ca JOIN contents_b cb ON ca.id = cb.id; + +-- detected charset distribution in the B run +SELECT m.mime_string, COUNT(*) n +FROM profiles_b p JOIN mimes m ON p.mime_id = m.mime_id +GROUP BY m.mime_string ORDER BY n DESC; +---- diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java index 550b4fa946..00254dcd96 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java @@ -135,25 +135,26 @@ public class MojibusterEncodingDetector implements EncodingDetector { * to drop a high-confidence UTF-8 classification on otherwise-valid * text and fall through to {@code AutoDetectReader.detect}, which * raises {@code TikaException} when the chain returns no candidates. - * 0.5% (1 byte per 200) accommodates "tiny corruption" while still - * rejecting genuinely-non-UTF-8 streams (which would have many more - * malformed bytes). + * Per-byte error rate governing LONG probes (the absolute cap below is a + * floor for short ones). 0.01% (~1 malformed sequence per 10 KB) + * accommodates real UTF-8 with a few stray/corrupt bytes (e.g. a 150 KB + * page with 4 errors = 0.003%) while still rejecting a win-1252 page + * misread as UTF-8 (a 20 KB Western page surfaces ~14 invalid sequences = + * 0.07%, 7× over). * * <p>TACTICAL: remove or revisit when Mojibuster's UTF-8 grammar * check is replaced with a probabilistic decoder that returns a * confidence score directly.</p> */ - private static final double UTF8_MALFORMED_TOLERANCE = 0.005; + private static final double UTF8_MALFORMED_TOLERANCE = 0.0001; /** - * Absolute cap on UTF-8 error events tolerated alongside - * {@link #UTF8_MALFORMED_TOLERANCE}. Tolerance fires only when - * BOTH the rate AND the absolute count are within bounds — a - * 20 KB French win-1252 probe with 14 invalid UTF-8 sequences - * has a 0.07% error rate (under the 0.5% rate cap) but 14 - * scattered errors is decisively "not UTF-8". Cap of 1 matches - * the original comment intent ("a single bad continuation byte - * in 2KB of CJK is nearly always corruption"). + * Absolute floor on tolerated UTF-8 error events for SHORT probes, where a + * rate is meaningless (a 20-byte string with 1 bad byte is 5%). The + * effective cap is {@code max(this, probeLen * UTF8_MALFORMED_TOLERANCE)} — + * so short probes allow 1, long probes are governed by the rate. (Earlier + * this was a hard cap applied at all lengths, which wrongly rejected long, + * genuinely-UTF-8 pages carrying a couple of stray bytes.) */ private static final int UTF8_MAX_TOLERATED_ERRORS = 1; @@ -309,9 +310,10 @@ public class MojibusterEncodingDetector implements EncodingDetector { boolean utf8Tolerated = false; if (utf8 == StructuralEncodingRules.Utf8Result.NOT_UTF8) { int errors = StructuralEncodingRules.countUtf8Errors(probe); - if (errors > 0 - && errors <= UTF8_MAX_TOLERATED_ERRORS - && (double) errors / probe.length <= UTF8_MALFORMED_TOLERANCE) { + // Length-aware: absolute floor for short probes, rate for long ones. + int maxTolerated = Math.max(UTF8_MAX_TOLERATED_ERRORS, + (int) (probe.length * UTF8_MALFORMED_TOLERANCE)); + if (errors > 0 && errors <= maxTolerated) { utf8Tolerated = true; LOG.trace("mojibuster utf8 NOT_UTF8 tolerated: {} error events in {}B ({}%)", errors, probe.length,
