This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 333d281bfa small updates to tika-eval (#2881)
333d281bfa is described below
commit 333d281bfaf42dc63a2d42e922d6eca3485d6940
Author: Tim Allison <[email protected]>
AuthorDate: Tue Jun 9 12:17:20 2026 -0400
small updates to tika-eval (#2881)
---
.skills/tika-eval-compare.md | 3 +-
.../integration-testing/tika-eval-regression.adoc | 11 +-
.../tika/eval/app/ExtractComparerRunner.java | 60 +++++-
.../src/main/resources/comparison-reports-pg.xml | 202 +++++++++++++++++++++
.../src/main/resources/comparison-reports.xml | 202 +++++++++++++++++++++
5 files changed, 472 insertions(+), 6 deletions(-)
diff --git a/.skills/tika-eval-compare.md b/.skills/tika-eval-compare.md
index d4549d26c8..cdb581e386 100644
--- a/.skills/tika-eval-compare.md
+++ b/.skills/tika-eval-compare.md
@@ -88,8 +88,9 @@ java -jar <tika-eval>/tika-eval-app-*.jar Compare \
| `-a` | Directory of "before" extracts (required) |
| `-b` | Directory of "after" extracts (required) |
| `-d` | H2 database path (temp file if omitted) |
-| `-r` | Auto-run Report + tar.gz after Compare |
+| `-r` | Auto-run Report + tgz the reports dir (`<reportsDir>.tgz`) after
Compare |
| `-rd` | Reports output directory (default: `reports`) |
+| `-z` | Gzip the H2 db (`<db>.mv.db.gz`) after Compare for transfer; requires
`-d` (no-op + warning for a temp db). Combine with `-r` to package both. |
| `-n` | Number of worker threads |
## Step 3 — Review Results
diff --git
a/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc
b/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc
index 61a8981633..deeed7bee3 100644
---
a/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc
+++
b/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc
@@ -191,7 +191,8 @@ java -jar
tika-eval/tika-eval-app/target/tika-eval-app-{tika-version}.jar \
The `Compare` subcommand keyword is optional — the CLI infers it from
the `-a` / `-b` flags. The `-r` flag both runs the Report stage and
-zips the resulting reports directory for easy archiving.
+creates a `.tgz` archive of the resulting reports directory
+(`<reportsDir>.tgz`) for easy archiving.
Options:
@@ -203,9 +204,13 @@ Options:
tika-eval will create `\{label}.mv.db` and a `\{label}-reports/` dir
alongside. Persist the db if you want to re-run Report later.
* `-r` / `--report` — automatically run the Report step after Compare,
- and zip the reports directory.
+ and tgz the reports directory to `<reportsDir>.tgz`.
* `-rd` / `--reportsDir` — explicit reports directory (overrides the
default derived from `-d`).
+* `-z` / `--gzip` — gzip the H2 db file to `<db>.mv.db.gz` after Compare
+ so it can be transferred. Requires `-d` (no-op with a warning for a
+ temp db or a non-file jdbc connection (e.g. `jdbc:h2:mem:`, tcp)). Combine
with `-r` to package
+ both the reports and the db.
* `-n` / `--numWorkers` — comparison worker count.
* `-c` / `--config` — optional tika-eval JSON config.
@@ -296,7 +301,7 @@ java -jar tika-eval-app-{tika-version}.jar \
-b ~/data/extracts/cc-html-sample-B
----
+
-Produces `cc-html-29k-A-vs-B-reports/` plus a `.tar.gz` of the same
+Produces `cc-html-29k-A-vs-B-reports/` plus a `.tgz` of the same
alongside `cc-html-29k-A-vs-B.mv.db`.
For a 29 K-file HTML sample on a typical workstation (8 forked workers,
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
index 3ffcc5a31d..9b48f06bf7 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
@@ -92,6 +92,7 @@ public class ExtractComparerRunner {
.addOption(Option.builder("m").longOpt("maxExtractLength").hasArg().desc("maximum
extract length").get())
.addOption(Option.builder("r").longOpt("report").desc("automatically run Report
and tgz after Compare").get())
.addOption(Option.builder("rd").longOpt("reportsDir").hasArg().desc("directory
for reports (default: 'reports')").get())
+ .addOption(Option.builder("z").longOpt("gzip").desc("gzip the
H2 db file (<db>.mv.db.gz) after Compare for transfer; requires -d").get())
;
}
@@ -128,15 +129,23 @@ public class ExtractComparerRunner {
if (commandLine.hasOption('r')) {
String reportsDir = commandLine.getOptionValue("rd",
"reports");
LOG.info("Running Report...");
- ResultsReporter.main(new String[]{"-d", dbPath, "-rd",
reportsDir});
+ if (dbPath.startsWith("jdbc:")) {
+ ResultsReporter.main(new String[]{"-jdbc", dbPath, "-rd",
reportsDir});
+ } else {
+ ResultsReporter.main(new String[]{"-d", dbPath, "-rd",
reportsDir});
+ }
Path reportsDirPath = Paths.get(reportsDir);
if (Files.isDirectory(reportsDirPath)) {
- Path tgzPath = reportsDirPath.resolveSibling(reportsDir +
".tar.gz");
+ Path tgzPath =
reportsDirPath.resolveSibling(reportsDirPath.getFileName() + ".tgz");
LOG.info("Creating {}", tgzPath);
createTarGz(reportsDirPath, tgzPath);
LOG.info("Reports archived to {}", tgzPath);
}
}
+
+ if (commandLine.hasOption('z')) {
+ gzipDb(dbPath, usesTempDb);
+ }
} finally {
if (usesTempDb && tempDbDir != null) {
deleteDirectory(tempDbDir);
@@ -263,6 +272,53 @@ public class ExtractComparerRunner {
}
}
+ /**
+ * Gzip the H2 db file (<dbPath>.mv.db ->
<dbPath>.mv.db.gz) so it can be
+ * transferred. The db connection is already closed by {@link #execute}
before
+ * this runs, so the file is unlocked. No-op (with a warning) when there
is no
+ * on-disk file db to gzip: a temp db (no -d), or a non-file jdbc
connection
+ * (e.g. mem/tcp). A {@code jdbc:h2:file:} URL is supported by extracting
its
+ * file path.
+ */
+ private static void gzipDb(String dbPath, boolean usesTempDb) throws
IOException {
+ if (usesTempDb) {
+ LOG.warn("-z (gzip) ignored: no -d db specified, so there is no db
file to transfer");
+ return;
+ }
+ String filePath = dbPath;
+ if (dbPath.startsWith("jdbc:")) {
+ String prefix = "jdbc:h2:file:";
+ if (!dbPath.startsWith(prefix)) {
+ LOG.warn("-z (gzip) ignored: db is a non-file jdbc connection
({}), no local file to transfer",
+ dbPath);
+ return;
+ }
+ // Strip the jdbc:h2:file: prefix and any ;OPTION=... suffix to
get the file base path.
+ filePath = dbPath.substring(prefix.length());
+ int semi = filePath.indexOf(';');
+ if (semi >= 0) {
+ filePath = filePath.substring(0, semi);
+ }
+ }
+ Path dbFile = Paths.get(filePath + ".mv.db");
+ if (!Files.isRegularFile(dbFile)) {
+ LOG.warn("-z (gzip) ignored: expected db file {} not found",
dbFile);
+ return;
+ }
+ Path gzPath = dbFile.resolveSibling(dbFile.getFileName() + ".gz");
+ LOG.info("Creating {}", gzPath);
+ gzipFile(dbFile, gzPath);
+ LOG.info("Db archived to {}", gzPath);
+ }
+
+ private static void gzipFile(Path source, Path output) throws IOException {
+ try (InputStream is = Files.newInputStream(source);
+ OutputStream fos = Files.newOutputStream(output);
+ GzipCompressorOutputStream gzo = new
GzipCompressorOutputStream(fos)) {
+ is.transferTo(gzo);
+ }
+ }
+
private static void createTarGz(Path sourceDir, Path output) throws
IOException {
try (OutputStream fos = Files.newOutputStream(output);
GzipCompressorOutputStream gzo = new
GzipCompressorOutputStream(fos);
diff --git
a/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml
b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml
index 0773e51dd3..1343e002bf 100644
--- a/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml
@@ -838,6 +838,208 @@
</report>
+ <!-- CHARSET / ENCODING DETECTION
+ Reports over the encodings_a / encodings_b tables. One row per file
that ran
+ charset detection: detected_encoding (final pick), encoding_detector
(winning
+ detector), declared_metadata (charset declared via Content-Type-Hint).
A row
+ exists only when an encoding was detected, so counts are over text-y
files and
+ joins to encodings are effectively inner. A and B are paired by id.
These
+ tables are not covered by any other report. -->
+
+ <report reportName="Charset Detection Coverage"
+ reportFilename="charset/charset_coverage.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select 'paired_files' as METRIC, count(1) as CNT
+ from profiles_a pa join profiles_b pb on pa.id=pb.id
+ union all
+ select 'detected_in_a', count(1) from encodings_a
+ union all
+ select 'detected_in_b', count(1) from encodings_b
+ union all
+ select 'detected_in_both', count(1)
+ from encodings_a ea join encodings_b eb on ea.id=eb.id
+ union all
+ select 'flipped_a_to_b', count(1)
+ from encodings_a ea join encodings_b eb on ea.id=eb.id
+ where ea.detected_encoding <> eb.detected_encoding
+ </sql>
+ </report>
+
+ <report reportName="Detected Encoding Distribution A"
+ reportFilename="charset/detected_encoding_distribution_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select detected_encoding as DETECTED_ENCODING, count(1) as COUNT
+ from encodings_a
+ group by detected_encoding
+ order by COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="Detected Encoding Distribution B"
+ reportFilename="charset/detected_encoding_distribution_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select detected_encoding as DETECTED_ENCODING, count(1) as COUNT
+ from encodings_b
+ group by detected_encoding
+ order by COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="Encoding Differences A -> B"
+ reportFilename="charset/encoding_diffs_A_to_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select concat(ea.detected_encoding, ' -> ', eb.detected_encoding) as
ENCODING_A_TO_ENCODING_B,
+ count(1) as COUNT
+ from encodings_a ea
+ join encodings_b eb on ea.id=eb.id
+ where ea.detected_encoding <> eb.detected_encoding
+ group by ENCODING_A_TO_ENCODING_B
+ order by COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="Encoding Differences A -> B Details"
+ reportFilename="charset/encoding_diffs_A_to_B_details.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select c.file_path as FILE_PATH,
+ case
+ when pb.embedded_depth > 0
+ then pb.embedded_file_path
+ else pb.file_name
+ end as FILE_NAME_B,
+ ea.detected_encoding as ENCODING_A,
+ eb.detected_encoding as ENCODING_B,
+ ea.encoding_detector as DETECTOR_A,
+ eb.encoding_detector as DETECTOR_B,
+ eb.declared_metadata as DECLARED_B,
+ mb.mime_string as MIME_B,
+ ca.oov as OOV_A, cb.oov as OOV_B,
+ cb.oov - ca.oov as OOV_DELTA_B,
+ ca.languageness as LANGUAGENESS_A, cb.languageness as LANGUAGENESS_B,
+ cb.languageness - ca.languageness as LANGUAGENESS_DELTA_B,
+ ca.num_replacement as FFFD_A, cb.num_replacement as FFFD_B,
+ ca.num_non_ascii as NON_ASCII_A, cb.num_non_ascii as NON_ASCII_B,
+ round(100.0*cb.num_replacement/nullif(cb.num_non_ascii,0),2) as
FFFD_PCT_B,
+ ca.num_common_tokens as COMMON_TOKENS_A, cb.num_common_tokens as
COMMON_TOKENS_B,
+ coalesce(cb.num_common_tokens,0)-coalesce(ca.num_common_tokens,0) as
COMMON_TOKENS_DELTA_B,
+ ca.lang_id_1 as LANG_A, cb.lang_id_1 as LANG_B
+ from encodings_a ea
+ join encodings_b eb on ea.id=eb.id
+ join profiles_b pb on pb.id=ea.id
+ join containers c on c.container_id=pb.container_id
+ left join contents_a ca on ca.id=ea.id
+ left join contents_b cb on cb.id=ea.id
+ left join mimes mb on mb.mime_id=pb.mime_id
+ where ea.detected_encoding <> eb.detected_encoding
+ order by
coalesce(cb.num_common_tokens,0)-coalesce(ca.num_common_tokens,0) asc
+ limit 100000
+ </sql>
+ </report>
+
+ <report reportName="SBCS-Western to CJK Flips A -> B"
+ reportFilename="charset/sbcs_to_cjk_flips_A_to_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select concat(ea.detected_encoding, ' -> ', eb.detected_encoding) as
ENCODING_A_TO_ENCODING_B,
+ count(1) as COUNT
+ from encodings_a ea
+ join encodings_b eb on ea.id=eb.id
+ where lower(ea.detected_encoding) in
+ ('windows-1252','iso-8859-1','iso-8859-15','iso-8859-2','iso-8859-3',
+
'windows-1250','windows-1254','windows-1257','iso-8859-13','windows-1258',
+ 'x-macroman','ibm850','ibm852')
+ and lower(eb.detected_encoding) in
+
('gb18030','gbk','gb2312','big5','big5-hkscs','shift_jis','euc-jp','euc-kr',
+
'x-euc-tw','x-windows-874','x-windows-949','iso-2022-jp','iso-2022-kr','iso-2022-cn')
+ group by ENCODING_A_TO_ENCODING_B
+ order by COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="Encoding Detector Distribution A"
+ reportFilename="charset/encoding_detector_distribution_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select encoding_detector as ENCODING_DETECTOR, count(1) as COUNT
+ from encodings_a
+ group by encoding_detector
+ order by COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="Encoding Detector Distribution B"
+ reportFilename="charset/encoding_detector_distribution_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select encoding_detector as ENCODING_DETECTOR, count(1) as COUNT
+ from encodings_b
+ group by encoding_detector
+ order by COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="Detector Quality B (by detector and encoding)"
+ reportFilename="charset/detector_quality_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select eb.encoding_detector as ENCODING_DETECTOR,
+ eb.detected_encoding as DETECTED_ENCODING,
+ count(1) as COUNT,
+ round(avg(cb.oov),4) as AVG_OOV,
+ round(avg(cb.languageness),2) as AVG_LANGUAGENESS,
+ sum(cb.num_replacement) as TOTAL_FFFD,
+ sum(cb.num_non_ascii) as TOTAL_NON_ASCII,
+ round(100.0*sum(cb.num_replacement)/nullif(sum(cb.num_non_ascii),0),2)
as FFFD_PCT
+ from encodings_b eb
+ join contents_b cb on cb.id=eb.id
+ group by eb.encoding_detector, eb.detected_encoding
+ order by AVG_OOV desc
+ </sql>
+ </report>
+
+ <report reportName="Declared vs Detected B"
+ reportFilename="charset/declared_vs_detected_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select eb.declared_metadata as DECLARED, eb.detected_encoding as
DETECTED,
+ count(1) as COUNT
+ from encodings_b eb
+ where eb.declared_metadata is not null
+ group by eb.declared_metadata, eb.detected_encoding
+ order by COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="Declared vs Detected A"
+ reportFilename="charset/declared_vs_detected_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select ea.declared_metadata as DECLARED, ea.detected_encoding as
DETECTED,
+ count(1) as COUNT
+ from encodings_a ea
+ where ea.declared_metadata is not null
+ group by ea.declared_metadata, ea.detected_encoding
+ order by COUNT desc
+ </sql>
+ </report>
+
+
<!-- Exceptions -->
<report reportName="AllExceptionsByMimeA"
reportFilename="exceptions/exceptions_by_mime_A.xlsx"
diff --git a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
index cf2aca82b4..69efc926e1 100644
--- a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
@@ -477,6 +477,208 @@
</report>
+ <!-- CHARSET / ENCODING DETECTION
+ Reports over the encodings_a / encodings_b tables. One row per file
that ran
+ charset detection: detected_encoding (final pick), encoding_detector
(winning
+ detector), declared_metadata (charset declared via Content-Type-Hint).
A row
+ exists only when an encoding was detected, so counts are over text-y
files and
+ joins to encodings are effectively inner. A and B are paired by id.
These
+ tables are not covered by any other report. -->
+
+ <report reportName="Charset Detection Coverage"
+ reportFilename="charset/charset_coverage.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select 'paired_files' as METRIC, count(1) as CNT
+ from profiles_a pa join profiles_b pb on pa.id=pb.id
+ union all
+ select 'detected_in_a', count(1) from encodings_a
+ union all
+ select 'detected_in_b', count(1) from encodings_b
+ union all
+ select 'detected_in_both', count(1)
+ from encodings_a ea join encodings_b eb on ea.id=eb.id
+ union all
+ select 'flipped_a_to_b', count(1)
+ from encodings_a ea join encodings_b eb on ea.id=eb.id
+ where ea.detected_encoding <> eb.detected_encoding
+ </sql>
+ </report>
+
+ <report reportName="Detected Encoding Distribution A"
+ reportFilename="charset/detected_encoding_distribution_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select detected_encoding as DETECTED_ENCODING, count(1) as COUNT
+ from encodings_a
+ group by detected_encoding
+ order by COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="Detected Encoding Distribution B"
+ reportFilename="charset/detected_encoding_distribution_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select detected_encoding as DETECTED_ENCODING, count(1) as COUNT
+ from encodings_b
+ group by detected_encoding
+ order by COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="Encoding Differences A -> B"
+ reportFilename="charset/encoding_diffs_A_to_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select concat(ea.detected_encoding, ' -> ', eb.detected_encoding) as
ENCODING_A_TO_ENCODING_B,
+ count(1) as COUNT
+ from encodings_a ea
+ join encodings_b eb on ea.id=eb.id
+ where ea.detected_encoding <> eb.detected_encoding
+ group by ENCODING_A_TO_ENCODING_B
+ order by COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="Encoding Differences A -> B Details"
+ reportFilename="charset/encoding_diffs_A_to_B_details.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select c.file_path as FILE_PATH,
+ case
+ when pb.embedded_depth > 0
+ then pb.embedded_file_path
+ else pb.file_name
+ end as FILE_NAME_B,
+ ea.detected_encoding as ENCODING_A,
+ eb.detected_encoding as ENCODING_B,
+ ea.encoding_detector as DETECTOR_A,
+ eb.encoding_detector as DETECTOR_B,
+ eb.declared_metadata as DECLARED_B,
+ mb.mime_string as MIME_B,
+ ca.oov as OOV_A, cb.oov as OOV_B,
+ cb.oov - ca.oov as OOV_DELTA_B,
+ ca.languageness as LANGUAGENESS_A, cb.languageness as LANGUAGENESS_B,
+ cb.languageness - ca.languageness as LANGUAGENESS_DELTA_B,
+ ca.num_replacement as FFFD_A, cb.num_replacement as FFFD_B,
+ ca.num_non_ascii as NON_ASCII_A, cb.num_non_ascii as NON_ASCII_B,
+ round(100.0*cb.num_replacement/nullif(cb.num_non_ascii,0),2) as
FFFD_PCT_B,
+ ca.num_common_tokens as COMMON_TOKENS_A, cb.num_common_tokens as
COMMON_TOKENS_B,
+ ifnull(cb.num_common_tokens,0)-ifnull(ca.num_common_tokens,0) as
COMMON_TOKENS_DELTA_B,
+ ca.lang_id_1 as LANG_A, cb.lang_id_1 as LANG_B
+ from encodings_a ea
+ join encodings_b eb on ea.id=eb.id
+ join profiles_b pb on pb.id=ea.id
+ join containers c on c.container_id=pb.container_id
+ left join contents_a ca on ca.id=ea.id
+ left join contents_b cb on cb.id=ea.id
+ left join mimes mb on mb.mime_id=pb.mime_id
+ where ea.detected_encoding <> eb.detected_encoding
+ order by ifnull(cb.num_common_tokens,0)-ifnull(ca.num_common_tokens,0)
asc
+ limit 100000
+ </sql>
+ </report>
+
+ <report reportName="SBCS-Western to CJK Flips A -> B"
+ reportFilename="charset/sbcs_to_cjk_flips_A_to_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select concat(ea.detected_encoding, ' -> ', eb.detected_encoding) as
ENCODING_A_TO_ENCODING_B,
+ count(1) as COUNT
+ from encodings_a ea
+ join encodings_b eb on ea.id=eb.id
+ where lower(ea.detected_encoding) in
+ ('windows-1252','iso-8859-1','iso-8859-15','iso-8859-2','iso-8859-3',
+
'windows-1250','windows-1254','windows-1257','iso-8859-13','windows-1258',
+ 'x-macroman','ibm850','ibm852')
+ and lower(eb.detected_encoding) in
+
('gb18030','gbk','gb2312','big5','big5-hkscs','shift_jis','euc-jp','euc-kr',
+
'x-euc-tw','x-windows-874','x-windows-949','iso-2022-jp','iso-2022-kr','iso-2022-cn')
+ group by ENCODING_A_TO_ENCODING_B
+ order by COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="Encoding Detector Distribution A"
+ reportFilename="charset/encoding_detector_distribution_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select encoding_detector as ENCODING_DETECTOR, count(1) as COUNT
+ from encodings_a
+ group by encoding_detector
+ order by COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="Encoding Detector Distribution B"
+ reportFilename="charset/encoding_detector_distribution_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select encoding_detector as ENCODING_DETECTOR, count(1) as COUNT
+ from encodings_b
+ group by encoding_detector
+ order by COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="Detector Quality B (by detector and encoding)"
+ reportFilename="charset/detector_quality_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select eb.encoding_detector as ENCODING_DETECTOR,
+ eb.detected_encoding as DETECTED_ENCODING,
+ count(1) as COUNT,
+ round(avg(cb.oov),4) as AVG_OOV,
+ round(avg(cb.languageness),2) as AVG_LANGUAGENESS,
+ sum(cb.num_replacement) as TOTAL_FFFD,
+ sum(cb.num_non_ascii) as TOTAL_NON_ASCII,
+ round(100.0*sum(cb.num_replacement)/nullif(sum(cb.num_non_ascii),0),2)
as FFFD_PCT
+ from encodings_b eb
+ join contents_b cb on cb.id=eb.id
+ group by eb.encoding_detector, eb.detected_encoding
+ order by AVG_OOV desc
+ </sql>
+ </report>
+
+ <report reportName="Declared vs Detected B"
+ reportFilename="charset/declared_vs_detected_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select eb.declared_metadata as DECLARED, eb.detected_encoding as
DETECTED,
+ count(1) as COUNT
+ from encodings_b eb
+ where eb.declared_metadata is not null
+ group by eb.declared_metadata, eb.detected_encoding
+ order by COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="Declared vs Detected A"
+ reportFilename="charset/declared_vs_detected_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select ea.declared_metadata as DECLARED, ea.detected_encoding as
DETECTED,
+ count(1) as COUNT
+ from encodings_a ea
+ where ea.declared_metadata is not null
+ group by ea.declared_metadata, ea.detected_encoding
+ order by COUNT desc
+ </sql>
+ </report>
+
+
<!-- Exceptions -->
<report reportName="AllExceptionsByMimeA"
reportFilename="exceptions/exceptions_by_mime_A.xlsx"