(tika) branch main updated: small updates to tika-eval (#2881)

tallison Tue, 09 Jun 2026 09:17:33 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new 333d281bfa small updates to tika-eval (#2881)
333d281bfa is described below

commit 333d281bfaf42dc63a2d42e922d6eca3485d6940
Author: Tim Allison <[email protected]>
AuthorDate: Tue Jun 9 12:17:20 2026 -0400

    small updates to tika-eval (#2881)
---
 .skills/tika-eval-compare.md                       |   3 +-
 .../integration-testing/tika-eval-regression.adoc  |  11 +-
 .../tika/eval/app/ExtractComparerRunner.java       |  60 +++++-
 .../src/main/resources/comparison-reports-pg.xml   | 202 +++++++++++++++++++++
 .../src/main/resources/comparison-reports.xml      | 202 +++++++++++++++++++++
 5 files changed, 472 insertions(+), 6 deletions(-)

diff --git a/.skills/tika-eval-compare.md b/.skills/tika-eval-compare.md
index d4549d26c8..cdb581e386 100644
--- a/.skills/tika-eval-compare.md
+++ b/.skills/tika-eval-compare.md
@@ -88,8 +88,9 @@ java -jar <tika-eval>/tika-eval-app-*.jar Compare \
 | `-a` | Directory of "before" extracts (required) |
 | `-b` | Directory of "after" extracts (required) |
 | `-d` | H2 database path (temp file if omitted) |
-| `-r` | Auto-run Report + tar.gz after Compare |
+| `-r` | Auto-run Report + tgz the reports dir (`<reportsDir>.tgz`) after 
Compare |
 | `-rd` | Reports output directory (default: `reports`) |
+| `-z` | Gzip the H2 db (`<db>.mv.db.gz`) after Compare for transfer; requires 
`-d` (no-op + warning for a temp db). Combine with `-r` to package both. |
 | `-n` | Number of worker threads |
 
 ## Step 3 — Review Results
diff --git 
a/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc
 
b/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc
index 61a8981633..deeed7bee3 100644
--- 
a/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc
+++ 
b/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc
@@ -191,7 +191,8 @@ java -jar 
tika-eval/tika-eval-app/target/tika-eval-app-{tika-version}.jar \
 
 The `Compare` subcommand keyword is optional — the CLI infers it from
 the `-a` / `-b` flags.  The `-r` flag both runs the Report stage and
-zips the resulting reports directory for easy archiving.
+creates a `.tgz` archive of the resulting reports directory
+(`<reportsDir>.tgz`) for easy archiving.
 
 Options:
 
@@ -203,9 +204,13 @@ Options:
   tika-eval will create `\{label}.mv.db` and a `\{label}-reports/` dir
   alongside.  Persist the db if you want to re-run Report later.
 * `-r` / `--report` — automatically run the Report step after Compare,
-  and zip the reports directory.
+  and tgz the reports directory to `<reportsDir>.tgz`.
 * `-rd` / `--reportsDir` — explicit reports directory (overrides the
   default derived from `-d`).
+* `-z` / `--gzip` — gzip the H2 db file to `<db>.mv.db.gz` after Compare
+  so it can be transferred.  Requires `-d` (no-op with a warning for a
+  temp db or a non-file jdbc connection (e.g. `jdbc:h2:mem:`, tcp)).  Combine 
with `-r` to package
+  both the reports and the db.
 * `-n` / `--numWorkers` — comparison worker count.
 * `-c` / `--config` — optional tika-eval JSON config.
 
@@ -296,7 +301,7 @@ java -jar tika-eval-app-{tika-version}.jar \
     -b ~/data/extracts/cc-html-sample-B
 ----
 +
-Produces `cc-html-29k-A-vs-B-reports/` plus a `.tar.gz` of the same
+Produces `cc-html-29k-A-vs-B-reports/` plus a `.tgz` of the same
 alongside `cc-html-29k-A-vs-B.mv.db`.
 
 For a 29 K-file HTML sample on a typical workstation (8 forked workers,
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
index 3ffcc5a31d..9b48f06bf7 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
@@ -92,6 +92,7 @@ public class ExtractComparerRunner {
                 
.addOption(Option.builder("m").longOpt("maxExtractLength").hasArg().desc("maximum
 extract length").get())
                 
.addOption(Option.builder("r").longOpt("report").desc("automatically run Report 
and tgz after Compare").get())
                 
.addOption(Option.builder("rd").longOpt("reportsDir").hasArg().desc("directory 
for reports (default: 'reports')").get())
+                .addOption(Option.builder("z").longOpt("gzip").desc("gzip the 
H2 db file (<db>.mv.db.gz) after Compare for transfer; requires -d").get())
                 ;
     }
 
@@ -128,15 +129,23 @@ public class ExtractComparerRunner {
             if (commandLine.hasOption('r')) {
                 String reportsDir = commandLine.getOptionValue("rd", 
"reports");
                 LOG.info("Running Report...");
-                ResultsReporter.main(new String[]{"-d", dbPath, "-rd", 
reportsDir});
+                if (dbPath.startsWith("jdbc:")) {
+                    ResultsReporter.main(new String[]{"-jdbc", dbPath, "-rd", 
reportsDir});
+                } else {
+                    ResultsReporter.main(new String[]{"-d", dbPath, "-rd", 
reportsDir});
+                }
                 Path reportsDirPath = Paths.get(reportsDir);
                 if (Files.isDirectory(reportsDirPath)) {
-                    Path tgzPath = reportsDirPath.resolveSibling(reportsDir + 
".tar.gz");
+                    Path tgzPath = 
reportsDirPath.resolveSibling(reportsDirPath.getFileName() + ".tgz");
                     LOG.info("Creating {}", tgzPath);
                     createTarGz(reportsDirPath, tgzPath);
                     LOG.info("Reports archived to {}", tgzPath);
                 }
             }
+
+            if (commandLine.hasOption('z')) {
+                gzipDb(dbPath, usesTempDb);
+            }
         } finally {
             if (usesTempDb && tempDbDir != null) {
                 deleteDirectory(tempDbDir);
@@ -263,6 +272,53 @@ public class ExtractComparerRunner {
         }
     }
 
+    /**
+     * Gzip the H2 db file (&lt;dbPath&gt;.mv.db -&gt; 
&lt;dbPath&gt;.mv.db.gz) so it can be
+     * transferred. The db connection is already closed by {@link #execute} 
before
+     * this runs, so the file is unlocked. No-op (with a warning) when there 
is no
+     * on-disk file db to gzip: a temp db (no -d), or a non-file jdbc 
connection
+     * (e.g. mem/tcp). A {@code jdbc:h2:file:} URL is supported by extracting 
its
+     * file path.
+     */
+    private static void gzipDb(String dbPath, boolean usesTempDb) throws 
IOException {
+        if (usesTempDb) {
+            LOG.warn("-z (gzip) ignored: no -d db specified, so there is no db 
file to transfer");
+            return;
+        }
+        String filePath = dbPath;
+        if (dbPath.startsWith("jdbc:")) {
+            String prefix = "jdbc:h2:file:";
+            if (!dbPath.startsWith(prefix)) {
+                LOG.warn("-z (gzip) ignored: db is a non-file jdbc connection 
({}), no local file to transfer",
+                        dbPath);
+                return;
+            }
+            // Strip the jdbc:h2:file: prefix and any ;OPTION=... suffix to 
get the file base path.
+            filePath = dbPath.substring(prefix.length());
+            int semi = filePath.indexOf(';');
+            if (semi >= 0) {
+                filePath = filePath.substring(0, semi);
+            }
+        }
+        Path dbFile = Paths.get(filePath + ".mv.db");
+        if (!Files.isRegularFile(dbFile)) {
+            LOG.warn("-z (gzip) ignored: expected db file {} not found", 
dbFile);
+            return;
+        }
+        Path gzPath = dbFile.resolveSibling(dbFile.getFileName() + ".gz");
+        LOG.info("Creating {}", gzPath);
+        gzipFile(dbFile, gzPath);
+        LOG.info("Db archived to {}", gzPath);
+    }
+
+    private static void gzipFile(Path source, Path output) throws IOException {
+        try (InputStream is = Files.newInputStream(source);
+             OutputStream fos = Files.newOutputStream(output);
+             GzipCompressorOutputStream gzo = new 
GzipCompressorOutputStream(fos)) {
+            is.transferTo(gzo);
+        }
+    }
+
     private static void createTarGz(Path sourceDir, Path output) throws 
IOException {
         try (OutputStream fos = Files.newOutputStream(output);
              GzipCompressorOutputStream gzo = new 
GzipCompressorOutputStream(fos);
diff --git 
a/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml 
b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml
index 0773e51dd3..1343e002bf 100644
--- a/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml
@@ -838,6 +838,208 @@
   </report>
 
 
+  <!-- CHARSET / ENCODING DETECTION
+       Reports over the encodings_a / encodings_b tables.  One row per file 
that ran
+       charset detection: detected_encoding (final pick), encoding_detector 
(winning
+       detector), declared_metadata (charset declared via Content-Type-Hint).  
A row
+       exists only when an encoding was detected, so counts are over text-y 
files and
+       joins to encodings are effectively inner.  A and B are paired by id.  
These
+       tables are not covered by any other report. -->
+
+  <report reportName="Charset Detection Coverage"
+          reportFilename="charset/charset_coverage.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select 'paired_files' as METRIC, count(1) as CNT
+      from profiles_a pa join profiles_b pb on pa.id=pb.id
+      union all
+      select 'detected_in_a', count(1) from encodings_a
+      union all
+      select 'detected_in_b', count(1) from encodings_b
+      union all
+      select 'detected_in_both', count(1)
+      from encodings_a ea join encodings_b eb on ea.id=eb.id
+      union all
+      select 'flipped_a_to_b', count(1)
+      from encodings_a ea join encodings_b eb on ea.id=eb.id
+      where ea.detected_encoding &lt;&gt; eb.detected_encoding
+    </sql>
+  </report>
+
+  <report reportName="Detected Encoding Distribution A"
+          reportFilename="charset/detected_encoding_distribution_A.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select detected_encoding as DETECTED_ENCODING, count(1) as COUNT
+      from encodings_a
+      group by detected_encoding
+      order by COUNT desc
+    </sql>
+  </report>
+
+  <report reportName="Detected Encoding Distribution B"
+          reportFilename="charset/detected_encoding_distribution_B.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select detected_encoding as DETECTED_ENCODING, count(1) as COUNT
+      from encodings_b
+      group by detected_encoding
+      order by COUNT desc
+    </sql>
+  </report>
+
+  <report reportName="Encoding Differences A -> B"
+          reportFilename="charset/encoding_diffs_A_to_B.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select concat(ea.detected_encoding, ' -&gt; ', eb.detected_encoding) as 
ENCODING_A_TO_ENCODING_B,
+      count(1) as COUNT
+      from encodings_a ea
+      join encodings_b eb on ea.id=eb.id
+      where ea.detected_encoding &lt;&gt; eb.detected_encoding
+      group by ENCODING_A_TO_ENCODING_B
+      order by COUNT desc
+    </sql>
+  </report>
+
+  <report reportName="Encoding Differences A -> B Details"
+          reportFilename="charset/encoding_diffs_A_to_B_details.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select c.file_path as FILE_PATH,
+      case
+        when pb.embedded_depth &gt; 0
+        then pb.embedded_file_path
+        else pb.file_name
+      end as FILE_NAME_B,
+      ea.detected_encoding as ENCODING_A,
+      eb.detected_encoding as ENCODING_B,
+      ea.encoding_detector as DETECTOR_A,
+      eb.encoding_detector as DETECTOR_B,
+      eb.declared_metadata as DECLARED_B,
+      mb.mime_string as MIME_B,
+      ca.oov as OOV_A, cb.oov as OOV_B,
+      cb.oov - ca.oov as OOV_DELTA_B,
+      ca.languageness as LANGUAGENESS_A, cb.languageness as LANGUAGENESS_B,
+      cb.languageness - ca.languageness as LANGUAGENESS_DELTA_B,
+      ca.num_replacement as FFFD_A, cb.num_replacement as FFFD_B,
+      ca.num_non_ascii as NON_ASCII_A, cb.num_non_ascii as NON_ASCII_B,
+      round(100.0*cb.num_replacement/nullif(cb.num_non_ascii,0),2) as 
FFFD_PCT_B,
+      ca.num_common_tokens as COMMON_TOKENS_A, cb.num_common_tokens as 
COMMON_TOKENS_B,
+      coalesce(cb.num_common_tokens,0)-coalesce(ca.num_common_tokens,0) as 
COMMON_TOKENS_DELTA_B,
+      ca.lang_id_1 as LANG_A, cb.lang_id_1 as LANG_B
+      from encodings_a ea
+      join encodings_b eb on ea.id=eb.id
+      join profiles_b pb on pb.id=ea.id
+      join containers c on c.container_id=pb.container_id
+      left join contents_a ca on ca.id=ea.id
+      left join contents_b cb on cb.id=ea.id
+      left join mimes mb on mb.mime_id=pb.mime_id
+      where ea.detected_encoding &lt;&gt; eb.detected_encoding
+      order by 
coalesce(cb.num_common_tokens,0)-coalesce(ca.num_common_tokens,0) asc
+      limit 100000
+    </sql>
+  </report>
+
+  <report reportName="SBCS-Western to CJK Flips A -> B"
+          reportFilename="charset/sbcs_to_cjk_flips_A_to_B.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select concat(ea.detected_encoding, ' -&gt; ', eb.detected_encoding) as 
ENCODING_A_TO_ENCODING_B,
+      count(1) as COUNT
+      from encodings_a ea
+      join encodings_b eb on ea.id=eb.id
+      where lower(ea.detected_encoding) in
+        ('windows-1252','iso-8859-1','iso-8859-15','iso-8859-2','iso-8859-3',
+         
'windows-1250','windows-1254','windows-1257','iso-8859-13','windows-1258',
+         'x-macroman','ibm850','ibm852')
+      and lower(eb.detected_encoding) in
+        
('gb18030','gbk','gb2312','big5','big5-hkscs','shift_jis','euc-jp','euc-kr',
+         
'x-euc-tw','x-windows-874','x-windows-949','iso-2022-jp','iso-2022-kr','iso-2022-cn')
+      group by ENCODING_A_TO_ENCODING_B
+      order by COUNT desc
+    </sql>
+  </report>
+
+  <report reportName="Encoding Detector Distribution A"
+          reportFilename="charset/encoding_detector_distribution_A.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select encoding_detector as ENCODING_DETECTOR, count(1) as COUNT
+      from encodings_a
+      group by encoding_detector
+      order by COUNT desc
+    </sql>
+  </report>
+
+  <report reportName="Encoding Detector Distribution B"
+          reportFilename="charset/encoding_detector_distribution_B.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select encoding_detector as ENCODING_DETECTOR, count(1) as COUNT
+      from encodings_b
+      group by encoding_detector
+      order by COUNT desc
+    </sql>
+  </report>
+
+  <report reportName="Detector Quality B (by detector and encoding)"
+          reportFilename="charset/detector_quality_B.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select eb.encoding_detector as ENCODING_DETECTOR,
+      eb.detected_encoding as DETECTED_ENCODING,
+      count(1) as COUNT,
+      round(avg(cb.oov),4) as AVG_OOV,
+      round(avg(cb.languageness),2) as AVG_LANGUAGENESS,
+      sum(cb.num_replacement) as TOTAL_FFFD,
+      sum(cb.num_non_ascii) as TOTAL_NON_ASCII,
+      round(100.0*sum(cb.num_replacement)/nullif(sum(cb.num_non_ascii),0),2) 
as FFFD_PCT
+      from encodings_b eb
+      join contents_b cb on cb.id=eb.id
+      group by eb.encoding_detector, eb.detected_encoding
+      order by AVG_OOV desc
+    </sql>
+  </report>
+
+  <report reportName="Declared vs Detected B"
+          reportFilename="charset/declared_vs_detected_B.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select eb.declared_metadata as DECLARED, eb.detected_encoding as 
DETECTED,
+      count(1) as COUNT
+      from encodings_b eb
+      where eb.declared_metadata is not null
+      group by eb.declared_metadata, eb.detected_encoding
+      order by COUNT desc
+    </sql>
+  </report>
+
+  <report reportName="Declared vs Detected A"
+          reportFilename="charset/declared_vs_detected_A.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select ea.declared_metadata as DECLARED, ea.detected_encoding as 
DETECTED,
+      count(1) as COUNT
+      from encodings_a ea
+      where ea.declared_metadata is not null
+      group by ea.declared_metadata, ea.detected_encoding
+      order by COUNT desc
+    </sql>
+  </report>
+
+
   <!-- Exceptions -->
   <report reportName="AllExceptionsByMimeA"
           reportFilename="exceptions/exceptions_by_mime_A.xlsx"
diff --git a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml 
b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
index cf2aca82b4..69efc926e1 100644
--- a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
@@ -477,6 +477,208 @@
   </report>
 
 
+  <!-- CHARSET / ENCODING DETECTION
+       Reports over the encodings_a / encodings_b tables.  One row per file 
that ran
+       charset detection: detected_encoding (final pick), encoding_detector 
(winning
+       detector), declared_metadata (charset declared via Content-Type-Hint).  
A row
+       exists only when an encoding was detected, so counts are over text-y 
files and
+       joins to encodings are effectively inner.  A and B are paired by id.  
These
+       tables are not covered by any other report. -->
+
+  <report reportName="Charset Detection Coverage"
+          reportFilename="charset/charset_coverage.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select 'paired_files' as METRIC, count(1) as CNT
+      from profiles_a pa join profiles_b pb on pa.id=pb.id
+      union all
+      select 'detected_in_a', count(1) from encodings_a
+      union all
+      select 'detected_in_b', count(1) from encodings_b
+      union all
+      select 'detected_in_both', count(1)
+      from encodings_a ea join encodings_b eb on ea.id=eb.id
+      union all
+      select 'flipped_a_to_b', count(1)
+      from encodings_a ea join encodings_b eb on ea.id=eb.id
+      where ea.detected_encoding &lt;&gt; eb.detected_encoding
+    </sql>
+  </report>
+
+  <report reportName="Detected Encoding Distribution A"
+          reportFilename="charset/detected_encoding_distribution_A.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select detected_encoding as DETECTED_ENCODING, count(1) as COUNT
+      from encodings_a
+      group by detected_encoding
+      order by COUNT desc
+    </sql>
+  </report>
+
+  <report reportName="Detected Encoding Distribution B"
+          reportFilename="charset/detected_encoding_distribution_B.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select detected_encoding as DETECTED_ENCODING, count(1) as COUNT
+      from encodings_b
+      group by detected_encoding
+      order by COUNT desc
+    </sql>
+  </report>
+
+  <report reportName="Encoding Differences A -> B"
+          reportFilename="charset/encoding_diffs_A_to_B.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select concat(ea.detected_encoding, ' -&gt; ', eb.detected_encoding) as 
ENCODING_A_TO_ENCODING_B,
+      count(1) as COUNT
+      from encodings_a ea
+      join encodings_b eb on ea.id=eb.id
+      where ea.detected_encoding &lt;&gt; eb.detected_encoding
+      group by ENCODING_A_TO_ENCODING_B
+      order by COUNT desc
+    </sql>
+  </report>
+
+  <report reportName="Encoding Differences A -> B Details"
+          reportFilename="charset/encoding_diffs_A_to_B_details.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select c.file_path as FILE_PATH,
+      case
+        when pb.embedded_depth &gt; 0
+        then pb.embedded_file_path
+        else pb.file_name
+      end as FILE_NAME_B,
+      ea.detected_encoding as ENCODING_A,
+      eb.detected_encoding as ENCODING_B,
+      ea.encoding_detector as DETECTOR_A,
+      eb.encoding_detector as DETECTOR_B,
+      eb.declared_metadata as DECLARED_B,
+      mb.mime_string as MIME_B,
+      ca.oov as OOV_A, cb.oov as OOV_B,
+      cb.oov - ca.oov as OOV_DELTA_B,
+      ca.languageness as LANGUAGENESS_A, cb.languageness as LANGUAGENESS_B,
+      cb.languageness - ca.languageness as LANGUAGENESS_DELTA_B,
+      ca.num_replacement as FFFD_A, cb.num_replacement as FFFD_B,
+      ca.num_non_ascii as NON_ASCII_A, cb.num_non_ascii as NON_ASCII_B,
+      round(100.0*cb.num_replacement/nullif(cb.num_non_ascii,0),2) as 
FFFD_PCT_B,
+      ca.num_common_tokens as COMMON_TOKENS_A, cb.num_common_tokens as 
COMMON_TOKENS_B,
+      ifnull(cb.num_common_tokens,0)-ifnull(ca.num_common_tokens,0) as 
COMMON_TOKENS_DELTA_B,
+      ca.lang_id_1 as LANG_A, cb.lang_id_1 as LANG_B
+      from encodings_a ea
+      join encodings_b eb on ea.id=eb.id
+      join profiles_b pb on pb.id=ea.id
+      join containers c on c.container_id=pb.container_id
+      left join contents_a ca on ca.id=ea.id
+      left join contents_b cb on cb.id=ea.id
+      left join mimes mb on mb.mime_id=pb.mime_id
+      where ea.detected_encoding &lt;&gt; eb.detected_encoding
+      order by ifnull(cb.num_common_tokens,0)-ifnull(ca.num_common_tokens,0) 
asc
+      limit 100000
+    </sql>
+  </report>
+
+  <report reportName="SBCS-Western to CJK Flips A -> B"
+          reportFilename="charset/sbcs_to_cjk_flips_A_to_B.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select concat(ea.detected_encoding, ' -&gt; ', eb.detected_encoding) as 
ENCODING_A_TO_ENCODING_B,
+      count(1) as COUNT
+      from encodings_a ea
+      join encodings_b eb on ea.id=eb.id
+      where lower(ea.detected_encoding) in
+        ('windows-1252','iso-8859-1','iso-8859-15','iso-8859-2','iso-8859-3',
+         
'windows-1250','windows-1254','windows-1257','iso-8859-13','windows-1258',
+         'x-macroman','ibm850','ibm852')
+      and lower(eb.detected_encoding) in
+        
('gb18030','gbk','gb2312','big5','big5-hkscs','shift_jis','euc-jp','euc-kr',
+         
'x-euc-tw','x-windows-874','x-windows-949','iso-2022-jp','iso-2022-kr','iso-2022-cn')
+      group by ENCODING_A_TO_ENCODING_B
+      order by COUNT desc
+    </sql>
+  </report>
+
+  <report reportName="Encoding Detector Distribution A"
+          reportFilename="charset/encoding_detector_distribution_A.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select encoding_detector as ENCODING_DETECTOR, count(1) as COUNT
+      from encodings_a
+      group by encoding_detector
+      order by COUNT desc
+    </sql>
+  </report>
+
+  <report reportName="Encoding Detector Distribution B"
+          reportFilename="charset/encoding_detector_distribution_B.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select encoding_detector as ENCODING_DETECTOR, count(1) as COUNT
+      from encodings_b
+      group by encoding_detector
+      order by COUNT desc
+    </sql>
+  </report>
+
+  <report reportName="Detector Quality B (by detector and encoding)"
+          reportFilename="charset/detector_quality_B.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select eb.encoding_detector as ENCODING_DETECTOR,
+      eb.detected_encoding as DETECTED_ENCODING,
+      count(1) as COUNT,
+      round(avg(cb.oov),4) as AVG_OOV,
+      round(avg(cb.languageness),2) as AVG_LANGUAGENESS,
+      sum(cb.num_replacement) as TOTAL_FFFD,
+      sum(cb.num_non_ascii) as TOTAL_NON_ASCII,
+      round(100.0*sum(cb.num_replacement)/nullif(sum(cb.num_non_ascii),0),2) 
as FFFD_PCT
+      from encodings_b eb
+      join contents_b cb on cb.id=eb.id
+      group by eb.encoding_detector, eb.detected_encoding
+      order by AVG_OOV desc
+    </sql>
+  </report>
+
+  <report reportName="Declared vs Detected B"
+          reportFilename="charset/declared_vs_detected_B.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select eb.declared_metadata as DECLARED, eb.detected_encoding as 
DETECTED,
+      count(1) as COUNT
+      from encodings_b eb
+      where eb.declared_metadata is not null
+      group by eb.declared_metadata, eb.detected_encoding
+      order by COUNT desc
+    </sql>
+  </report>
+
+  <report reportName="Declared vs Detected A"
+          reportFilename="charset/declared_vs_detected_A.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select ea.declared_metadata as DECLARED, ea.detected_encoding as 
DETECTED,
+      count(1) as COUNT
+      from encodings_a ea
+      where ea.declared_metadata is not null
+      group by ea.declared_metadata, ea.detected_encoding
+      order by COUNT desc
+    </sql>
+  </report>
+
+
   <!-- Exceptions -->
   <report reportName="AllExceptionsByMimeA"
           reportFilename="exceptions/exceptions_by_mime_A.xlsx"

(tika) branch main updated: small updates to tika-eval (#2881)

Reply via email to