This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4731-common-script in repository https://gitbox.apache.org/repos/asf/tika.git
commit 63dc5ed7066dca81add348fe8656d7baae49f7c2 Author: tallison <[email protected]> AuthorDate: Wed May 20 08:04:32 2026 -0400 TIKA-4731 - further refinements --- tika-ml/tika-ml-junkdetect/pom.xml | 12 ++++++++++++ .../tika/ml/junkdetect/tools/TrainJunkModel.java | 6 ++++++ .../org/apache/tika/ml/junkdetect/junkdetect.bin | Bin 2784427 -> 2901358 bytes 3 files changed, 18 insertions(+) diff --git a/tika-ml/tika-ml-junkdetect/pom.xml b/tika-ml/tika-ml-junkdetect/pom.xml index a10d73ad64..fe717998cf 100644 --- a/tika-ml/tika-ml-junkdetect/pom.xml +++ b/tika-ml/tika-ml-junkdetect/pom.xml @@ -61,6 +61,18 @@ </dependency> <!-- Test dependencies --> + <!-- + tika-serialization is test-scope only because the one consumer + (BuildJunkAugmentationData) lives in src/test/java — it's a corpus-prep + tool, not part of the runtime detector. Keeps the production classpath of + tika-ml-junkdetect free of the serialization dep. + --> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-serialization</artifactId> + <version>${revision}</version> + <scope>test</scope> + </dependency> <dependency> <groupId>org.junit.jupiter</groupId> <artifactId>junit-jupiter-api</artifactId> diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java index 88807c1cdb..b52e185eff 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java @@ -226,6 +226,12 @@ public class TrainJunkModel { {"ISO-8859-1", "windows-1252"}, {"windows-1252", "ISO-8859-1"}, {"x-MacRoman", "windows-1252"}, + // The exact win-1252/ISO-8859-2 sibling pathology: a win-1252 page with + // ©/®/£ symbols read as ISO-8859-2 yields isolated Latin-Extended-A + // letters (Š/Ž/Ł). Included as classifier negatives so the LR trains + // against this pattern directly. + {"windows-1252", "ISO-8859-2"}, + {"ISO-8859-2", "windows-1252"}, // SBCS Cyrillic / Greek / RTL {"windows-1251", "windows-1252"}, {"windows-1252", "windows-1251"}, diff --git a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin index 50f7dfe2e6..af491ba162 100644 Binary files a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin and b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin differ
