This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 127665a5d8 TIKA-4751 - decode as (#2867)
127665a5d8 is described below
commit 127665a5d820e88302ec2c2c017ff4e09e427027
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jun 5 08:49:18 2026 -0400
TIKA-4751 - decode as (#2867)
---
.../java/org/apache/tika/detect/AutoDetectReader.java | 11 +++++------
.../java/org/apache/tika/detect/CharsetSupersets.java | 11 +++++++++++
.../main/java/org/apache/tika/detect/EncodingResult.java | 14 ++++++++++++++
.../java/org/apache/tika/parser/html/JSoupParser.java | 4 +++-
.../apache/tika/parser/microsoft/OutlookExtractor.java | 16 ++++++++++++----
.../main/java/org/apache/tika/parser/dbf/DBFParser.java | 9 ++++++---
.../main/java/org/apache/tika/parser/pkg/ZipParser.java | 2 +-
7 files changed, 52 insertions(+), 15 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
index f17844bf78..0bea81f32d 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
@@ -100,13 +100,12 @@ public class AutoDetectReader extends BufferedReader {
// Ask all given detectors for the character encoding
List<EncodingResult> results = detector.detect(tis, metadata, new
ParseContext());
if (!results.isEmpty()) {
- Charset detected = results.get(0).getCharset();
- Charset superset = CharsetSupersets.supersetOf(detected);
- if (superset != null) {
- metadata.set(TikaCoreProperties.DECODED_CHARSET,
superset.name());
- return superset;
+ EncodingResult result = results.get(0);
+ Charset decodeAs = result.getDecodeAs();
+ if (!decodeAs.equals(result.getCharset())) {
+ metadata.set(TikaCoreProperties.DECODED_CHARSET,
decodeAs.name());
}
- return detected;
+ return decodeAs;
}
// Try determining the encoding based on hints in document metadata.
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java
b/tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java
index 88bd5416bb..c9935feb3a 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java
@@ -88,4 +88,15 @@ public final class CharsetSupersets {
return null;
}
}
+
+ /**
+ * The charset to decode with: {@link #supersetOf(Charset) superset} of
+ * {@code detected} if one exists, else {@code detected} (null only if
+ * {@code detected} is null). For bare-{@link Charset} callers; detection
+ * results can use {@link EncodingResult#getDecodeAs()}.
+ */
+ public static Charset decodeAs(Charset detected) {
+ Charset superset = supersetOf(detected);
+ return superset != null ? superset : detected;
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java
b/tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java
index 55724aefc7..26df7d3b50 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java
@@ -126,10 +126,24 @@ public class EncodingResult {
this.resultType = resultType;
}
+ /**
+ * The detected charset. For <em>decoding</em> bytes prefer {@link
#getDecodeAs()};
+ * this is the charset to <em>report</em> (Content-Type /
detected-encoding).
+ */
public Charset getCharset() {
return charset;
}
+ /**
+ * The charset to decode with: {@link #getCharset()} widened to its
superset when
+ * one exists (e.g. GBK → GB18030), else the detected charset
unchanged.
+ *
+ * @see CharsetSupersets
+ */
+ public Charset getDecodeAs() {
+ return CharsetSupersets.decodeAs(charset);
+ }
+
/**
* Detection confidence in {@code [0.0, 1.0]}. Meaningful for ranking
* among {@link ResultType#STATISTICAL} candidates. For
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
index 4e45a8723e..05d8f836e6 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
@@ -163,6 +163,8 @@ public class JSoupParser extends
AbstractEncodingDetectorParser {
List<EncodingResult> encResults = encodingDetector.detect(tis,
metadata, context);
Charset charset = encResults.isEmpty() ? DEFAULT_CHARSET
: encResults.get(0).getCharset();
+ Charset decodeAs = encResults.isEmpty() ? DEFAULT_CHARSET
+ : encResults.get(0).getDecodeAs();
String previous = metadata.get(Metadata.CONTENT_TYPE);
MediaType contentType = null;
if (previous == null || previous.startsWith("text/html")) {
@@ -195,7 +197,7 @@ public class JSoupParser extends
AbstractEncodingDetectorParser {
tis.setCloseShield();
Document document;
try {
- document = Jsoup.parse(tis, charset.name(), "",
+ document = Jsoup.parse(tis, decodeAs.name(), "",
Parser.htmlParser().tagSet(tagSet));
} finally {
tis.removeCloseShield();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 01b357bddf..c6f4bc8db6 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -68,6 +68,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
+import org.apache.tika.detect.CharsetSupersets;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -903,7 +904,7 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
try (TikaInputStream tis =
TikaInputStream.get(html.getBytes(UTF_8))) {
List<EncodingResult> encResults =
detector.detect(tis, EMPTY_METADATA, context);
- charset = encResults.isEmpty() ? null :
encResults.get(0).getCharset();
+ charset = encResults.isEmpty() ? null :
encResults.get(0).getDecodeAs();
} catch (IOException e) {
//swallow
}
@@ -921,9 +922,16 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
CharsetDetector detector = new CharsetDetector();
detector.setText(text.getRawValue());
CharsetMatch match = detector.detect();
- if (match != null && match.getConfidence() > 35 &&
- tryToSet7BitEncoding(msg, match.getName())) {
- return;
+ if (match != null && match.getConfidence() > 35) {
+ String charsetName = match.getName();
+ try {
+ charsetName =
CharsetSupersets.decodeAs(Charset.forName(charsetName)).name();
+ } catch (IllegalArgumentException e) {
+ //ICU name not a resolvable Java charset; use as-is
+ }
+ if (tryToSet7BitEncoding(msg, charsetName)) {
+ return;
+ }
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
index d6e9d8d863..f894593be4 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
@@ -30,6 +30,7 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.config.TikaComponent;
+import org.apache.tika.detect.CharsetSupersets;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.exception.TikaException;
@@ -89,6 +90,8 @@ public class DBFParser implements Parser {
Charset charset = getCharset(firstRows, header, context);
metadata.set(Metadata.CONTENT_ENCODING, charset.toString());
+ //report detected (above); decode with its superset
+ Charset decodeAs = CharsetSupersets.decodeAs(charset);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata,
context);
xhtml.startDocument();
@@ -96,7 +99,7 @@ public class DBFParser implements Parser {
xhtml.startElement("thead");
for (DBFColumnHeader col : header.getCols()) {
xhtml.startElement("th");
- xhtml.characters(col.getName(charset));
+ xhtml.characters(col.getName(decodeAs));
xhtml.endElement("th");
}
xhtml.endElement("thead");
@@ -106,12 +109,12 @@ public class DBFParser implements Parser {
//now write cached rows
while (firstRows.size() > 0) {
DBFRow cachedRow = firstRows.remove(0);
- writeRow(cachedRow, charset, xhtml);
+ writeRow(cachedRow, decodeAs, xhtml);
}
//now continue with rest
while (row != null) {
- writeRow(row, charset, xhtml);
+ writeRow(row, decodeAs, xhtml);
row = reader.next();
}
xhtml.endElement("tbody");
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
index 2933200bea..d01fa3ab41 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
@@ -565,7 +565,7 @@ public class ZipParser extends AbstractArchiveParser {
try (TikaInputStream detectStream =
TikaInputStream.get(entryName)) {
List<EncodingResult> encResults =
getEncodingDetector().detect(detectStream,
parentMetadata, context);
- Charset candidate = encResults.isEmpty() ? null :
encResults.get(0).getCharset();
+ Charset candidate = encResults.isEmpty() ? null :
encResults.get(0).getDecodeAs();
if (candidate != null) {
return new String(entry.getRawName(), candidate);
}