This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4752
in repository https://gitbox.apache.org/repos/asf/tika.git

commit eff4f458bb73268ff6dd42f0067e51795e85a0c9
Merge: 652ffb27b2 ffd712980b
Author: tallison <[email protected]>
AuthorDate: Fri Jun 5 09:12:26 2026 -0400

    merge main

 .../org/apache/tika/detect/AutoDetectReader.java   | 11 +++--
 .../org/apache/tika/detect/CharsetSupersets.java   | 11 +++++
 .../org/apache/tika/detect/EncodingResult.java     | 14 +++++++
 .../org/apache/tika/parser/html/JSoupParser.java   |  4 +-
 .../tika/parser/microsoft/OutlookExtractor.java    | 16 +++++--
 .../java/org/apache/tika/parser/dbf/DBFParser.java |  9 ++--
 .../java/org/apache/tika/parser/pkg/ZipParser.java |  2 +-
 .../tika/serialization/ComponentNameResolver.java  | 49 ++++++++++++++++++++--
 .../serialization/ComponentNameResolverTest.java   | 45 ++++++++++++++++++++
 9 files changed, 142 insertions(+), 19 deletions(-)

diff --cc 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
index cb0c52208a,d01fa3ab41..fe9b5236d6
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
@@@ -570,18 -562,10 +570,18 @@@ public class ZipParser extends Abstract
          // 9-30 bytes); no byte-extension trick needed.
          if (config.isDetectCharsetsInEntryNames()) {
              byte[] entryName = entry.getRawName();
 +            // The EFS flag (general purpose bit 11) also declares UTF-8, but 
is
 +            // unvalidated. Record it as a content-type hint for the detector 
to
 +            // evaluate against the bytes, not trust outright.
 +            Metadata nameMetadata = new Metadata();
 +            if (entry.getNameSource() == 
ZipArchiveEntry.NameSource.NAME_WITH_EFS_FLAG) {
 +                nameMetadata.set(TikaCoreProperties.CONTENT_TYPE_HINT,
 +                        new MediaType(MediaType.TEXT_PLAIN, 
StandardCharsets.UTF_8).toString());
 +            }
              try (TikaInputStream detectStream = 
TikaInputStream.get(entryName)) {
                  List<EncodingResult> encResults =
 -                        getEncodingDetector().detect(detectStream, 
parentMetadata, context);
 +                        getEncodingDetector(context).detect(detectStream, 
nameMetadata, context);
-                 Charset candidate = encResults.isEmpty() ? null : 
encResults.get(0).getCharset();
+                 Charset candidate = encResults.isEmpty() ? null : 
encResults.get(0).getDecodeAs();
                  if (candidate != null) {
                      return new String(entry.getRawName(), candidate);
                  }

Reply via email to