tballison commented on code in PR #2871:
URL: https://github.com/apache/tika/pull/2871#discussion_r3363838346


##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java:
##########
@@ -549,22 +550,37 @@ private void parseStreamEntry(ZipArchiveInputStream zis, 
ZipArchiveEntry entry,
         }
     }
 
-    private String detectEntryName(ZipArchiveEntry entry, Metadata 
parentMetadata,
-                                    ParseContext context, ZipParserConfig 
config) throws IOException {
+    private String detectEntryName(ZipArchiveEntry entry, ParseContext context,
+                                    ZipParserConfig config) throws IOException 
{
         // If user specified an encoding, decode raw bytes with that charset
         // This avoids needing to reopen the ZipFile with a different charset
         if (config.getEntryEncoding() != null) {
             return new String(entry.getRawName(), config.getEntryEncoding());
         }
 
+        // A zip only ever declares a name as UTF-8 (it can't name a legacy 
charset),
+        // two ways. The Unicode extra field carries a CRC-validated UTF-8 
name -- that
+        // CRC check is the evaluation, so trust commons-compress's getName().
+        if (entry.getNameSource() == 
ZipArchiveEntry.NameSource.UNICODE_EXTRA_FIELD) {
+            return entry.getName();
+        }
+
         // If charset detection is enabled, try to detect and decode.
         // Mojibuster handles short inputs natively (zip filenames are often
         // 9-30 bytes); no byte-extension trick needed.
         if (config.isDetectCharsetsInEntryNames()) {
             byte[] entryName = entry.getRawName();
+            // The EFS flag (general purpose bit 11) also declares UTF-8, but 
is
+            // unvalidated. Record it as a content-type hint for the detector 
to
+            // evaluate against the bytes, not trust outright.
+            Metadata nameMetadata = new Metadata();
+            if (entry.getNameSource() == 
ZipArchiveEntry.NameSource.NAME_WITH_EFS_FLAG) {
+                nameMetadata.set(TikaCoreProperties.CONTENT_TYPE_HINT,
+                        new MediaType(MediaType.TEXT_PLAIN, 
StandardCharsets.UTF_8).toString());
+            }
             try (TikaInputStream detectStream = 
TikaInputStream.get(entryName)) {
                 List<EncodingResult> encResults =
-                        getEncodingDetector().detect(detectStream, 
parentMetadata, context);
+                        getEncodingDetector(context).detect(detectStream, 
nameMetadata, context);

Review Comment:
   not a concern.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to