This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4752 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 652ffb27b2a88700f844e224a79d0ba73cc67326 Author: tallison <[email protected]> AuthorDate: Fri Jun 5 09:05:41 2026 -0400 TIKA-4752 -- improve zip name detection --- .../tika/detect/MetadataCharsetDetector.java | 27 +++++- .../tika/detect/MetadataCharsetDetectorTest.java | 101 +++++++++++++++++++ .../java/org/apache/tika/parser/pkg/ZipParser.java | 26 ++++- .../tika/parser/pkg/ZipEntryNameEncodingTest.java | 107 +++++++++++++++++++++ 4 files changed, 252 insertions(+), 9 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java b/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java index 13102ea01d..1581bc9b74 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.tika.config.TikaComponent; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; @@ -33,10 +34,13 @@ import org.apache.tika.parser.ParseContext; * reading any bytes from the stream. Returns a single * {@link EncodingResult.ResultType#DECLARATIVE} result when a charset is found. * - * <p>Two metadata keys are consulted in order: + * <p>Three metadata keys are consulted in order: * <ol> * <li>{@link Metadata#CONTENT_TYPE} — the {@code charset} parameter of the * HTTP/MIME Content-Type header (e.g. {@code text/html; charset=UTF-8}).</li> + * <li>{@link TikaCoreProperties#CONTENT_TYPE_HINT} — the {@code charset} parameter + * of a content-type a source <em>claimed</em> for the bytes (e.g. an HTML + * {@code <meta>} tag, or a zip entry's UTF-8 (EFS) flag). A hint, not a verdict.</li> * <li>{@link Metadata#CONTENT_ENCODING} — a bare charset label set by parsers * such as {@code RFC822Parser}, which splits Content-Type into a bare * media-type key and a separate charset key.</li> @@ -56,6 +60,9 @@ public class MetadataCharsetDetector implements EncodingDetector { public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata, ParseContext context) throws IOException { Charset cs = charsetFromContentType(metadata); + if (cs == null) { + cs = charsetFromContentTypeHint(metadata); + } if (cs == null) { cs = charsetFromContentEncoding(metadata); } @@ -71,7 +78,20 @@ public class MetadataCharsetDetector implements EncodingDetector { * {@link Metadata#CONTENT_TYPE} value, or {@code null} if absent or unparseable. */ public static Charset charsetFromContentType(Metadata metadata) { - String contentType = metadata.get(Metadata.CONTENT_TYPE); + return charsetFromMediaType(metadata.get(Metadata.CONTENT_TYPE)); + } + + /** + * Returns the charset named in the {@code charset} parameter of the + * {@link TikaCoreProperties#CONTENT_TYPE_HINT} value — a content-type a source + * claimed for the bytes (HTML {@code <meta>}, a zip entry's UTF-8 flag, ...) — + * or {@code null} if absent or unparseable. + */ + public static Charset charsetFromContentTypeHint(Metadata metadata) { + return charsetFromMediaType(metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT)); + } + + private static Charset charsetFromMediaType(String contentType) { if (contentType == null) { return null; } @@ -79,8 +99,7 @@ public class MetadataCharsetDetector implements EncodingDetector { if (mediaType == null) { return null; } - String label = mediaType.getParameters().get("charset"); - return parseCharset(label); + return parseCharset(mediaType.getParameters().get("charset")); } /** diff --git a/tika-core/src/test/java/org/apache/tika/detect/MetadataCharsetDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/MetadataCharsetDetectorTest.java new file mode 100644 index 0000000000..b8ca8531b9 --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/detect/MetadataCharsetDetectorTest.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.detect; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; + +public class MetadataCharsetDetectorTest { + + private final MetadataCharsetDetector detector = new MetadataCharsetDetector(); + + private Charset detect(Metadata metadata) throws IOException { + try (TikaInputStream tis = TikaInputStream.get(new byte[0])) { + List<EncodingResult> results = detector.detect(tis, metadata, new ParseContext()); + if (results.isEmpty()) { + return null; + } + assertEquals(EncodingResult.ResultType.DECLARATIVE, results.get(0).getResultType()); + return results.get(0).getCharset(); + } + } + + @Test + public void testContentTypeHint() throws Exception { + // TIKA-4752: the charset claimed via CONTENT_TYPE_HINT (e.g. a zip entry's + // UTF-8/EFS flag, recorded as text/plain; charset=UTF-8) is consumed. + Metadata m = new Metadata(); + m.set(TikaCoreProperties.CONTENT_TYPE_HINT, "text/plain; charset=UTF-8"); + assertEquals(StandardCharsets.UTF_8, detect(m)); + } + + @Test + public void testContentType() throws Exception { + Metadata m = new Metadata(); + // ISO-8859-1 normalizes to its windows-1252 superset (WHATWG), existing behavior. + m.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1"); + assertEquals(Charset.forName("windows-1252"), detect(m)); + } + + @Test + public void testContentEncoding() throws Exception { + Metadata m = new Metadata(); + m.set(Metadata.CONTENT_ENCODING, "Shift_JIS"); + assertEquals(Charset.forName("Shift_JIS"), detect(m)); + } + + @Test + public void testContentTypeWinsOverHint() throws Exception { + Metadata m = new Metadata(); + m.set(Metadata.CONTENT_TYPE, "text/plain; charset=UTF-16"); + m.set(TikaCoreProperties.CONTENT_TYPE_HINT, "text/plain; charset=UTF-8"); + assertEquals(StandardCharsets.UTF_16, detect(m)); + } + + @Test + public void testHintWinsOverContentEncoding() throws Exception { + Metadata m = new Metadata(); + m.set(TikaCoreProperties.CONTENT_TYPE_HINT, "text/plain; charset=UTF-8"); + m.set(Metadata.CONTENT_ENCODING, "Shift_JIS"); + assertEquals(StandardCharsets.UTF_8, detect(m)); + } + + @Test + public void testNoDeclarationIsEmpty() throws Exception { + assertEquals(null, detect(new Metadata())); + // A content-type with no charset parameter is not a declaration. + Metadata m = new Metadata(); + m.set(Metadata.CONTENT_TYPE, "text/plain"); + assertEquals(null, detect(m)); + // An unparseable charset label is ignored, not thrown. + Metadata bad = new Metadata(); + bad.set(Metadata.CONTENT_ENCODING, "not-a-charset"); + assertTrue(detect(bad) == null); + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java index 2933200bea..cb0c52208a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java @@ -22,6 +22,7 @@ import static org.apache.tika.detect.zip.PackageConstants.ZIP; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.nio.file.attribute.FileTime; import java.util.ArrayList; import java.util.Collections; @@ -468,7 +469,7 @@ public class ZipParser extends AbstractArchiveParser { ZipParserConfig config) throws SAXException, IOException, TikaException { - String name = detectEntryName(entry, parentMetadata, context, config); + String name = detectEntryName(entry, context, config); if (entry.getGeneralPurposeBit().usesEncryption()) { handleEncryptedEntry(name, parentMetadata, xhtml); @@ -513,7 +514,7 @@ public class ZipParser extends AbstractArchiveParser { ZipParserConfig config) throws SAXException, IOException, TikaException { - String name = detectEntryName(entry, parentMetadata, context, config); + String name = detectEntryName(entry, context, config); if (!zis.canReadEntryData(entry)) { if (entry.getGeneralPurposeBit().usesEncryption()) { @@ -549,22 +550,37 @@ public class ZipParser extends AbstractArchiveParser { } } - private String detectEntryName(ZipArchiveEntry entry, Metadata parentMetadata, - ParseContext context, ZipParserConfig config) throws IOException { + private String detectEntryName(ZipArchiveEntry entry, ParseContext context, + ZipParserConfig config) throws IOException { // If user specified an encoding, decode raw bytes with that charset // This avoids needing to reopen the ZipFile with a different charset if (config.getEntryEncoding() != null) { return new String(entry.getRawName(), config.getEntryEncoding()); } + // A zip only ever declares a name as UTF-8 (it can't name a legacy charset), + // two ways. The Unicode extra field carries a CRC-validated UTF-8 name -- that + // CRC check is the evaluation, so trust commons-compress's getName(). + if (entry.getNameSource() == ZipArchiveEntry.NameSource.UNICODE_EXTRA_FIELD) { + return entry.getName(); + } + // If charset detection is enabled, try to detect and decode. // Mojibuster handles short inputs natively (zip filenames are often // 9-30 bytes); no byte-extension trick needed. if (config.isDetectCharsetsInEntryNames()) { byte[] entryName = entry.getRawName(); + // The EFS flag (general purpose bit 11) also declares UTF-8, but is + // unvalidated. Record it as a content-type hint for the detector to + // evaluate against the bytes, not trust outright. + Metadata nameMetadata = new Metadata(); + if (entry.getNameSource() == ZipArchiveEntry.NameSource.NAME_WITH_EFS_FLAG) { + nameMetadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, + new MediaType(MediaType.TEXT_PLAIN, StandardCharsets.UTF_8).toString()); + } try (TikaInputStream detectStream = TikaInputStream.get(entryName)) { List<EncodingResult> encResults = - getEncodingDetector().detect(detectStream, parentMetadata, context); + getEncodingDetector(context).detect(detectStream, nameMetadata, context); Charset candidate = encResults.isEmpty() ? null : encResults.get(0).getCharset(); if (candidate != null) { return new String(entry.getRawName(), candidate); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipEntryNameEncodingTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipEntryNameEncodingTest.java new file mode 100644 index 0000000000..a8fe4e58be --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipEntryNameEncodingTest.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.List; + +import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; +import org.junit.jupiter.api.Test; + +import org.apache.tika.TikaTest; +import org.apache.tika.detect.CompositeEncodingDetector; +import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.detect.MetadataCharsetDetector; +import org.apache.tika.detect.OverrideEncodingDetector; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; + +/** + * TIKA-4752: a zip can only declare an entry name as UTF-8 (never a legacy charset), + * two ways -- the EFS flag (general purpose bit 11) and the Unicode path extra field. + * ZipParser must honor both. + */ +public class ZipEntryNameEncodingTest extends TikaTest { + + private static final String LATIN = "café-Köln-Süß.txt"; + private static final String CJK = "日本語.txt"; + + @Test + public void testEfsFlagHint() throws Exception { + // Deterministic + discriminating: MetadataCharsetDetector consumes the + // EFS->UTF-8 hint; the override garbles anything it doesn't catch. So only the + // hint yields UTF-8 -- an empty-returning detector wouldn't isolate it, because + // ZipParser would fall back to getName(), already UTF-8 for a flagged entry. + ParseContext context = new ParseContext(); + context.set(EncodingDetector.class, new CompositeEncodingDetector(List.of( + new MetadataCharsetDetector(), + new OverrideEncodingDetector(Charset.forName("windows-1252"))))); + assertEquals(LATIN, entryName(efsZip(LATIN), context)); + } + + @Test + public void testUnicodeExtraField() throws Exception { + // CRC-validated UTF-8 name in the extra field; the main-header name is a garbled + // CP437 fallback. We must use the extra-field name, not detect the raw bytes. + assertEquals(CJK, entryName(unicodeExtraFieldZip(CJK), new ParseContext())); + } + + private String entryName(byte[] zipBytes, ParseContext context) throws Exception { + try (TikaInputStream tis = TikaInputStream.get(zipBytes)) { + List<Metadata> list = getRecursiveMetadata(tis, new Metadata(), context, false); + assertEquals(2, list.size()); + return list.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY); + } + } + + private static byte[] efsZip(String name) throws IOException { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(bos)) { + zos.setEncoding("UTF-8"); + zos.setUseLanguageEncodingFlag(true); + zos.setCreateUnicodeExtraFields(ZipArchiveOutputStream.UnicodeExtraFieldPolicy.NEVER); + writeEntry(zos, name); + } + return bos.toByteArray(); + } + + private static byte[] unicodeExtraFieldZip(String name) throws IOException { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(bos)) { + zos.setEncoding("Cp437"); + zos.setUseLanguageEncodingFlag(false); + zos.setCreateUnicodeExtraFields(ZipArchiveOutputStream.UnicodeExtraFieldPolicy.ALWAYS); + writeEntry(zos, name); + } + return bos.toByteArray(); + } + + private static void writeEntry(ZipArchiveOutputStream zos, String name) throws IOException { + ZipArchiveEntry entry = new ZipArchiveEntry(name); + zos.putArchiveEntry(entry); + zos.write("hello".getBytes(StandardCharsets.US_ASCII)); + zos.closeArchiveEntry(); + } +}
