This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_3x by this push:
     new 03f6397364 improve isatab parsing (#2875)
03f6397364 is described below

commit 03f6397364551155f71ef54d38969299e5d1a6f9
Author: tallison <[email protected]>
AuthorDate: Fri Jun 5 13:20:06 2026 -0400

    improve isatab parsing (#2875)
    
    Co-authored-by: Copilot Autofix powered by AI 
<[email protected]>
    # Conflicts:
    #       tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
    #       tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
---
 .../java/org/apache/tika/io/FilenameUtils.java     | 32 +++++++++++++++++
 .../java/org/apache/tika/io/FilenameUtilsTest.java | 31 +++++++++++++++++
 .../apache/tika/parser/isatab/ISArchiveParser.java | 11 ++++--
 .../tika/parser/isatab/ISArchiveParserTest.java    | 40 ++++++++++++++++++++++
 4 files changed, 111 insertions(+), 3 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java 
b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
index 66dbcb3ea4..5f2a35fa99 100644
--- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
@@ -16,6 +16,8 @@
  */
 package org.apache.tika.io;
 
+import java.io.IOException;
+import java.nio.file.Path;
 import java.util.HashSet;
 import java.util.Locale;
 import java.util.regex.Pattern;
@@ -130,4 +132,34 @@ public class FilenameUtils {
         }
         return StringUtils.EMPTY;
     }
+
+    /**
+     * Resolves {@code name} under {@code dir}, rejecting anything that 
escapes it (zip-slip
+     * guard). Normalizes first so {@code ..} is collapsed before the 
element-wise containment
+     * check.
+     *
+     * @throws IOException if {@code name} resolves outside {@code dir}
+     */
+    public static Path resolveWithin(Path dir, String name) throws IOException 
{
+        Path normalizedDir = dir.normalize();
+        Path resolved = normalizedDir.resolve(name).normalize();
+        if (!resolved.startsWith(normalizedDir)) {
+            throw new IOException(
+                    "'" + name + "' resolves to '" + resolved + "', which is 
outside of '" +
+                            normalizedDir + "'");
+        }
+
+        // Defense in depth against symlink traversal (only possible if the 
paths exist).
+        if (java.nio.file.Files.exists(resolved) && 
java.nio.file.Files.exists(normalizedDir)) {
+            Path realDir = normalizedDir.toRealPath();
+            Path realResolved = resolved.toRealPath();
+            if (!realResolved.startsWith(realDir)) {
+                throw new IOException(
+                        "'" + name + "' resolves to '" + realResolved + "', 
which is outside of '" +
+                                realDir + "'");
+            }
+        }
+
+        return resolved;
+    }
 }
diff --git a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java 
b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
index 06b8a7ecf6..23fc7bcd29 100644
--- a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
@@ -17,9 +17,14 @@
 package org.apache.tika.io;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assertions.fail;
 
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.utils.StringUtils;
@@ -102,6 +107,32 @@ public class FilenameUtilsTest {
         testFilenameEquality("HW.txt", "_1457338542/HW.txt");
     }
 
+    @Test
+    public void testResolveWithin() throws Exception {
+        Path dir = Paths.get("base", "isa");
+
+        // plain and nested children resolve to a path under dir
+        assertEquals(Paths.get("base", "isa", "a_assay.txt"),
+                FilenameUtils.resolveWithin(dir, "a_assay.txt"));
+        assertEquals(Paths.get("base", "isa", "sub", "a_assay.txt"),
+                FilenameUtils.resolveWithin(dir, "sub/a_assay.txt"));
+
+        // resolving the directory itself stays within it
+        assertEquals(dir.normalize(), FilenameUtils.resolveWithin(dir, "."));
+
+        // names that resolve out of dir are rejected. These also pin the 
normalize() call:
+        // without it the resolved path would still textually begin with dir 
and be accepted.
+        assertThrows(IOException.class, () -> FilenameUtils.resolveWithin(dir, 
"../outside.txt"));
+        assertThrows(IOException.class, () -> FilenameUtils.resolveWithin(dir, 
"../../outside.txt"));
+        assertThrows(IOException.class,
+                () -> FilenameUtils.resolveWithin(dir, 
"sub/../../outside.txt"));
+
+        // element-wise (not string-prefix) containment: a/bc is not within 
a/b even though
+        // the string "a/bc" starts with the string "a/b".
+        assertThrows(IOException.class,
+                () -> FilenameUtils.resolveWithin(Paths.get("a", "b"), 
"../bc"));
+    }
+
     @Test
     public void testExtension() throws Exception {
         assertEquals(".pdf", 
FilenameUtils.getSuffixFromPath("blah/blah/or/something.pdf"));
diff --git 
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
 
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
index 6c7975f82b..197c6c27fa 100644
--- 
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
+++ 
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.isatab;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.Path;
 import java.util.Collections;
 import java.util.Set;
 
@@ -26,6 +27,7 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.FilenameUtils;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -131,12 +133,15 @@ public class ISArchiveParser implements Parser {
 
     private void parseAssay(XHTMLContentHandler xhtml, Metadata metadata, 
ParseContext context)
             throws IOException, SAXException, TikaException {
+        // location starts with "/C:" on windows, so build the directory Path 
from a File
+        // rather than Paths.get(). The assay file names come from the 
investigation file and
+        // are resolved within this directory.
+        Path locationDir = new File(this.location).toPath();
         for (String assayFileName : 
metadata.getValues(studyAssayFileNameField)) {
             xhtml.startElement("div");
             xhtml.element("h3", "ASSAY " + assayFileName);
-            // location starts with "/C:" on windows, can't use Paths.get()
-            try (InputStream stream = TikaInputStream.get(new 
File(this.location + assayFileName).toPath()))
-            {
+            Path assayFile = FilenameUtils.resolveWithin(locationDir, 
assayFileName);
+            try (InputStream stream = TikaInputStream.get(assayFile)) {
                 ISATabUtils.parseAssay(stream, xhtml, metadata, context);
             }
             xhtml.endElement("div");
diff --git 
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
 
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
index de9cf25ed0..884754a5bf 100644
--- 
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
+++ 
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
@@ -17,13 +17,22 @@
 package org.apache.tika.parser.isatab;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 
+import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
 
 import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
 import org.xml.sax.ContentHandler;
 
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
@@ -66,4 +75,35 @@ public class ISArchiveParserTest {
         assertEquals("Stephen", metadata.get("Investigation Person First 
Name"),
                 "Invalid Investigation Person First Name");
     }
+
+    @Test
+    public void testAssayFileNameResolvedWithinDirectory(@TempDir Path root) 
throws Exception {
+        // A file sitting next to (but outside of) the ISA-Tab directory.
+        Path outside = root.resolve("outside.txt");
+        Files.write(outside, 
"OUTSIDE_DIRECTORY_CONTENT".getBytes(StandardCharsets.UTF_8));
+
+        // An ISA-Tab directory whose investigation file points an assay at 
the sibling file
+        // using a relative name.
+        Path isaDir = Files.createDirectory(root.resolve("isa"));
+        Files.write(isaDir.resolve("i_test.txt"),
+                ("STUDY\n"
+                        + "Study File Name\t\"s_test.txt\"\n"
+                        + "Study Assay File Name\t\"../outside.txt\"\n")
+                        .getBytes(StandardCharsets.UTF_8));
+        Path study = isaDir.resolve("s_test.txt");
+        Files.write(study, "\"Source 
Name\"\n\"culture1\"\n".getBytes(StandardCharsets.UTF_8));
+
+        Parser parser = new ISArchiveParser(isaDir.toString());
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "s_test.txt");
+        ParseContext context = new ParseContext();
+
+        try (TikaInputStream tis = TikaInputStream.get(study)) {
+            assertThrows(IOException.class,
+                    () -> parser.parse(tis, handler, metadata, context));
+        }
+        assertFalse(handler.toString().contains("OUTSIDE_DIRECTORY_CONTENT"),
+                "assay reader read a file outside the ISA-Tab directory");
+    }
 }

Reply via email to