This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new 03f6397364 improve isatab parsing (#2875)
03f6397364 is described below
commit 03f6397364551155f71ef54d38969299e5d1a6f9
Author: tallison <[email protected]>
AuthorDate: Fri Jun 5 13:20:06 2026 -0400
improve isatab parsing (#2875)
Co-authored-by: Copilot Autofix powered by AI
<[email protected]>
# Conflicts:
# tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
# tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
---
.../java/org/apache/tika/io/FilenameUtils.java | 32 +++++++++++++++++
.../java/org/apache/tika/io/FilenameUtilsTest.java | 31 +++++++++++++++++
.../apache/tika/parser/isatab/ISArchiveParser.java | 11 ++++--
.../tika/parser/isatab/ISArchiveParserTest.java | 40 ++++++++++++++++++++++
4 files changed, 111 insertions(+), 3 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
index 66dbcb3ea4..5f2a35fa99 100644
--- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
@@ -16,6 +16,8 @@
*/
package org.apache.tika.io;
+import java.io.IOException;
+import java.nio.file.Path;
import java.util.HashSet;
import java.util.Locale;
import java.util.regex.Pattern;
@@ -130,4 +132,34 @@ public class FilenameUtils {
}
return StringUtils.EMPTY;
}
+
+ /**
+ * Resolves {@code name} under {@code dir}, rejecting anything that
escapes it (zip-slip
+ * guard). Normalizes first so {@code ..} is collapsed before the
element-wise containment
+ * check.
+ *
+ * @throws IOException if {@code name} resolves outside {@code dir}
+ */
+ public static Path resolveWithin(Path dir, String name) throws IOException
{
+ Path normalizedDir = dir.normalize();
+ Path resolved = normalizedDir.resolve(name).normalize();
+ if (!resolved.startsWith(normalizedDir)) {
+ throw new IOException(
+ "'" + name + "' resolves to '" + resolved + "', which is
outside of '" +
+ normalizedDir + "'");
+ }
+
+ // Defense in depth against symlink traversal (only possible if the
paths exist).
+ if (java.nio.file.Files.exists(resolved) &&
java.nio.file.Files.exists(normalizedDir)) {
+ Path realDir = normalizedDir.toRealPath();
+ Path realResolved = resolved.toRealPath();
+ if (!realResolved.startsWith(realDir)) {
+ throw new IOException(
+ "'" + name + "' resolves to '" + realResolved + "',
which is outside of '" +
+ realDir + "'");
+ }
+ }
+
+ return resolved;
+ }
}
diff --git a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
index 06b8a7ecf6..23fc7bcd29 100644
--- a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
@@ -17,9 +17,14 @@
package org.apache.tika.io;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
import org.junit.jupiter.api.Test;
import org.apache.tika.utils.StringUtils;
@@ -102,6 +107,32 @@ public class FilenameUtilsTest {
testFilenameEquality("HW.txt", "_1457338542/HW.txt");
}
+ @Test
+ public void testResolveWithin() throws Exception {
+ Path dir = Paths.get("base", "isa");
+
+ // plain and nested children resolve to a path under dir
+ assertEquals(Paths.get("base", "isa", "a_assay.txt"),
+ FilenameUtils.resolveWithin(dir, "a_assay.txt"));
+ assertEquals(Paths.get("base", "isa", "sub", "a_assay.txt"),
+ FilenameUtils.resolveWithin(dir, "sub/a_assay.txt"));
+
+ // resolving the directory itself stays within it
+ assertEquals(dir.normalize(), FilenameUtils.resolveWithin(dir, "."));
+
+ // names that resolve out of dir are rejected. These also pin the
normalize() call:
+ // without it the resolved path would still textually begin with dir
and be accepted.
+ assertThrows(IOException.class, () -> FilenameUtils.resolveWithin(dir,
"../outside.txt"));
+ assertThrows(IOException.class, () -> FilenameUtils.resolveWithin(dir,
"../../outside.txt"));
+ assertThrows(IOException.class,
+ () -> FilenameUtils.resolveWithin(dir,
"sub/../../outside.txt"));
+
+ // element-wise (not string-prefix) containment: a/bc is not within
a/b even though
+ // the string "a/bc" starts with the string "a/b".
+ assertThrows(IOException.class,
+ () -> FilenameUtils.resolveWithin(Paths.get("a", "b"),
"../bc"));
+ }
+
@Test
public void testExtension() throws Exception {
assertEquals(".pdf",
FilenameUtils.getSuffixFromPath("blah/blah/or/something.pdf"));
diff --git
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
index 6c7975f82b..197c6c27fa 100644
---
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
+++
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.isatab;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.file.Path;
import java.util.Collections;
import java.util.Set;
@@ -26,6 +27,7 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -131,12 +133,15 @@ public class ISArchiveParser implements Parser {
private void parseAssay(XHTMLContentHandler xhtml, Metadata metadata,
ParseContext context)
throws IOException, SAXException, TikaException {
+ // location starts with "/C:" on windows, so build the directory Path
from a File
+ // rather than Paths.get(). The assay file names come from the
investigation file and
+ // are resolved within this directory.
+ Path locationDir = new File(this.location).toPath();
for (String assayFileName :
metadata.getValues(studyAssayFileNameField)) {
xhtml.startElement("div");
xhtml.element("h3", "ASSAY " + assayFileName);
- // location starts with "/C:" on windows, can't use Paths.get()
- try (InputStream stream = TikaInputStream.get(new
File(this.location + assayFileName).toPath()))
- {
+ Path assayFile = FilenameUtils.resolveWithin(locationDir,
assayFileName);
+ try (InputStream stream = TikaInputStream.get(assayFile)) {
ISATabUtils.parseAssay(stream, xhtml, metadata, context);
}
xhtml.endElement("div");
diff --git
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
index de9cf25ed0..884754a5bf 100644
---
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
+++
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
@@ -17,13 +17,22 @@
package org.apache.tika.parser.isatab;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import java.io.IOException;
import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
import org.xml.sax.ContentHandler;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
@@ -66,4 +75,35 @@ public class ISArchiveParserTest {
assertEquals("Stephen", metadata.get("Investigation Person First
Name"),
"Invalid Investigation Person First Name");
}
+
+ @Test
+ public void testAssayFileNameResolvedWithinDirectory(@TempDir Path root)
throws Exception {
+ // A file sitting next to (but outside of) the ISA-Tab directory.
+ Path outside = root.resolve("outside.txt");
+ Files.write(outside,
"OUTSIDE_DIRECTORY_CONTENT".getBytes(StandardCharsets.UTF_8));
+
+ // An ISA-Tab directory whose investigation file points an assay at
the sibling file
+ // using a relative name.
+ Path isaDir = Files.createDirectory(root.resolve("isa"));
+ Files.write(isaDir.resolve("i_test.txt"),
+ ("STUDY\n"
+ + "Study File Name\t\"s_test.txt\"\n"
+ + "Study Assay File Name\t\"../outside.txt\"\n")
+ .getBytes(StandardCharsets.UTF_8));
+ Path study = isaDir.resolve("s_test.txt");
+ Files.write(study, "\"Source
Name\"\n\"culture1\"\n".getBytes(StandardCharsets.UTF_8));
+
+ Parser parser = new ISArchiveParser(isaDir.toString());
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "s_test.txt");
+ ParseContext context = new ParseContext();
+
+ try (TikaInputStream tis = TikaInputStream.get(study)) {
+ assertThrows(IOException.class,
+ () -> parser.parse(tis, handler, metadata, context));
+ }
+ assertFalse(handler.toString().contains("OUTSIDE_DIRECTORY_CONTENT"),
+ "assay reader read a file outside the ISA-Tab directory");
+ }
}