Repository: tika Updated Branches: refs/heads/master bc0b1f7f7 -> b2821d921
TIKA-1928 Fix detection for filenames containing a #, avoid mis-detecting that part as a page anchor Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/b2821d92 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/b2821d92 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/b2821d92 Branch: refs/heads/master Commit: b2821d921ac4cfd3be468e8bea9123f5cb627cbf Parents: bc0b1f7 Author: Nick Burch <[email protected]> Authored: Mon May 16 01:02:16 2016 +0100 Committer: Nick Burch <[email protected]> Committed: Mon May 16 01:02:16 2016 +0100 ---------------------------------------------------------------------- .../java/org/apache/tika/detect/NameDetector.java | 15 ++++++++++----- .../org/apache/tika/detect/NameDetectorTest.java | 10 ++++++++++ 2 files changed, 20 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/b2821d92/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java b/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java index 1638d50..7135493 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java @@ -97,15 +97,11 @@ public class NameDetector implements Detector { // Look for a resource name in the input metadata String name = metadata.get(Metadata.RESOURCE_NAME_KEY); if (name != null) { - // If the name is a URL, skip the trailing query and fragment parts + // If the name is a URL, skip the trailing query int question = name.indexOf('?'); if (question != -1) { name = name.substring(0, question); } - int hash = name.indexOf('#'); - if (hash != -1) { - name = name.substring(0, hash); - } // If the name is a URL or a path, skip all but the last component int slash = name.lastIndexOf('/'); @@ -117,6 +113,15 @@ public class NameDetector implements Detector { name = name.substring(backslash + 1); } + // Strip any fragments from the end, but only ones after the extension + int hash = name.lastIndexOf('#'); + int dot = name.indexOf('.'); + if (hash != -1) { + if (dot == -1 || hash > dot) { + name = name.substring(0, hash); + } + } + // Decode any potential URL encoding int percent = name.indexOf('%'); if (percent != -1) { http://git-wip-us.apache.org/repos/asf/tika/blob/b2821d92/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java index 24d4d42..23fbb15 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java @@ -65,6 +65,16 @@ public class NameDetectorTest { assertDetect(MediaType.OCTET_STREAM, "ReadMe"); // case sensitive assertDetect(MediaType.OCTET_STREAM, "README.NOW"); + // TIKA-1928 # in the filename + assertDetect(MediaType.TEXT_PLAIN, "text.txt"); + assertDetect(MediaType.TEXT_PLAIN, "text#.txt"); // # before extension + assertDetect(MediaType.TEXT_PLAIN, "text#123.txt");// # before extension + assertDetect(MediaType.TEXT_PLAIN, "text.txt#pdf");// # after extension + + // Check # as URL fragment too + assertDetect(MediaType.TEXT_PLAIN, "http://foo/test.txt?1=2#pdf"); + assertDetect(MediaType.TEXT_PLAIN, "http://foo/test.txt#pdf"); + // tough one assertDetect( MediaType.TEXT_PLAIN,
