Repository: tika Updated Branches: refs/heads/master aadccbf97 -> 7b45c7ceb
TIKA-1896 -- add test files and unit tests, no fix yet Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7b45c7ce Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7b45c7ce Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7b45c7ce Branch: refs/heads/master Commit: 7b45c7ceb0830cb33a04571da87dd86a817d4138 Parents: aadccbf Author: tballison <[email protected]> Authored: Mon Nov 7 21:19:56 2016 -0500 Committer: tballison <[email protected]> Committed: Mon Nov 7 21:19:56 2016 -0500 ---------------------------------------------------------------------- .../apache/tika/parser/html/HtmlParserTest.java | 17 ++++++++++++++++- .../test-documents/testHTMLBadScript.html | 9 +++++++++ .../test-documents/testHTMLGoodScript.html | 9 +++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/7b45c7ce/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index 41efcc0..75744ca 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -127,7 +127,7 @@ public class HtmlParserTest extends TikaTest { } @Test - @Ignore("The file 'testXHTML_utf8.html' is not available fo testing") + @Ignore("The file 'testXHTML_utf8.html' is not available for testing") public void XtestParseUTF8() throws IOException, SAXException, TikaException { String path = "/test-documents/testXHTML_utf8.html"; Metadata metadata = new Metadata(); @@ -1219,6 +1219,21 @@ public class HtmlParserTest extends TikaTest { } @Test + @Ignore("until we fix TIKA-1896") + public void testBadScript() throws Exception { + String xml = getXML("testHTMLBadScript.html").xml; + assertContains("This is a test", xml); + assertNotContained("cool", xml); + } + + @Test + public void testGoodScript() throws Exception { + String xml = getXML("testHTMLGoodScript.html").xml; + assertContains("This is a test", xml); + assertNotContained("cool", xml); + } + + @Test public void testMultiThreadingEncodingDetection() throws Exception { List<EncodingDetector> detectors = new ArrayList<>(); ServiceLoader loader = http://git-wip-us.apache.org/repos/asf/tika/blob/7b45c7ce/tika-parsers/src/test/resources/test-documents/testHTMLBadScript.html ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testHTMLBadScript.html b/tika-parsers/src/test/resources/test-documents/testHTMLBadScript.html new file mode 100644 index 0000000..2c61f4f --- /dev/null +++ b/tika-parsers/src/test/resources/test-documents/testHTMLBadScript.html @@ -0,0 +1,9 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> +<html> +<head> + <script lang="javascript">cool script</script language> +</head> +<body> +<p>This is a test.</p> +</body> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/7b45c7ce/tika-parsers/src/test/resources/test-documents/testHTMLGoodScript.html ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testHTMLGoodScript.html b/tika-parsers/src/test/resources/test-documents/testHTMLGoodScript.html new file mode 100644 index 0000000..f37eb98 --- /dev/null +++ b/tika-parsers/src/test/resources/test-documents/testHTMLGoodScript.html @@ -0,0 +1,9 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> +<html> +<head> + <script lang="javascript">cool script</script> +</head> +<body> +<p>This is a test.</p> +</body> +</html> \ No newline at end of file
